gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2019 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #include "config.h"
  24 #define INCLUDE_STRING
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "memmodel.h"
  32 #include "gimple.h"
  33 #include "cfghooks.h"
  34 #include "cfgloop.h"
  35 #include "df.h"
  36 #include "tm_p.h"
  37 #include "stringpool.h"
  38 #include "attribs.h"
  39 #include "optabs.h"
  40 #include "regs.h"
  41 #include "emit-rtl.h"
  42 #include "recog.h"
  43 #include "cgraph.h"
  44 #include "diagnostic.h"
  45 #include "insn-attr.h"
  46 #include "alias.h"
  47 #include "fold-const.h"
  48 #include "stor-layout.h"
  49 #include "calls.h"
  50 #include "varasm.h"
  51 #include "output.h"
  52 #include "flags.h"
  53 #include "explow.h"
  54 #include "expr.h"
  55 #include "reload.h"
  56 #include "langhooks.h"
  57 #include "opts.h"
  58 #include "params.h"
  59 #include "gimplify.h"
  60 #include "dwarf2.h"
  61 #include "gimple-iterator.h"
  62 #include "tree-vectorizer.h"
  63 #include "aarch64-cost-tables.h"
  64 #include "dumpfile.h"
  65 #include "builtins.h"
  66 #include "rtl-iter.h"
  67 #include "tm-constrs.h"
  68 #include "sched-int.h"
  69 #include "target-globals.h"
  70 #include "common/common-target.h"
  71 #include "cfgrtl.h"
  72 #include "selftest.h"
  73 #include "selftest-rtl.h"
  74 #include "rtx-vector-builder.h"
  75 #include "intl.h"
  76 #include "expmed.h"
  77
  78 /* This file should be included last.  */
  79 #include "target-def.h"
  80
  81 /* Defined for convenience.  */
  82 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  83
  84 /* Information about a legitimate vector immediate operand.  */
  85 struct simd_immediate_info
  86 {
  87   enum insn_type { MOV, MVN, INDEX, PTRUE };
  88   enum modifier_type { LSL, MSL };
  89
  90   simd_immediate_info () {}
  91   simd_immediate_info (scalar_float_mode, rtx);
  92   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
  93                        insn_type = MOV, modifier_type = LSL,
  94                        unsigned int = 0);
  95   simd_immediate_info (scalar_mode, rtx, rtx);
  96   simd_immediate_info (scalar_int_mode, aarch64_svpattern);
  97
  98   /* The mode of the elements.  */
  99   scalar_mode elt_mode;
 100
 101   /* The instruction to use to move the immediate into a vector.  */
 102   insn_type insn;
 103
 104   union
 105   {
 106     /* For MOV and MVN.  */
 107     struct
 108     {
 109       /* The value of each element.  */
 110       rtx value;
 111
 112       /* The kind of shift modifier to use, and the number of bits to shift.
 113          This is (LSL, 0) if no shift is needed.  */
 114       modifier_type modifier;
 115       unsigned int shift;
 116     } mov;
 117
 118     /* For INDEX.  */
 119     struct
 120     {
 121       /* The value of the first element and the step to be added for each
 122          subsequent element.  */
 123       rtx base, step;
 124     } index;
 125
 126     /* For PTRUE.  */
 127     aarch64_svpattern pattern;
 128   } u;
 129 };
 130
 131 /* Construct a floating-point immediate in which each element has mode
 132    ELT_MODE_IN and value VALUE_IN.  */
 133 inline simd_immediate_info
 134 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 135   : elt_mode (elt_mode_in), insn (MOV)
 136 {
 137   u.mov.value = value_in;
 138   u.mov.modifier = LSL;
 139   u.mov.shift = 0;
 140 }
 141
 142 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 143    and value VALUE_IN.  The other parameters are as for the structure
 144    fields.  */
 145 inline simd_immediate_info
 146 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 147                        unsigned HOST_WIDE_INT value_in,
 148                        insn_type insn_in, modifier_type modifier_in,
 149                        unsigned int shift_in)
 150   : elt_mode (elt_mode_in), insn (insn_in)
 151 {
 152   u.mov.value = gen_int_mode (value_in, elt_mode_in);
 153   u.mov.modifier = modifier_in;
 154   u.mov.shift = shift_in;
 155 }
 156
 157 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 158    and where element I is equal to BASE_IN + I * STEP_IN.  */
 159 inline simd_immediate_info
 160 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
 161   : elt_mode (elt_mode_in), insn (INDEX)
 162 {
 163   u.index.base = base_in;
 164   u.index.step = step_in;
 165 }
 166
 167 /* Construct a predicate that controls elements of mode ELT_MODE_IN
 168    and has PTRUE pattern PATTERN_IN.  */
 169 inline simd_immediate_info
 170 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 171                        aarch64_svpattern pattern_in)
 172   : elt_mode (elt_mode_in), insn (PTRUE)
 173 {
 174   u.pattern = pattern_in;
 175 }
 176
 177 /* The current code model.  */
 178 enum aarch64_code_model aarch64_cmodel;
 179
 180 /* The number of 64-bit elements in an SVE vector.  */
 181 poly_uint16 aarch64_sve_vg;
 182
 183 #ifdef HAVE_AS_TLS
 184 #undef TARGET_HAVE_TLS
 185 #define TARGET_HAVE_TLS 1
 186 #endif
 187
 188 static bool aarch64_composite_type_p (const_tree, machine_mode);
 189 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 190                                                      const_tree,
 191                                                      machine_mode *, int *,
 192                                                      bool *);
 193 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 194 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 195 static void aarch64_override_options_after_change (void);
 196 static bool aarch64_vector_mode_supported_p (machine_mode);
 197 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 198 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 199                                                          const_tree type,
 200                                                          int misalignment,
 201                                                          bool is_packed);
 202 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 203 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 204                                             aarch64_addr_query_type);
 205 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
 206
 207 /* Major revision number of the ARM Architecture implemented by the target.  */
 208 unsigned aarch64_architecture_version;
 209
 210 /* The processor for which instructions should be scheduled.  */
 211 enum aarch64_processor aarch64_tune = cortexa53;
 212
 213 /* Mask to specify which instruction scheduling options should be used.  */
 214 uint64_t aarch64_tune_flags = 0;
 215
 216 /* Global flag for PC relative loads.  */
 217 bool aarch64_pcrelative_literal_loads;
 218
 219 /* Global flag for whether frame pointer is enabled.  */
 220 bool aarch64_use_frame_pointer;
 221
 222 #define BRANCH_PROTECT_STR_MAX 255
 223 char *accepted_branch_protection_string = NULL;
 224
 225 static enum aarch64_parse_opt_result
 226 aarch64_parse_branch_protection (const char*, char**);
 227
 228 /* Support for command line parsing of boolean flags in the tuning
 229    structures.  */
 230 struct aarch64_flag_desc
 231 {
 232   const char* name;
 233   unsigned int flag;
 234 };
 235
 236 #define AARCH64_FUSION_PAIR(name, internal_name) \
 237   { name, AARCH64_FUSE_##internal_name },
 238 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 239 {
 240   { "none", AARCH64_FUSE_NOTHING },
 241 #include "aarch64-fusion-pairs.def"
 242   { "all", AARCH64_FUSE_ALL },
 243   { NULL, AARCH64_FUSE_NOTHING }
 244 };
 245
 246 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 247   { name, AARCH64_EXTRA_TUNE_##internal_name },
 248 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 249 {
 250   { "none", AARCH64_EXTRA_TUNE_NONE },
 251 #include "aarch64-tuning-flags.def"
 252   { "all", AARCH64_EXTRA_TUNE_ALL },
 253   { NULL, AARCH64_EXTRA_TUNE_NONE }
 254 };
 255
 256 /* Tuning parameters.  */
 257
 258 static const struct cpu_addrcost_table generic_addrcost_table =
 259 {
 260     {
 261       1, /* hi  */
 262       0, /* si  */
 263       0, /* di  */
 264       1, /* ti  */
 265     },
 266   0, /* pre_modify  */
 267   0, /* post_modify  */
 268   0, /* register_offset  */
 269   0, /* register_sextend  */
 270   0, /* register_zextend  */
 271   0 /* imm_offset  */
 272 };
 273
 274 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 275 {
 276     {
 277       0, /* hi  */
 278       0, /* si  */
 279       0, /* di  */
 280       2, /* ti  */
 281     },
 282   0, /* pre_modify  */
 283   0, /* post_modify  */
 284   1, /* register_offset  */
 285   1, /* register_sextend  */
 286   2, /* register_zextend  */
 287   0, /* imm_offset  */
 288 };
 289
 290 static const struct cpu_addrcost_table xgene1_addrcost_table =
 291 {
 292     {
 293       1, /* hi  */
 294       0, /* si  */
 295       0, /* di  */
 296       1, /* ti  */
 297     },
 298   1, /* pre_modify  */
 299   1, /* post_modify  */
 300   0, /* register_offset  */
 301   1, /* register_sextend  */
 302   1, /* register_zextend  */
 303   0, /* imm_offset  */
 304 };
 305
 306 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 307 {
 308     {
 309       1, /* hi  */
 310       1, /* si  */
 311       1, /* di  */
 312       2, /* ti  */
 313     },
 314   0, /* pre_modify  */
 315   0, /* post_modify  */
 316   2, /* register_offset  */
 317   3, /* register_sextend  */
 318   3, /* register_zextend  */
 319   0, /* imm_offset  */
 320 };
 321
 322 static const struct cpu_addrcost_table tsv110_addrcost_table =
 323 {
 324     {
 325       1, /* hi  */
 326       0, /* si  */
 327       0, /* di  */
 328       1, /* ti  */
 329     },
 330   0, /* pre_modify  */
 331   0, /* post_modify  */
 332   0, /* register_offset  */
 333   1, /* register_sextend  */
 334   1, /* register_zextend  */
 335   0, /* imm_offset  */
 336 };
 337
 338 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 339 {
 340     {
 341       1, /* hi  */
 342       1, /* si  */
 343       1, /* di  */
 344       2, /* ti  */
 345     },
 346   1, /* pre_modify  */
 347   1, /* post_modify  */
 348   3, /* register_offset  */
 349   3, /* register_sextend  */
 350   3, /* register_zextend  */
 351   2, /* imm_offset  */
 352 };
 353
 354 static const struct cpu_regmove_cost generic_regmove_cost =
 355 {
 356   1, /* GP2GP  */
 357   /* Avoid the use of slow int<->fp moves for spilling by setting
 358      their cost higher than memmov_cost.  */
 359   5, /* GP2FP  */
 360   5, /* FP2GP  */
 361   2 /* FP2FP  */
 362 };
 363
 364 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 365 {
 366   1, /* GP2GP  */
 367   /* Avoid the use of slow int<->fp moves for spilling by setting
 368      their cost higher than memmov_cost.  */
 369   5, /* GP2FP  */
 370   5, /* FP2GP  */
 371   2 /* FP2FP  */
 372 };
 373
 374 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 375 {
 376   1, /* GP2GP  */
 377   /* Avoid the use of slow int<->fp moves for spilling by setting
 378      their cost higher than memmov_cost.  */
 379   5, /* GP2FP  */
 380   5, /* FP2GP  */
 381   2 /* FP2FP  */
 382 };
 383
 384 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 385 {
 386   1, /* GP2GP  */
 387   /* Avoid the use of slow int<->fp moves for spilling by setting
 388      their cost higher than memmov_cost (actual, 4 and 9).  */
 389   9, /* GP2FP  */
 390   9, /* FP2GP  */
 391   1 /* FP2FP  */
 392 };
 393
 394 static const struct cpu_regmove_cost thunderx_regmove_cost =
 395 {
 396   2, /* GP2GP  */
 397   2, /* GP2FP  */
 398   6, /* FP2GP  */
 399   4 /* FP2FP  */
 400 };
 401
 402 static const struct cpu_regmove_cost xgene1_regmove_cost =
 403 {
 404   1, /* GP2GP  */
 405   /* Avoid the use of slow int<->fp moves for spilling by setting
 406      their cost higher than memmov_cost.  */
 407   8, /* GP2FP  */
 408   8, /* FP2GP  */
 409   2 /* FP2FP  */
 410 };
 411
 412 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 413 {
 414   2, /* GP2GP  */
 415   /* Avoid the use of int<->fp moves for spilling.  */
 416   6, /* GP2FP  */
 417   6, /* FP2GP  */
 418   4 /* FP2FP  */
 419 };
 420
 421 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 422 {
 423   1, /* GP2GP  */
 424   /* Avoid the use of int<->fp moves for spilling.  */
 425   8, /* GP2FP  */
 426   8, /* FP2GP  */
 427   4  /* FP2FP  */
 428 };
 429
 430 static const struct cpu_regmove_cost tsv110_regmove_cost =
 431 {
 432   1, /* GP2GP  */
 433   /* Avoid the use of slow int<->fp moves for spilling by setting
 434      their cost higher than memmov_cost.  */
 435   2, /* GP2FP  */
 436   3, /* FP2GP  */
 437   2  /* FP2FP  */
 438 };
 439
 440 /* Generic costs for vector insn classes.  */
 441 static const struct cpu_vector_cost generic_vector_cost =
 442 {
 443   1, /* scalar_int_stmt_cost  */
 444   1, /* scalar_fp_stmt_cost  */
 445   1, /* scalar_load_cost  */
 446   1, /* scalar_store_cost  */
 447   1, /* vec_int_stmt_cost  */
 448   1, /* vec_fp_stmt_cost  */
 449   2, /* vec_permute_cost  */
 450   1, /* vec_to_scalar_cost  */
 451   1, /* scalar_to_vec_cost  */
 452   1, /* vec_align_load_cost  */
 453   1, /* vec_unalign_load_cost  */
 454   1, /* vec_unalign_store_cost  */
 455   1, /* vec_store_cost  */
 456   3, /* cond_taken_branch_cost  */
 457   1 /* cond_not_taken_branch_cost  */
 458 };
 459
 460 /* QDF24XX costs for vector insn classes.  */
 461 static const struct cpu_vector_cost qdf24xx_vector_cost =
 462 {
 463   1, /* scalar_int_stmt_cost  */
 464   1, /* scalar_fp_stmt_cost  */
 465   1, /* scalar_load_cost  */
 466   1, /* scalar_store_cost  */
 467   1, /* vec_int_stmt_cost  */
 468   3, /* vec_fp_stmt_cost  */
 469   2, /* vec_permute_cost  */
 470   1, /* vec_to_scalar_cost  */
 471   1, /* scalar_to_vec_cost  */
 472   1, /* vec_align_load_cost  */
 473   1, /* vec_unalign_load_cost  */
 474   1, /* vec_unalign_store_cost  */
 475   1, /* vec_store_cost  */
 476   3, /* cond_taken_branch_cost  */
 477   1 /* cond_not_taken_branch_cost  */
 478 };
 479
 480 /* ThunderX costs for vector insn classes.  */
 481 static const struct cpu_vector_cost thunderx_vector_cost =
 482 {
 483   1, /* scalar_int_stmt_cost  */
 484   1, /* scalar_fp_stmt_cost  */
 485   3, /* scalar_load_cost  */
 486   1, /* scalar_store_cost  */
 487   4, /* vec_int_stmt_cost  */
 488   1, /* vec_fp_stmt_cost  */
 489   4, /* vec_permute_cost  */
 490   2, /* vec_to_scalar_cost  */
 491   2, /* scalar_to_vec_cost  */
 492   3, /* vec_align_load_cost  */
 493   5, /* vec_unalign_load_cost  */
 494   5, /* vec_unalign_store_cost  */
 495   1, /* vec_store_cost  */
 496   3, /* cond_taken_branch_cost  */
 497   3 /* cond_not_taken_branch_cost  */
 498 };
 499
 500 static const struct cpu_vector_cost tsv110_vector_cost =
 501 {
 502   1, /* scalar_int_stmt_cost  */
 503   1, /* scalar_fp_stmt_cost  */
 504   5, /* scalar_load_cost  */
 505   1, /* scalar_store_cost  */
 506   2, /* vec_int_stmt_cost  */
 507   2, /* vec_fp_stmt_cost  */
 508   2, /* vec_permute_cost  */
 509   3, /* vec_to_scalar_cost  */
 510   2, /* scalar_to_vec_cost  */
 511   5, /* vec_align_load_cost  */
 512   5, /* vec_unalign_load_cost  */
 513   1, /* vec_unalign_store_cost  */
 514   1, /* vec_store_cost  */
 515   1, /* cond_taken_branch_cost  */
 516   1 /* cond_not_taken_branch_cost  */
 517 };
 518
 519 /* Generic costs for vector insn classes.  */
 520 static const struct cpu_vector_cost cortexa57_vector_cost =
 521 {
 522   1, /* scalar_int_stmt_cost  */
 523   1, /* scalar_fp_stmt_cost  */
 524   4, /* scalar_load_cost  */
 525   1, /* scalar_store_cost  */
 526   2, /* vec_int_stmt_cost  */
 527   2, /* vec_fp_stmt_cost  */
 528   3, /* vec_permute_cost  */
 529   8, /* vec_to_scalar_cost  */
 530   8, /* scalar_to_vec_cost  */
 531   4, /* vec_align_load_cost  */
 532   4, /* vec_unalign_load_cost  */
 533   1, /* vec_unalign_store_cost  */
 534   1, /* vec_store_cost  */
 535   1, /* cond_taken_branch_cost  */
 536   1 /* cond_not_taken_branch_cost  */
 537 };
 538
 539 static const struct cpu_vector_cost exynosm1_vector_cost =
 540 {
 541   1, /* scalar_int_stmt_cost  */
 542   1, /* scalar_fp_stmt_cost  */
 543   5, /* scalar_load_cost  */
 544   1, /* scalar_store_cost  */
 545   3, /* vec_int_stmt_cost  */
 546   3, /* vec_fp_stmt_cost  */
 547   3, /* vec_permute_cost  */
 548   3, /* vec_to_scalar_cost  */
 549   3, /* scalar_to_vec_cost  */
 550   5, /* vec_align_load_cost  */
 551   5, /* vec_unalign_load_cost  */
 552   1, /* vec_unalign_store_cost  */
 553   1, /* vec_store_cost  */
 554   1, /* cond_taken_branch_cost  */
 555   1 /* cond_not_taken_branch_cost  */
 556 };
 557
 558 /* Generic costs for vector insn classes.  */
 559 static const struct cpu_vector_cost xgene1_vector_cost =
 560 {
 561   1, /* scalar_int_stmt_cost  */
 562   1, /* scalar_fp_stmt_cost  */
 563   5, /* scalar_load_cost  */
 564   1, /* scalar_store_cost  */
 565   2, /* vec_int_stmt_cost  */
 566   2, /* vec_fp_stmt_cost  */
 567   2, /* vec_permute_cost  */
 568   4, /* vec_to_scalar_cost  */
 569   4, /* scalar_to_vec_cost  */
 570   10, /* vec_align_load_cost  */
 571   10, /* vec_unalign_load_cost  */
 572   2, /* vec_unalign_store_cost  */
 573   2, /* vec_store_cost  */
 574   2, /* cond_taken_branch_cost  */
 575   1 /* cond_not_taken_branch_cost  */
 576 };
 577
 578 /* Costs for vector insn classes for Vulcan.  */
 579 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 580 {
 581   1, /* scalar_int_stmt_cost  */
 582   6, /* scalar_fp_stmt_cost  */
 583   4, /* scalar_load_cost  */
 584   1, /* scalar_store_cost  */
 585   5, /* vec_int_stmt_cost  */
 586   6, /* vec_fp_stmt_cost  */
 587   3, /* vec_permute_cost  */
 588   6, /* vec_to_scalar_cost  */
 589   5, /* scalar_to_vec_cost  */
 590   8, /* vec_align_load_cost  */
 591   8, /* vec_unalign_load_cost  */
 592   4, /* vec_unalign_store_cost  */
 593   4, /* vec_store_cost  */
 594   2, /* cond_taken_branch_cost  */
 595   1  /* cond_not_taken_branch_cost  */
 596 };
 597
 598 /* Generic costs for branch instructions.  */
 599 static const struct cpu_branch_cost generic_branch_cost =
 600 {
 601   1,  /* Predictable.  */
 602   3   /* Unpredictable.  */
 603 };
 604
 605 /* Generic approximation modes.  */
 606 static const cpu_approx_modes generic_approx_modes =
 607 {
 608   AARCH64_APPROX_NONE,  /* division  */
 609   AARCH64_APPROX_NONE,  /* sqrt  */
 610   AARCH64_APPROX_NONE   /* recip_sqrt  */
 611 };
 612
 613 /* Approximation modes for Exynos M1.  */
 614 static const cpu_approx_modes exynosm1_approx_modes =
 615 {
 616   AARCH64_APPROX_NONE,  /* division  */
 617   AARCH64_APPROX_ALL,   /* sqrt  */
 618   AARCH64_APPROX_ALL    /* recip_sqrt  */
 619 };
 620
 621 /* Approximation modes for X-Gene 1.  */
 622 static const cpu_approx_modes xgene1_approx_modes =
 623 {
 624   AARCH64_APPROX_NONE,  /* division  */
 625   AARCH64_APPROX_NONE,  /* sqrt  */
 626   AARCH64_APPROX_ALL    /* recip_sqrt  */
 627 };
 628
 629 /* Generic prefetch settings (which disable prefetch).  */
 630 static const cpu_prefetch_tune generic_prefetch_tune =
 631 {
 632   0,                    /* num_slots  */
 633   -1,                   /* l1_cache_size  */
 634   -1,                   /* l1_cache_line_size  */
 635   -1,                   /* l2_cache_size  */
 636   true,                 /* prefetch_dynamic_strides */
 637   -1,                   /* minimum_stride */
 638   -1                    /* default_opt_level  */
 639 };
 640
 641 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 642 {
 643   0,                    /* num_slots  */
 644   -1,                   /* l1_cache_size  */
 645   64,                   /* l1_cache_line_size  */
 646   -1,                   /* l2_cache_size  */
 647   true,                 /* prefetch_dynamic_strides */
 648   -1,                   /* minimum_stride */
 649   -1                    /* default_opt_level  */
 650 };
 651
 652 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 653 {
 654   4,                    /* num_slots  */
 655   32,                   /* l1_cache_size  */
 656   64,                   /* l1_cache_line_size  */
 657   512,                  /* l2_cache_size  */
 658   false,                /* prefetch_dynamic_strides */
 659   2048,                 /* minimum_stride */
 660   3                     /* default_opt_level  */
 661 };
 662
 663 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 664 {
 665   8,                    /* num_slots  */
 666   32,                   /* l1_cache_size  */
 667   128,                  /* l1_cache_line_size  */
 668   16*1024,              /* l2_cache_size  */
 669   true,                 /* prefetch_dynamic_strides */
 670   -1,                   /* minimum_stride */
 671   3                     /* default_opt_level  */
 672 };
 673
 674 static const cpu_prefetch_tune thunderx_prefetch_tune =
 675 {
 676   8,                    /* num_slots  */
 677   32,                   /* l1_cache_size  */
 678   128,                  /* l1_cache_line_size  */
 679   -1,                   /* l2_cache_size  */
 680   true,                 /* prefetch_dynamic_strides */
 681   -1,                   /* minimum_stride */
 682   -1                    /* default_opt_level  */
 683 };
 684
 685 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 686 {
 687   8,                    /* num_slots  */
 688   32,                   /* l1_cache_size  */
 689   64,                   /* l1_cache_line_size  */
 690   256,                  /* l2_cache_size  */
 691   true,                 /* prefetch_dynamic_strides */
 692   -1,                   /* minimum_stride */
 693   -1                    /* default_opt_level  */
 694 };
 695
 696 static const cpu_prefetch_tune tsv110_prefetch_tune =
 697 {
 698   0,                    /* num_slots  */
 699   64,                   /* l1_cache_size  */
 700   64,                   /* l1_cache_line_size  */
 701   512,                  /* l2_cache_size  */
 702   true,                 /* prefetch_dynamic_strides */
 703   -1,                   /* minimum_stride */
 704   -1                    /* default_opt_level  */
 705 };
 706
 707 static const cpu_prefetch_tune xgene1_prefetch_tune =
 708 {
 709   8,                    /* num_slots  */
 710   32,                   /* l1_cache_size  */
 711   64,                   /* l1_cache_line_size  */
 712   256,                  /* l2_cache_size  */
 713   true,                 /* prefetch_dynamic_strides */
 714   -1,                   /* minimum_stride */
 715   -1                    /* default_opt_level  */
 716 };
 717
 718 static const struct tune_params generic_tunings =
 719 {
 720   &cortexa57_extra_costs,
 721   &generic_addrcost_table,
 722   &generic_regmove_cost,
 723   &generic_vector_cost,
 724   &generic_branch_cost,
 725   &generic_approx_modes,
 726   SVE_NOT_IMPLEMENTED, /* sve_width  */
 727   4, /* memmov_cost  */
 728   2, /* issue_rate  */
 729   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 730   "16:12",      /* function_align.  */
 731   "4",  /* jump_align.  */
 732   "8",  /* loop_align.  */
 733   2,    /* int_reassoc_width.  */
 734   4,    /* fp_reassoc_width.  */
 735   1,    /* vec_reassoc_width.  */
 736   2,    /* min_div_recip_mul_sf.  */
 737   2,    /* min_div_recip_mul_df.  */
 738   0,    /* max_case_values.  */
 739   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 740   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 741   &generic_prefetch_tune
 742 };
 743
 744 static const struct tune_params cortexa35_tunings =
 745 {
 746   &cortexa53_extra_costs,
 747   &generic_addrcost_table,
 748   &cortexa53_regmove_cost,
 749   &generic_vector_cost,
 750   &generic_branch_cost,
 751   &generic_approx_modes,
 752   SVE_NOT_IMPLEMENTED, /* sve_width  */
 753   4, /* memmov_cost  */
 754   1, /* issue_rate  */
 755   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 756    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 757   "16", /* function_align.  */
 758   "4",  /* jump_align.  */
 759   "8",  /* loop_align.  */
 760   2,    /* int_reassoc_width.  */
 761   4,    /* fp_reassoc_width.  */
 762   1,    /* vec_reassoc_width.  */
 763   2,    /* min_div_recip_mul_sf.  */
 764   2,    /* min_div_recip_mul_df.  */
 765   0,    /* max_case_values.  */
 766   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 767   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 768   &generic_prefetch_tune
 769 };
 770
 771 static const struct tune_params cortexa53_tunings =
 772 {
 773   &cortexa53_extra_costs,
 774   &generic_addrcost_table,
 775   &cortexa53_regmove_cost,
 776   &generic_vector_cost,
 777   &generic_branch_cost,
 778   &generic_approx_modes,
 779   SVE_NOT_IMPLEMENTED, /* sve_width  */
 780   4, /* memmov_cost  */
 781   2, /* issue_rate  */
 782   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 783    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 784   "16", /* function_align.  */
 785   "4",  /* jump_align.  */
 786   "8",  /* loop_align.  */
 787   2,    /* int_reassoc_width.  */
 788   4,    /* fp_reassoc_width.  */
 789   1,    /* vec_reassoc_width.  */
 790   2,    /* min_div_recip_mul_sf.  */
 791   2,    /* min_div_recip_mul_df.  */
 792   0,    /* max_case_values.  */
 793   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 794   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 795   &generic_prefetch_tune
 796 };
 797
 798 static const struct tune_params cortexa57_tunings =
 799 {
 800   &cortexa57_extra_costs,
 801   &generic_addrcost_table,
 802   &cortexa57_regmove_cost,
 803   &cortexa57_vector_cost,
 804   &generic_branch_cost,
 805   &generic_approx_modes,
 806   SVE_NOT_IMPLEMENTED, /* sve_width  */
 807   4, /* memmov_cost  */
 808   3, /* issue_rate  */
 809   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 810    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 811   "16", /* function_align.  */
 812   "4",  /* jump_align.  */
 813   "8",  /* loop_align.  */
 814   2,    /* int_reassoc_width.  */
 815   4,    /* fp_reassoc_width.  */
 816   1,    /* vec_reassoc_width.  */
 817   2,    /* min_div_recip_mul_sf.  */
 818   2,    /* min_div_recip_mul_df.  */
 819   0,    /* max_case_values.  */
 820   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 821   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 822   &generic_prefetch_tune
 823 };
 824
 825 static const struct tune_params cortexa72_tunings =
 826 {
 827   &cortexa57_extra_costs,
 828   &generic_addrcost_table,
 829   &cortexa57_regmove_cost,
 830   &cortexa57_vector_cost,
 831   &generic_branch_cost,
 832   &generic_approx_modes,
 833   SVE_NOT_IMPLEMENTED, /* sve_width  */
 834   4, /* memmov_cost  */
 835   3, /* issue_rate  */
 836   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 837    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 838   "16", /* function_align.  */
 839   "4",  /* jump_align.  */
 840   "8",  /* loop_align.  */
 841   2,    /* int_reassoc_width.  */
 842   4,    /* fp_reassoc_width.  */
 843   1,    /* vec_reassoc_width.  */
 844   2,    /* min_div_recip_mul_sf.  */
 845   2,    /* min_div_recip_mul_df.  */
 846   0,    /* max_case_values.  */
 847   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 848   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 849   &generic_prefetch_tune
 850 };
 851
 852 static const struct tune_params cortexa73_tunings =
 853 {
 854   &cortexa57_extra_costs,
 855   &generic_addrcost_table,
 856   &cortexa57_regmove_cost,
 857   &cortexa57_vector_cost,
 858   &generic_branch_cost,
 859   &generic_approx_modes,
 860   SVE_NOT_IMPLEMENTED, /* sve_width  */
 861   4, /* memmov_cost.  */
 862   2, /* issue_rate.  */
 863   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 864    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 865   "16", /* function_align.  */
 866   "4",  /* jump_align.  */
 867   "8",  /* loop_align.  */
 868   2,    /* int_reassoc_width.  */
 869   4,    /* fp_reassoc_width.  */
 870   1,    /* vec_reassoc_width.  */
 871   2,    /* min_div_recip_mul_sf.  */
 872   2,    /* min_div_recip_mul_df.  */
 873   0,    /* max_case_values.  */
 874   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 875   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 876   &generic_prefetch_tune
 877 };
 878
 879
 880
 881 static const struct tune_params exynosm1_tunings =
 882 {
 883   &exynosm1_extra_costs,
 884   &exynosm1_addrcost_table,
 885   &exynosm1_regmove_cost,
 886   &exynosm1_vector_cost,
 887   &generic_branch_cost,
 888   &exynosm1_approx_modes,
 889   SVE_NOT_IMPLEMENTED, /* sve_width  */
 890   4,    /* memmov_cost  */
 891   3,    /* issue_rate  */
 892   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 893   "4",  /* function_align.  */
 894   "4",  /* jump_align.  */
 895   "4",  /* loop_align.  */
 896   2,    /* int_reassoc_width.  */
 897   4,    /* fp_reassoc_width.  */
 898   1,    /* vec_reassoc_width.  */
 899   2,    /* min_div_recip_mul_sf.  */
 900   2,    /* min_div_recip_mul_df.  */
 901   48,   /* max_case_values.  */
 902   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 903   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 904   &exynosm1_prefetch_tune
 905 };
 906
 907 static const struct tune_params thunderxt88_tunings =
 908 {
 909   &thunderx_extra_costs,
 910   &generic_addrcost_table,
 911   &thunderx_regmove_cost,
 912   &thunderx_vector_cost,
 913   &generic_branch_cost,
 914   &generic_approx_modes,
 915   SVE_NOT_IMPLEMENTED, /* sve_width  */
 916   6, /* memmov_cost  */
 917   2, /* issue_rate  */
 918   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 919   "8",  /* function_align.  */
 920   "8",  /* jump_align.  */
 921   "8",  /* loop_align.  */
 922   2,    /* int_reassoc_width.  */
 923   4,    /* fp_reassoc_width.  */
 924   1,    /* vec_reassoc_width.  */
 925   2,    /* min_div_recip_mul_sf.  */
 926   2,    /* min_div_recip_mul_df.  */
 927   0,    /* max_case_values.  */
 928   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 929   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 930   &thunderxt88_prefetch_tune
 931 };
 932
 933 static const struct tune_params thunderx_tunings =
 934 {
 935   &thunderx_extra_costs,
 936   &generic_addrcost_table,
 937   &thunderx_regmove_cost,
 938   &thunderx_vector_cost,
 939   &generic_branch_cost,
 940   &generic_approx_modes,
 941   SVE_NOT_IMPLEMENTED, /* sve_width  */
 942   6, /* memmov_cost  */
 943   2, /* issue_rate  */
 944   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 945   "8",  /* function_align.  */
 946   "8",  /* jump_align.  */
 947   "8",  /* loop_align.  */
 948   2,    /* int_reassoc_width.  */
 949   4,    /* fp_reassoc_width.  */
 950   1,    /* vec_reassoc_width.  */
 951   2,    /* min_div_recip_mul_sf.  */
 952   2,    /* min_div_recip_mul_df.  */
 953   0,    /* max_case_values.  */
 954   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 955   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 956    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 957   &thunderx_prefetch_tune
 958 };
 959
 960 static const struct tune_params tsv110_tunings =
 961 {
 962   &tsv110_extra_costs,
 963   &tsv110_addrcost_table,
 964   &tsv110_regmove_cost,
 965   &tsv110_vector_cost,
 966   &generic_branch_cost,
 967   &generic_approx_modes,
 968   SVE_NOT_IMPLEMENTED, /* sve_width  */
 969   4,    /* memmov_cost  */
 970   4,    /* issue_rate  */
 971   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
 972    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 973   "16", /* function_align.  */
 974   "4",  /* jump_align.  */
 975   "8",  /* loop_align.  */
 976   2,    /* int_reassoc_width.  */
 977   4,    /* fp_reassoc_width.  */
 978   1,    /* vec_reassoc_width.  */
 979   2,    /* min_div_recip_mul_sf.  */
 980   2,    /* min_div_recip_mul_df.  */
 981   0,    /* max_case_values.  */
 982   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 983   (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
 984   &tsv110_prefetch_tune
 985 };
 986
 987 static const struct tune_params xgene1_tunings =
 988 {
 989   &xgene1_extra_costs,
 990   &xgene1_addrcost_table,
 991   &xgene1_regmove_cost,
 992   &xgene1_vector_cost,
 993   &generic_branch_cost,
 994   &xgene1_approx_modes,
 995   SVE_NOT_IMPLEMENTED, /* sve_width  */
 996   6, /* memmov_cost  */
 997   4, /* issue_rate  */
 998   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 999   "16", /* function_align.  */
1000   "16", /* jump_align.  */
1001   "16", /* loop_align.  */
1002   2,    /* int_reassoc_width.  */
1003   4,    /* fp_reassoc_width.  */
1004   1,    /* vec_reassoc_width.  */
1005   2,    /* min_div_recip_mul_sf.  */
1006   2,    /* min_div_recip_mul_df.  */
1007   17,   /* max_case_values.  */
1008   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1009   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1010   &xgene1_prefetch_tune
1011 };
1012
1013 static const struct tune_params emag_tunings =
1014 {
1015   &xgene1_extra_costs,
1016   &xgene1_addrcost_table,
1017   &xgene1_regmove_cost,
1018   &xgene1_vector_cost,
1019   &generic_branch_cost,
1020   &xgene1_approx_modes,
1021   SVE_NOT_IMPLEMENTED,
1022   6, /* memmov_cost  */
1023   4, /* issue_rate  */
1024   AARCH64_FUSE_NOTHING, /* fusible_ops  */
1025   "16", /* function_align.  */
1026   "16", /* jump_align.  */
1027   "16", /* loop_align.  */
1028   2,    /* int_reassoc_width.  */
1029   4,    /* fp_reassoc_width.  */
1030   1,    /* vec_reassoc_width.  */
1031   2,    /* min_div_recip_mul_sf.  */
1032   2,    /* min_div_recip_mul_df.  */
1033   17,   /* max_case_values.  */
1034   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1035   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1036   &xgene1_prefetch_tune
1037 };
1038
1039 static const struct tune_params qdf24xx_tunings =
1040 {
1041   &qdf24xx_extra_costs,
1042   &qdf24xx_addrcost_table,
1043   &qdf24xx_regmove_cost,
1044   &qdf24xx_vector_cost,
1045   &generic_branch_cost,
1046   &generic_approx_modes,
1047   SVE_NOT_IMPLEMENTED, /* sve_width  */
1048   4, /* memmov_cost  */
1049   4, /* issue_rate  */
1050   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1051    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1052   "16", /* function_align.  */
1053   "8",  /* jump_align.  */
1054   "16", /* loop_align.  */
1055   2,    /* int_reassoc_width.  */
1056   4,    /* fp_reassoc_width.  */
1057   1,    /* vec_reassoc_width.  */
1058   2,    /* min_div_recip_mul_sf.  */
1059   2,    /* min_div_recip_mul_df.  */
1060   0,    /* max_case_values.  */
1061   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1062   AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
1063   &qdf24xx_prefetch_tune
1064 };
1065
1066 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
1067    for now.  */
1068 static const struct tune_params saphira_tunings =
1069 {
1070   &generic_extra_costs,
1071   &generic_addrcost_table,
1072   &generic_regmove_cost,
1073   &generic_vector_cost,
1074   &generic_branch_cost,
1075   &generic_approx_modes,
1076   SVE_NOT_IMPLEMENTED, /* sve_width  */
1077   4, /* memmov_cost  */
1078   4, /* issue_rate  */
1079   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1080    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1081   "16", /* function_align.  */
1082   "8",  /* jump_align.  */
1083   "16", /* loop_align.  */
1084   2,    /* int_reassoc_width.  */
1085   4,    /* fp_reassoc_width.  */
1086   1,    /* vec_reassoc_width.  */
1087   2,    /* min_div_recip_mul_sf.  */
1088   2,    /* min_div_recip_mul_df.  */
1089   0,    /* max_case_values.  */
1090   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1091   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
1092   &generic_prefetch_tune
1093 };
1094
1095 static const struct tune_params thunderx2t99_tunings =
1096 {
1097   &thunderx2t99_extra_costs,
1098   &thunderx2t99_addrcost_table,
1099   &thunderx2t99_regmove_cost,
1100   &thunderx2t99_vector_cost,
1101   &generic_branch_cost,
1102   &generic_approx_modes,
1103   SVE_NOT_IMPLEMENTED, /* sve_width  */
1104   4, /* memmov_cost.  */
1105   4, /* issue_rate.  */
1106   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
1107    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
1108   "16", /* function_align.  */
1109   "8",  /* jump_align.  */
1110   "16", /* loop_align.  */
1111   3,    /* int_reassoc_width.  */
1112   2,    /* fp_reassoc_width.  */
1113   2,    /* vec_reassoc_width.  */
1114   2,    /* min_div_recip_mul_sf.  */
1115   2,    /* min_div_recip_mul_df.  */
1116   0,    /* max_case_values.  */
1117   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1118   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1119   &thunderx2t99_prefetch_tune
1120 };
1121
1122 static const struct tune_params neoversen1_tunings =
1123 {
1124   &cortexa57_extra_costs,
1125   &generic_addrcost_table,
1126   &generic_regmove_cost,
1127   &cortexa57_vector_cost,
1128   &generic_branch_cost,
1129   &generic_approx_modes,
1130   SVE_NOT_IMPLEMENTED, /* sve_width  */
1131   4, /* memmov_cost  */
1132   3, /* issue_rate  */
1133   AARCH64_FUSE_AES_AESMC, /* fusible_ops  */
1134   "32:16",      /* function_align.  */
1135   "32:16",      /* jump_align.  */
1136   "32:16",      /* loop_align.  */
1137   2,    /* int_reassoc_width.  */
1138   4,    /* fp_reassoc_width.  */
1139   2,    /* vec_reassoc_width.  */
1140   2,    /* min_div_recip_mul_sf.  */
1141   2,    /* min_div_recip_mul_df.  */
1142   0,    /* max_case_values.  */
1143   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1144   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1145   &generic_prefetch_tune
1146 };
1147
1148 /* Support for fine-grained override of the tuning structures.  */
1149 struct aarch64_tuning_override_function
1150 {
1151   const char* name;
1152   void (*parse_override)(const char*, struct tune_params*);
1153 };
1154
1155 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1156 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1157 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1158
1159 static const struct aarch64_tuning_override_function
1160 aarch64_tuning_override_functions[] =
1161 {
1162   { "fuse", aarch64_parse_fuse_string },
1163   { "tune", aarch64_parse_tune_string },
1164   { "sve_width", aarch64_parse_sve_width_string },
1165   { NULL, NULL }
1166 };
1167
1168 /* A processor implementing AArch64.  */
1169 struct processor
1170 {
1171   const char *const name;
1172   enum aarch64_processor ident;
1173   enum aarch64_processor sched_core;
1174   enum aarch64_arch arch;
1175   unsigned architecture_version;
1176   const uint64_t flags;
1177   const struct tune_params *const tune;
1178 };
1179
1180 /* Architectures implementing AArch64.  */
1181 static const struct processor all_architectures[] =
1182 {
1183 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1184   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1185 #include "aarch64-arches.def"
1186   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1187 };
1188
1189 /* Processor cores implementing AArch64.  */
1190 static const struct processor all_cores[] =
1191 {
1192 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1193   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
1194   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
1195   FLAGS, &COSTS##_tunings},
1196 #include "aarch64-cores.def"
1197   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1198     AARCH64_FL_FOR_ARCH8, &generic_tunings},
1199   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1200 };
1201
1202
1203 /* Target specification.  These are populated by the -march, -mtune, -mcpu
1204    handling code or by target attributes.  */
1205 static const struct processor *selected_arch;
1206 static const struct processor *selected_cpu;
1207 static const struct processor *selected_tune;
1208
1209 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1210
1211 /* The current tuning set.  */
1212 struct tune_params aarch64_tune_params = generic_tunings;
1213
1214 /* Table of machine attributes.  */
1215 static const struct attribute_spec aarch64_attribute_table[] =
1216 {
1217   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1218        affects_type_identity, handler, exclude } */
1219   { "aarch64_vector_pcs", 0, 0, false, true,  true,  true,  NULL, NULL },
1220   { NULL,                 0, 0, false, false, false, false, NULL, NULL }
1221 };
1222
1223 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1224
1225 /* An ISA extension in the co-processor and main instruction set space.  */
1226 struct aarch64_option_extension
1227 {
1228   const char *const name;
1229   const unsigned long flags_on;
1230   const unsigned long flags_off;
1231 };
1232
1233 typedef enum aarch64_cond_code
1234 {
1235   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1236   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1237   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1238 }
1239 aarch64_cc;
1240
1241 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1242
1243 struct aarch64_branch_protect_type
1244 {
1245   /* The type's name that the user passes to the branch-protection option
1246     string.  */
1247   const char* name;
1248   /* Function to handle the protection type and set global variables.
1249     First argument is the string token corresponding with this type and the
1250     second argument is the next token in the option string.
1251     Return values:
1252     * AARCH64_PARSE_OK: Handling was sucessful.
1253     * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1254       should print an error.
1255     * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1256       own error.  */
1257   enum aarch64_parse_opt_result (*handler)(char*, char*);
1258   /* A list of types that can follow this type in the option string.  */
1259   const aarch64_branch_protect_type* subtypes;
1260   unsigned int num_subtypes;
1261 };
1262
1263 static enum aarch64_parse_opt_result
1264 aarch64_handle_no_branch_protection (char* str, char* rest)
1265 {
1266   aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1267   aarch64_enable_bti = 0;
1268   if (rest)
1269     {
1270       error ("unexpected %<%s%> after %<%s%>", rest, str);
1271       return AARCH64_PARSE_INVALID_FEATURE;
1272     }
1273   return AARCH64_PARSE_OK;
1274 }
1275
1276 static enum aarch64_parse_opt_result
1277 aarch64_handle_standard_branch_protection (char* str, char* rest)
1278 {
1279   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1280   aarch64_ra_sign_key = AARCH64_KEY_A;
1281   aarch64_enable_bti = 1;
1282   if (rest)
1283     {
1284       error ("unexpected %<%s%> after %<%s%>", rest, str);
1285       return AARCH64_PARSE_INVALID_FEATURE;
1286     }
1287   return AARCH64_PARSE_OK;
1288 }
1289
1290 static enum aarch64_parse_opt_result
1291 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1292                                     char* rest ATTRIBUTE_UNUSED)
1293 {
1294   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1295   aarch64_ra_sign_key = AARCH64_KEY_A;
1296   return AARCH64_PARSE_OK;
1297 }
1298
1299 static enum aarch64_parse_opt_result
1300 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1301                               char* rest ATTRIBUTE_UNUSED)
1302 {
1303   aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1304   return AARCH64_PARSE_OK;
1305 }
1306
1307 static enum aarch64_parse_opt_result
1308 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
1309                               char* rest ATTRIBUTE_UNUSED)
1310 {
1311   aarch64_ra_sign_key = AARCH64_KEY_B;
1312   return AARCH64_PARSE_OK;
1313 }
1314
1315 static enum aarch64_parse_opt_result
1316 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1317                                     char* rest ATTRIBUTE_UNUSED)
1318 {
1319   aarch64_enable_bti = 1;
1320   return AARCH64_PARSE_OK;
1321 }
1322
1323 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1324   { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1325   { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
1326   { NULL, NULL, NULL, 0 }
1327 };
1328
1329 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1330   { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1331   { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1332   { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1333     ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1334   { "bti", aarch64_handle_bti_protection, NULL, 0 },
1335   { NULL, NULL, NULL, 0 }
1336 };
1337
1338 /* The condition codes of the processor, and the inverse function.  */
1339 static const char * const aarch64_condition_codes[] =
1340 {
1341   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1342   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1343 };
1344
1345 /* The preferred condition codes for SVE conditions.  */
1346 static const char *const aarch64_sve_condition_codes[] =
1347 {
1348   "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1349   "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1350 };
1351
1352 /* Return the assembly token for svpattern value VALUE.  */
1353
1354 static const char *
1355 svpattern_token (enum aarch64_svpattern pattern)
1356 {
1357   switch (pattern)
1358     {
1359 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1360     AARCH64_FOR_SVPATTERN (CASE)
1361 #undef CASE
1362     case AARCH64_NUM_SVPATTERNS:
1363       break;
1364     }
1365   gcc_unreachable ();
1366 }
1367
1368 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1369 const char *
1370 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1371                         const char * branch_format)
1372 {
1373     rtx_code_label * tmp_label = gen_label_rtx ();
1374     char label_buf[256];
1375     char buffer[128];
1376     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1377                                  CODE_LABEL_NUMBER (tmp_label));
1378     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1379     rtx dest_label = operands[pos_label];
1380     operands[pos_label] = tmp_label;
1381
1382     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1383     output_asm_insn (buffer, operands);
1384
1385     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1386     operands[pos_label] = dest_label;
1387     output_asm_insn (buffer, operands);
1388     return "";
1389 }
1390
1391 void
1392 aarch64_err_no_fpadvsimd (machine_mode mode)
1393 {
1394   if (TARGET_GENERAL_REGS_ONLY)
1395     if (FLOAT_MODE_P (mode))
1396       error ("%qs is incompatible with the use of floating-point types",
1397              "-mgeneral-regs-only");
1398     else
1399       error ("%qs is incompatible with the use of vector types",
1400              "-mgeneral-regs-only");
1401   else
1402     if (FLOAT_MODE_P (mode))
1403       error ("%qs feature modifier is incompatible with the use of"
1404              " floating-point types", "+nofp");
1405     else
1406       error ("%qs feature modifier is incompatible with the use of"
1407              " vector types", "+nofp");
1408 }
1409
1410 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1411    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1412    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1413    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1414    and GENERAL_REGS is lower than the memory cost (in this case the best class
1415    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
1416    cost results in bad allocations with many redundant int<->FP moves which
1417    are expensive on various cores.
1418    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1419    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
1420    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
1421    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
1422    The result of this is that it is no longer inefficient to have a higher
1423    memory move cost than the register move cost.
1424 */
1425
1426 static reg_class_t
1427 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1428                                          reg_class_t best_class)
1429 {
1430   machine_mode mode;
1431
1432   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1433       || !reg_class_subset_p (FP_REGS, allocno_class))
1434     return allocno_class;
1435
1436   if (!reg_class_subset_p (GENERAL_REGS, best_class)
1437       || !reg_class_subset_p (FP_REGS, best_class))
1438     return best_class;
1439
1440   mode = PSEUDO_REGNO_MODE (regno);
1441   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1442 }
1443
1444 static unsigned int
1445 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1446 {
1447   if (GET_MODE_UNIT_SIZE (mode) == 4)
1448     return aarch64_tune_params.min_div_recip_mul_sf;
1449   return aarch64_tune_params.min_div_recip_mul_df;
1450 }
1451
1452 /* Return the reassociation width of treeop OPC with mode MODE.  */
1453 static int
1454 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1455 {
1456   if (VECTOR_MODE_P (mode))
1457     return aarch64_tune_params.vec_reassoc_width;
1458   if (INTEGRAL_MODE_P (mode))
1459     return aarch64_tune_params.int_reassoc_width;
1460   /* Avoid reassociating floating point addition so we emit more FMAs.  */
1461   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1462     return aarch64_tune_params.fp_reassoc_width;
1463   return 1;
1464 }
1465
1466 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1467 unsigned
1468 aarch64_dbx_register_number (unsigned regno)
1469 {
1470    if (GP_REGNUM_P (regno))
1471      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1472    else if (regno == SP_REGNUM)
1473      return AARCH64_DWARF_SP;
1474    else if (FP_REGNUM_P (regno))
1475      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1476    else if (PR_REGNUM_P (regno))
1477      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1478    else if (regno == VG_REGNUM)
1479      return AARCH64_DWARF_VG;
1480
1481    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1482       equivalent DWARF register.  */
1483    return DWARF_FRAME_REGISTERS;
1484 }
1485
1486 /* If X is a CONST_DOUBLE, return its bit representation as a constant
1487    integer, otherwise return X unmodified.  */
1488 static rtx
1489 aarch64_bit_representation (rtx x)
1490 {
1491   if (CONST_DOUBLE_P (x))
1492     x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
1493   return x;
1494 }
1495
1496 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1497 static bool
1498 aarch64_advsimd_struct_mode_p (machine_mode mode)
1499 {
1500   return (TARGET_SIMD
1501           && (mode == OImode || mode == CImode || mode == XImode));
1502 }
1503
1504 /* Return true if MODE is an SVE predicate mode.  */
1505 static bool
1506 aarch64_sve_pred_mode_p (machine_mode mode)
1507 {
1508   return (TARGET_SVE
1509           && (mode == VNx16BImode
1510               || mode == VNx8BImode
1511               || mode == VNx4BImode
1512               || mode == VNx2BImode));
1513 }
1514
1515 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1516 const unsigned int VEC_ADVSIMD  = 1;
1517 const unsigned int VEC_SVE_DATA = 2;
1518 const unsigned int VEC_SVE_PRED = 4;
1519 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1520    a structure of 2, 3 or 4 vectors.  */
1521 const unsigned int VEC_STRUCT   = 8;
1522 /* Useful combinations of the above.  */
1523 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1524 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1525
1526 /* Return a set of flags describing the vector properties of mode MODE.
1527    Ignore modes that are not supported by the current target.  */
1528 static unsigned int
1529 aarch64_classify_vector_mode (machine_mode mode)
1530 {
1531   if (aarch64_advsimd_struct_mode_p (mode))
1532     return VEC_ADVSIMD | VEC_STRUCT;
1533
1534   if (aarch64_sve_pred_mode_p (mode))
1535     return VEC_SVE_PRED;
1536
1537   /* Make the decision based on the mode's enum value rather than its
1538      properties, so that we keep the correct classification regardless
1539      of -msve-vector-bits.  */
1540   switch (mode)
1541     {
1542     /* Single SVE vectors.  */
1543     case E_VNx16QImode:
1544     case E_VNx8HImode:
1545     case E_VNx4SImode:
1546     case E_VNx2DImode:
1547     case E_VNx8HFmode:
1548     case E_VNx4SFmode:
1549     case E_VNx2DFmode:
1550       return TARGET_SVE ? VEC_SVE_DATA : 0;
1551
1552     /* x2 SVE vectors.  */
1553     case E_VNx32QImode:
1554     case E_VNx16HImode:
1555     case E_VNx8SImode:
1556     case E_VNx4DImode:
1557     case E_VNx16HFmode:
1558     case E_VNx8SFmode:
1559     case E_VNx4DFmode:
1560     /* x3 SVE vectors.  */
1561     case E_VNx48QImode:
1562     case E_VNx24HImode:
1563     case E_VNx12SImode:
1564     case E_VNx6DImode:
1565     case E_VNx24HFmode:
1566     case E_VNx12SFmode:
1567     case E_VNx6DFmode:
1568     /* x4 SVE vectors.  */
1569     case E_VNx64QImode:
1570     case E_VNx32HImode:
1571     case E_VNx16SImode:
1572     case E_VNx8DImode:
1573     case E_VNx32HFmode:
1574     case E_VNx16SFmode:
1575     case E_VNx8DFmode:
1576       return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
1577
1578     /* 64-bit Advanced SIMD vectors.  */
1579     case E_V8QImode:
1580     case E_V4HImode:
1581     case E_V2SImode:
1582     /* ...E_V1DImode doesn't exist.  */
1583     case E_V4HFmode:
1584     case E_V2SFmode:
1585     case E_V1DFmode:
1586     /* 128-bit Advanced SIMD vectors.  */
1587     case E_V16QImode:
1588     case E_V8HImode:
1589     case E_V4SImode:
1590     case E_V2DImode:
1591     case E_V8HFmode:
1592     case E_V4SFmode:
1593     case E_V2DFmode:
1594       return TARGET_SIMD ? VEC_ADVSIMD : 0;
1595
1596     default:
1597       return 0;
1598     }
1599 }
1600
1601 /* Return true if MODE is any of the data vector modes, including
1602    structure modes.  */
1603 static bool
1604 aarch64_vector_data_mode_p (machine_mode mode)
1605 {
1606   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1607 }
1608
1609 /* Return true if MODE is any form of SVE mode, including predicates,
1610    vectors and structures.  */
1611 bool
1612 aarch64_sve_mode_p (machine_mode mode)
1613 {
1614   return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
1615 }
1616
1617 /* Return true if MODE is an SVE data vector mode; either a single vector
1618    or a structure of vectors.  */
1619 static bool
1620 aarch64_sve_data_mode_p (machine_mode mode)
1621 {
1622   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1623 }
1624
1625 /* Implement target hook TARGET_ARRAY_MODE.  */
1626 static opt_machine_mode
1627 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1628 {
1629   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1630       && IN_RANGE (nelems, 2, 4))
1631     return mode_for_vector (GET_MODE_INNER (mode),
1632                             GET_MODE_NUNITS (mode) * nelems);
1633
1634   return opt_machine_mode ();
1635 }
1636
1637 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1638 static bool
1639 aarch64_array_mode_supported_p (machine_mode mode,
1640                                 unsigned HOST_WIDE_INT nelems)
1641 {
1642   if (TARGET_SIMD
1643       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1644           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1645       && (nelems >= 2 && nelems <= 4))
1646     return true;
1647
1648   return false;
1649 }
1650
1651 /* Return the SVE predicate mode to use for elements that have
1652    ELEM_NBYTES bytes, if such a mode exists.  */
1653
1654 opt_machine_mode
1655 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1656 {
1657   if (TARGET_SVE)
1658     {
1659       if (elem_nbytes == 1)
1660         return VNx16BImode;
1661       if (elem_nbytes == 2)
1662         return VNx8BImode;
1663       if (elem_nbytes == 4)
1664         return VNx4BImode;
1665       if (elem_nbytes == 8)
1666         return VNx2BImode;
1667     }
1668   return opt_machine_mode ();
1669 }
1670
1671 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1672
1673 static opt_machine_mode
1674 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1675 {
1676   if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1677     {
1678       unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1679       machine_mode pred_mode;
1680       if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1681         return pred_mode;
1682     }
1683
1684   return default_get_mask_mode (nunits, nbytes);
1685 }
1686
1687 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE.  */
1688
1689 static opt_machine_mode
1690 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
1691 {
1692   enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
1693                             ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
1694   machine_mode mode;
1695   FOR_EACH_MODE_IN_CLASS (mode, mclass)
1696     if (inner_mode == GET_MODE_INNER (mode)
1697         && known_eq (nunits, GET_MODE_NUNITS (mode))
1698         && aarch64_sve_data_mode_p (mode))
1699       return mode;
1700   return opt_machine_mode ();
1701 }
1702
1703 /* Return the integer element mode associated with SVE mode MODE.  */
1704
1705 static scalar_int_mode
1706 aarch64_sve_element_int_mode (machine_mode mode)
1707 {
1708   unsigned int elt_bits = vector_element_size (BITS_PER_SVE_VECTOR,
1709                                                GET_MODE_NUNITS (mode));
1710   return int_mode_for_size (elt_bits, 0).require ();
1711 }
1712
1713 /* Return the integer vector mode associated with SVE mode MODE.
1714    Unlike mode_for_int_vector, this can handle the case in which
1715    MODE is a predicate (and thus has a different total size).  */
1716
1717 static machine_mode
1718 aarch64_sve_int_mode (machine_mode mode)
1719 {
1720   scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
1721   return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
1722 }
1723
1724 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
1725    prefer to use the first arithmetic operand as the else value if
1726    the else value doesn't matter, since that exactly matches the SVE
1727    destructive merging form.  For ternary operations we could either
1728    pick the first operand and use FMAD-like instructions or the last
1729    operand and use FMLA-like instructions; the latter seems more
1730    natural.  */
1731
1732 static tree
1733 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1734 {
1735   return nops == 3 ? ops[2] : ops[0];
1736 }
1737
1738 /* Implement TARGET_HARD_REGNO_NREGS.  */
1739
1740 static unsigned int
1741 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1742 {
1743   /* ??? Logically we should only need to provide a value when
1744      HARD_REGNO_MODE_OK says that the combination is valid,
1745      but at the moment we need to handle all modes.  Just ignore
1746      any runtime parts for registers that can't store them.  */
1747   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1748   switch (aarch64_regno_regclass (regno))
1749     {
1750     case FP_REGS:
1751     case FP_LO_REGS:
1752     case FP_LO8_REGS:
1753       if (aarch64_sve_data_mode_p (mode))
1754         return exact_div (GET_MODE_SIZE (mode),
1755                           BYTES_PER_SVE_VECTOR).to_constant ();
1756       return CEIL (lowest_size, UNITS_PER_VREG);
1757     case PR_REGS:
1758     case PR_LO_REGS:
1759     case PR_HI_REGS:
1760       return 1;
1761     default:
1762       return CEIL (lowest_size, UNITS_PER_WORD);
1763     }
1764   gcc_unreachable ();
1765 }
1766
1767 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1768
1769 static bool
1770 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1771 {
1772   if (GET_MODE_CLASS (mode) == MODE_CC)
1773     return regno == CC_REGNUM;
1774
1775   if (regno == VG_REGNUM)
1776     /* This must have the same size as _Unwind_Word.  */
1777     return mode == DImode;
1778
1779   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1780   if (vec_flags & VEC_SVE_PRED)
1781     return PR_REGNUM_P (regno);
1782
1783   if (PR_REGNUM_P (regno))
1784     return 0;
1785
1786   if (regno == SP_REGNUM)
1787     /* The purpose of comparing with ptr_mode is to support the
1788        global register variable associated with the stack pointer
1789        register via the syntax of asm ("wsp") in ILP32.  */
1790     return mode == Pmode || mode == ptr_mode;
1791
1792   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1793     return mode == Pmode;
1794
1795   if (GP_REGNUM_P (regno))
1796     {
1797       if (known_le (GET_MODE_SIZE (mode), 8))
1798         return true;
1799       else if (known_le (GET_MODE_SIZE (mode), 16))
1800         return (regno & 1) == 0;
1801     }
1802   else if (FP_REGNUM_P (regno))
1803     {
1804       if (vec_flags & VEC_STRUCT)
1805         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1806       else
1807         return !VECTOR_MODE_P (mode) || vec_flags != 0;
1808     }
1809
1810   return false;
1811 }
1812
1813 /* Return true if this is a definition of a vectorized simd function.  */
1814
1815 static bool
1816 aarch64_simd_decl_p (tree fndecl)
1817 {
1818   tree fntype;
1819
1820   if (fndecl == NULL)
1821     return false;
1822   fntype = TREE_TYPE (fndecl);
1823   if (fntype == NULL)
1824     return false;
1825
1826   /* Functions with the aarch64_vector_pcs attribute use the simd ABI.  */
1827   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
1828     return true;
1829
1830   return false;
1831 }
1832
1833 /* Return the mode a register save/restore should use.  DImode for integer
1834    registers, DFmode for FP registers in non-SIMD functions (they only save
1835    the bottom half of a 128 bit register), or TFmode for FP registers in
1836    SIMD functions.  */
1837
1838 static machine_mode
1839 aarch64_reg_save_mode (tree fndecl, unsigned regno)
1840 {
1841   return GP_REGNUM_P (regno)
1842            ? E_DImode
1843            : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
1844 }
1845
1846 /* Return true if the instruction is a call to a SIMD function, false
1847    if it is not a SIMD function or if we do not know anything about
1848    the function.  */
1849
1850 static bool
1851 aarch64_simd_call_p (rtx_insn *insn)
1852 {
1853   rtx symbol;
1854   rtx call;
1855   tree fndecl;
1856
1857   gcc_assert (CALL_P (insn));
1858   call = get_call_rtx_from (insn);
1859   symbol = XEXP (XEXP (call, 0), 0);
1860   if (GET_CODE (symbol) != SYMBOL_REF)
1861     return false;
1862   fndecl = SYMBOL_REF_DECL (symbol);
1863   if (!fndecl)
1864     return false;
1865
1866   return aarch64_simd_decl_p (fndecl);
1867 }
1868
1869 /* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS.  If INSN calls
1870    a function that uses the SIMD ABI, take advantage of the extra
1871    call-preserved registers that the ABI provides.  */
1872
1873 void
1874 aarch64_remove_extra_call_preserved_regs (rtx_insn *insn,
1875                                           HARD_REG_SET *return_set)
1876 {
1877   if (aarch64_simd_call_p (insn))
1878     {
1879       for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1880         if (FP_SIMD_SAVED_REGNUM_P (regno))
1881           CLEAR_HARD_REG_BIT (*return_set, regno);
1882     }
1883 }
1884
1885 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1886    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1887    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1888
1889 static bool
1890 aarch64_hard_regno_call_part_clobbered (rtx_insn *insn, unsigned int regno,
1891                                         machine_mode mode)
1892 {
1893   bool simd_p = insn && CALL_P (insn) && aarch64_simd_call_p (insn);
1894   return FP_REGNUM_P (regno)
1895          && maybe_gt (GET_MODE_SIZE (mode), simd_p ? 16 : 8);
1896 }
1897
1898 /* Implement TARGET_RETURN_CALL_WITH_MAX_CLOBBERS.  */
1899
1900 rtx_insn *
1901 aarch64_return_call_with_max_clobbers (rtx_insn *call_1, rtx_insn *call_2)
1902 {
1903   gcc_assert (CALL_P (call_1) && CALL_P (call_2));
1904
1905   if (!aarch64_simd_call_p (call_1) || aarch64_simd_call_p (call_2))
1906     return call_1;
1907   else
1908     return call_2;
1909 }
1910
1911 /* Implement REGMODE_NATURAL_SIZE.  */
1912 poly_uint64
1913 aarch64_regmode_natural_size (machine_mode mode)
1914 {
1915   /* The natural size for SVE data modes is one SVE data vector,
1916      and similarly for predicates.  We can't independently modify
1917      anything smaller than that.  */
1918   /* ??? For now, only do this for variable-width SVE registers.
1919      Doing it for constant-sized registers breaks lower-subreg.c.  */
1920   /* ??? And once that's fixed, we should probably have similar
1921      code for Advanced SIMD.  */
1922   if (!aarch64_sve_vg.is_constant ())
1923     {
1924       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1925       if (vec_flags & VEC_SVE_PRED)
1926         return BYTES_PER_SVE_PRED;
1927       if (vec_flags & VEC_SVE_DATA)
1928         return BYTES_PER_SVE_VECTOR;
1929     }
1930   return UNITS_PER_WORD;
1931 }
1932
1933 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1934 machine_mode
1935 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1936                                      machine_mode mode)
1937 {
1938   /* The predicate mode determines which bits are significant and
1939      which are "don't care".  Decreasing the number of lanes would
1940      lose data while increasing the number of lanes would make bits
1941      unnecessarily significant.  */
1942   if (PR_REGNUM_P (regno))
1943     return mode;
1944   if (known_ge (GET_MODE_SIZE (mode), 4))
1945     return mode;
1946   else
1947     return SImode;
1948 }
1949
1950 /* Return true if I's bits are consecutive ones from the MSB.  */
1951 bool
1952 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
1953 {
1954   return exact_log2 (-i) != HOST_WIDE_INT_M1;
1955 }
1956
1957 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
1958    that strcpy from constants will be faster.  */
1959
1960 static HOST_WIDE_INT
1961 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1962 {
1963   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1964     return MAX (align, BITS_PER_WORD);
1965   return align;
1966 }
1967
1968 /* Return true if calls to DECL should be treated as
1969    long-calls (ie called via a register).  */
1970 static bool
1971 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1972 {
1973   return false;
1974 }
1975
1976 /* Return true if calls to symbol-ref SYM should be treated as
1977    long-calls (ie called via a register).  */
1978 bool
1979 aarch64_is_long_call_p (rtx sym)
1980 {
1981   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1982 }
1983
1984 /* Return true if calls to symbol-ref SYM should not go through
1985    plt stubs.  */
1986
1987 bool
1988 aarch64_is_noplt_call_p (rtx sym)
1989 {
1990   const_tree decl = SYMBOL_REF_DECL (sym);
1991
1992   if (flag_pic
1993       && decl
1994       && (!flag_plt
1995           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1996       && !targetm.binds_local_p (decl))
1997     return true;
1998
1999   return false;
2000 }
2001
2002 /* Return true if the offsets to a zero/sign-extract operation
2003    represent an expression that matches an extend operation.  The
2004    operands represent the paramters from
2005
2006    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
2007 bool
2008 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
2009                                 rtx extract_imm)
2010 {
2011   HOST_WIDE_INT mult_val, extract_val;
2012
2013   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
2014     return false;
2015
2016   mult_val = INTVAL (mult_imm);
2017   extract_val = INTVAL (extract_imm);
2018
2019   if (extract_val > 8
2020       && extract_val < GET_MODE_BITSIZE (mode)
2021       && exact_log2 (extract_val & ~7) > 0
2022       && (extract_val & 7) <= 4
2023       && mult_val == (1 << (extract_val & 7)))
2024     return true;
2025
2026   return false;
2027 }
2028
2029 /* Emit an insn that's a simple single-set.  Both the operands must be
2030    known to be valid.  */
2031 inline static rtx_insn *
2032 emit_set_insn (rtx x, rtx y)
2033 {
2034   return emit_insn (gen_rtx_SET (x, y));
2035 }
2036
2037 /* X and Y are two things to compare using CODE.  Emit the compare insn and
2038    return the rtx for register 0 in the proper mode.  */
2039 rtx
2040 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
2041 {
2042   machine_mode mode = SELECT_CC_MODE (code, x, y);
2043   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
2044
2045   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
2046   return cc_reg;
2047 }
2048
2049 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
2050
2051 static rtx
2052 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2053                                   machine_mode y_mode)
2054 {
2055   if (y_mode == E_QImode || y_mode == E_HImode)
2056     {
2057       if (CONST_INT_P (y))
2058         y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2059       else
2060         {
2061           rtx t, cc_reg;
2062           machine_mode cc_mode;
2063
2064           t = gen_rtx_ZERO_EXTEND (SImode, y);
2065           t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2066           cc_mode = CC_SWPmode;
2067           cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2068           emit_set_insn (cc_reg, t);
2069           return cc_reg;
2070         }
2071     }
2072
2073   return aarch64_gen_compare_reg (code, x, y);
2074 }
2075
2076 /* Build the SYMBOL_REF for __tls_get_addr.  */
2077
2078 static GTY(()) rtx tls_get_addr_libfunc;
2079
2080 rtx
2081 aarch64_tls_get_addr (void)
2082 {
2083   if (!tls_get_addr_libfunc)
2084     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
2085   return tls_get_addr_libfunc;
2086 }
2087
2088 /* Return the TLS model to use for ADDR.  */
2089
2090 static enum tls_model
2091 tls_symbolic_operand_type (rtx addr)
2092 {
2093   enum tls_model tls_kind = TLS_MODEL_NONE;
2094   if (GET_CODE (addr) == CONST)
2095     {
2096       poly_int64 addend;
2097       rtx sym = strip_offset (addr, &addend);
2098       if (GET_CODE (sym) == SYMBOL_REF)
2099         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
2100     }
2101   else if (GET_CODE (addr) == SYMBOL_REF)
2102     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2103
2104   return tls_kind;
2105 }
2106
2107 /* We'll allow lo_sum's in addresses in our legitimate addresses
2108    so that combine would take care of combining addresses where
2109    necessary, but for generation purposes, we'll generate the address
2110    as :
2111    RTL                               Absolute
2112    tmp = hi (symbol_ref);            adrp  x1, foo
2113    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
2114                                      nop
2115
2116    PIC                               TLS
2117    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
2118    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
2119                                      bl   __tls_get_addr
2120                                      nop
2121
2122    Load TLS symbol, depending on TLS mechanism and TLS access model.
2123
2124    Global Dynamic - Traditional TLS:
2125    adrp tmp, :tlsgd:imm
2126    add  dest, tmp, #:tlsgd_lo12:imm
2127    bl   __tls_get_addr
2128
2129    Global Dynamic - TLS Descriptors:
2130    adrp dest, :tlsdesc:imm
2131    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
2132    add  dest, dest, #:tlsdesc_lo12:imm
2133    blr  tmp
2134    mrs  tp, tpidr_el0
2135    add  dest, dest, tp
2136
2137    Initial Exec:
2138    mrs  tp, tpidr_el0
2139    adrp tmp, :gottprel:imm
2140    ldr  dest, [tmp, #:gottprel_lo12:imm]
2141    add  dest, dest, tp
2142
2143    Local Exec:
2144    mrs  tp, tpidr_el0
2145    add  t0, tp, #:tprel_hi12:imm, lsl #12
2146    add  t0, t0, #:tprel_lo12_nc:imm
2147 */
2148
2149 static void
2150 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2151                                    enum aarch64_symbol_type type)
2152 {
2153   switch (type)
2154     {
2155     case SYMBOL_SMALL_ABSOLUTE:
2156       {
2157         /* In ILP32, the mode of dest can be either SImode or DImode.  */
2158         rtx tmp_reg = dest;
2159         machine_mode mode = GET_MODE (dest);
2160
2161         gcc_assert (mode == Pmode || mode == ptr_mode);
2162
2163         if (can_create_pseudo_p ())
2164           tmp_reg = gen_reg_rtx (mode);
2165
2166         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2167         emit_insn (gen_add_losym (dest, tmp_reg, imm));
2168         return;
2169       }
2170
2171     case SYMBOL_TINY_ABSOLUTE:
2172       emit_insn (gen_rtx_SET (dest, imm));
2173       return;
2174
2175     case SYMBOL_SMALL_GOT_28K:
2176       {
2177         machine_mode mode = GET_MODE (dest);
2178         rtx gp_rtx = pic_offset_table_rtx;
2179         rtx insn;
2180         rtx mem;
2181
2182         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2183            here before rtl expand.  Tree IVOPT will generate rtl pattern to
2184            decide rtx costs, in which case pic_offset_table_rtx is not
2185            initialized.  For that case no need to generate the first adrp
2186            instruction as the final cost for global variable access is
2187            one instruction.  */
2188         if (gp_rtx != NULL)
2189           {
2190             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2191                using the page base as GOT base, the first page may be wasted,
2192                in the worst scenario, there is only 28K space for GOT).
2193
2194                The generate instruction sequence for accessing global variable
2195                is:
2196
2197                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2198
2199                Only one instruction needed. But we must initialize
2200                pic_offset_table_rtx properly.  We generate initialize insn for
2201                every global access, and allow CSE to remove all redundant.
2202
2203                The final instruction sequences will look like the following
2204                for multiply global variables access.
2205
2206                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2207
2208                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2209                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2210                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2211                  ...  */
2212
2213             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2214             crtl->uses_pic_offset_table = 1;
2215             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2216
2217             if (mode != GET_MODE (gp_rtx))
2218              gp_rtx = gen_lowpart (mode, gp_rtx);
2219
2220           }
2221
2222         if (mode == ptr_mode)
2223           {
2224             if (mode == DImode)
2225               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2226             else
2227               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2228
2229             mem = XVECEXP (SET_SRC (insn), 0, 0);
2230           }
2231         else
2232           {
2233             gcc_assert (mode == Pmode);
2234
2235             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2236             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2237           }
2238
2239         /* The operand is expected to be MEM.  Whenever the related insn
2240            pattern changed, above code which calculate mem should be
2241            updated.  */
2242         gcc_assert (GET_CODE (mem) == MEM);
2243         MEM_READONLY_P (mem) = 1;
2244         MEM_NOTRAP_P (mem) = 1;
2245         emit_insn (insn);
2246         return;
2247       }
2248
2249     case SYMBOL_SMALL_GOT_4G:
2250       {
2251         /* In ILP32, the mode of dest can be either SImode or DImode,
2252            while the got entry is always of SImode size.  The mode of
2253            dest depends on how dest is used: if dest is assigned to a
2254            pointer (e.g. in the memory), it has SImode; it may have
2255            DImode if dest is dereferenced to access the memeory.
2256            This is why we have to handle three different ldr_got_small
2257            patterns here (two patterns for ILP32).  */
2258
2259         rtx insn;
2260         rtx mem;
2261         rtx tmp_reg = dest;
2262         machine_mode mode = GET_MODE (dest);
2263
2264         if (can_create_pseudo_p ())
2265           tmp_reg = gen_reg_rtx (mode);
2266
2267         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2268         if (mode == ptr_mode)
2269           {
2270             if (mode == DImode)
2271               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2272             else
2273               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2274
2275             mem = XVECEXP (SET_SRC (insn), 0, 0);
2276           }
2277         else
2278           {
2279             gcc_assert (mode == Pmode);
2280
2281             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2282             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2283           }
2284
2285         gcc_assert (GET_CODE (mem) == MEM);
2286         MEM_READONLY_P (mem) = 1;
2287         MEM_NOTRAP_P (mem) = 1;
2288         emit_insn (insn);
2289         return;
2290       }
2291
2292     case SYMBOL_SMALL_TLSGD:
2293       {
2294         rtx_insn *insns;
2295         machine_mode mode = GET_MODE (dest);
2296         rtx result = gen_rtx_REG (mode, R0_REGNUM);
2297
2298         start_sequence ();
2299         if (TARGET_ILP32)
2300           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2301         else
2302           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2303         insns = get_insns ();
2304         end_sequence ();
2305
2306         RTL_CONST_CALL_P (insns) = 1;
2307         emit_libcall_block (insns, dest, result, imm);
2308         return;
2309       }
2310
2311     case SYMBOL_SMALL_TLSDESC:
2312       {
2313         machine_mode mode = GET_MODE (dest);
2314         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2315         rtx tp;
2316
2317         gcc_assert (mode == Pmode || mode == ptr_mode);
2318
2319         /* In ILP32, the got entry is always of SImode size.  Unlike
2320            small GOT, the dest is fixed at reg 0.  */
2321         if (TARGET_ILP32)
2322           emit_insn (gen_tlsdesc_small_si (imm));
2323         else
2324           emit_insn (gen_tlsdesc_small_di (imm));
2325         tp = aarch64_load_tp (NULL);
2326
2327         if (mode != Pmode)
2328           tp = gen_lowpart (mode, tp);
2329
2330         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2331         if (REG_P (dest))
2332           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2333         return;
2334       }
2335
2336     case SYMBOL_SMALL_TLSIE:
2337       {
2338         /* In ILP32, the mode of dest can be either SImode or DImode,
2339            while the got entry is always of SImode size.  The mode of
2340            dest depends on how dest is used: if dest is assigned to a
2341            pointer (e.g. in the memory), it has SImode; it may have
2342            DImode if dest is dereferenced to access the memeory.
2343            This is why we have to handle three different tlsie_small
2344            patterns here (two patterns for ILP32).  */
2345         machine_mode mode = GET_MODE (dest);
2346         rtx tmp_reg = gen_reg_rtx (mode);
2347         rtx tp = aarch64_load_tp (NULL);
2348
2349         if (mode == ptr_mode)
2350           {
2351             if (mode == DImode)
2352               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2353             else
2354               {
2355                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2356                 tp = gen_lowpart (mode, tp);
2357               }
2358           }
2359         else
2360           {
2361             gcc_assert (mode == Pmode);
2362             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2363           }
2364
2365         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2366         if (REG_P (dest))
2367           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2368         return;
2369       }
2370
2371     case SYMBOL_TLSLE12:
2372     case SYMBOL_TLSLE24:
2373     case SYMBOL_TLSLE32:
2374     case SYMBOL_TLSLE48:
2375       {
2376         machine_mode mode = GET_MODE (dest);
2377         rtx tp = aarch64_load_tp (NULL);
2378
2379         if (mode != Pmode)
2380           tp = gen_lowpart (mode, tp);
2381
2382         switch (type)
2383           {
2384           case SYMBOL_TLSLE12:
2385             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2386                         (dest, tp, imm));
2387             break;
2388           case SYMBOL_TLSLE24:
2389             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2390                         (dest, tp, imm));
2391           break;
2392           case SYMBOL_TLSLE32:
2393             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2394                         (dest, imm));
2395             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2396                         (dest, dest, tp));
2397           break;
2398           case SYMBOL_TLSLE48:
2399             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2400                         (dest, imm));
2401             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2402                         (dest, dest, tp));
2403             break;
2404           default:
2405             gcc_unreachable ();
2406           }
2407
2408         if (REG_P (dest))
2409           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2410         return;
2411       }
2412
2413     case SYMBOL_TINY_GOT:
2414       emit_insn (gen_ldr_got_tiny (dest, imm));
2415       return;
2416
2417     case SYMBOL_TINY_TLSIE:
2418       {
2419         machine_mode mode = GET_MODE (dest);
2420         rtx tp = aarch64_load_tp (NULL);
2421
2422         if (mode == ptr_mode)
2423           {
2424             if (mode == DImode)
2425               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2426             else
2427               {
2428                 tp = gen_lowpart (mode, tp);
2429                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2430               }
2431           }
2432         else
2433           {
2434             gcc_assert (mode == Pmode);
2435             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2436           }
2437
2438         if (REG_P (dest))
2439           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2440         return;
2441       }
2442
2443     default:
2444       gcc_unreachable ();
2445     }
2446 }
2447
2448 /* Emit a move from SRC to DEST.  Assume that the move expanders can
2449    handle all moves if !can_create_pseudo_p ().  The distinction is
2450    important because, unlike emit_move_insn, the move expanders know
2451    how to force Pmode objects into the constant pool even when the
2452    constant pool address is not itself legitimate.  */
2453 static rtx
2454 aarch64_emit_move (rtx dest, rtx src)
2455 {
2456   return (can_create_pseudo_p ()
2457           ? emit_move_insn (dest, src)
2458           : emit_move_insn_1 (dest, src));
2459 }
2460
2461 /* Apply UNOPTAB to OP and store the result in DEST.  */
2462
2463 static void
2464 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2465 {
2466   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2467   if (dest != tmp)
2468     emit_move_insn (dest, tmp);
2469 }
2470
2471 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
2472
2473 static void
2474 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2475 {
2476   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2477                           OPTAB_DIRECT);
2478   if (dest != tmp)
2479     emit_move_insn (dest, tmp);
2480 }
2481
2482 /* Split a 128-bit move operation into two 64-bit move operations,
2483    taking care to handle partial overlap of register to register
2484    copies.  Special cases are needed when moving between GP regs and
2485    FP regs.  SRC can be a register, constant or memory; DST a register
2486    or memory.  If either operand is memory it must not have any side
2487    effects.  */
2488 void
2489 aarch64_split_128bit_move (rtx dst, rtx src)
2490 {
2491   rtx dst_lo, dst_hi;
2492   rtx src_lo, src_hi;
2493
2494   machine_mode mode = GET_MODE (dst);
2495
2496   gcc_assert (mode == TImode || mode == TFmode);
2497   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2498   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2499
2500   if (REG_P (dst) && REG_P (src))
2501     {
2502       int src_regno = REGNO (src);
2503       int dst_regno = REGNO (dst);
2504
2505       /* Handle FP <-> GP regs.  */
2506       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2507         {
2508           src_lo = gen_lowpart (word_mode, src);
2509           src_hi = gen_highpart (word_mode, src);
2510
2511           emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2512           emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2513           return;
2514         }
2515       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2516         {
2517           dst_lo = gen_lowpart (word_mode, dst);
2518           dst_hi = gen_highpart (word_mode, dst);
2519
2520           emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2521           emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2522           return;
2523         }
2524     }
2525
2526   dst_lo = gen_lowpart (word_mode, dst);
2527   dst_hi = gen_highpart (word_mode, dst);
2528   src_lo = gen_lowpart (word_mode, src);
2529   src_hi = gen_highpart_mode (word_mode, mode, src);
2530
2531   /* At most one pairing may overlap.  */
2532   if (reg_overlap_mentioned_p (dst_lo, src_hi))
2533     {
2534       aarch64_emit_move (dst_hi, src_hi);
2535       aarch64_emit_move (dst_lo, src_lo);
2536     }
2537   else
2538     {
2539       aarch64_emit_move (dst_lo, src_lo);
2540       aarch64_emit_move (dst_hi, src_hi);
2541     }
2542 }
2543
2544 bool
2545 aarch64_split_128bit_move_p (rtx dst, rtx src)
2546 {
2547   return (! REG_P (src)
2548           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2549 }
2550
2551 /* Split a complex SIMD combine.  */
2552
2553 void
2554 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2555 {
2556   machine_mode src_mode = GET_MODE (src1);
2557   machine_mode dst_mode = GET_MODE (dst);
2558
2559   gcc_assert (VECTOR_MODE_P (dst_mode));
2560   gcc_assert (register_operand (dst, dst_mode)
2561               && register_operand (src1, src_mode)
2562               && register_operand (src2, src_mode));
2563
2564   emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2565   return;
2566 }
2567
2568 /* Split a complex SIMD move.  */
2569
2570 void
2571 aarch64_split_simd_move (rtx dst, rtx src)
2572 {
2573   machine_mode src_mode = GET_MODE (src);
2574   machine_mode dst_mode = GET_MODE (dst);
2575
2576   gcc_assert (VECTOR_MODE_P (dst_mode));
2577
2578   if (REG_P (dst) && REG_P (src))
2579     {
2580       gcc_assert (VECTOR_MODE_P (src_mode));
2581       emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2582     }
2583 }
2584
2585 bool
2586 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2587                               machine_mode ymode, rtx y)
2588 {
2589   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2590   gcc_assert (r != NULL);
2591   return rtx_equal_p (x, r);
2592 }
2593
2594
2595 /* Return TARGET if it is nonnull and a register of mode MODE.
2596    Otherwise, return a fresh register of mode MODE if we can,
2597    or TARGET reinterpreted as MODE if we can't.  */
2598
2599 static rtx
2600 aarch64_target_reg (rtx target, machine_mode mode)
2601 {
2602   if (target && REG_P (target) && GET_MODE (target) == mode)
2603     return target;
2604   if (!can_create_pseudo_p ())
2605     {
2606       gcc_assert (target);
2607       return gen_lowpart (mode, target);
2608     }
2609   return gen_reg_rtx (mode);
2610 }
2611
2612 /* Return a register that contains the constant in BUILDER, given that
2613    the constant is a legitimate move operand.  Use TARGET as the register
2614    if it is nonnull and convenient.  */
2615
2616 static rtx
2617 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
2618 {
2619   rtx src = builder.build ();
2620   target = aarch64_target_reg (target, GET_MODE (src));
2621   emit_insn (gen_rtx_SET (target, src));
2622   return target;
2623 }
2624
2625 static rtx
2626 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2627 {
2628   if (can_create_pseudo_p ())
2629     return force_reg (mode, value);
2630   else
2631     {
2632       gcc_assert (x);
2633       aarch64_emit_move (x, value);
2634       return x;
2635     }
2636 }
2637
2638 /* Return true if predicate value X is a constant in which every element
2639    is a CONST_INT.  When returning true, describe X in BUILDER as a VNx16BI
2640    value, i.e. as a predicate in which all bits are significant.  */
2641
2642 static bool
2643 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
2644 {
2645   if (GET_CODE (x) != CONST_VECTOR)
2646     return false;
2647
2648   unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
2649                                              GET_MODE_NUNITS (GET_MODE (x)));
2650   unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
2651   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
2652   builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
2653
2654   unsigned int nelts = const_vector_encoded_nelts (x);
2655   for (unsigned int i = 0; i < nelts; ++i)
2656     {
2657       rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
2658       if (!CONST_INT_P (elt))
2659         return false;
2660
2661       builder.quick_push (elt);
2662       for (unsigned int j = 1; j < factor; ++j)
2663         builder.quick_push (const0_rtx);
2664     }
2665   builder.finalize ();
2666   return true;
2667 }
2668
2669 /* BUILDER contains a predicate constant of mode VNx16BI.  Return the
2670    widest predicate element size it can have (that is, the largest size
2671    for which each element would still be 0 or 1).  */
2672
2673 unsigned int
2674 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
2675 {
2676   /* Start with the most optimistic assumption: that we only need
2677      one bit per pattern.  This is what we will use if only the first
2678      bit in each pattern is ever set.  */
2679   unsigned int mask = GET_MODE_SIZE (DImode);
2680   mask |= builder.npatterns ();
2681
2682   /* Look for set bits.  */
2683   unsigned int nelts = builder.encoded_nelts ();
2684   for (unsigned int i = 1; i < nelts; ++i)
2685     if (INTVAL (builder.elt (i)) != 0)
2686       {
2687         if (i & 1)
2688           return 1;
2689         mask |= i;
2690       }
2691   return mask & -mask;
2692 }
2693
2694 /* BUILDER is a predicate constant of mode VNx16BI.  Consider the value
2695    that the constant would have with predicate element size ELT_SIZE
2696    (ignoring the upper bits in each element) and return:
2697
2698    * -1 if all bits are set
2699    * N if the predicate has N leading set bits followed by all clear bits
2700    * 0 if the predicate does not have any of these forms.  */
2701
2702 int
2703 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
2704                               unsigned int elt_size)
2705 {
2706   /* If nelts_per_pattern is 3, we have set bits followed by clear bits
2707      followed by set bits.  */
2708   if (builder.nelts_per_pattern () == 3)
2709     return 0;
2710
2711   /* Skip over leading set bits.  */
2712   unsigned int nelts = builder.encoded_nelts ();
2713   unsigned int i = 0;
2714   for (; i < nelts; i += elt_size)
2715     if (INTVAL (builder.elt (i)) == 0)
2716       break;
2717   unsigned int vl = i / elt_size;
2718
2719   /* Check for the all-true case.  */
2720   if (i == nelts)
2721     return -1;
2722
2723   /* If nelts_per_pattern is 1, then either VL is zero, or we have a
2724      repeating pattern of set bits followed by clear bits.  */
2725   if (builder.nelts_per_pattern () != 2)
2726     return 0;
2727
2728   /* We have a "foreground" value and a duplicated "background" value.
2729      If the background might repeat and the last set bit belongs to it,
2730      we might have set bits followed by clear bits followed by set bits.  */
2731   if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
2732     return 0;
2733
2734   /* Make sure that the rest are all clear.  */
2735   for (; i < nelts; i += elt_size)
2736     if (INTVAL (builder.elt (i)) != 0)
2737       return 0;
2738
2739   return vl;
2740 }
2741
2742 /* See if there is an svpattern that encodes an SVE predicate of mode
2743    PRED_MODE in which the first VL bits are set and the rest are clear.
2744    Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
2745    A VL of -1 indicates an all-true vector.  */
2746
2747 aarch64_svpattern
2748 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
2749 {
2750   if (vl < 0)
2751     return AARCH64_SV_ALL;
2752
2753   if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
2754     return AARCH64_NUM_SVPATTERNS;
2755
2756   if (vl >= 1 && vl <= 8)
2757     return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
2758
2759   if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
2760     return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
2761
2762   int max_vl;
2763   if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
2764     {
2765       if (vl == (max_vl / 3) * 3)
2766         return AARCH64_SV_MUL3;
2767       /* These would only trigger for non-power-of-2 lengths.  */
2768       if (vl == (max_vl & -4))
2769         return AARCH64_SV_MUL4;
2770       if (vl == (1 << floor_log2 (max_vl)))
2771         return AARCH64_SV_POW2;
2772       if (vl == max_vl)
2773         return AARCH64_SV_ALL;
2774     }
2775   return AARCH64_NUM_SVPATTERNS;
2776 }
2777
2778 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
2779    bits has the lowest bit set and the upper bits clear.  This is the
2780    VNx16BImode equivalent of a PTRUE for controlling elements of
2781    ELT_SIZE bytes.  However, because the constant is VNx16BImode,
2782    all bits are significant, even the upper zeros.  */
2783
2784 rtx
2785 aarch64_ptrue_all (unsigned int elt_size)
2786 {
2787   rtx_vector_builder builder (VNx16BImode, elt_size, 1);
2788   builder.quick_push (const1_rtx);
2789   for (unsigned int i = 1; i < elt_size; ++i)
2790     builder.quick_push (const0_rtx);
2791   return builder.build ();
2792 }
2793
2794 /* Return an all-true predicate register of mode MODE.  */
2795
2796 rtx
2797 aarch64_ptrue_reg (machine_mode mode)
2798 {
2799   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2800   rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
2801   return gen_lowpart (mode, reg);
2802 }
2803
2804 /* Return an all-false predicate register of mode MODE.  */
2805
2806 rtx
2807 aarch64_pfalse_reg (machine_mode mode)
2808 {
2809   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2810   rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
2811   return gen_lowpart (mode, reg);
2812 }
2813
2814 /* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
2815    true, or alternatively if we know that the operation predicated by
2816    PRED1[0] is safe to perform whenever PRED2 is true.  PRED1[1] is a
2817    aarch64_sve_gp_strictness operand that describes the operation
2818    predicated by PRED1[0].  */
2819
2820 bool
2821 aarch64_sve_pred_dominates_p (rtx *pred1, rtx pred2)
2822 {
2823   machine_mode mode = GET_MODE (pred2);
2824   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2825               && mode == GET_MODE (pred1[0])
2826               && aarch64_sve_gp_strictness (pred1[1], SImode));
2827   return (pred1[0] == CONSTM1_RTX (mode)
2828           || INTVAL (pred1[1]) == SVE_RELAXED_GP
2829           || rtx_equal_p (pred1[0], pred2));
2830 }
2831
2832 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
2833    for it.  PRED2[0] is the predicate for the instruction whose result
2834    is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
2835    for it.  Return true if we can prove that the two predicates are
2836    equivalent for PTEST purposes; that is, if we can replace PRED2[0]
2837    with PRED1[0] without changing behavior.  */
2838
2839 bool
2840 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
2841 {
2842   machine_mode mode = GET_MODE (pred1[0]);
2843   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2844               && mode == GET_MODE (pred2[0])
2845               && aarch64_sve_ptrue_flag (pred1[1], SImode)
2846               && aarch64_sve_ptrue_flag (pred2[1], SImode));
2847
2848   bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
2849                    || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
2850   bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
2851                    || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
2852   return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
2853 }
2854
2855 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
2856    DATA_MODE, and return the result in a predicate of mode PRED_MODE.
2857    Use TARGET as the target register if nonnull and convenient.  */
2858
2859 static rtx
2860 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
2861                           machine_mode data_mode, rtx op1, rtx op2)
2862 {
2863   insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
2864   expand_operand ops[5];
2865   create_output_operand (&ops[0], target, pred_mode);
2866   create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
2867   create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
2868   create_input_operand (&ops[3], op1, data_mode);
2869   create_input_operand (&ops[4], op2, data_mode);
2870   expand_insn (icode, 5, ops);
2871   return ops[0].value;
2872 }
2873
2874 /* Use a comparison to convert integer vector SRC into MODE, which is
2875    the corresponding SVE predicate mode.  Use TARGET for the result
2876    if it's nonnull and convenient.  */
2877
2878 static rtx
2879 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
2880 {
2881   machine_mode src_mode = GET_MODE (src);
2882   return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
2883                                    src, CONST0_RTX (src_mode));
2884 }
2885
2886 /* Return true if we can move VALUE into a register using a single
2887    CNT[BHWD] instruction.  */
2888
2889 static bool
2890 aarch64_sve_cnt_immediate_p (poly_int64 value)
2891 {
2892   HOST_WIDE_INT factor = value.coeffs[0];
2893   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
2894   return (value.coeffs[1] == factor
2895           && IN_RANGE (factor, 2, 16 * 16)
2896           && (factor & 1) == 0
2897           && factor <= 16 * (factor & -factor));
2898 }
2899
2900 /* Likewise for rtx X.  */
2901
2902 bool
2903 aarch64_sve_cnt_immediate_p (rtx x)
2904 {
2905   poly_int64 value;
2906   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2907 }
2908
2909 /* Return the asm string for an instruction with a CNT-like vector size
2910    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2911    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2912    first part of the operands template (the part that comes before the
2913    vector size itself).  PATTERN is the pattern to use.  FACTOR is the
2914    number of quadwords.  NELTS_PER_VQ, if nonzero, is the number of elements
2915    in each quadword.  If it is zero, we can use any element size.  */
2916
2917 static char *
2918 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2919                                   aarch64_svpattern pattern,
2920                                   unsigned int factor,
2921                                   unsigned int nelts_per_vq)
2922 {
2923   static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
2924
2925   if (nelts_per_vq == 0)
2926     /* There is some overlap in the ranges of the four CNT instructions.
2927        Here we always use the smallest possible element size, so that the
2928        multiplier is 1 whereever possible.  */
2929     nelts_per_vq = factor & -factor;
2930   int shift = std::min (exact_log2 (nelts_per_vq), 4);
2931   gcc_assert (IN_RANGE (shift, 1, 4));
2932   char suffix = "dwhb"[shift - 1];
2933
2934   factor >>= shift;
2935   unsigned int written;
2936   if (pattern == AARCH64_SV_ALL && factor == 1)
2937     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2938                         prefix, suffix, operands);
2939   else if (factor == 1)
2940     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
2941                         prefix, suffix, operands, svpattern_token (pattern));
2942   else
2943     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
2944                         prefix, suffix, operands, svpattern_token (pattern),
2945                         factor);
2946   gcc_assert (written < sizeof (buffer));
2947   return buffer;
2948 }
2949
2950 /* Return the asm string for an instruction with a CNT-like vector size
2951    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2952    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2953    first part of the operands template (the part that comes before the
2954    vector size itself).  X is the value of the vector size operand,
2955    as a polynomial integer rtx; we need to convert this into an "all"
2956    pattern with a multiplier.  */
2957
2958 char *
2959 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2960                                   rtx x)
2961 {
2962   poly_int64 value = rtx_to_poly_int64 (x);
2963   gcc_assert (aarch64_sve_cnt_immediate_p (value));
2964   return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
2965                                            value.coeffs[1], 0);
2966 }
2967
2968 /* Return true if we can add X using a single SVE INC or DEC instruction.  */
2969
2970 bool
2971 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
2972 {
2973   poly_int64 value;
2974   return (poly_int_rtx_p (x, &value)
2975           && (aarch64_sve_cnt_immediate_p (value)
2976               || aarch64_sve_cnt_immediate_p (-value)));
2977 }
2978
2979 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
2980    operand 0.  */
2981
2982 char *
2983 aarch64_output_sve_scalar_inc_dec (rtx offset)
2984 {
2985   poly_int64 offset_value = rtx_to_poly_int64 (offset);
2986   gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
2987   if (offset_value.coeffs[1] > 0)
2988     return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
2989                                              offset_value.coeffs[1], 0);
2990   else
2991     return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
2992                                              -offset_value.coeffs[1], 0);
2993 }
2994
2995 /* Return true if we can add VALUE to a register using a single ADDVL
2996    or ADDPL instruction.  */
2997
2998 static bool
2999 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
3000 {
3001   HOST_WIDE_INT factor = value.coeffs[0];
3002   if (factor == 0 || value.coeffs[1] != factor)
3003     return false;
3004   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
3005      and a value of 16 is one vector width.  */
3006   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
3007           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
3008 }
3009
3010 /* Likewise for rtx X.  */
3011
3012 bool
3013 aarch64_sve_addvl_addpl_immediate_p (rtx x)
3014 {
3015   poly_int64 value;
3016   return (poly_int_rtx_p (x, &value)
3017           && aarch64_sve_addvl_addpl_immediate_p (value));
3018 }
3019
3020 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
3021    to operand 1 and storing the result in operand 0.  */
3022
3023 char *
3024 aarch64_output_sve_addvl_addpl (rtx offset)
3025 {
3026   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
3027   poly_int64 offset_value = rtx_to_poly_int64 (offset);
3028   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
3029
3030   int factor = offset_value.coeffs[1];
3031   if ((factor & 15) == 0)
3032     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
3033   else
3034     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
3035   return buffer;
3036 }
3037
3038 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3039    instruction.  If it is, store the number of elements in each vector
3040    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
3041    factor in *FACTOR_OUT (if nonnull).  */
3042
3043 bool
3044 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
3045                                         unsigned int *nelts_per_vq_out)
3046 {
3047   rtx elt;
3048   poly_int64 value;
3049
3050   if (!const_vec_duplicate_p (x, &elt)
3051       || !poly_int_rtx_p (elt, &value))
3052     return false;
3053
3054   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
3055   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
3056     /* There's no vector INCB.  */
3057     return false;
3058
3059   HOST_WIDE_INT factor = value.coeffs[0];
3060   if (value.coeffs[1] != factor)
3061     return false;
3062
3063   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
3064   if ((factor % nelts_per_vq) != 0
3065       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
3066     return false;
3067
3068   if (factor_out)
3069     *factor_out = factor;
3070   if (nelts_per_vq_out)
3071     *nelts_per_vq_out = nelts_per_vq;
3072   return true;
3073 }
3074
3075 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3076    instruction.  */
3077
3078 bool
3079 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
3080 {
3081   return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
3082 }
3083
3084 /* Return the asm template for an SVE vector INC or DEC instruction.
3085    OPERANDS gives the operands before the vector count and X is the
3086    value of the vector count operand itself.  */
3087
3088 char *
3089 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
3090 {
3091   int factor;
3092   unsigned int nelts_per_vq;
3093   if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
3094     gcc_unreachable ();
3095   if (factor < 0)
3096     return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
3097                                              -factor, nelts_per_vq);
3098   else
3099     return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
3100                                              factor, nelts_per_vq);
3101 }
3102
3103 static int
3104 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
3105                                 scalar_int_mode mode)
3106 {
3107   int i;
3108   unsigned HOST_WIDE_INT val, val2, mask;
3109   int one_match, zero_match;
3110   int num_insns;
3111
3112   val = INTVAL (imm);
3113
3114   if (aarch64_move_imm (val, mode))
3115     {
3116       if (generate)
3117         emit_insn (gen_rtx_SET (dest, imm));
3118       return 1;
3119     }
3120
3121   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
3122      (with XXXX non-zero). In that case check to see if the move can be done in
3123      a smaller mode.  */
3124   val2 = val & 0xffffffff;
3125   if (mode == DImode
3126       && aarch64_move_imm (val2, SImode)
3127       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
3128     {
3129       if (generate)
3130         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3131
3132       /* Check if we have to emit a second instruction by checking to see
3133          if any of the upper 32 bits of the original DI mode value is set.  */
3134       if (val == val2)
3135         return 1;
3136
3137       i = (val >> 48) ? 48 : 32;
3138
3139       if (generate)
3140          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3141                                     GEN_INT ((val >> i) & 0xffff)));
3142
3143       return 2;
3144     }
3145
3146   if ((val >> 32) == 0 || mode == SImode)
3147     {
3148       if (generate)
3149         {
3150           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
3151           if (mode == SImode)
3152             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
3153                                        GEN_INT ((val >> 16) & 0xffff)));
3154           else
3155             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
3156                                        GEN_INT ((val >> 16) & 0xffff)));
3157         }
3158       return 2;
3159     }
3160
3161   /* Remaining cases are all for DImode.  */
3162
3163   mask = 0xffff;
3164   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
3165     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
3166   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
3167     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
3168
3169   if (zero_match != 2 && one_match != 2)
3170     {
3171       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
3172          For a 64-bit bitmask try whether changing 16 bits to all ones or
3173          zeroes creates a valid bitmask.  To check any repeated bitmask,
3174          try using 16 bits from the other 32-bit half of val.  */
3175
3176       for (i = 0; i < 64; i += 16, mask <<= 16)
3177         {
3178           val2 = val & ~mask;
3179           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3180             break;
3181           val2 = val | mask;
3182           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3183             break;
3184           val2 = val2 & ~mask;
3185           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
3186           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3187             break;
3188         }
3189       if (i != 64)
3190         {
3191           if (generate)
3192             {
3193               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3194               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3195                                          GEN_INT ((val >> i) & 0xffff)));
3196             }
3197           return 2;
3198         }
3199     }
3200
3201   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
3202      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
3203      otherwise skip zero bits.  */
3204
3205   num_insns = 1;
3206   mask = 0xffff;
3207   val2 = one_match > zero_match ? ~val : val;
3208   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
3209
3210   if (generate)
3211     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
3212                                            ? (val | ~(mask << i))
3213                                            : (val & (mask << i)))));
3214   for (i += 16; i < 64; i += 16)
3215     {
3216       if ((val2 & (mask << i)) == 0)
3217         continue;
3218       if (generate)
3219         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3220                                    GEN_INT ((val >> i) & 0xffff)));
3221       num_insns ++;
3222     }
3223
3224   return num_insns;
3225 }
3226
3227 /* Return whether imm is a 128-bit immediate which is simple enough to
3228    expand inline.  */
3229 bool
3230 aarch64_mov128_immediate (rtx imm)
3231 {
3232   if (GET_CODE (imm) == CONST_INT)
3233     return true;
3234
3235   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
3236
3237   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
3238   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
3239
3240   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
3241          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
3242 }
3243
3244
3245 /* Return the number of temporary registers that aarch64_add_offset_1
3246    would need to add OFFSET to a register.  */
3247
3248 static unsigned int
3249 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
3250 {
3251   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
3252 }
3253
3254 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
3255    a non-polynomial OFFSET.  MODE is the mode of the addition.
3256    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3257    be set and CFA adjustments added to the generated instructions.
3258
3259    TEMP1, if nonnull, is a register of mode MODE that can be used as a
3260    temporary if register allocation is already complete.  This temporary
3261    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
3262    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
3263    the immediate again.
3264
3265    Since this function may be used to adjust the stack pointer, we must
3266    ensure that it cannot cause transient stack deallocation (for example
3267    by first incrementing SP and then decrementing when adjusting by a
3268    large immediate).  */
3269
3270 static void
3271 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
3272                       rtx src, HOST_WIDE_INT offset, rtx temp1,
3273                       bool frame_related_p, bool emit_move_imm)
3274 {
3275   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3276   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3277
3278   HOST_WIDE_INT moffset = abs_hwi (offset);
3279   rtx_insn *insn;
3280
3281   if (!moffset)
3282     {
3283       if (!rtx_equal_p (dest, src))
3284         {
3285           insn = emit_insn (gen_rtx_SET (dest, src));
3286           RTX_FRAME_RELATED_P (insn) = frame_related_p;
3287         }
3288       return;
3289     }
3290
3291   /* Single instruction adjustment.  */
3292   if (aarch64_uimm12_shift (moffset))
3293     {
3294       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
3295       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3296       return;
3297     }
3298
3299   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
3300      and either:
3301
3302      a) the offset cannot be loaded by a 16-bit move or
3303      b) there is no spare register into which we can move it.  */
3304   if (moffset < 0x1000000
3305       && ((!temp1 && !can_create_pseudo_p ())
3306           || !aarch64_move_imm (moffset, mode)))
3307     {
3308       HOST_WIDE_INT low_off = moffset & 0xfff;
3309
3310       low_off = offset < 0 ? -low_off : low_off;
3311       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
3312       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3313       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
3314       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3315       return;
3316     }
3317
3318   /* Emit a move immediate if required and an addition/subtraction.  */
3319   if (emit_move_imm)
3320     {
3321       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
3322       temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
3323     }
3324   insn = emit_insn (offset < 0
3325                     ? gen_sub3_insn (dest, src, temp1)
3326                     : gen_add3_insn (dest, src, temp1));
3327   if (frame_related_p)
3328     {
3329       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3330       rtx adj = plus_constant (mode, src, offset);
3331       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
3332     }
3333 }
3334
3335 /* Return the number of temporary registers that aarch64_add_offset
3336    would need to move OFFSET into a register or add OFFSET to a register;
3337    ADD_P is true if we want the latter rather than the former.  */
3338
3339 static unsigned int
3340 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
3341 {
3342   /* This follows the same structure as aarch64_add_offset.  */
3343   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
3344     return 0;
3345
3346   unsigned int count = 0;
3347   HOST_WIDE_INT factor = offset.coeffs[1];
3348   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3349   poly_int64 poly_offset (factor, factor);
3350   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3351     /* Need one register for the ADDVL/ADDPL result.  */
3352     count += 1;
3353   else if (factor != 0)
3354     {
3355       factor = abs (factor);
3356       if (factor > 16 * (factor & -factor))
3357         /* Need one register for the CNT result and one for the multiplication
3358            factor.  If necessary, the second temporary can be reused for the
3359            constant part of the offset.  */
3360         return 2;
3361       /* Need one register for the CNT result (which might then
3362          be shifted).  */
3363       count += 1;
3364     }
3365   return count + aarch64_add_offset_1_temporaries (constant);
3366 }
3367
3368 /* If X can be represented as a poly_int64, return the number
3369    of temporaries that are required to add it to a register.
3370    Return -1 otherwise.  */
3371
3372 int
3373 aarch64_add_offset_temporaries (rtx x)
3374 {
3375   poly_int64 offset;
3376   if (!poly_int_rtx_p (x, &offset))
3377     return -1;
3378   return aarch64_offset_temporaries (true, offset);
3379 }
3380
3381 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
3382    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3383    be set and CFA adjustments added to the generated instructions.
3384
3385    TEMP1, if nonnull, is a register of mode MODE that can be used as a
3386    temporary if register allocation is already complete.  This temporary
3387    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
3388    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
3389    false to avoid emitting the immediate again.
3390
3391    TEMP2, if nonnull, is a second temporary register that doesn't
3392    overlap either DEST or REG.
3393
3394    Since this function may be used to adjust the stack pointer, we must
3395    ensure that it cannot cause transient stack deallocation (for example
3396    by first incrementing SP and then decrementing when adjusting by a
3397    large immediate).  */
3398
3399 static void
3400 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3401                     poly_int64 offset, rtx temp1, rtx temp2,
3402                     bool frame_related_p, bool emit_move_imm = true)
3403 {
3404   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3405   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3406   gcc_assert (temp1 == NULL_RTX
3407               || !frame_related_p
3408               || !reg_overlap_mentioned_p (temp1, dest));
3409   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
3410
3411   /* Try using ADDVL or ADDPL to add the whole value.  */
3412   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
3413     {
3414       rtx offset_rtx = gen_int_mode (offset, mode);
3415       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3416       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3417       return;
3418     }
3419
3420   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
3421      SVE vector register, over and above the minimum size of 128 bits.
3422      This is equivalent to half the value returned by CNTD with a
3423      vector shape of ALL.  */
3424   HOST_WIDE_INT factor = offset.coeffs[1];
3425   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3426
3427   /* Try using ADDVL or ADDPL to add the VG-based part.  */
3428   poly_int64 poly_offset (factor, factor);
3429   if (src != const0_rtx
3430       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3431     {
3432       rtx offset_rtx = gen_int_mode (poly_offset, mode);
3433       if (frame_related_p)
3434         {
3435           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3436           RTX_FRAME_RELATED_P (insn) = true;
3437           src = dest;
3438         }
3439       else
3440         {
3441           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
3442           src = aarch64_force_temporary (mode, temp1, addr);
3443           temp1 = temp2;
3444           temp2 = NULL_RTX;
3445         }
3446     }
3447   /* Otherwise use a CNT-based sequence.  */
3448   else if (factor != 0)
3449     {
3450       /* Use a subtraction if we have a negative factor.  */
3451       rtx_code code = PLUS;
3452       if (factor < 0)
3453         {
3454           factor = -factor;
3455           code = MINUS;
3456         }
3457
3458       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
3459          into the multiplication.  */
3460       rtx val;
3461       int shift = 0;
3462       if (factor & 1)
3463         /* Use a right shift by 1.  */
3464         shift = -1;
3465       else
3466         factor /= 2;
3467       HOST_WIDE_INT low_bit = factor & -factor;
3468       if (factor <= 16 * low_bit)
3469         {
3470           if (factor > 16 * 8)
3471             {
3472               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3473                  the value with the minimum multiplier and shift it into
3474                  position.  */
3475               int extra_shift = exact_log2 (low_bit);
3476               shift += extra_shift;
3477               factor >>= extra_shift;
3478             }
3479           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
3480         }
3481       else
3482         {
3483           /* Base the factor on LOW_BIT if we can calculate LOW_BIT
3484              directly, since that should increase the chances of being
3485              able to use a shift and add sequence.  If LOW_BIT itself
3486              is out of range, just use CNTD.  */
3487           if (low_bit <= 16 * 8)
3488             factor /= low_bit;
3489           else
3490             low_bit = 1;
3491
3492           val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
3493           val = aarch64_force_temporary (mode, temp1, val);
3494
3495           if (can_create_pseudo_p ())
3496             {
3497               rtx coeff1 = gen_int_mode (factor, mode);
3498               val = expand_mult (mode, val, coeff1, NULL_RTX, false, true);
3499             }
3500           else
3501             {
3502               /* Go back to using a negative multiplication factor if we have
3503                  no register from which to subtract.  */
3504               if (code == MINUS && src == const0_rtx)
3505                 {
3506                   factor = -factor;
3507                   code = PLUS;
3508                 }
3509               rtx coeff1 = gen_int_mode (factor, mode);
3510               coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
3511               val = gen_rtx_MULT (mode, val, coeff1);
3512             }
3513         }
3514
3515       if (shift > 0)
3516         {
3517           /* Multiply by 1 << SHIFT.  */
3518           val = aarch64_force_temporary (mode, temp1, val);
3519           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3520         }
3521       else if (shift == -1)
3522         {
3523           /* Divide by 2.  */
3524           val = aarch64_force_temporary (mode, temp1, val);
3525           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3526         }
3527
3528       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
3529       if (src != const0_rtx)
3530         {
3531           val = aarch64_force_temporary (mode, temp1, val);
3532           val = gen_rtx_fmt_ee (code, mode, src, val);
3533         }
3534       else if (code == MINUS)
3535         {
3536           val = aarch64_force_temporary (mode, temp1, val);
3537           val = gen_rtx_NEG (mode, val);
3538         }
3539
3540       if (constant == 0 || frame_related_p)
3541         {
3542           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3543           if (frame_related_p)
3544             {
3545               RTX_FRAME_RELATED_P (insn) = true;
3546               add_reg_note (insn, REG_CFA_ADJUST_CFA,
3547                             gen_rtx_SET (dest, plus_constant (Pmode, src,
3548                                                               poly_offset)));
3549             }
3550           src = dest;
3551           if (constant == 0)
3552             return;
3553         }
3554       else
3555         {
3556           src = aarch64_force_temporary (mode, temp1, val);
3557           temp1 = temp2;
3558           temp2 = NULL_RTX;
3559         }
3560
3561       emit_move_imm = true;
3562     }
3563
3564   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
3565                         frame_related_p, emit_move_imm);
3566 }
3567
3568 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3569    than a poly_int64.  */
3570
3571 void
3572 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3573                           rtx offset_rtx, rtx temp1, rtx temp2)
3574 {
3575   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
3576                       temp1, temp2, false);
3577 }
3578
3579 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3580    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
3581    if TEMP1 already contains abs (DELTA).  */
3582
3583 static inline void
3584 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
3585 {
3586   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
3587                       temp1, temp2, true, emit_move_imm);
3588 }
3589
3590 /* Subtract DELTA from the stack pointer, marking the instructions
3591    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
3592    if nonnull.  */
3593
3594 static inline void
3595 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
3596                 bool emit_move_imm = true)
3597 {
3598   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
3599                       temp1, temp2, frame_related_p, emit_move_imm);
3600 }
3601
3602 /* Set DEST to (vec_series BASE STEP).  */
3603
3604 static void
3605 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
3606 {
3607   machine_mode mode = GET_MODE (dest);
3608   scalar_mode inner = GET_MODE_INNER (mode);
3609
3610   /* Each operand can be a register or an immediate in the range [-16, 15].  */
3611   if (!aarch64_sve_index_immediate_p (base))
3612     base = force_reg (inner, base);
3613   if (!aarch64_sve_index_immediate_p (step))
3614     step = force_reg (inner, step);
3615
3616   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
3617 }
3618
3619 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
3620    register of mode MODE.  Use TARGET for the result if it's nonnull
3621    and convenient.
3622
3623    The two vector modes must have the same element mode.  The behavior
3624    is to duplicate architectural lane N of SRC into architectural lanes
3625    N + I * STEP of the result.  On big-endian targets, architectural
3626    lane 0 of an Advanced SIMD vector is the last element of the vector
3627    in memory layout, so for big-endian targets this operation has the
3628    effect of reversing SRC before duplicating it.  Callers need to
3629    account for this.  */
3630
3631 rtx
3632 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
3633 {
3634   machine_mode src_mode = GET_MODE (src);
3635   gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
3636   insn_code icode = (BYTES_BIG_ENDIAN
3637                      ? code_for_aarch64_vec_duplicate_vq_be (mode)
3638                      : code_for_aarch64_vec_duplicate_vq_le (mode));
3639
3640   unsigned int i = 0;
3641   expand_operand ops[3];
3642   create_output_operand (&ops[i++], target, mode);
3643   create_output_operand (&ops[i++], src, src_mode);
3644   if (BYTES_BIG_ENDIAN)
3645     {
3646       /* Create a PARALLEL describing the reversal of SRC.  */
3647       unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
3648       rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
3649                                                   nelts_per_vq - 1, -1);
3650       create_fixed_operand (&ops[i++], sel);
3651     }
3652   expand_insn (icode, i, ops);
3653   return ops[0].value;
3654 }
3655
3656 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
3657    the memory image into DEST.  Return true on success.  */
3658
3659 static bool
3660 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
3661 {
3662   src = force_const_mem (GET_MODE (src), src);
3663   if (!src)
3664     return false;
3665
3666   /* Make sure that the address is legitimate.  */
3667   if (!aarch64_sve_ld1rq_operand_p (src))
3668     {
3669       rtx addr = force_reg (Pmode, XEXP (src, 0));
3670       src = replace_equiv_address (src, addr);
3671     }
3672
3673   machine_mode mode = GET_MODE (dest);
3674   unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
3675   machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
3676   rtx ptrue = aarch64_ptrue_reg (pred_mode);
3677   emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
3678   return true;
3679 }
3680
3681 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
3682    SVE data mode and isn't a legitimate constant.  Use TARGET for the
3683    result if convenient.
3684
3685    The returned register can have whatever mode seems most natural
3686    given the contents of SRC.  */
3687
3688 static rtx
3689 aarch64_expand_sve_const_vector (rtx target, rtx src)
3690 {
3691   machine_mode mode = GET_MODE (src);
3692   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
3693   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
3694   scalar_mode elt_mode = GET_MODE_INNER (mode);
3695   unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
3696   unsigned int encoded_bits = npatterns * nelts_per_pattern * elt_bits;
3697
3698   if (nelts_per_pattern == 1 && encoded_bits == 128)
3699     {
3700       /* The constant is a duplicated quadword but can't be narrowed
3701          beyond a quadword.  Get the memory image of the first quadword
3702          as a 128-bit vector and try using LD1RQ to load it from memory.
3703
3704          The effect for both endiannesses is to load memory lane N into
3705          architectural lanes N + I * STEP of the result.  On big-endian
3706          targets, the layout of the 128-bit vector in an Advanced SIMD
3707          register would be different from its layout in an SVE register,
3708          but this 128-bit vector is a memory value only.  */
3709       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
3710       rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
3711       if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
3712         return target;
3713     }
3714
3715   if (nelts_per_pattern == 1 && encoded_bits < 128)
3716     {
3717       /* The vector is a repeating sequence of 64 bits or fewer.
3718          See if we can load them using an Advanced SIMD move and then
3719          duplicate it to fill a vector.  This is better than using a GPR
3720          move because it keeps everything in the same register file.  */
3721       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
3722       rtx_vector_builder builder (vq_mode, npatterns, 1);
3723       for (unsigned int i = 0; i < npatterns; ++i)
3724         {
3725           /* We want memory lane N to go into architectural lane N,
3726              so reverse for big-endian targets.  The DUP .Q pattern
3727              has a compensating reverse built-in.  */
3728           unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
3729           builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
3730         }
3731       rtx vq_src = builder.build ();
3732       if (aarch64_simd_valid_immediate (vq_src, NULL))
3733         {
3734           vq_src = force_reg (vq_mode, vq_src);
3735           return aarch64_expand_sve_dupq (target, mode, vq_src);
3736         }
3737
3738       /* Get an integer representation of the repeating part of Advanced
3739          SIMD vector VQ_SRC.  This preserves the endianness of VQ_SRC,
3740          which for big-endian targets is lane-swapped wrt a normal
3741          Advanced SIMD vector.  This means that for both endiannesses,
3742          memory lane N of SVE vector SRC corresponds to architectural
3743          lane N of a register holding VQ_SRC.  This in turn means that
3744          memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
3745          as a single 128-bit value) and thus that memory lane 0 of SRC is
3746          in the lsb of the integer.  Duplicating the integer therefore
3747          ensures that memory lane N of SRC goes into architectural lane
3748          N + I * INDEX of the SVE register.  */
3749       scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
3750       rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
3751       if (elt_value)
3752         {
3753           /* Pretend that we had a vector of INT_MODE to start with.  */
3754           elt_mode = int_mode;
3755           mode = aarch64_full_sve_mode (int_mode).require ();
3756
3757           /* If the integer can be moved into a general register by a
3758              single instruction, do that and duplicate the result.  */
3759           if (CONST_INT_P (elt_value)
3760               && aarch64_move_imm (INTVAL (elt_value), elt_mode))
3761             {
3762               elt_value = force_reg (elt_mode, elt_value);
3763               return expand_vector_broadcast (mode, elt_value);
3764             }
3765         }
3766       else if (npatterns == 1)
3767         /* We're duplicating a single value, but can't do better than
3768            force it to memory and load from there.  This handles things
3769            like symbolic constants.  */
3770         elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
3771
3772       if (elt_value)
3773         {
3774           /* Load the element from memory if we can, otherwise move it into
3775              a register and use a DUP.  */
3776           rtx op = force_const_mem (elt_mode, elt_value);
3777           if (!op)
3778             op = force_reg (elt_mode, elt_value);
3779           return expand_vector_broadcast (mode, op);
3780         }
3781     }
3782
3783   /* Try using INDEX.  */
3784   rtx base, step;
3785   if (const_vec_series_p (src, &base, &step))
3786     {
3787       aarch64_expand_vec_series (target, base, step);
3788       return target;
3789     }
3790
3791   /* From here on, it's better to force the whole constant to memory
3792      if we can.  */
3793   if (GET_MODE_NUNITS (mode).is_constant ())
3794     return NULL_RTX;
3795
3796   /* Expand each pattern individually.  */
3797   gcc_assert (npatterns > 1);
3798   rtx_vector_builder builder;
3799   auto_vec<rtx, 16> vectors (npatterns);
3800   for (unsigned int i = 0; i < npatterns; ++i)
3801     {
3802       builder.new_vector (mode, 1, nelts_per_pattern);
3803       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
3804         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
3805       vectors.quick_push (force_reg (mode, builder.build ()));
3806     }
3807
3808   /* Use permutes to interleave the separate vectors.  */
3809   while (npatterns > 1)
3810     {
3811       npatterns /= 2;
3812       for (unsigned int i = 0; i < npatterns; ++i)
3813         {
3814           rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
3815           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
3816           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
3817           vectors[i] = tmp;
3818         }
3819     }
3820   gcc_assert (vectors[0] == target);
3821   return target;
3822 }
3823
3824 /* Use WHILE to set a predicate register of mode MODE in which the first
3825    VL bits are set and the rest are clear.  Use TARGET for the register
3826    if it's nonnull and convenient.  */
3827
3828 static rtx
3829 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
3830                                  unsigned int vl)
3831 {
3832   rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
3833   target = aarch64_target_reg (target, mode);
3834   emit_insn (gen_while_ult (DImode, mode, target, const0_rtx, limit));
3835   return target;
3836 }
3837
3838 static rtx
3839 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
3840
3841 /* BUILDER is a constant predicate in which the index of every set bit
3842    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
3843    by inverting every element at a multiple of ELT_SIZE and EORing the
3844    result with an ELT_SIZE PTRUE.
3845
3846    Return a register that contains the constant on success, otherwise
3847    return null.  Use TARGET as the register if it is nonnull and
3848    convenient.  */
3849
3850 static rtx
3851 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
3852                                    unsigned int elt_size)
3853 {
3854   /* Invert every element at a multiple of ELT_SIZE, keeping the
3855      other bits zero.  */
3856   rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
3857                                   builder.nelts_per_pattern ());
3858   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
3859     if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
3860       inv_builder.quick_push (const1_rtx);
3861     else
3862       inv_builder.quick_push (const0_rtx);
3863   inv_builder.finalize ();
3864
3865   /* See if we can load the constant cheaply.  */
3866   rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
3867   if (!inv)
3868     return NULL_RTX;
3869
3870   /* EOR the result with an ELT_SIZE PTRUE.  */
3871   rtx mask = aarch64_ptrue_all (elt_size);
3872   mask = force_reg (VNx16BImode, mask);
3873   target = aarch64_target_reg (target, VNx16BImode);
3874   emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
3875   return target;
3876 }
3877
3878 /* BUILDER is a constant predicate in which the index of every set bit
3879    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
3880    using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE.  Return the
3881    register on success, otherwise return null.  Use TARGET as the register
3882    if nonnull and convenient.  */
3883
3884 static rtx
3885 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
3886                                    unsigned int elt_size,
3887                                    unsigned int permute_size)
3888 {
3889   /* We're going to split the constant into two new constants A and B,
3890      with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
3891      and into B otherwise.  E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
3892
3893      A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
3894      B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
3895
3896      where _ indicates elements that will be discarded by the permute.
3897
3898      First calculate the ELT_SIZEs for A and B.  */
3899   unsigned int a_elt_size = GET_MODE_SIZE (DImode);
3900   unsigned int b_elt_size = GET_MODE_SIZE (DImode);
3901   for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
3902     if (INTVAL (builder.elt (i)) != 0)
3903       {
3904         if (i & permute_size)
3905           b_elt_size |= i - permute_size;
3906         else
3907           a_elt_size |= i;
3908       }
3909   a_elt_size &= -a_elt_size;
3910   b_elt_size &= -b_elt_size;
3911
3912   /* Now construct the vectors themselves.  */
3913   rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
3914                                 builder.nelts_per_pattern ());
3915   rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
3916                                 builder.nelts_per_pattern ());
3917   unsigned int nelts = builder.encoded_nelts ();
3918   for (unsigned int i = 0; i < nelts; ++i)
3919     if (i & (elt_size - 1))
3920       {
3921         a_builder.quick_push (const0_rtx);
3922         b_builder.quick_push (const0_rtx);
3923       }
3924     else if ((i & permute_size) == 0)
3925       {
3926         /* The A and B elements are significant.  */
3927         a_builder.quick_push (builder.elt (i));
3928         b_builder.quick_push (builder.elt (i + permute_size));
3929       }
3930     else
3931       {
3932         /* The A and B elements are going to be discarded, so pick whatever
3933            is likely to give a nice constant.  We are targeting element
3934            sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
3935            with the aim of each being a sequence of ones followed by
3936            a sequence of zeros.  So:
3937
3938            * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
3939              duplicate the last X_ELT_SIZE element, to extend the
3940              current sequence of ones or zeros.
3941
3942            * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
3943              zero, so that the constant really does have X_ELT_SIZE and
3944              not a smaller size.  */
3945         if (a_elt_size > permute_size)
3946           a_builder.quick_push (const0_rtx);
3947         else
3948           a_builder.quick_push (a_builder.elt (i - a_elt_size));
3949         if (b_elt_size > permute_size)
3950           b_builder.quick_push (const0_rtx);
3951         else
3952           b_builder.quick_push (b_builder.elt (i - b_elt_size));
3953       }
3954   a_builder.finalize ();
3955   b_builder.finalize ();
3956
3957   /* Try loading A into a register.  */
3958   rtx_insn *last = get_last_insn ();
3959   rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
3960   if (!a)
3961     return NULL_RTX;
3962
3963   /* Try loading B into a register.  */
3964   rtx b = a;
3965   if (a_builder != b_builder)
3966     {
3967       b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
3968       if (!b)
3969         {
3970           delete_insns_since (last);
3971           return NULL_RTX;
3972         }
3973     }
3974
3975   /* Emit the TRN1 itself.  */
3976   machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
3977   target = aarch64_target_reg (target, mode);
3978   emit_insn (gen_aarch64_sve (UNSPEC_TRN1, mode, target,
3979                               gen_lowpart (mode, a),
3980                               gen_lowpart (mode, b)));
3981   return target;
3982 }
3983
3984 /* Subroutine of aarch64_expand_sve_const_pred.  Try to load the VNx16BI
3985    constant in BUILDER into an SVE predicate register.  Return the register
3986    on success, otherwise return null.  Use TARGET for the register if
3987    nonnull and convenient.
3988
3989    ALLOW_RECURSE_P is true if we can use methods that would call this
3990    function recursively.  */
3991
3992 static rtx
3993 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
3994                                  bool allow_recurse_p)
3995 {
3996   if (builder.encoded_nelts () == 1)
3997     /* A PFALSE or a PTRUE .B ALL.  */
3998     return aarch64_emit_set_immediate (target, builder);
3999
4000   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
4001   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
4002     {
4003       /* If we can load the constant using PTRUE, use it as-is.  */
4004       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
4005       if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
4006         return aarch64_emit_set_immediate (target, builder);
4007
4008       /* Otherwise use WHILE to set the first VL bits.  */
4009       return aarch64_sve_move_pred_via_while (target, mode, vl);
4010     }
4011
4012   if (!allow_recurse_p)
4013     return NULL_RTX;
4014
4015   /* Try inverting the vector in element size ELT_SIZE and then EORing
4016      the result with an ELT_SIZE PTRUE.  */
4017   if (INTVAL (builder.elt (0)) == 0)
4018     if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
4019                                                      elt_size))
4020       return res;
4021
4022   /* Try using TRN1 to permute two simpler constants.  */
4023   for (unsigned int i = elt_size; i <= 8; i *= 2)
4024     if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
4025                                                      elt_size, i))
4026       return res;
4027
4028   return NULL_RTX;
4029 }
4030
4031 /* Return an SVE predicate register that contains the VNx16BImode
4032    constant in BUILDER, without going through the move expanders.
4033
4034    The returned register can have whatever mode seems most natural
4035    given the contents of BUILDER.  Use TARGET for the result if
4036    convenient.  */
4037
4038 static rtx
4039 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
4040 {
4041   /* Try loading the constant using pure predicate operations.  */
4042   if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
4043     return res;
4044
4045   /* Try forcing the constant to memory.  */
4046   if (builder.full_nelts ().is_constant ())
4047     if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
4048       {
4049         target = aarch64_target_reg (target, VNx16BImode);
4050         emit_move_insn (target, mem);
4051         return target;
4052       }
4053
4054   /* The last resort is to load the constant as an integer and then
4055      compare it against zero.  Use -1 for set bits in order to increase
4056      the changes of using SVE DUPM or an Advanced SIMD byte mask.  */
4057   rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
4058                                   builder.nelts_per_pattern ());
4059   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
4060     int_builder.quick_push (INTVAL (builder.elt (i))
4061                             ? constm1_rtx : const0_rtx);
4062   return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
4063                                            int_builder.build ());
4064 }
4065
4066 /* Set DEST to immediate IMM.  */
4067
4068 void
4069 aarch64_expand_mov_immediate (rtx dest, rtx imm)
4070 {
4071   machine_mode mode = GET_MODE (dest);
4072
4073   /* Check on what type of symbol it is.  */
4074   scalar_int_mode int_mode;
4075   if ((GET_CODE (imm) == SYMBOL_REF
4076        || GET_CODE (imm) == LABEL_REF
4077        || GET_CODE (imm) == CONST
4078        || GET_CODE (imm) == CONST_POLY_INT)
4079       && is_a <scalar_int_mode> (mode, &int_mode))
4080     {
4081       rtx mem;
4082       poly_int64 offset;
4083       HOST_WIDE_INT const_offset;
4084       enum aarch64_symbol_type sty;
4085
4086       /* If we have (const (plus symbol offset)), separate out the offset
4087          before we start classifying the symbol.  */
4088       rtx base = strip_offset (imm, &offset);
4089
4090       /* We must always add an offset involving VL separately, rather than
4091          folding it into the relocation.  */
4092       if (!offset.is_constant (&const_offset))
4093         {
4094           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
4095             emit_insn (gen_rtx_SET (dest, imm));
4096           else
4097             {
4098               /* Do arithmetic on 32-bit values if the result is smaller
4099                  than that.  */
4100               if (partial_subreg_p (int_mode, SImode))
4101                 {
4102                   /* It is invalid to do symbol calculations in modes
4103                      narrower than SImode.  */
4104                   gcc_assert (base == const0_rtx);
4105                   dest = gen_lowpart (SImode, dest);
4106                   int_mode = SImode;
4107                 }
4108               if (base != const0_rtx)
4109                 {
4110                   base = aarch64_force_temporary (int_mode, dest, base);
4111                   aarch64_add_offset (int_mode, dest, base, offset,
4112                                       NULL_RTX, NULL_RTX, false);
4113                 }
4114               else
4115                 aarch64_add_offset (int_mode, dest, base, offset,
4116                                     dest, NULL_RTX, false);
4117             }
4118           return;
4119         }
4120
4121       sty = aarch64_classify_symbol (base, const_offset);
4122       switch (sty)
4123         {
4124         case SYMBOL_FORCE_TO_MEM:
4125           if (const_offset != 0
4126               && targetm.cannot_force_const_mem (int_mode, imm))
4127             {
4128               gcc_assert (can_create_pseudo_p ());
4129               base = aarch64_force_temporary (int_mode, dest, base);
4130               aarch64_add_offset (int_mode, dest, base, const_offset,
4131                                   NULL_RTX, NULL_RTX, false);
4132               return;
4133             }
4134
4135           mem = force_const_mem (ptr_mode, imm);
4136           gcc_assert (mem);
4137
4138           /* If we aren't generating PC relative literals, then
4139              we need to expand the literal pool access carefully.
4140              This is something that needs to be done in a number
4141              of places, so could well live as a separate function.  */
4142           if (!aarch64_pcrelative_literal_loads)
4143             {
4144               gcc_assert (can_create_pseudo_p ());
4145               base = gen_reg_rtx (ptr_mode);
4146               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
4147               if (ptr_mode != Pmode)
4148                 base = convert_memory_address (Pmode, base);
4149               mem = gen_rtx_MEM (ptr_mode, base);
4150             }
4151
4152           if (int_mode != ptr_mode)
4153             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
4154
4155           emit_insn (gen_rtx_SET (dest, mem));
4156
4157           return;
4158
4159         case SYMBOL_SMALL_TLSGD:
4160         case SYMBOL_SMALL_TLSDESC:
4161         case SYMBOL_SMALL_TLSIE:
4162         case SYMBOL_SMALL_GOT_28K:
4163         case SYMBOL_SMALL_GOT_4G:
4164         case SYMBOL_TINY_GOT:
4165         case SYMBOL_TINY_TLSIE:
4166           if (const_offset != 0)
4167             {
4168               gcc_assert(can_create_pseudo_p ());
4169               base = aarch64_force_temporary (int_mode, dest, base);
4170               aarch64_add_offset (int_mode, dest, base, const_offset,
4171                                   NULL_RTX, NULL_RTX, false);
4172               return;
4173             }
4174           /* FALLTHRU */
4175
4176         case SYMBOL_SMALL_ABSOLUTE:
4177         case SYMBOL_TINY_ABSOLUTE:
4178         case SYMBOL_TLSLE12:
4179         case SYMBOL_TLSLE24:
4180         case SYMBOL_TLSLE32:
4181         case SYMBOL_TLSLE48:
4182           aarch64_load_symref_appropriately (dest, imm, sty);
4183           return;
4184
4185         default:
4186           gcc_unreachable ();
4187         }
4188     }
4189
4190   if (!CONST_INT_P (imm))
4191     {
4192       if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
4193         {
4194           /* Only the low bit of each .H, .S and .D element is defined,
4195              so we can set the upper bits to whatever we like.  If the
4196              predicate is all-true in MODE, prefer to set all the undefined
4197              bits as well, so that we can share a single .B predicate for
4198              all modes.  */
4199           if (imm == CONSTM1_RTX (mode))
4200             imm = CONSTM1_RTX (VNx16BImode);
4201
4202           /* All methods for constructing predicate modes wider than VNx16BI
4203              will set the upper bits of each element to zero.  Expose this
4204              by moving such constants as a VNx16BI, so that all bits are
4205              significant and so that constants for different modes can be
4206              shared.  The wider constant will still be available as a
4207              REG_EQUAL note.  */
4208           rtx_vector_builder builder;
4209           if (aarch64_get_sve_pred_bits (builder, imm))
4210             {
4211               rtx res = aarch64_expand_sve_const_pred (dest, builder);
4212               if (dest != res)
4213                 emit_move_insn (dest, gen_lowpart (mode, res));
4214               return;
4215             }
4216         }
4217
4218       if (GET_CODE (imm) == HIGH
4219           || aarch64_simd_valid_immediate (imm, NULL))
4220         {
4221           emit_insn (gen_rtx_SET (dest, imm));
4222           return;
4223         }
4224
4225       if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
4226         if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
4227           {
4228             if (dest != res)
4229               emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
4230             return;
4231           }
4232
4233       rtx mem = force_const_mem (mode, imm);
4234       gcc_assert (mem);
4235       emit_move_insn (dest, mem);
4236       return;
4237     }
4238
4239   aarch64_internal_mov_immediate (dest, imm, true,
4240                                   as_a <scalar_int_mode> (mode));
4241 }
4242
4243 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
4244    that is known to contain PTRUE.  */
4245
4246 void
4247 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
4248 {
4249   expand_operand ops[3];
4250   machine_mode mode = GET_MODE (dest);
4251   create_output_operand (&ops[0], dest, mode);
4252   create_input_operand (&ops[1], pred, GET_MODE(pred));
4253   create_input_operand (&ops[2], src, mode);
4254   temporary_volatile_ok v (true);
4255   expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
4256 }
4257
4258 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
4259    operand is in memory.  In this case we need to use the predicated LD1
4260    and ST1 instead of LDR and STR, both for correctness on big-endian
4261    targets and because LD1 and ST1 support a wider range of addressing modes.
4262    PRED_MODE is the mode of the predicate.
4263
4264    See the comment at the head of aarch64-sve.md for details about the
4265    big-endian handling.  */
4266
4267 void
4268 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
4269 {
4270   machine_mode mode = GET_MODE (dest);
4271   rtx ptrue = aarch64_ptrue_reg (pred_mode);
4272   if (!register_operand (src, mode)
4273       && !register_operand (dest, mode))
4274     {
4275       rtx tmp = gen_reg_rtx (mode);
4276       if (MEM_P (src))
4277         aarch64_emit_sve_pred_move (tmp, ptrue, src);
4278       else
4279         emit_move_insn (tmp, src);
4280       src = tmp;
4281     }
4282   aarch64_emit_sve_pred_move (dest, ptrue, src);
4283 }
4284
4285 /* Called only on big-endian targets.  See whether an SVE vector move
4286    from SRC to DEST is effectively a REV[BHW] instruction, because at
4287    least one operand is a subreg of an SVE vector that has wider or
4288    narrower elements.  Return true and emit the instruction if so.
4289
4290    For example:
4291
4292      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
4293
4294    represents a VIEW_CONVERT between the following vectors, viewed
4295    in memory order:
4296
4297      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
4298      R1: { [0],      [1],      [2],      [3],     ... }
4299
4300    The high part of lane X in R2 should therefore correspond to lane X*2
4301    of R1, but the register representations are:
4302
4303          msb                                      lsb
4304      R2: ...... [1].high  [1].low   [0].high  [0].low
4305      R1: ...... [3]       [2]       [1]       [0]
4306
4307    where the low part of lane X in R2 corresponds to lane X*2 in R1.
4308    We therefore need a reverse operation to swap the high and low values
4309    around.
4310
4311    This is purely an optimization.  Without it we would spill the
4312    subreg operand to the stack in one mode and reload it in the
4313    other mode, which has the same effect as the REV.  */
4314
4315 bool
4316 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
4317 {
4318   gcc_assert (BYTES_BIG_ENDIAN);
4319   if (GET_CODE (dest) == SUBREG)
4320     dest = SUBREG_REG (dest);
4321   if (GET_CODE (src) == SUBREG)
4322     src = SUBREG_REG (src);
4323
4324   /* The optimization handles two single SVE REGs with different element
4325      sizes.  */
4326   if (!REG_P (dest)
4327       || !REG_P (src)
4328       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
4329       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
4330       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
4331           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
4332     return false;
4333
4334   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
4335   rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
4336   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
4337                                UNSPEC_REV_SUBREG);
4338   emit_insn (gen_rtx_SET (dest, unspec));
4339   return true;
4340 }
4341
4342 /* Return a copy of X with mode MODE, without changing its other
4343    attributes.  Unlike gen_lowpart, this doesn't care whether the
4344    mode change is valid.  */
4345
4346 static rtx
4347 aarch64_replace_reg_mode (rtx x, machine_mode mode)
4348 {
4349   if (GET_MODE (x) == mode)
4350     return x;
4351
4352   x = shallow_copy_rtx (x);
4353   set_mode_and_regno (x, mode, REGNO (x));
4354   return x;
4355 }
4356
4357 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
4358    stored in wider integer containers.  */
4359
4360 static unsigned int
4361 aarch64_sve_rev_unspec (machine_mode mode)
4362 {
4363   switch (GET_MODE_UNIT_SIZE (mode))
4364     {
4365     case 1: return UNSPEC_REVB;
4366     case 2: return UNSPEC_REVH;
4367     case 4: return UNSPEC_REVW;
4368     }
4369   gcc_unreachable ();
4370 }
4371
4372 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
4373    operands.  */
4374
4375 void
4376 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
4377 {
4378   /* Decide which REV operation we need.  The mode with wider elements
4379      determines the mode of the operands and the mode with the narrower
4380      elements determines the reverse width.  */
4381   machine_mode mode_with_wider_elts = GET_MODE (dest);
4382   machine_mode mode_with_narrower_elts = GET_MODE (src);
4383   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
4384       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
4385     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
4386
4387   unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
4388   unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
4389   machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
4390
4391   /* Get the operands in the appropriate modes and emit the instruction.  */
4392   ptrue = gen_lowpart (pred_mode, ptrue);
4393   dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
4394   src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
4395   emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
4396                                dest, ptrue, src));
4397 }
4398
4399 static bool
4400 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
4401                                  tree exp ATTRIBUTE_UNUSED)
4402 {
4403   if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
4404     return false;
4405
4406   return true;
4407 }
4408
4409 /* Implement TARGET_PASS_BY_REFERENCE.  */
4410
4411 static bool
4412 aarch64_pass_by_reference (cumulative_args_t, const function_arg_info &arg)
4413 {
4414   HOST_WIDE_INT size;
4415   machine_mode dummymode;
4416   int nregs;
4417
4418   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
4419   if (arg.mode == BLKmode && arg.type)
4420     size = int_size_in_bytes (arg.type);
4421   else
4422     /* No frontends can create types with variable-sized modes, so we
4423        shouldn't be asked to pass or return them.  */
4424     size = GET_MODE_SIZE (arg.mode).to_constant ();
4425
4426   /* Aggregates are passed by reference based on their size.  */
4427   if (arg.aggregate_type_p ())
4428     size = int_size_in_bytes (arg.type);
4429
4430   /* Variable sized arguments are always returned by reference.  */
4431   if (size < 0)
4432     return true;
4433
4434   /* Can this be a candidate to be passed in fp/simd register(s)?  */
4435   if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
4436                                                &dummymode, &nregs,
4437                                                NULL))
4438     return false;
4439
4440   /* Arguments which are variable sized or larger than 2 registers are
4441      passed by reference unless they are a homogenous floating point
4442      aggregate.  */
4443   return size > 2 * UNITS_PER_WORD;
4444 }
4445
4446 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
4447 static bool
4448 aarch64_return_in_msb (const_tree valtype)
4449 {
4450   machine_mode dummy_mode;
4451   int dummy_int;
4452
4453   /* Never happens in little-endian mode.  */
4454   if (!BYTES_BIG_ENDIAN)
4455     return false;
4456
4457   /* Only composite types smaller than or equal to 16 bytes can
4458      be potentially returned in registers.  */
4459   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
4460       || int_size_in_bytes (valtype) <= 0
4461       || int_size_in_bytes (valtype) > 16)
4462     return false;
4463
4464   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
4465      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
4466      is always passed/returned in the least significant bits of fp/simd
4467      register(s).  */
4468   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
4469                                                &dummy_mode, &dummy_int, NULL))
4470     return false;
4471
4472   return true;
4473 }
4474
4475 /* Implement TARGET_FUNCTION_VALUE.
4476    Define how to find the value returned by a function.  */
4477
4478 static rtx
4479 aarch64_function_value (const_tree type, const_tree func,
4480                         bool outgoing ATTRIBUTE_UNUSED)
4481 {
4482   machine_mode mode;
4483   int unsignedp;
4484   int count;
4485   machine_mode ag_mode;
4486
4487   mode = TYPE_MODE (type);
4488   if (INTEGRAL_TYPE_P (type))
4489     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
4490
4491   if (aarch64_return_in_msb (type))
4492     {
4493       HOST_WIDE_INT size = int_size_in_bytes (type);
4494
4495       if (size % UNITS_PER_WORD != 0)
4496         {
4497           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
4498           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
4499         }
4500     }
4501
4502   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
4503                                                &ag_mode, &count, NULL))
4504     {
4505       if (!aarch64_composite_type_p (type, mode))
4506         {
4507           gcc_assert (count == 1 && mode == ag_mode);
4508           return gen_rtx_REG (mode, V0_REGNUM);
4509         }
4510       else
4511         {
4512           int i;
4513           rtx par;
4514
4515           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
4516           for (i = 0; i < count; i++)
4517             {
4518               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
4519               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
4520               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4521               XVECEXP (par, 0, i) = tmp;
4522             }
4523           return par;
4524         }
4525     }
4526   else
4527     return gen_rtx_REG (mode, R0_REGNUM);
4528 }
4529
4530 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
4531    Return true if REGNO is the number of a hard register in which the values
4532    of called function may come back.  */
4533
4534 static bool
4535 aarch64_function_value_regno_p (const unsigned int regno)
4536 {
4537   /* Maximum of 16 bytes can be returned in the general registers.  Examples
4538      of 16-byte return values are: 128-bit integers and 16-byte small
4539      structures (excluding homogeneous floating-point aggregates).  */
4540   if (regno == R0_REGNUM || regno == R1_REGNUM)
4541     return true;
4542
4543   /* Up to four fp/simd registers can return a function value, e.g. a
4544      homogeneous floating-point aggregate having four members.  */
4545   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
4546     return TARGET_FLOAT;
4547
4548   return false;
4549 }
4550
4551 /* Implement TARGET_RETURN_IN_MEMORY.
4552
4553    If the type T of the result of a function is such that
4554      void func (T arg)
4555    would require that arg be passed as a value in a register (or set of
4556    registers) according to the parameter passing rules, then the result
4557    is returned in the same registers as would be used for such an
4558    argument.  */
4559
4560 static bool
4561 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
4562 {
4563   HOST_WIDE_INT size;
4564   machine_mode ag_mode;
4565   int count;
4566
4567   if (!AGGREGATE_TYPE_P (type)
4568       && TREE_CODE (type) != COMPLEX_TYPE
4569       && TREE_CODE (type) != VECTOR_TYPE)
4570     /* Simple scalar types always returned in registers.  */
4571     return false;
4572
4573   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
4574                                                type,
4575                                                &ag_mode,
4576                                                &count,
4577                                                NULL))
4578     return false;
4579
4580   /* Types larger than 2 registers returned in memory.  */
4581   size = int_size_in_bytes (type);
4582   return (size < 0 || size > 2 * UNITS_PER_WORD);
4583 }
4584
4585 static bool
4586 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
4587                                const_tree type, int *nregs)
4588 {
4589   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4590   return aarch64_vfp_is_call_or_return_candidate (mode,
4591                                                   type,
4592                                                   &pcum->aapcs_vfp_rmode,
4593                                                   nregs,
4594                                                   NULL);
4595 }
4596
4597 /* Given MODE and TYPE of a function argument, return the alignment in
4598    bits.  The idea is to suppress any stronger alignment requested by
4599    the user and opt for the natural alignment (specified in AAPCS64 \S
4600    4.1).  ABI_BREAK is set to true if the alignment was incorrectly
4601    calculated in versions of GCC prior to GCC-9.  This is a helper
4602    function for local use only.  */
4603
4604 static unsigned int
4605 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
4606                                 bool *abi_break)
4607 {
4608   *abi_break = false;
4609   if (!type)
4610     return GET_MODE_ALIGNMENT (mode);
4611
4612   if (integer_zerop (TYPE_SIZE (type)))
4613     return 0;
4614
4615   gcc_assert (TYPE_MODE (type) == mode);
4616
4617   if (!AGGREGATE_TYPE_P (type))
4618     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
4619
4620   if (TREE_CODE (type) == ARRAY_TYPE)
4621     return TYPE_ALIGN (TREE_TYPE (type));
4622
4623   unsigned int alignment = 0;
4624   unsigned int bitfield_alignment = 0;
4625   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
4626     if (TREE_CODE (field) == FIELD_DECL)
4627       {
4628         alignment = std::max (alignment, DECL_ALIGN (field));
4629         if (DECL_BIT_FIELD_TYPE (field))
4630           bitfield_alignment
4631             = std::max (bitfield_alignment,
4632                         TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
4633       }
4634
4635   if (bitfield_alignment > alignment)
4636     {
4637       *abi_break = true;
4638       return bitfield_alignment;
4639     }
4640
4641   return alignment;
4642 }
4643
4644 /* Layout a function argument according to the AAPCS64 rules.  The rule
4645    numbers refer to the rule numbers in the AAPCS64.  */
4646
4647 static void
4648 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
4649                     const_tree type,
4650                     bool named ATTRIBUTE_UNUSED)
4651 {
4652   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4653   int ncrn, nvrn, nregs;
4654   bool allocate_ncrn, allocate_nvrn;
4655   HOST_WIDE_INT size;
4656   bool abi_break;
4657
4658   /* We need to do this once per argument.  */
4659   if (pcum->aapcs_arg_processed)
4660     return;
4661
4662   pcum->aapcs_arg_processed = true;
4663
4664   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
4665   if (type)
4666     size = int_size_in_bytes (type);
4667   else
4668     /* No frontends can create types with variable-sized modes, so we
4669        shouldn't be asked to pass or return them.  */
4670     size = GET_MODE_SIZE (mode).to_constant ();
4671   size = ROUND_UP (size, UNITS_PER_WORD);
4672
4673   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
4674   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
4675                                                  mode,
4676                                                  type,
4677                                                  &nregs);
4678
4679   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
4680      The following code thus handles passing by SIMD/FP registers first.  */
4681
4682   nvrn = pcum->aapcs_nvrn;
4683
4684   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
4685      and homogenous short-vector aggregates (HVA).  */
4686   if (allocate_nvrn)
4687     {
4688       if (!TARGET_FLOAT)
4689         aarch64_err_no_fpadvsimd (mode);
4690
4691       if (nvrn + nregs <= NUM_FP_ARG_REGS)
4692         {
4693           pcum->aapcs_nextnvrn = nvrn + nregs;
4694           if (!aarch64_composite_type_p (type, mode))
4695             {
4696               gcc_assert (nregs == 1);
4697               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
4698             }
4699           else
4700             {
4701               rtx par;
4702               int i;
4703               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4704               for (i = 0; i < nregs; i++)
4705                 {
4706                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
4707                                          V0_REGNUM + nvrn + i);
4708                   rtx offset = gen_int_mode
4709                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
4710                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4711                   XVECEXP (par, 0, i) = tmp;
4712                 }
4713               pcum->aapcs_reg = par;
4714             }
4715           return;
4716         }
4717       else
4718         {
4719           /* C.3 NSRN is set to 8.  */
4720           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
4721           goto on_stack;
4722         }
4723     }
4724
4725   ncrn = pcum->aapcs_ncrn;
4726   nregs = size / UNITS_PER_WORD;
4727
4728   /* C6 - C9.  though the sign and zero extension semantics are
4729      handled elsewhere.  This is the case where the argument fits
4730      entirely general registers.  */
4731   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
4732     {
4733       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
4734
4735       /* C.8 if the argument has an alignment of 16 then the NGRN is
4736          rounded up to the next even number.  */
4737       if (nregs == 2
4738           && ncrn % 2
4739           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
4740              comparison is there because for > 16 * BITS_PER_UNIT
4741              alignment nregs should be > 2 and therefore it should be
4742              passed by reference rather than value.  */
4743           && (aarch64_function_arg_alignment (mode, type, &abi_break)
4744               == 16 * BITS_PER_UNIT))
4745         {
4746           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4747             inform (input_location, "parameter passing for argument of type "
4748                     "%qT changed in GCC 9.1", type);
4749           ++ncrn;
4750           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
4751         }
4752
4753       /* NREGS can be 0 when e.g. an empty structure is to be passed.
4754          A reg is still generated for it, but the caller should be smart
4755          enough not to use it.  */
4756       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
4757         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
4758       else
4759         {
4760           rtx par;
4761           int i;
4762
4763           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4764           for (i = 0; i < nregs; i++)
4765             {
4766               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
4767               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
4768                                        GEN_INT (i * UNITS_PER_WORD));
4769               XVECEXP (par, 0, i) = tmp;
4770             }
4771           pcum->aapcs_reg = par;
4772         }
4773
4774       pcum->aapcs_nextncrn = ncrn + nregs;
4775       return;
4776     }
4777
4778   /* C.11  */
4779   pcum->aapcs_nextncrn = NUM_ARG_REGS;
4780
4781   /* The argument is passed on stack; record the needed number of words for
4782      this argument and align the total size if necessary.  */
4783 on_stack:
4784   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
4785
4786   if (aarch64_function_arg_alignment (mode, type, &abi_break)
4787       == 16 * BITS_PER_UNIT)
4788     {
4789       int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
4790       if (pcum->aapcs_stack_size != new_size)
4791         {
4792           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4793             inform (input_location, "parameter passing for argument of type "
4794                     "%qT changed in GCC 9.1", type);
4795           pcum->aapcs_stack_size = new_size;
4796         }
4797     }
4798   return;
4799 }
4800
4801 /* Implement TARGET_FUNCTION_ARG.  */
4802
4803 static rtx
4804 aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
4805 {
4806   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4807   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
4808
4809   if (arg.end_marker_p ())
4810     return NULL_RTX;
4811
4812   aarch64_layout_arg (pcum_v, arg.mode, arg.type, arg.named);
4813   return pcum->aapcs_reg;
4814 }
4815
4816 void
4817 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
4818                            const_tree fntype ATTRIBUTE_UNUSED,
4819                            rtx libname ATTRIBUTE_UNUSED,
4820                            const_tree fndecl ATTRIBUTE_UNUSED,
4821                            unsigned n_named ATTRIBUTE_UNUSED)
4822 {
4823   pcum->aapcs_ncrn = 0;
4824   pcum->aapcs_nvrn = 0;
4825   pcum->aapcs_nextncrn = 0;
4826   pcum->aapcs_nextnvrn = 0;
4827   pcum->pcs_variant = ARM_PCS_AAPCS64;
4828   pcum->aapcs_reg = NULL_RTX;
4829   pcum->aapcs_arg_processed = false;
4830   pcum->aapcs_stack_words = 0;
4831   pcum->aapcs_stack_size = 0;
4832
4833   if (!TARGET_FLOAT
4834       && fndecl && TREE_PUBLIC (fndecl)
4835       && fntype && fntype != error_mark_node)
4836     {
4837       const_tree type = TREE_TYPE (fntype);
4838       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
4839       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
4840       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
4841                                                    &mode, &nregs, NULL))
4842         aarch64_err_no_fpadvsimd (TYPE_MODE (type));
4843     }
4844   return;
4845 }
4846
4847 static void
4848 aarch64_function_arg_advance (cumulative_args_t pcum_v,
4849                               machine_mode mode,
4850                               const_tree type,
4851                               bool named)
4852 {
4853   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4854   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
4855     {
4856       aarch64_layout_arg (pcum_v, mode, type, named);
4857       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
4858                   != (pcum->aapcs_stack_words != 0));
4859       pcum->aapcs_arg_processed = false;
4860       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
4861       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
4862       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
4863       pcum->aapcs_stack_words = 0;
4864       pcum->aapcs_reg = NULL_RTX;
4865     }
4866 }
4867
4868 bool
4869 aarch64_function_arg_regno_p (unsigned regno)
4870 {
4871   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
4872           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
4873 }
4874
4875 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
4876    PARM_BOUNDARY bits of alignment, but will be given anything up
4877    to STACK_BOUNDARY bits if the type requires it.  This makes sure
4878    that both before and after the layout of each argument, the Next
4879    Stacked Argument Address (NSAA) will have a minimum alignment of
4880    8 bytes.  */
4881
4882 static unsigned int
4883 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
4884 {
4885   bool abi_break;
4886   unsigned int alignment = aarch64_function_arg_alignment (mode, type,
4887                                                            &abi_break);
4888   if (abi_break & warn_psabi)
4889     inform (input_location, "parameter passing for argument of type "
4890             "%qT changed in GCC 9.1", type);
4891
4892   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
4893 }
4894
4895 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
4896
4897 static fixed_size_mode
4898 aarch64_get_reg_raw_mode (int regno)
4899 {
4900   if (TARGET_SVE && FP_REGNUM_P (regno))
4901     /* Don't use the SVE part of the register for __builtin_apply and
4902        __builtin_return.  The SVE registers aren't used by the normal PCS,
4903        so using them there would be a waste of time.  The PCS extensions
4904        for SVE types are fundamentally incompatible with the
4905        __builtin_return/__builtin_apply interface.  */
4906     return as_a <fixed_size_mode> (V16QImode);
4907   return default_get_reg_raw_mode (regno);
4908 }
4909
4910 /* Implement TARGET_FUNCTION_ARG_PADDING.
4911
4912    Small aggregate types are placed in the lowest memory address.
4913
4914    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
4915
4916 static pad_direction
4917 aarch64_function_arg_padding (machine_mode mode, const_tree type)
4918 {
4919   /* On little-endian targets, the least significant byte of every stack
4920      argument is passed at the lowest byte address of the stack slot.  */
4921   if (!BYTES_BIG_ENDIAN)
4922     return PAD_UPWARD;
4923
4924   /* Otherwise, integral, floating-point and pointer types are padded downward:
4925      the least significant byte of a stack argument is passed at the highest
4926      byte address of the stack slot.  */
4927   if (type
4928       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
4929          || POINTER_TYPE_P (type))
4930       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
4931     return PAD_DOWNWARD;
4932
4933   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
4934   return PAD_UPWARD;
4935 }
4936
4937 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4938
4939    It specifies padding for the last (may also be the only)
4940    element of a block move between registers and memory.  If
4941    assuming the block is in the memory, padding upward means that
4942    the last element is padded after its highest significant byte,
4943    while in downward padding, the last element is padded at the
4944    its least significant byte side.
4945
4946    Small aggregates and small complex types are always padded
4947    upwards.
4948
4949    We don't need to worry about homogeneous floating-point or
4950    short-vector aggregates; their move is not affected by the
4951    padding direction determined here.  Regardless of endianness,
4952    each element of such an aggregate is put in the least
4953    significant bits of a fp/simd register.
4954
4955    Return !BYTES_BIG_ENDIAN if the least significant byte of the
4956    register has useful data, and return the opposite if the most
4957    significant byte does.  */
4958
4959 bool
4960 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
4961                      bool first ATTRIBUTE_UNUSED)
4962 {
4963
4964   /* Small composite types are always padded upward.  */
4965   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
4966     {
4967       HOST_WIDE_INT size;
4968       if (type)
4969         size = int_size_in_bytes (type);
4970       else
4971         /* No frontends can create types with variable-sized modes, so we
4972            shouldn't be asked to pass or return them.  */
4973         size = GET_MODE_SIZE (mode).to_constant ();
4974       if (size < 2 * UNITS_PER_WORD)
4975         return true;
4976     }
4977
4978   /* Otherwise, use the default padding.  */
4979   return !BYTES_BIG_ENDIAN;
4980 }
4981
4982 static scalar_int_mode
4983 aarch64_libgcc_cmp_return_mode (void)
4984 {
4985   return SImode;
4986 }
4987
4988 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
4989
4990 /* We use the 12-bit shifted immediate arithmetic instructions so values
4991    must be multiple of (1 << 12), i.e. 4096.  */
4992 #define ARITH_FACTOR 4096
4993
4994 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
4995 #error Cannot use simple address calculation for stack probing
4996 #endif
4997
4998 /* The pair of scratch registers used for stack probing.  */
4999 #define PROBE_STACK_FIRST_REG  R9_REGNUM
5000 #define PROBE_STACK_SECOND_REG R10_REGNUM
5001
5002 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
5003    inclusive.  These are offsets from the current stack pointer.  */
5004
5005 static void
5006 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
5007 {
5008   HOST_WIDE_INT size;
5009   if (!poly_size.is_constant (&size))
5010     {
5011       sorry ("stack probes for SVE frames");
5012       return;
5013     }
5014
5015   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
5016
5017   /* See the same assertion on PROBE_INTERVAL above.  */
5018   gcc_assert ((first % ARITH_FACTOR) == 0);
5019
5020   /* See if we have a constant small number of probes to generate.  If so,
5021      that's the easy case.  */
5022   if (size <= PROBE_INTERVAL)
5023     {
5024       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
5025
5026       emit_set_insn (reg1,
5027                      plus_constant (Pmode,
5028                                     stack_pointer_rtx, -(first + base)));
5029       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
5030     }
5031
5032   /* The run-time loop is made up of 8 insns in the generic case while the
5033      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
5034   else if (size <= 4 * PROBE_INTERVAL)
5035     {
5036       HOST_WIDE_INT i, rem;
5037
5038       emit_set_insn (reg1,
5039                      plus_constant (Pmode,
5040                                     stack_pointer_rtx,
5041                                     -(first + PROBE_INTERVAL)));
5042       emit_stack_probe (reg1);
5043
5044       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
5045          it exceeds SIZE.  If only two probes are needed, this will not
5046          generate any code.  Then probe at FIRST + SIZE.  */
5047       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
5048         {
5049           emit_set_insn (reg1,
5050                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
5051           emit_stack_probe (reg1);
5052         }
5053
5054       rem = size - (i - PROBE_INTERVAL);
5055       if (rem > 256)
5056         {
5057           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5058
5059           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
5060           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
5061         }
5062       else
5063         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
5064     }
5065
5066   /* Otherwise, do the same as above, but in a loop.  Note that we must be
5067      extra careful with variables wrapping around because we might be at
5068      the very top (or the very bottom) of the address space and we have
5069      to be able to handle this case properly; in particular, we use an
5070      equality test for the loop condition.  */
5071   else
5072     {
5073       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
5074
5075       /* Step 1: round SIZE to the previous multiple of the interval.  */
5076
5077       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
5078
5079
5080       /* Step 2: compute initial and final value of the loop counter.  */
5081
5082       /* TEST_ADDR = SP + FIRST.  */
5083       emit_set_insn (reg1,
5084                      plus_constant (Pmode, stack_pointer_rtx, -first));
5085
5086       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
5087       HOST_WIDE_INT adjustment = - (first + rounded_size);
5088       if (! aarch64_uimm12_shift (adjustment))
5089         {
5090           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
5091                                           true, Pmode);
5092           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
5093         }
5094       else
5095         emit_set_insn (reg2,
5096                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
5097
5098       /* Step 3: the loop
5099
5100          do
5101            {
5102              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
5103              probe at TEST_ADDR
5104            }
5105          while (TEST_ADDR != LAST_ADDR)
5106
5107          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
5108          until it is equal to ROUNDED_SIZE.  */
5109
5110       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
5111
5112
5113       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
5114          that SIZE is equal to ROUNDED_SIZE.  */
5115
5116       if (size != rounded_size)
5117         {
5118           HOST_WIDE_INT rem = size - rounded_size;
5119
5120           if (rem > 256)
5121             {
5122               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5123
5124               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
5125               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
5126             }
5127           else
5128             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
5129         }
5130     }
5131
5132   /* Make sure nothing is scheduled before we are done.  */
5133   emit_insn (gen_blockage ());
5134 }
5135
5136 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
5137    absolute addresses.  */
5138
5139 const char *
5140 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
5141 {
5142   static int labelno = 0;
5143   char loop_lab[32];
5144   rtx xops[2];
5145
5146   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
5147
5148   /* Loop.  */
5149   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
5150
5151   HOST_WIDE_INT stack_clash_probe_interval
5152     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5153
5154   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
5155   xops[0] = reg1;
5156   HOST_WIDE_INT interval;
5157   if (flag_stack_clash_protection)
5158     interval = stack_clash_probe_interval;
5159   else
5160     interval = PROBE_INTERVAL;
5161
5162   gcc_assert (aarch64_uimm12_shift (interval));
5163   xops[1] = GEN_INT (interval);
5164
5165   output_asm_insn ("sub\t%0, %0, %1", xops);
5166
5167   /* If doing stack clash protection then we probe up by the ABI specified
5168      amount.  We do this because we're dropping full pages at a time in the
5169      loop.  But if we're doing non-stack clash probing, probe at SP 0.  */
5170   if (flag_stack_clash_protection)
5171     xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
5172   else
5173     xops[1] = CONST0_RTX (GET_MODE (xops[1]));
5174
5175   /* Probe at TEST_ADDR.  If we're inside the loop it is always safe to probe
5176      by this amount for each iteration.  */
5177   output_asm_insn ("str\txzr, [%0, %1]", xops);
5178
5179   /* Test if TEST_ADDR == LAST_ADDR.  */
5180   xops[1] = reg2;
5181   output_asm_insn ("cmp\t%0, %1", xops);
5182
5183   /* Branch.  */
5184   fputs ("\tb.ne\t", asm_out_file);
5185   assemble_name_raw (asm_out_file, loop_lab);
5186   fputc ('\n', asm_out_file);
5187
5188   return "";
5189 }
5190
5191 /* Emit the probe loop for doing stack clash probes and stack adjustments for
5192    SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
5193    of GUARD_SIZE.  When a probe is emitted it is done at most
5194    MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
5195    at most MIN_PROBE_THRESHOLD.  By the end of this function
5196    BASE = BASE - ADJUSTMENT.  */
5197
5198 const char *
5199 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
5200                                       rtx min_probe_threshold, rtx guard_size)
5201 {
5202   /* This function is not allowed to use any instruction generation function
5203      like gen_ and friends.  If you do you'll likely ICE during CFG validation,
5204      so instead emit the code you want using output_asm_insn.  */
5205   gcc_assert (flag_stack_clash_protection);
5206   gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
5207   gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
5208
5209   /* The minimum required allocation before the residual requires probing.  */
5210   HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
5211
5212   /* Clamp the value down to the nearest value that can be used with a cmp.  */
5213   residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
5214   rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
5215
5216   gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
5217   gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
5218
5219   static int labelno = 0;
5220   char loop_start_lab[32];
5221   char loop_end_lab[32];
5222   rtx xops[2];
5223
5224   ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
5225   ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
5226
5227   /* Emit loop start label.  */
5228   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
5229
5230   /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
5231   xops[0] = adjustment;
5232   xops[1] = probe_offset_value_rtx;
5233   output_asm_insn ("cmp\t%0, %1", xops);
5234
5235   /* Branch to end if not enough adjustment to probe.  */
5236   fputs ("\tb.lt\t", asm_out_file);
5237   assemble_name_raw (asm_out_file, loop_end_lab);
5238   fputc ('\n', asm_out_file);
5239
5240   /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
5241   xops[0] = base;
5242   xops[1] = probe_offset_value_rtx;
5243   output_asm_insn ("sub\t%0, %0, %1", xops);
5244
5245   /* Probe at BASE.  */
5246   xops[1] = const0_rtx;
5247   output_asm_insn ("str\txzr, [%0, %1]", xops);
5248
5249   /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
5250   xops[0] = adjustment;
5251   xops[1] = probe_offset_value_rtx;
5252   output_asm_insn ("sub\t%0, %0, %1", xops);
5253
5254   /* Branch to start if still more bytes to allocate.  */
5255   fputs ("\tb\t", asm_out_file);
5256   assemble_name_raw (asm_out_file, loop_start_lab);
5257   fputc ('\n', asm_out_file);
5258
5259   /* No probe leave.  */
5260   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
5261
5262   /* BASE = BASE - ADJUSTMENT.  */
5263   xops[0] = base;
5264   xops[1] = adjustment;
5265   output_asm_insn ("sub\t%0, %0, %1", xops);
5266   return "";
5267 }
5268
5269 /* Determine whether a frame chain needs to be generated.  */
5270 static bool
5271 aarch64_needs_frame_chain (void)
5272 {
5273   /* Force a frame chain for EH returns so the return address is at FP+8.  */
5274   if (frame_pointer_needed || crtl->calls_eh_return)
5275     return true;
5276
5277   /* A leaf function cannot have calls or write LR.  */
5278   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
5279
5280   /* Don't use a frame chain in leaf functions if leaf frame pointers
5281      are disabled.  */
5282   if (flag_omit_leaf_frame_pointer && is_leaf)
5283     return false;
5284
5285   return aarch64_use_frame_pointer;
5286 }
5287
5288 /* Mark the registers that need to be saved by the callee and calculate
5289    the size of the callee-saved registers area and frame record (both FP
5290    and LR may be omitted).  */
5291 static void
5292 aarch64_layout_frame (void)
5293 {
5294   HOST_WIDE_INT offset = 0;
5295   int regno, last_fp_reg = INVALID_REGNUM;
5296   bool simd_function = aarch64_simd_decl_p (cfun->decl);
5297
5298   cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
5299
5300   /* Adjust the outgoing arguments size if required.  Keep it in sync with what
5301      the mid-end is doing.  */
5302   crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
5303
5304 #define SLOT_NOT_REQUIRED (-2)
5305 #define SLOT_REQUIRED     (-1)
5306
5307   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
5308   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
5309
5310   /* If this is a non-leaf simd function with calls we assume that
5311      at least one of those calls is to a non-simd function and thus
5312      we must save V8 to V23 in the prologue.  */
5313
5314   if (simd_function && !crtl->is_leaf)
5315     {
5316       for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5317         if (FP_SIMD_SAVED_REGNUM_P (regno))
5318           df_set_regs_ever_live (regno, true);
5319     }
5320
5321   /* First mark all the registers that really need to be saved...  */
5322   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5323     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5324
5325   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5326     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5327
5328   /* ... that includes the eh data registers (if needed)...  */
5329   if (crtl->calls_eh_return)
5330     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
5331       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
5332         = SLOT_REQUIRED;
5333
5334   /* ... and any callee saved register that dataflow says is live.  */
5335   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5336     if (df_regs_ever_live_p (regno)
5337         && (regno == R30_REGNUM
5338             || !call_used_regs[regno]))
5339       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
5340
5341   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5342     if (df_regs_ever_live_p (regno)
5343         && (!call_used_regs[regno]
5344             || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
5345       {
5346         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
5347         last_fp_reg = regno;
5348       }
5349
5350   if (cfun->machine->frame.emit_frame_chain)
5351     {
5352       /* FP and LR are placed in the linkage record.  */
5353       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
5354       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
5355       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
5356       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
5357       offset = 2 * UNITS_PER_WORD;
5358     }
5359
5360   /* With stack-clash, LR must be saved in non-leaf functions.  */
5361   gcc_assert (crtl->is_leaf
5362               || (cfun->machine->frame.reg_offset[R30_REGNUM]
5363                   != SLOT_NOT_REQUIRED));
5364
5365   /* Now assign stack slots for them.  */
5366   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5367     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
5368       {
5369         cfun->machine->frame.reg_offset[regno] = offset;
5370         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
5371           cfun->machine->frame.wb_candidate1 = regno;
5372         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
5373           cfun->machine->frame.wb_candidate2 = regno;
5374         offset += UNITS_PER_WORD;
5375       }
5376
5377   HOST_WIDE_INT max_int_offset = offset;
5378   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
5379   bool has_align_gap = offset != max_int_offset;
5380
5381   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5382     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
5383       {
5384         /* If there is an alignment gap between integer and fp callee-saves,
5385            allocate the last fp register to it if possible.  */
5386         if (regno == last_fp_reg
5387             && has_align_gap
5388             && !simd_function
5389             && (offset & 8) == 0)
5390           {
5391             cfun->machine->frame.reg_offset[regno] = max_int_offset;
5392             break;
5393           }
5394
5395         cfun->machine->frame.reg_offset[regno] = offset;
5396         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
5397           cfun->machine->frame.wb_candidate1 = regno;
5398         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
5399                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
5400           cfun->machine->frame.wb_candidate2 = regno;
5401         offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
5402       }
5403
5404   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
5405
5406   cfun->machine->frame.saved_regs_size = offset;
5407
5408   HOST_WIDE_INT varargs_and_saved_regs_size
5409     = offset + cfun->machine->frame.saved_varargs_size;
5410
5411   cfun->machine->frame.hard_fp_offset
5412     = aligned_upper_bound (varargs_and_saved_regs_size
5413                            + get_frame_size (),
5414                            STACK_BOUNDARY / BITS_PER_UNIT);
5415
5416   /* Both these values are already aligned.  */
5417   gcc_assert (multiple_p (crtl->outgoing_args_size,
5418                           STACK_BOUNDARY / BITS_PER_UNIT));
5419   cfun->machine->frame.frame_size
5420     = (cfun->machine->frame.hard_fp_offset
5421        + crtl->outgoing_args_size);
5422
5423   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
5424
5425   cfun->machine->frame.initial_adjust = 0;
5426   cfun->machine->frame.final_adjust = 0;
5427   cfun->machine->frame.callee_adjust = 0;
5428   cfun->machine->frame.callee_offset = 0;
5429
5430   HOST_WIDE_INT max_push_offset = 0;
5431   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
5432     max_push_offset = 512;
5433   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
5434     max_push_offset = 256;
5435
5436   HOST_WIDE_INT const_size, const_fp_offset;
5437   if (cfun->machine->frame.frame_size.is_constant (&const_size)
5438       && const_size < max_push_offset
5439       && known_eq (crtl->outgoing_args_size, 0))
5440     {
5441       /* Simple, small frame with no outgoing arguments:
5442          stp reg1, reg2, [sp, -frame_size]!
5443          stp reg3, reg4, [sp, 16]  */
5444       cfun->machine->frame.callee_adjust = const_size;
5445     }
5446   else if (known_lt (crtl->outgoing_args_size
5447                      + cfun->machine->frame.saved_regs_size, 512)
5448            && !(cfun->calls_alloca
5449                 && known_lt (cfun->machine->frame.hard_fp_offset,
5450                              max_push_offset)))
5451     {
5452       /* Frame with small outgoing arguments:
5453          sub sp, sp, frame_size
5454          stp reg1, reg2, [sp, outgoing_args_size]
5455          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
5456       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
5457       cfun->machine->frame.callee_offset
5458         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
5459     }
5460   else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
5461            && const_fp_offset < max_push_offset)
5462     {
5463       /* Frame with large outgoing arguments but a small local area:
5464          stp reg1, reg2, [sp, -hard_fp_offset]!
5465          stp reg3, reg4, [sp, 16]
5466          sub sp, sp, outgoing_args_size  */
5467       cfun->machine->frame.callee_adjust = const_fp_offset;
5468       cfun->machine->frame.final_adjust
5469         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
5470     }
5471   else
5472     {
5473       /* Frame with large local area and outgoing arguments using frame pointer:
5474          sub sp, sp, hard_fp_offset
5475          stp x29, x30, [sp, 0]
5476          add x29, sp, 0
5477          stp reg3, reg4, [sp, 16]
5478          sub sp, sp, outgoing_args_size  */
5479       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
5480       cfun->machine->frame.final_adjust
5481         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
5482     }
5483
5484   cfun->machine->frame.laid_out = true;
5485 }
5486
5487 /* Return true if the register REGNO is saved on entry to
5488    the current function.  */
5489
5490 static bool
5491 aarch64_register_saved_on_entry (int regno)
5492 {
5493   return cfun->machine->frame.reg_offset[regno] >= 0;
5494 }
5495
5496 /* Return the next register up from REGNO up to LIMIT for the callee
5497    to save.  */
5498
5499 static unsigned
5500 aarch64_next_callee_save (unsigned regno, unsigned limit)
5501 {
5502   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
5503     regno ++;
5504   return regno;
5505 }
5506
5507 /* Push the register number REGNO of mode MODE to the stack with write-back
5508    adjusting the stack by ADJUSTMENT.  */
5509
5510 static void
5511 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
5512                            HOST_WIDE_INT adjustment)
5513  {
5514   rtx base_rtx = stack_pointer_rtx;
5515   rtx insn, reg, mem;
5516
5517   reg = gen_rtx_REG (mode, regno);
5518   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
5519                             plus_constant (Pmode, base_rtx, -adjustment));
5520   mem = gen_frame_mem (mode, mem);
5521
5522   insn = emit_move_insn (mem, reg);
5523   RTX_FRAME_RELATED_P (insn) = 1;
5524 }
5525
5526 /* Generate and return an instruction to store the pair of registers
5527    REG and REG2 of mode MODE to location BASE with write-back adjusting
5528    the stack location BASE by ADJUSTMENT.  */
5529
5530 static rtx
5531 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
5532                           HOST_WIDE_INT adjustment)
5533 {
5534   switch (mode)
5535     {
5536     case E_DImode:
5537       return gen_storewb_pairdi_di (base, base, reg, reg2,
5538                                     GEN_INT (-adjustment),
5539                                     GEN_INT (UNITS_PER_WORD - adjustment));
5540     case E_DFmode:
5541       return gen_storewb_pairdf_di (base, base, reg, reg2,
5542                                     GEN_INT (-adjustment),
5543                                     GEN_INT (UNITS_PER_WORD - adjustment));
5544     case E_TFmode:
5545       return gen_storewb_pairtf_di (base, base, reg, reg2,
5546                                     GEN_INT (-adjustment),
5547                                     GEN_INT (UNITS_PER_VREG - adjustment));
5548     default:
5549       gcc_unreachable ();
5550     }
5551 }
5552
5553 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
5554    stack pointer by ADJUSTMENT.  */
5555
5556 static void
5557 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
5558 {
5559   rtx_insn *insn;
5560   machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
5561
5562   if (regno2 == INVALID_REGNUM)
5563     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
5564
5565   rtx reg1 = gen_rtx_REG (mode, regno1);
5566   rtx reg2 = gen_rtx_REG (mode, regno2);
5567
5568   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
5569                                               reg2, adjustment));
5570   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
5571   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5572   RTX_FRAME_RELATED_P (insn) = 1;
5573 }
5574
5575 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
5576    adjusting it by ADJUSTMENT afterwards.  */
5577
5578 static rtx
5579 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
5580                          HOST_WIDE_INT adjustment)
5581 {
5582   switch (mode)
5583     {
5584     case E_DImode:
5585       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
5586                                    GEN_INT (UNITS_PER_WORD));
5587     case E_DFmode:
5588       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
5589                                    GEN_INT (UNITS_PER_WORD));
5590     case E_TFmode:
5591       return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
5592                                    GEN_INT (UNITS_PER_VREG));
5593     default:
5594       gcc_unreachable ();
5595     }
5596 }
5597
5598 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
5599    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
5600    into CFI_OPS.  */
5601
5602 static void
5603 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
5604                   rtx *cfi_ops)
5605 {
5606   machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
5607   rtx reg1 = gen_rtx_REG (mode, regno1);
5608
5609   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
5610
5611   if (regno2 == INVALID_REGNUM)
5612     {
5613       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
5614       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
5615       emit_move_insn (reg1, gen_frame_mem (mode, mem));
5616     }
5617   else
5618     {
5619       rtx reg2 = gen_rtx_REG (mode, regno2);
5620       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5621       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
5622                                           reg2, adjustment));
5623     }
5624 }
5625
5626 /* Generate and return a store pair instruction of mode MODE to store
5627    register REG1 to MEM1 and register REG2 to MEM2.  */
5628
5629 static rtx
5630 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
5631                         rtx reg2)
5632 {
5633   switch (mode)
5634     {
5635     case E_DImode:
5636       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
5637
5638     case E_DFmode:
5639       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
5640
5641     case E_TFmode:
5642       return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
5643
5644     default:
5645       gcc_unreachable ();
5646     }
5647 }
5648
5649 /* Generate and regurn a load pair isntruction of mode MODE to load register
5650    REG1 from MEM1 and register REG2 from MEM2.  */
5651
5652 static rtx
5653 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
5654                        rtx mem2)
5655 {
5656   switch (mode)
5657     {
5658     case E_DImode:
5659       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
5660
5661     case E_DFmode:
5662       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
5663
5664     case E_TFmode:
5665       return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
5666
5667     default:
5668       gcc_unreachable ();
5669     }
5670 }
5671
5672 /* Return TRUE if return address signing should be enabled for the current
5673    function, otherwise return FALSE.  */
5674
5675 bool
5676 aarch64_return_address_signing_enabled (void)
5677 {
5678   /* This function should only be called after frame laid out.   */
5679   gcc_assert (cfun->machine->frame.laid_out);
5680
5681   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
5682      if its LR is pushed onto stack.  */
5683   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
5684           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
5685               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
5686 }
5687
5688 /* Return TRUE if Branch Target Identification Mechanism is enabled.  */
5689 bool
5690 aarch64_bti_enabled (void)
5691 {
5692   return (aarch64_enable_bti == 1);
5693 }
5694
5695 /* Emit code to save the callee-saved registers from register number START
5696    to LIMIT to the stack at the location starting at offset START_OFFSET,
5697    skipping any write-back candidates if SKIP_WB is true.  */
5698
5699 static void
5700 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
5701                            unsigned start, unsigned limit, bool skip_wb)
5702 {
5703   rtx_insn *insn;
5704   unsigned regno;
5705   unsigned regno2;
5706
5707   for (regno = aarch64_next_callee_save (start, limit);
5708        regno <= limit;
5709        regno = aarch64_next_callee_save (regno + 1, limit))
5710     {
5711       rtx reg, mem;
5712       poly_int64 offset;
5713       int offset_diff;
5714
5715       if (skip_wb
5716           && (regno == cfun->machine->frame.wb_candidate1
5717               || regno == cfun->machine->frame.wb_candidate2))
5718         continue;
5719
5720       if (cfun->machine->reg_is_wrapped_separately[regno])
5721        continue;
5722
5723       reg = gen_rtx_REG (mode, regno);
5724       offset = start_offset + cfun->machine->frame.reg_offset[regno];
5725       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5726                                                 offset));
5727
5728       regno2 = aarch64_next_callee_save (regno + 1, limit);
5729       offset_diff = cfun->machine->frame.reg_offset[regno2]
5730                     - cfun->machine->frame.reg_offset[regno];
5731
5732       if (regno2 <= limit
5733           && !cfun->machine->reg_is_wrapped_separately[regno2]
5734           && known_eq (GET_MODE_SIZE (mode), offset_diff))
5735         {
5736           rtx reg2 = gen_rtx_REG (mode, regno2);
5737           rtx mem2;
5738
5739           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5740           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5741                                                      offset));
5742           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
5743                                                     reg2));
5744
5745           /* The first part of a frame-related parallel insn is
5746              always assumed to be relevant to the frame
5747              calculations; subsequent parts, are only
5748              frame-related if explicitly marked.  */
5749           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5750           regno = regno2;
5751         }
5752       else
5753         insn = emit_move_insn (mem, reg);
5754
5755       RTX_FRAME_RELATED_P (insn) = 1;
5756     }
5757 }
5758
5759 /* Emit code to restore the callee registers of mode MODE from register
5760    number START up to and including LIMIT.  Restore from the stack offset
5761    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
5762    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
5763
5764 static void
5765 aarch64_restore_callee_saves (machine_mode mode,
5766                               poly_int64 start_offset, unsigned start,
5767                               unsigned limit, bool skip_wb, rtx *cfi_ops)
5768 {
5769   rtx base_rtx = stack_pointer_rtx;
5770   unsigned regno;
5771   unsigned regno2;
5772   poly_int64 offset;
5773
5774   for (regno = aarch64_next_callee_save (start, limit);
5775        regno <= limit;
5776        regno = aarch64_next_callee_save (regno + 1, limit))
5777     {
5778       if (cfun->machine->reg_is_wrapped_separately[regno])
5779        continue;
5780
5781       rtx reg, mem;
5782       int offset_diff;
5783
5784       if (skip_wb
5785           && (regno == cfun->machine->frame.wb_candidate1
5786               || regno == cfun->machine->frame.wb_candidate2))
5787         continue;
5788
5789       reg = gen_rtx_REG (mode, regno);
5790       offset = start_offset + cfun->machine->frame.reg_offset[regno];
5791       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5792
5793       regno2 = aarch64_next_callee_save (regno + 1, limit);
5794       offset_diff = cfun->machine->frame.reg_offset[regno2]
5795                     - cfun->machine->frame.reg_offset[regno];
5796
5797       if (regno2 <= limit
5798           && !cfun->machine->reg_is_wrapped_separately[regno2]
5799           && known_eq (GET_MODE_SIZE (mode), offset_diff))
5800         {
5801           rtx reg2 = gen_rtx_REG (mode, regno2);
5802           rtx mem2;
5803
5804           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5805           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5806           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5807
5808           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5809           regno = regno2;
5810         }
5811       else
5812         emit_move_insn (reg, mem);
5813       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
5814     }
5815 }
5816
5817 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
5818    of MODE.  */
5819
5820 static inline bool
5821 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5822 {
5823   HOST_WIDE_INT multiple;
5824   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5825           && IN_RANGE (multiple, -8, 7));
5826 }
5827
5828 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
5829    of MODE.  */
5830
5831 static inline bool
5832 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5833 {
5834   HOST_WIDE_INT multiple;
5835   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5836           && IN_RANGE (multiple, 0, 63));
5837 }
5838
5839 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
5840    of MODE.  */
5841
5842 bool
5843 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5844 {
5845   HOST_WIDE_INT multiple;
5846   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5847           && IN_RANGE (multiple, -64, 63));
5848 }
5849
5850 /* Return true if OFFSET is a signed 9-bit value.  */
5851
5852 bool
5853 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
5854                                        poly_int64 offset)
5855 {
5856   HOST_WIDE_INT const_offset;
5857   return (offset.is_constant (&const_offset)
5858           && IN_RANGE (const_offset, -256, 255));
5859 }
5860
5861 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
5862    of MODE.  */
5863
5864 static inline bool
5865 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5866 {
5867   HOST_WIDE_INT multiple;
5868   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5869           && IN_RANGE (multiple, -256, 255));
5870 }
5871
5872 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
5873    of MODE.  */
5874
5875 static inline bool
5876 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5877 {
5878   HOST_WIDE_INT multiple;
5879   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5880           && IN_RANGE (multiple, 0, 4095));
5881 }
5882
5883 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
5884
5885 static sbitmap
5886 aarch64_get_separate_components (void)
5887 {
5888   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5889   bitmap_clear (components);
5890
5891   /* The registers we need saved to the frame.  */
5892   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5893     if (aarch64_register_saved_on_entry (regno))
5894       {
5895         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5896         if (!frame_pointer_needed)
5897           offset += cfun->machine->frame.frame_size
5898                     - cfun->machine->frame.hard_fp_offset;
5899         /* Check that we can access the stack slot of the register with one
5900            direct load with no adjustments needed.  */
5901         if (offset_12bit_unsigned_scaled_p (DImode, offset))
5902           bitmap_set_bit (components, regno);
5903       }
5904
5905   /* Don't mess with the hard frame pointer.  */
5906   if (frame_pointer_needed)
5907     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
5908
5909   unsigned reg1 = cfun->machine->frame.wb_candidate1;
5910   unsigned reg2 = cfun->machine->frame.wb_candidate2;
5911   /* If registers have been chosen to be stored/restored with
5912      writeback don't interfere with them to avoid having to output explicit
5913      stack adjustment instructions.  */
5914   if (reg2 != INVALID_REGNUM)
5915     bitmap_clear_bit (components, reg2);
5916   if (reg1 != INVALID_REGNUM)
5917     bitmap_clear_bit (components, reg1);
5918
5919   bitmap_clear_bit (components, LR_REGNUM);
5920   bitmap_clear_bit (components, SP_REGNUM);
5921
5922   return components;
5923 }
5924
5925 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
5926
5927 static sbitmap
5928 aarch64_components_for_bb (basic_block bb)
5929 {
5930   bitmap in = DF_LIVE_IN (bb);
5931   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
5932   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
5933   bool simd_function = aarch64_simd_decl_p (cfun->decl);
5934
5935   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5936   bitmap_clear (components);
5937
5938   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
5939   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5940     if ((!call_used_regs[regno]
5941         || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
5942        && (bitmap_bit_p (in, regno)
5943            || bitmap_bit_p (gen, regno)
5944            || bitmap_bit_p (kill, regno)))
5945       {
5946         unsigned regno2, offset, offset2;
5947         bitmap_set_bit (components, regno);
5948
5949         /* If there is a callee-save at an adjacent offset, add it too
5950            to increase the use of LDP/STP.  */
5951         offset = cfun->machine->frame.reg_offset[regno];
5952         regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
5953
5954         if (regno2 <= LAST_SAVED_REGNUM)
5955           {
5956             offset2 = cfun->machine->frame.reg_offset[regno2];
5957             if ((offset & ~8) == (offset2 & ~8))
5958               bitmap_set_bit (components, regno2);
5959           }
5960       }
5961
5962   return components;
5963 }
5964
5965 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5966    Nothing to do for aarch64.  */
5967
5968 static void
5969 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
5970 {
5971 }
5972
5973 /* Return the next set bit in BMP from START onwards.  Return the total number
5974    of bits in BMP if no set bit is found at or after START.  */
5975
5976 static unsigned int
5977 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
5978 {
5979   unsigned int nbits = SBITMAP_SIZE (bmp);
5980   if (start == nbits)
5981     return start;
5982
5983   gcc_assert (start < nbits);
5984   for (unsigned int i = start; i < nbits; i++)
5985     if (bitmap_bit_p (bmp, i))
5986       return i;
5987
5988   return nbits;
5989 }
5990
5991 /* Do the work for aarch64_emit_prologue_components and
5992    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
5993    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
5994    for these components or the epilogue sequence.  That is, it determines
5995    whether we should emit stores or loads and what kind of CFA notes to attach
5996    to the insns.  Otherwise the logic for the two sequences is very
5997    similar.  */
5998
5999 static void
6000 aarch64_process_components (sbitmap components, bool prologue_p)
6001 {
6002   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
6003                              ? HARD_FRAME_POINTER_REGNUM
6004                              : STACK_POINTER_REGNUM);
6005
6006   unsigned last_regno = SBITMAP_SIZE (components);
6007   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
6008   rtx_insn *insn = NULL;
6009
6010   while (regno != last_regno)
6011     {
6012       /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
6013          so DFmode for the vector registers is enough.  For simd functions
6014          we want to save the low 128 bits.  */
6015       machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
6016
6017       rtx reg = gen_rtx_REG (mode, regno);
6018       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
6019       if (!frame_pointer_needed)
6020         offset += cfun->machine->frame.frame_size
6021                   - cfun->machine->frame.hard_fp_offset;
6022       rtx addr = plus_constant (Pmode, ptr_reg, offset);
6023       rtx mem = gen_frame_mem (mode, addr);
6024
6025       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
6026       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
6027       /* No more registers to handle after REGNO.
6028          Emit a single save/restore and exit.  */
6029       if (regno2 == last_regno)
6030         {
6031           insn = emit_insn (set);
6032           RTX_FRAME_RELATED_P (insn) = 1;
6033           if (prologue_p)
6034             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
6035           else
6036             add_reg_note (insn, REG_CFA_RESTORE, reg);
6037           break;
6038         }
6039
6040       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
6041       /* The next register is not of the same class or its offset is not
6042          mergeable with the current one into a pair.  */
6043       if (!satisfies_constraint_Ump (mem)
6044           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
6045           || (aarch64_simd_decl_p (cfun->decl) && FP_REGNUM_P (regno))
6046           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
6047                        GET_MODE_SIZE (mode)))
6048         {
6049           insn = emit_insn (set);
6050           RTX_FRAME_RELATED_P (insn) = 1;
6051           if (prologue_p)
6052             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
6053           else
6054             add_reg_note (insn, REG_CFA_RESTORE, reg);
6055
6056           regno = regno2;
6057           continue;
6058         }
6059
6060       /* REGNO2 can be saved/restored in a pair with REGNO.  */
6061       rtx reg2 = gen_rtx_REG (mode, regno2);
6062       if (!frame_pointer_needed)
6063         offset2 += cfun->machine->frame.frame_size
6064                   - cfun->machine->frame.hard_fp_offset;
6065       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
6066       rtx mem2 = gen_frame_mem (mode, addr2);
6067       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
6068                              : gen_rtx_SET (reg2, mem2);
6069
6070       if (prologue_p)
6071         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
6072       else
6073         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
6074
6075       RTX_FRAME_RELATED_P (insn) = 1;
6076       if (prologue_p)
6077         {
6078           add_reg_note (insn, REG_CFA_OFFSET, set);
6079           add_reg_note (insn, REG_CFA_OFFSET, set2);
6080         }
6081       else
6082         {
6083           add_reg_note (insn, REG_CFA_RESTORE, reg);
6084           add_reg_note (insn, REG_CFA_RESTORE, reg2);
6085         }
6086
6087       regno = aarch64_get_next_set_bit (components, regno2 + 1);
6088     }
6089 }
6090
6091 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
6092
6093 static void
6094 aarch64_emit_prologue_components (sbitmap components)
6095 {
6096   aarch64_process_components (components, true);
6097 }
6098
6099 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
6100
6101 static void
6102 aarch64_emit_epilogue_components (sbitmap components)
6103 {
6104   aarch64_process_components (components, false);
6105 }
6106
6107 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
6108
6109 static void
6110 aarch64_set_handled_components (sbitmap components)
6111 {
6112   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6113     if (bitmap_bit_p (components, regno))
6114       cfun->machine->reg_is_wrapped_separately[regno] = true;
6115 }
6116
6117 /* On AArch64 we have an ABI defined safe buffer.  This constant is used to
6118    determining the probe offset for alloca.  */
6119
6120 static HOST_WIDE_INT
6121 aarch64_stack_clash_protection_alloca_probe_range (void)
6122 {
6123   return STACK_CLASH_CALLER_GUARD;
6124 }
6125
6126
6127 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
6128    registers.  If POLY_SIZE is not large enough to require a probe this function
6129    will only adjust the stack.  When allocating the stack space
6130    FRAME_RELATED_P is then used to indicate if the allocation is frame related.
6131    FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
6132    arguments.  If we are then we ensure that any allocation larger than the ABI
6133    defined buffer needs a probe so that the invariant of having a 1KB buffer is
6134    maintained.
6135
6136    We emit barriers after each stack adjustment to prevent optimizations from
6137    breaking the invariant that we never drop the stack more than a page.  This
6138    invariant is needed to make it easier to correctly handle asynchronous
6139    events, e.g. if we were to allow the stack to be dropped by more than a page
6140    and then have multiple probes up and we take a signal somewhere in between
6141    then the signal handler doesn't know the state of the stack and can make no
6142    assumptions about which pages have been probed.  */
6143
6144 static void
6145 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
6146                                         poly_int64 poly_size,
6147                                         bool frame_related_p,
6148                                         bool final_adjustment_p)
6149 {
6150   HOST_WIDE_INT guard_size
6151     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
6152   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
6153   /* When doing the final adjustment for the outgoing argument size we can't
6154      assume that LR was saved at position 0.  So subtract it's offset from the
6155      ABI safe buffer so that we don't accidentally allow an adjustment that
6156      would result in an allocation larger than the ABI buffer without
6157      probing.  */
6158   HOST_WIDE_INT min_probe_threshold
6159     = final_adjustment_p
6160       ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
6161       : guard_size - guard_used_by_caller;
6162
6163   poly_int64 frame_size = cfun->machine->frame.frame_size;
6164
6165   /* We should always have a positive probe threshold.  */
6166   gcc_assert (min_probe_threshold > 0);
6167
6168   if (flag_stack_clash_protection && !final_adjustment_p)
6169     {
6170       poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6171       poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6172
6173       if (known_eq (frame_size, 0))
6174         {
6175           dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
6176         }
6177       else if (known_lt (initial_adjust, guard_size - guard_used_by_caller)
6178                && known_lt (final_adjust, guard_used_by_caller))
6179         {
6180           dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
6181         }
6182     }
6183
6184   /* If SIZE is not large enough to require probing, just adjust the stack and
6185      exit.  */
6186   if (known_lt (poly_size, min_probe_threshold)
6187       || !flag_stack_clash_protection)
6188     {
6189       aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
6190       return;
6191     }
6192
6193   HOST_WIDE_INT size;
6194   /* Handle the SVE non-constant case first.  */
6195   if (!poly_size.is_constant (&size))
6196     {
6197      if (dump_file)
6198       {
6199         fprintf (dump_file, "Stack clash SVE prologue: ");
6200         print_dec (poly_size, dump_file);
6201         fprintf (dump_file, " bytes, dynamic probing will be required.\n");
6202       }
6203
6204       /* First calculate the amount of bytes we're actually spilling.  */
6205       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
6206                           poly_size, temp1, temp2, false, true);
6207
6208       rtx_insn *insn = get_last_insn ();
6209
6210       if (frame_related_p)
6211         {
6212           /* This is done to provide unwinding information for the stack
6213              adjustments we're about to do, however to prevent the optimizers
6214              from removing the R11 move and leaving the CFA note (which would be
6215              very wrong) we tie the old and new stack pointer together.
6216              The tie will expand to nothing but the optimizers will not touch
6217              the instruction.  */
6218           rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
6219           emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
6220           emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
6221
6222           /* We want the CFA independent of the stack pointer for the
6223              duration of the loop.  */
6224           add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
6225           RTX_FRAME_RELATED_P (insn) = 1;
6226         }
6227
6228       rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
6229       rtx guard_const = gen_int_mode (guard_size, Pmode);
6230
6231       insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
6232                                                    stack_pointer_rtx, temp1,
6233                                                    probe_const, guard_const));
6234
6235       /* Now reset the CFA register if needed.  */
6236       if (frame_related_p)
6237         {
6238           add_reg_note (insn, REG_CFA_DEF_CFA,
6239                         gen_rtx_PLUS (Pmode, stack_pointer_rtx,
6240                                       gen_int_mode (poly_size, Pmode)));
6241           RTX_FRAME_RELATED_P (insn) = 1;
6242         }
6243
6244       return;
6245     }
6246
6247   if (dump_file)
6248     fprintf (dump_file,
6249              "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
6250              " bytes, probing will be required.\n", size);
6251
6252   /* Round size to the nearest multiple of guard_size, and calculate the
6253      residual as the difference between the original size and the rounded
6254      size.  */
6255   HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
6256   HOST_WIDE_INT residual = size - rounded_size;
6257
6258   /* We can handle a small number of allocations/probes inline.  Otherwise
6259      punt to a loop.  */
6260   if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
6261     {
6262       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
6263         {
6264           aarch64_sub_sp (NULL, temp2, guard_size, true);
6265           emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
6266                                            guard_used_by_caller));
6267           emit_insn (gen_blockage ());
6268         }
6269       dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
6270     }
6271   else
6272     {
6273       /* Compute the ending address.  */
6274       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
6275                           temp1, NULL, false, true);
6276       rtx_insn *insn = get_last_insn ();
6277
6278       /* For the initial allocation, we don't have a frame pointer
6279          set up, so we always need CFI notes.  If we're doing the
6280          final allocation, then we may have a frame pointer, in which
6281          case it is the CFA, otherwise we need CFI notes.
6282
6283          We can determine which allocation we are doing by looking at
6284          the value of FRAME_RELATED_P since the final allocations are not
6285          frame related.  */
6286       if (frame_related_p)
6287         {
6288           /* We want the CFA independent of the stack pointer for the
6289              duration of the loop.  */
6290           add_reg_note (insn, REG_CFA_DEF_CFA,
6291                         plus_constant (Pmode, temp1, rounded_size));
6292           RTX_FRAME_RELATED_P (insn) = 1;
6293         }
6294
6295       /* This allocates and probes the stack.  Note that this re-uses some of
6296          the existing Ada stack protection code.  However we are guaranteed not
6297          to enter the non loop or residual branches of that code.
6298
6299          The non-loop part won't be entered because if our allocation amount
6300          doesn't require a loop, the case above would handle it.
6301
6302          The residual amount won't be entered because TEMP1 is a mutliple of
6303          the allocation size.  The residual will always be 0.  As such, the only
6304          part we are actually using from that code is the loop setup.  The
6305          actual probing is done in aarch64_output_probe_stack_range.  */
6306       insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
6307                                                stack_pointer_rtx, temp1));
6308
6309       /* Now reset the CFA register if needed.  */
6310       if (frame_related_p)
6311         {
6312           add_reg_note (insn, REG_CFA_DEF_CFA,
6313                         plus_constant (Pmode, stack_pointer_rtx, rounded_size));
6314           RTX_FRAME_RELATED_P (insn) = 1;
6315         }
6316
6317       emit_insn (gen_blockage ());
6318       dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
6319     }
6320
6321   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
6322      be probed.  This maintains the requirement that each page is probed at
6323      least once.  For initial probing we probe only if the allocation is
6324      more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
6325      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
6326      GUARD_SIZE.  This works that for any allocation that is large enough to
6327      trigger a probe here, we'll have at least one, and if they're not large
6328      enough for this code to emit anything for them, The page would have been
6329      probed by the saving of FP/LR either by this function or any callees.  If
6330      we don't have any callees then we won't have more stack adjustments and so
6331      are still safe.  */
6332   if (residual)
6333     {
6334       HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
6335       /* If we're doing final adjustments, and we've done any full page
6336          allocations then any residual needs to be probed.  */
6337       if (final_adjustment_p && rounded_size != 0)
6338         min_probe_threshold = 0;
6339       /* If doing a small final adjustment, we always probe at offset 0.
6340          This is done to avoid issues when LR is not at position 0 or when
6341          the final adjustment is smaller than the probing offset.  */
6342       else if (final_adjustment_p && rounded_size == 0)
6343         residual_probe_offset = 0;
6344
6345       aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
6346       if (residual >= min_probe_threshold)
6347         {
6348           if (dump_file)
6349             fprintf (dump_file,
6350                      "Stack clash AArch64 prologue residuals: "
6351                      HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
6352                      "\n", residual);
6353
6354             emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
6355                                              residual_probe_offset));
6356           emit_insn (gen_blockage ());
6357         }
6358     }
6359 }
6360
6361 /* Return 1 if the register is used by the epilogue.  We need to say the
6362    return register is used, but only after epilogue generation is complete.
6363    Note that in the case of sibcalls, the values "used by the epilogue" are
6364    considered live at the start of the called function.
6365
6366    For SIMD functions we need to return 1 for FP registers that are saved and
6367    restored by a function but are not zero in call_used_regs.  If we do not do
6368    this optimizations may remove the restore of the register.  */
6369
6370 int
6371 aarch64_epilogue_uses (int regno)
6372 {
6373   if (epilogue_completed)
6374     {
6375       if (regno == LR_REGNUM)
6376         return 1;
6377       if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno))
6378         return 1;
6379     }
6380   return 0;
6381 }
6382
6383 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
6384    is saved at BASE + OFFSET.  */
6385
6386 static void
6387 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
6388                             rtx base, poly_int64 offset)
6389 {
6390   rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
6391   add_reg_note (insn, REG_CFA_EXPRESSION,
6392                 gen_rtx_SET (mem, regno_reg_rtx[reg]));
6393 }
6394
6395 /* AArch64 stack frames generated by this compiler look like:
6396
6397         +-------------------------------+
6398         |                               |
6399         |  incoming stack arguments     |
6400         |                               |
6401         +-------------------------------+
6402         |                               | <-- incoming stack pointer (aligned)
6403         |  callee-allocated save area   |
6404         |  for register varargs         |
6405         |                               |
6406         +-------------------------------+
6407         |  local variables              | <-- frame_pointer_rtx
6408         |                               |
6409         +-------------------------------+
6410         |  padding                      | \
6411         +-------------------------------+  |
6412         |  callee-saved registers       |  | frame.saved_regs_size
6413         +-------------------------------+  |
6414         |  LR'                          |  |
6415         +-------------------------------+  |
6416         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
6417         +-------------------------------+
6418         |  dynamic allocation           |
6419         +-------------------------------+
6420         |  padding                      |
6421         +-------------------------------+
6422         |  outgoing stack arguments     | <-- arg_pointer
6423         |                               |
6424         +-------------------------------+
6425         |                               | <-- stack_pointer_rtx (aligned)
6426
6427    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
6428    but leave frame_pointer_rtx and hard_frame_pointer_rtx
6429    unchanged.
6430
6431    By default for stack-clash we assume the guard is at least 64KB, but this
6432    value is configurable to either 4KB or 64KB.  We also force the guard size to
6433    be the same as the probing interval and both values are kept in sync.
6434
6435    With those assumptions the callee can allocate up to 63KB (or 3KB depending
6436    on the guard size) of stack space without probing.
6437
6438    When probing is needed, we emit a probe at the start of the prologue
6439    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
6440
6441    We have to track how much space has been allocated and the only stores
6442    to the stack we track as implicit probes are the FP/LR stores.
6443
6444    For outgoing arguments we probe if the size is larger than 1KB, such that
6445    the ABI specified buffer is maintained for the next callee.
6446
6447    The following registers are reserved during frame layout and should not be
6448    used for any other purpose:
6449
6450    - r11: Used by stack clash protection when SVE is enabled.
6451    - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
6452    - r14 and r15: Used for speculation tracking.
6453    - r16(IP0), r17(IP1): Used by indirect tailcalls.
6454    - r30(LR), r29(FP): Used by standard frame layout.
6455
6456    These registers must be avoided in frame layout related code unless the
6457    explicit intention is to interact with one of the features listed above.  */
6458
6459 /* Generate the prologue instructions for entry into a function.
6460    Establish the stack frame by decreasing the stack pointer with a
6461    properly calculated size and, if necessary, create a frame record
6462    filled with the values of LR and previous frame pointer.  The
6463    current FP is also set up if it is in use.  */
6464
6465 void
6466 aarch64_expand_prologue (void)
6467 {
6468   poly_int64 frame_size = cfun->machine->frame.frame_size;
6469   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6470   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6471   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6472   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
6473   unsigned reg1 = cfun->machine->frame.wb_candidate1;
6474   unsigned reg2 = cfun->machine->frame.wb_candidate2;
6475   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
6476   rtx_insn *insn;
6477
6478   /* Sign return address for functions.  */
6479   if (aarch64_return_address_signing_enabled ())
6480     {
6481       switch (aarch64_ra_sign_key)
6482         {
6483           case AARCH64_KEY_A:
6484             insn = emit_insn (gen_paciasp ());
6485             break;
6486           case AARCH64_KEY_B:
6487             insn = emit_insn (gen_pacibsp ());
6488             break;
6489           default:
6490             gcc_unreachable ();
6491         }
6492       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6493       RTX_FRAME_RELATED_P (insn) = 1;
6494     }
6495
6496   if (flag_stack_usage_info)
6497     current_function_static_stack_size = constant_lower_bound (frame_size);
6498
6499   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
6500     {
6501       if (crtl->is_leaf && !cfun->calls_alloca)
6502         {
6503           if (maybe_gt (frame_size, PROBE_INTERVAL)
6504               && maybe_gt (frame_size, get_stack_check_protect ()))
6505             aarch64_emit_probe_stack_range (get_stack_check_protect (),
6506                                             (frame_size
6507                                              - get_stack_check_protect ()));
6508         }
6509       else if (maybe_gt (frame_size, 0))
6510         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
6511     }
6512
6513   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
6514   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6515
6516   /* In theory we should never have both an initial adjustment
6517      and a callee save adjustment.  Verify that is the case since the
6518      code below does not handle it for -fstack-clash-protection.  */
6519   gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
6520
6521   /* Will only probe if the initial adjustment is larger than the guard
6522      less the amount of the guard reserved for use by the caller's
6523      outgoing args.  */
6524   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
6525                                           true, false);
6526
6527   if (callee_adjust != 0)
6528     aarch64_push_regs (reg1, reg2, callee_adjust);
6529
6530   if (emit_frame_chain)
6531     {
6532       poly_int64 reg_offset = callee_adjust;
6533       if (callee_adjust == 0)
6534         {
6535           reg1 = R29_REGNUM;
6536           reg2 = R30_REGNUM;
6537           reg_offset = callee_offset;
6538           aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
6539         }
6540       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
6541                           stack_pointer_rtx, callee_offset,
6542                           tmp1_rtx, tmp0_rtx, frame_pointer_needed);
6543       if (frame_pointer_needed && !frame_size.is_constant ())
6544         {
6545           /* Variable-sized frames need to describe the save slot
6546              address using DW_CFA_expression rather than DW_CFA_offset.
6547              This means that, without taking further action, the
6548              locations of the registers that we've already saved would
6549              remain based on the stack pointer even after we redefine
6550              the CFA based on the frame pointer.  We therefore need new
6551              DW_CFA_expressions to re-express the save slots with addresses
6552              based on the frame pointer.  */
6553           rtx_insn *insn = get_last_insn ();
6554           gcc_assert (RTX_FRAME_RELATED_P (insn));
6555
6556           /* Add an explicit CFA definition if this was previously
6557              implicit.  */
6558           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
6559             {
6560               rtx src = plus_constant (Pmode, stack_pointer_rtx,
6561                                        callee_offset);
6562               add_reg_note (insn, REG_CFA_ADJUST_CFA,
6563                             gen_rtx_SET (hard_frame_pointer_rtx, src));
6564             }
6565
6566           /* Change the save slot expressions for the registers that
6567              we've already saved.  */
6568           reg_offset -= callee_offset;
6569           aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
6570                                       reg_offset + UNITS_PER_WORD);
6571           aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
6572                                       reg_offset);
6573         }
6574       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
6575     }
6576
6577   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
6578                              callee_adjust != 0 || emit_frame_chain);
6579   if (aarch64_simd_decl_p (cfun->decl))
6580     aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6581                                callee_adjust != 0 || emit_frame_chain);
6582   else
6583     aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6584                                callee_adjust != 0 || emit_frame_chain);
6585
6586   /* We may need to probe the final adjustment if it is larger than the guard
6587      that is assumed by the called.  */
6588   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
6589                                           !frame_pointer_needed, true);
6590 }
6591
6592 /* Return TRUE if we can use a simple_return insn.
6593
6594    This function checks whether the callee saved stack is empty, which
6595    means no restore actions are need. The pro_and_epilogue will use
6596    this to check whether shrink-wrapping opt is feasible.  */
6597
6598 bool
6599 aarch64_use_return_insn_p (void)
6600 {
6601   if (!reload_completed)
6602     return false;
6603
6604   if (crtl->profile)
6605     return false;
6606
6607   return known_eq (cfun->machine->frame.frame_size, 0);
6608 }
6609
6610 /* Return false for non-leaf SIMD functions in order to avoid
6611    shrink-wrapping them.  Doing this will lose the necessary
6612    save/restore of FP registers.  */
6613
6614 bool
6615 aarch64_use_simple_return_insn_p (void)
6616 {
6617   if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf)
6618     return false;
6619
6620   return true;
6621 }
6622
6623 /* Generate the epilogue instructions for returning from a function.
6624    This is almost exactly the reverse of the prolog sequence, except
6625    that we need to insert barriers to avoid scheduling loads that read
6626    from a deallocated stack, and we optimize the unwind records by
6627    emitting them all together if possible.  */
6628 void
6629 aarch64_expand_epilogue (bool for_sibcall)
6630 {
6631   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6632   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6633   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6634   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
6635   unsigned reg1 = cfun->machine->frame.wb_candidate1;
6636   unsigned reg2 = cfun->machine->frame.wb_candidate2;
6637   rtx cfi_ops = NULL;
6638   rtx_insn *insn;
6639   /* A stack clash protection prologue may not have left EP0_REGNUM or
6640      EP1_REGNUM in a usable state.  The same is true for allocations
6641      with an SVE component, since we then need both temporary registers
6642      for each allocation.  For stack clash we are in a usable state if
6643      the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
6644   HOST_WIDE_INT guard_size
6645     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
6646   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
6647
6648   /* We can re-use the registers when the allocation amount is smaller than
6649      guard_size - guard_used_by_caller because we won't be doing any probes
6650      then.  In such situations the register should remain live with the correct
6651      value.  */
6652   bool can_inherit_p = (initial_adjust.is_constant ()
6653                         && final_adjust.is_constant ())
6654                         && (!flag_stack_clash_protection
6655                             || known_lt (initial_adjust,
6656                                          guard_size - guard_used_by_caller));
6657
6658   /* We need to add memory barrier to prevent read from deallocated stack.  */
6659   bool need_barrier_p
6660     = maybe_ne (get_frame_size ()
6661                 + cfun->machine->frame.saved_varargs_size, 0);
6662
6663   /* Emit a barrier to prevent loads from a deallocated stack.  */
6664   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
6665       || cfun->calls_alloca
6666       || crtl->calls_eh_return)
6667     {
6668       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6669       need_barrier_p = false;
6670     }
6671
6672   /* Restore the stack pointer from the frame pointer if it may not
6673      be the same as the stack pointer.  */
6674   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
6675   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6676   if (frame_pointer_needed
6677       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
6678     /* If writeback is used when restoring callee-saves, the CFA
6679        is restored on the instruction doing the writeback.  */
6680     aarch64_add_offset (Pmode, stack_pointer_rtx,
6681                         hard_frame_pointer_rtx, -callee_offset,
6682                         tmp1_rtx, tmp0_rtx, callee_adjust == 0);
6683   else
6684      /* The case where we need to re-use the register here is very rare, so
6685         avoid the complicated condition and just always emit a move if the
6686         immediate doesn't fit.  */
6687      aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
6688
6689   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
6690                                 callee_adjust != 0, &cfi_ops);
6691   if (aarch64_simd_decl_p (cfun->decl))
6692     aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6693                                   callee_adjust != 0, &cfi_ops);
6694   else
6695     aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6696                                   callee_adjust != 0, &cfi_ops);
6697
6698   if (need_barrier_p)
6699     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6700
6701   if (callee_adjust != 0)
6702     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
6703
6704   if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
6705     {
6706       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
6707       insn = get_last_insn ();
6708       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
6709       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
6710       RTX_FRAME_RELATED_P (insn) = 1;
6711       cfi_ops = NULL;
6712     }
6713
6714   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
6715      add restriction on emit_move optimization to leaf functions.  */
6716   aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
6717                   (!can_inherit_p || !crtl->is_leaf
6718                    || df_regs_ever_live_p (EP0_REGNUM)));
6719
6720   if (cfi_ops)
6721     {
6722       /* Emit delayed restores and reset the CFA to be SP.  */
6723       insn = get_last_insn ();
6724       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
6725       REG_NOTES (insn) = cfi_ops;
6726       RTX_FRAME_RELATED_P (insn) = 1;
6727     }
6728
6729   /* We prefer to emit the combined return/authenticate instruction RETAA,
6730      however there are three cases in which we must instead emit an explicit
6731      authentication instruction.
6732
6733         1) Sibcalls don't return in a normal way, so if we're about to call one
6734            we must authenticate.
6735
6736         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
6737            generating code for !TARGET_ARMV8_3 we can't use it and must
6738            explicitly authenticate.
6739
6740         3) On an eh_return path we make extra stack adjustments to update the
6741            canonical frame address to be the exception handler's CFA.  We want
6742            to authenticate using the CFA of the function which calls eh_return.
6743     */
6744   if (aarch64_return_address_signing_enabled ()
6745       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
6746     {
6747       switch (aarch64_ra_sign_key)
6748         {
6749           case AARCH64_KEY_A:
6750             insn = emit_insn (gen_autiasp ());
6751             break;
6752           case AARCH64_KEY_B:
6753             insn = emit_insn (gen_autibsp ());
6754             break;
6755           default:
6756             gcc_unreachable ();
6757         }
6758       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6759       RTX_FRAME_RELATED_P (insn) = 1;
6760     }
6761
6762   /* Stack adjustment for exception handler.  */
6763   if (crtl->calls_eh_return && !for_sibcall)
6764     {
6765       /* We need to unwind the stack by the offset computed by
6766          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
6767          to be SP; letting the CFA move during this adjustment
6768          is just as correct as retaining the CFA from the body
6769          of the function.  Therefore, do nothing special.  */
6770       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
6771     }
6772
6773   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
6774   if (!for_sibcall)
6775     emit_jump_insn (ret_rtx);
6776 }
6777
6778 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
6779    normally or return to a previous frame after unwinding.
6780
6781    An EH return uses a single shared return sequence.  The epilogue is
6782    exactly like a normal epilogue except that it has an extra input
6783    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
6784    that must be applied after the frame has been destroyed.  An extra label
6785    is inserted before the epilogue which initializes this register to zero,
6786    and this is the entry point for a normal return.
6787
6788    An actual EH return updates the return address, initializes the stack
6789    adjustment and jumps directly into the epilogue (bypassing the zeroing
6790    of the adjustment).  Since the return address is typically saved on the
6791    stack when a function makes a call, the saved LR must be updated outside
6792    the epilogue.
6793
6794    This poses problems as the store is generated well before the epilogue,
6795    so the offset of LR is not known yet.  Also optimizations will remove the
6796    store as it appears dead, even after the epilogue is generated (as the
6797    base or offset for loading LR is different in many cases).
6798
6799    To avoid these problems this implementation forces the frame pointer
6800    in eh_return functions so that the location of LR is fixed and known early.
6801    It also marks the store volatile, so no optimization is permitted to
6802    remove the store.  */
6803 rtx
6804 aarch64_eh_return_handler_rtx (void)
6805 {
6806   rtx tmp = gen_frame_mem (Pmode,
6807     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
6808
6809   /* Mark the store volatile, so no optimization is permitted to remove it.  */
6810   MEM_VOLATILE_P (tmp) = true;
6811   return tmp;
6812 }
6813
6814 /* Output code to add DELTA to the first argument, and then jump
6815    to FUNCTION.  Used for C++ multiple inheritance.  */
6816 static void
6817 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
6818                          HOST_WIDE_INT delta,
6819                          HOST_WIDE_INT vcall_offset,
6820                          tree function)
6821 {
6822   /* The this pointer is always in x0.  Note that this differs from
6823      Arm where the this pointer maybe bumped to r1 if r0 is required
6824      to return a pointer to an aggregate.  On AArch64 a result value
6825      pointer will be in x8.  */
6826   int this_regno = R0_REGNUM;
6827   rtx this_rtx, temp0, temp1, addr, funexp;
6828   rtx_insn *insn;
6829   const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
6830
6831   if (aarch64_bti_enabled ())
6832     emit_insn (gen_bti_c());
6833
6834   reload_completed = 1;
6835   emit_note (NOTE_INSN_PROLOGUE_END);
6836
6837   this_rtx = gen_rtx_REG (Pmode, this_regno);
6838   temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
6839   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
6840
6841   if (vcall_offset == 0)
6842     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
6843   else
6844     {
6845       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
6846
6847       addr = this_rtx;
6848       if (delta != 0)
6849         {
6850           if (delta >= -256 && delta < 256)
6851             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
6852                                        plus_constant (Pmode, this_rtx, delta));
6853           else
6854             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
6855                                 temp1, temp0, false);
6856         }
6857
6858       if (Pmode == ptr_mode)
6859         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
6860       else
6861         aarch64_emit_move (temp0,
6862                            gen_rtx_ZERO_EXTEND (Pmode,
6863                                                 gen_rtx_MEM (ptr_mode, addr)));
6864
6865       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
6866           addr = plus_constant (Pmode, temp0, vcall_offset);
6867       else
6868         {
6869           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
6870                                           Pmode);
6871           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
6872         }
6873
6874       if (Pmode == ptr_mode)
6875         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
6876       else
6877         aarch64_emit_move (temp1,
6878                            gen_rtx_SIGN_EXTEND (Pmode,
6879                                                 gen_rtx_MEM (ptr_mode, addr)));
6880
6881       emit_insn (gen_add2_insn (this_rtx, temp1));
6882     }
6883
6884   /* Generate a tail call to the target function.  */
6885   if (!TREE_USED (function))
6886     {
6887       assemble_external (function);
6888       TREE_USED (function) = 1;
6889     }
6890   funexp = XEXP (DECL_RTL (function), 0);
6891   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
6892   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
6893   SIBLING_CALL_P (insn) = 1;
6894
6895   insn = get_insns ();
6896   shorten_branches (insn);
6897
6898   assemble_start_function (thunk, fnname);
6899   final_start_function (insn, file, 1);
6900   final (insn, file, 1);
6901   final_end_function ();
6902   assemble_end_function (thunk, fnname);
6903
6904   /* Stop pretending to be a post-reload pass.  */
6905   reload_completed = 0;
6906 }
6907
6908 static bool
6909 aarch64_tls_referenced_p (rtx x)
6910 {
6911   if (!TARGET_HAVE_TLS)
6912     return false;
6913   subrtx_iterator::array_type array;
6914   FOR_EACH_SUBRTX (iter, array, x, ALL)
6915     {
6916       const_rtx x = *iter;
6917       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
6918         return true;
6919       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
6920          TLS offsets, not real symbol references.  */
6921       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
6922         iter.skip_subrtxes ();
6923     }
6924   return false;
6925 }
6926
6927
6928 /* Return true if val can be encoded as a 12-bit unsigned immediate with
6929    a left shift of 0 or 12 bits.  */
6930 bool
6931 aarch64_uimm12_shift (HOST_WIDE_INT val)
6932 {
6933   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
6934           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
6935           );
6936 }
6937
6938 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
6939    that can be created with a left shift of 0 or 12.  */
6940 static HOST_WIDE_INT
6941 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
6942 {
6943   /* Check to see if the value fits in 24 bits, as that is the maximum we can
6944      handle correctly.  */
6945   gcc_assert ((val & 0xffffff) == val);
6946
6947   if (((val & 0xfff) << 0) == val)
6948     return val;
6949
6950   return val & (0xfff << 12);
6951 }
6952
6953 /* Return true if val is an immediate that can be loaded into a
6954    register by a MOVZ instruction.  */
6955 static bool
6956 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
6957 {
6958   if (GET_MODE_SIZE (mode) > 4)
6959     {
6960       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
6961           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
6962         return 1;
6963     }
6964   else
6965     {
6966       /* Ignore sign extension.  */
6967       val &= (HOST_WIDE_INT) 0xffffffff;
6968     }
6969   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
6970           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
6971 }
6972
6973 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
6974    64-bit (DImode) integer.  */
6975
6976 static unsigned HOST_WIDE_INT
6977 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
6978 {
6979   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
6980   while (size < 64)
6981     {
6982       val &= (HOST_WIDE_INT_1U << size) - 1;
6983       val |= val << size;
6984       size *= 2;
6985     }
6986   return val;
6987 }
6988
6989 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
6990
6991 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
6992   {
6993     0x0000000100000001ull,
6994     0x0001000100010001ull,
6995     0x0101010101010101ull,
6996     0x1111111111111111ull,
6997     0x5555555555555555ull,
6998   };
6999
7000
7001 /* Return true if val is a valid bitmask immediate.  */
7002
7003 bool
7004 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
7005 {
7006   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
7007   int bits;
7008
7009   /* Check for a single sequence of one bits and return quickly if so.
7010      The special cases of all ones and all zeroes returns false.  */
7011   val = aarch64_replicate_bitmask_imm (val_in, mode);
7012   tmp = val + (val & -val);
7013
7014   if (tmp == (tmp & -tmp))
7015     return (val + 1) > 1;
7016
7017   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
7018   if (mode == SImode)
7019     val = (val << 32) | (val & 0xffffffff);
7020
7021   /* Invert if the immediate doesn't start with a zero bit - this means we
7022      only need to search for sequences of one bits.  */
7023   if (val & 1)
7024     val = ~val;
7025
7026   /* Find the first set bit and set tmp to val with the first sequence of one
7027      bits removed.  Return success if there is a single sequence of ones.  */
7028   first_one = val & -val;
7029   tmp = val & (val + first_one);
7030
7031   if (tmp == 0)
7032     return true;
7033
7034   /* Find the next set bit and compute the difference in bit position.  */
7035   next_one = tmp & -tmp;
7036   bits = clz_hwi (first_one) - clz_hwi (next_one);
7037   mask = val ^ tmp;
7038
7039   /* Check the bit position difference is a power of 2, and that the first
7040      sequence of one bits fits within 'bits' bits.  */
7041   if ((mask >> bits) != 0 || bits != (bits & -bits))
7042     return false;
7043
7044   /* Check the sequence of one bits is repeated 64/bits times.  */
7045   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
7046 }
7047
7048 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
7049    Assumed precondition: VAL_IN Is not zero.  */
7050
7051 unsigned HOST_WIDE_INT
7052 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
7053 {
7054   int lowest_bit_set = ctz_hwi (val_in);
7055   int highest_bit_set = floor_log2 (val_in);
7056   gcc_assert (val_in != 0);
7057
7058   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
7059           (HOST_WIDE_INT_1U << lowest_bit_set));
7060 }
7061
7062 /* Create constant where bits outside of lowest bit set to highest bit set
7063    are set to 1.  */
7064
7065 unsigned HOST_WIDE_INT
7066 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
7067 {
7068   return val_in | ~aarch64_and_split_imm1 (val_in);
7069 }
7070
7071 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
7072
7073 bool
7074 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
7075 {
7076   scalar_int_mode int_mode;
7077   if (!is_a <scalar_int_mode> (mode, &int_mode))
7078     return false;
7079
7080   if (aarch64_bitmask_imm (val_in, int_mode))
7081     return false;
7082
7083   if (aarch64_move_imm (val_in, int_mode))
7084     return false;
7085
7086   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
7087
7088   return aarch64_bitmask_imm (imm2, int_mode);
7089 }
7090
7091 /* Return true if val is an immediate that can be loaded into a
7092    register in a single instruction.  */
7093 bool
7094 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
7095 {
7096   scalar_int_mode int_mode;
7097   if (!is_a <scalar_int_mode> (mode, &int_mode))
7098     return false;
7099
7100   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
7101     return 1;
7102   return aarch64_bitmask_imm (val, int_mode);
7103 }
7104
7105 static bool
7106 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
7107 {
7108   rtx base, offset;
7109
7110   if (GET_CODE (x) == HIGH)
7111     return true;
7112
7113   /* There's no way to calculate VL-based values using relocations.  */
7114   subrtx_iterator::array_type array;
7115   FOR_EACH_SUBRTX (iter, array, x, ALL)
7116     if (GET_CODE (*iter) == CONST_POLY_INT)
7117       return true;
7118
7119   split_const (x, &base, &offset);
7120   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
7121     {
7122       if (aarch64_classify_symbol (base, INTVAL (offset))
7123           != SYMBOL_FORCE_TO_MEM)
7124         return true;
7125       else
7126         /* Avoid generating a 64-bit relocation in ILP32; leave
7127            to aarch64_expand_mov_immediate to handle it properly.  */
7128         return mode != ptr_mode;
7129     }
7130
7131   return aarch64_tls_referenced_p (x);
7132 }
7133
7134 /* Implement TARGET_CASE_VALUES_THRESHOLD.
7135    The expansion for a table switch is quite expensive due to the number
7136    of instructions, the table lookup and hard to predict indirect jump.
7137    When optimizing for speed, and -O3 enabled, use the per-core tuning if
7138    set, otherwise use tables for > 16 cases as a tradeoff between size and
7139    performance.  When optimizing for size, use the default setting.  */
7140
7141 static unsigned int
7142 aarch64_case_values_threshold (void)
7143 {
7144   /* Use the specified limit for the number of cases before using jump
7145      tables at higher optimization levels.  */
7146   if (optimize > 2
7147       && selected_cpu->tune->max_case_values != 0)
7148     return selected_cpu->tune->max_case_values;
7149   else
7150     return optimize_size ? default_case_values_threshold () : 17;
7151 }
7152
7153 /* Return true if register REGNO is a valid index register.
7154    STRICT_P is true if REG_OK_STRICT is in effect.  */
7155
7156 bool
7157 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
7158 {
7159   if (!HARD_REGISTER_NUM_P (regno))
7160     {
7161       if (!strict_p)
7162         return true;
7163
7164       if (!reg_renumber)
7165         return false;
7166
7167       regno = reg_renumber[regno];
7168     }
7169   return GP_REGNUM_P (regno);
7170 }
7171
7172 /* Return true if register REGNO is a valid base register for mode MODE.
7173    STRICT_P is true if REG_OK_STRICT is in effect.  */
7174
7175 bool
7176 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
7177 {
7178   if (!HARD_REGISTER_NUM_P (regno))
7179     {
7180       if (!strict_p)
7181         return true;
7182
7183       if (!reg_renumber)
7184         return false;
7185
7186       regno = reg_renumber[regno];
7187     }
7188
7189   /* The fake registers will be eliminated to either the stack or
7190      hard frame pointer, both of which are usually valid base registers.
7191      Reload deals with the cases where the eliminated form isn't valid.  */
7192   return (GP_REGNUM_P (regno)
7193           || regno == SP_REGNUM
7194           || regno == FRAME_POINTER_REGNUM
7195           || regno == ARG_POINTER_REGNUM);
7196 }
7197
7198 /* Return true if X is a valid base register for mode MODE.
7199    STRICT_P is true if REG_OK_STRICT is in effect.  */
7200
7201 static bool
7202 aarch64_base_register_rtx_p (rtx x, bool strict_p)
7203 {
7204   if (!strict_p
7205       && GET_CODE (x) == SUBREG
7206       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
7207     x = SUBREG_REG (x);
7208
7209   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
7210 }
7211
7212 /* Return true if address offset is a valid index.  If it is, fill in INFO
7213    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
7214
7215 static bool
7216 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
7217                         machine_mode mode, bool strict_p)
7218 {
7219   enum aarch64_address_type type;
7220   rtx index;
7221   int shift;
7222
7223   /* (reg:P) */
7224   if ((REG_P (x) || GET_CODE (x) == SUBREG)
7225       && GET_MODE (x) == Pmode)
7226     {
7227       type = ADDRESS_REG_REG;
7228       index = x;
7229       shift = 0;
7230     }
7231   /* (sign_extend:DI (reg:SI)) */
7232   else if ((GET_CODE (x) == SIGN_EXTEND
7233             || GET_CODE (x) == ZERO_EXTEND)
7234            && GET_MODE (x) == DImode
7235            && GET_MODE (XEXP (x, 0)) == SImode)
7236     {
7237       type = (GET_CODE (x) == SIGN_EXTEND)
7238         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7239       index = XEXP (x, 0);
7240       shift = 0;
7241     }
7242   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
7243   else if (GET_CODE (x) == MULT
7244            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
7245                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
7246            && GET_MODE (XEXP (x, 0)) == DImode
7247            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
7248            && CONST_INT_P (XEXP (x, 1)))
7249     {
7250       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
7251         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7252       index = XEXP (XEXP (x, 0), 0);
7253       shift = exact_log2 (INTVAL (XEXP (x, 1)));
7254     }
7255   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
7256   else if (GET_CODE (x) == ASHIFT
7257            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
7258                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
7259            && GET_MODE (XEXP (x, 0)) == DImode
7260            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
7261            && CONST_INT_P (XEXP (x, 1)))
7262     {
7263       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
7264         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7265       index = XEXP (XEXP (x, 0), 0);
7266       shift = INTVAL (XEXP (x, 1));
7267     }
7268   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
7269   else if ((GET_CODE (x) == SIGN_EXTRACT
7270             || GET_CODE (x) == ZERO_EXTRACT)
7271            && GET_MODE (x) == DImode
7272            && GET_CODE (XEXP (x, 0)) == MULT
7273            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7274            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
7275     {
7276       type = (GET_CODE (x) == SIGN_EXTRACT)
7277         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7278       index = XEXP (XEXP (x, 0), 0);
7279       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
7280       if (INTVAL (XEXP (x, 1)) != 32 + shift
7281           || INTVAL (XEXP (x, 2)) != 0)
7282         shift = -1;
7283     }
7284   /* (and:DI (mult:DI (reg:DI) (const_int scale))
7285      (const_int 0xffffffff<<shift)) */
7286   else if (GET_CODE (x) == AND
7287            && GET_MODE (x) == DImode
7288            && GET_CODE (XEXP (x, 0)) == MULT
7289            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7290            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7291            && CONST_INT_P (XEXP (x, 1)))
7292     {
7293       type = ADDRESS_REG_UXTW;
7294       index = XEXP (XEXP (x, 0), 0);
7295       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
7296       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
7297         shift = -1;
7298     }
7299   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
7300   else if ((GET_CODE (x) == SIGN_EXTRACT
7301             || GET_CODE (x) == ZERO_EXTRACT)
7302            && GET_MODE (x) == DImode
7303            && GET_CODE (XEXP (x, 0)) == ASHIFT
7304            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7305            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
7306     {
7307       type = (GET_CODE (x) == SIGN_EXTRACT)
7308         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7309       index = XEXP (XEXP (x, 0), 0);
7310       shift = INTVAL (XEXP (XEXP (x, 0), 1));
7311       if (INTVAL (XEXP (x, 1)) != 32 + shift
7312           || INTVAL (XEXP (x, 2)) != 0)
7313         shift = -1;
7314     }
7315   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
7316      (const_int 0xffffffff<<shift)) */
7317   else if (GET_CODE (x) == AND
7318            && GET_MODE (x) == DImode
7319            && GET_CODE (XEXP (x, 0)) == ASHIFT
7320            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7321            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7322            && CONST_INT_P (XEXP (x, 1)))
7323     {
7324       type = ADDRESS_REG_UXTW;
7325       index = XEXP (XEXP (x, 0), 0);
7326       shift = INTVAL (XEXP (XEXP (x, 0), 1));
7327       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
7328         shift = -1;
7329     }
7330   /* (mult:P (reg:P) (const_int scale)) */
7331   else if (GET_CODE (x) == MULT
7332            && GET_MODE (x) == Pmode
7333            && GET_MODE (XEXP (x, 0)) == Pmode
7334            && CONST_INT_P (XEXP (x, 1)))
7335     {
7336       type = ADDRESS_REG_REG;
7337       index = XEXP (x, 0);
7338       shift = exact_log2 (INTVAL (XEXP (x, 1)));
7339     }
7340   /* (ashift:P (reg:P) (const_int shift)) */
7341   else if (GET_CODE (x) == ASHIFT
7342            && GET_MODE (x) == Pmode
7343            && GET_MODE (XEXP (x, 0)) == Pmode
7344            && CONST_INT_P (XEXP (x, 1)))
7345     {
7346       type = ADDRESS_REG_REG;
7347       index = XEXP (x, 0);
7348       shift = INTVAL (XEXP (x, 1));
7349     }
7350   else
7351     return false;
7352
7353   if (!strict_p
7354       && GET_CODE (index) == SUBREG
7355       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
7356     index = SUBREG_REG (index);
7357
7358   if (aarch64_sve_data_mode_p (mode))
7359     {
7360       if (type != ADDRESS_REG_REG
7361           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
7362         return false;
7363     }
7364   else
7365     {
7366       if (shift != 0
7367           && !(IN_RANGE (shift, 1, 3)
7368                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
7369         return false;
7370     }
7371
7372   if (REG_P (index)
7373       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
7374     {
7375       info->type = type;
7376       info->offset = index;
7377       info->shift = shift;
7378       return true;
7379     }
7380
7381   return false;
7382 }
7383
7384 /* Return true if MODE is one of the modes for which we
7385    support LDP/STP operations.  */
7386
7387 static bool
7388 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
7389 {
7390   return mode == SImode || mode == DImode
7391          || mode == SFmode || mode == DFmode
7392          || (aarch64_vector_mode_supported_p (mode)
7393              && (known_eq (GET_MODE_SIZE (mode), 8)
7394                  || (known_eq (GET_MODE_SIZE (mode), 16)
7395                     && (aarch64_tune_params.extra_tuning_flags
7396                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
7397 }
7398
7399 /* Return true if REGNO is a virtual pointer register, or an eliminable
7400    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
7401    include stack_pointer or hard_frame_pointer.  */
7402 static bool
7403 virt_or_elim_regno_p (unsigned regno)
7404 {
7405   return ((regno >= FIRST_VIRTUAL_REGISTER
7406            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
7407           || regno == FRAME_POINTER_REGNUM
7408           || regno == ARG_POINTER_REGNUM);
7409 }
7410
7411 /* Return true if X is a valid address of type TYPE for machine mode MODE.
7412    If it is, fill in INFO appropriately.  STRICT_P is true if
7413    REG_OK_STRICT is in effect.  */
7414
7415 bool
7416 aarch64_classify_address (struct aarch64_address_info *info,
7417                           rtx x, machine_mode mode, bool strict_p,
7418                           aarch64_addr_query_type type)
7419 {
7420   enum rtx_code code = GET_CODE (x);
7421   rtx op0, op1;
7422   poly_int64 offset;
7423
7424   HOST_WIDE_INT const_size;
7425
7426   /* On BE, we use load/store pair for all large int mode load/stores.
7427      TI/TFmode may also use a load/store pair.  */
7428   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7429   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
7430   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
7431                             || type == ADDR_QUERY_LDP_STP_N
7432                             || mode == TImode
7433                             || mode == TFmode
7434                             || (BYTES_BIG_ENDIAN && advsimd_struct_p));
7435
7436   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
7437      corresponds to the actual size of the memory being loaded/stored and the
7438      mode of the corresponding addressing mode is half of that.  */
7439   if (type == ADDR_QUERY_LDP_STP_N
7440       && known_eq (GET_MODE_SIZE (mode), 16))
7441     mode = DFmode;
7442
7443   bool allow_reg_index_p = (!load_store_pair_p
7444                             && (known_lt (GET_MODE_SIZE (mode), 16)
7445                                 || vec_flags == VEC_ADVSIMD
7446                                 || vec_flags & VEC_SVE_DATA));
7447
7448   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
7449      [Rn, #offset, MUL VL].  */
7450   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
7451       && (code != REG && code != PLUS))
7452     return false;
7453
7454   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
7455      REG addressing.  */
7456   if (advsimd_struct_p
7457       && !BYTES_BIG_ENDIAN
7458       && (code != POST_INC && code != REG))
7459     return false;
7460
7461   gcc_checking_assert (GET_MODE (x) == VOIDmode
7462                        || SCALAR_INT_MODE_P (GET_MODE (x)));
7463
7464   switch (code)
7465     {
7466     case REG:
7467     case SUBREG:
7468       info->type = ADDRESS_REG_IMM;
7469       info->base = x;
7470       info->offset = const0_rtx;
7471       info->const_offset = 0;
7472       return aarch64_base_register_rtx_p (x, strict_p);
7473
7474     case PLUS:
7475       op0 = XEXP (x, 0);
7476       op1 = XEXP (x, 1);
7477
7478       if (! strict_p
7479           && REG_P (op0)
7480           && virt_or_elim_regno_p (REGNO (op0))
7481           && poly_int_rtx_p (op1, &offset))
7482         {
7483           info->type = ADDRESS_REG_IMM;
7484           info->base = op0;
7485           info->offset = op1;
7486           info->const_offset = offset;
7487
7488           return true;
7489         }
7490
7491       if (maybe_ne (GET_MODE_SIZE (mode), 0)
7492           && aarch64_base_register_rtx_p (op0, strict_p)
7493           && poly_int_rtx_p (op1, &offset))
7494         {
7495           info->type = ADDRESS_REG_IMM;
7496           info->base = op0;
7497           info->offset = op1;
7498           info->const_offset = offset;
7499
7500           /* TImode and TFmode values are allowed in both pairs of X
7501              registers and individual Q registers.  The available
7502              address modes are:
7503              X,X: 7-bit signed scaled offset
7504              Q:   9-bit signed offset
7505              We conservatively require an offset representable in either mode.
7506              When performing the check for pairs of X registers i.e.  LDP/STP
7507              pass down DImode since that is the natural size of the LDP/STP
7508              instruction memory accesses.  */
7509           if (mode == TImode || mode == TFmode)
7510             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
7511                     && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
7512                         || offset_12bit_unsigned_scaled_p (mode, offset)));
7513
7514           /* A 7bit offset check because OImode will emit a ldp/stp
7515              instruction (only big endian will get here).
7516              For ldp/stp instructions, the offset is scaled for the size of a
7517              single element of the pair.  */
7518           if (mode == OImode)
7519             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
7520
7521           /* Three 9/12 bit offsets checks because CImode will emit three
7522              ldr/str instructions (only big endian will get here).  */
7523           if (mode == CImode)
7524             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
7525                     && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
7526                                                                offset + 32)
7527                         || offset_12bit_unsigned_scaled_p (V16QImode,
7528                                                            offset + 32)));
7529
7530           /* Two 7bit offsets checks because XImode will emit two ldp/stp
7531              instructions (only big endian will get here).  */
7532           if (mode == XImode)
7533             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
7534                     && aarch64_offset_7bit_signed_scaled_p (TImode,
7535                                                             offset + 32));
7536
7537           /* Make "m" use the LD1 offset range for SVE data modes, so
7538              that pre-RTL optimizers like ivopts will work to that
7539              instead of the wider LDR/STR range.  */
7540           if (vec_flags == VEC_SVE_DATA)
7541             return (type == ADDR_QUERY_M
7542                     ? offset_4bit_signed_scaled_p (mode, offset)
7543                     : offset_9bit_signed_scaled_p (mode, offset));
7544
7545           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
7546             {
7547               poly_int64 end_offset = (offset
7548                                        + GET_MODE_SIZE (mode)
7549                                        - BYTES_PER_SVE_VECTOR);
7550               return (type == ADDR_QUERY_M
7551                       ? offset_4bit_signed_scaled_p (mode, offset)
7552                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
7553                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
7554                                                          end_offset)));
7555             }
7556
7557           if (vec_flags == VEC_SVE_PRED)
7558             return offset_9bit_signed_scaled_p (mode, offset);
7559
7560           if (load_store_pair_p)
7561             return ((known_eq (GET_MODE_SIZE (mode), 4)
7562                      || known_eq (GET_MODE_SIZE (mode), 8)
7563                      || known_eq (GET_MODE_SIZE (mode), 16))
7564                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
7565           else
7566             return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
7567                     || offset_12bit_unsigned_scaled_p (mode, offset));
7568         }
7569
7570       if (allow_reg_index_p)
7571         {
7572           /* Look for base + (scaled/extended) index register.  */
7573           if (aarch64_base_register_rtx_p (op0, strict_p)
7574               && aarch64_classify_index (info, op1, mode, strict_p))
7575             {
7576               info->base = op0;
7577               return true;
7578             }
7579           if (aarch64_base_register_rtx_p (op1, strict_p)
7580               && aarch64_classify_index (info, op0, mode, strict_p))
7581             {
7582               info->base = op1;
7583               return true;
7584             }
7585         }
7586
7587       return false;
7588
7589     case POST_INC:
7590     case POST_DEC:
7591     case PRE_INC:
7592     case PRE_DEC:
7593       info->type = ADDRESS_REG_WB;
7594       info->base = XEXP (x, 0);
7595       info->offset = NULL_RTX;
7596       return aarch64_base_register_rtx_p (info->base, strict_p);
7597
7598     case POST_MODIFY:
7599     case PRE_MODIFY:
7600       info->type = ADDRESS_REG_WB;
7601       info->base = XEXP (x, 0);
7602       if (GET_CODE (XEXP (x, 1)) == PLUS
7603           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
7604           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
7605           && aarch64_base_register_rtx_p (info->base, strict_p))
7606         {
7607           info->offset = XEXP (XEXP (x, 1), 1);
7608           info->const_offset = offset;
7609
7610           /* TImode and TFmode values are allowed in both pairs of X
7611              registers and individual Q registers.  The available
7612              address modes are:
7613              X,X: 7-bit signed scaled offset
7614              Q:   9-bit signed offset
7615              We conservatively require an offset representable in either mode.
7616            */
7617           if (mode == TImode || mode == TFmode)
7618             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
7619                     && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
7620
7621           if (load_store_pair_p)
7622             return ((known_eq (GET_MODE_SIZE (mode), 4)
7623                      || known_eq (GET_MODE_SIZE (mode), 8)
7624                      || known_eq (GET_MODE_SIZE (mode), 16))
7625                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
7626           else
7627             return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
7628         }
7629       return false;
7630
7631     case CONST:
7632     case SYMBOL_REF:
7633     case LABEL_REF:
7634       /* load literal: pc-relative constant pool entry.  Only supported
7635          for SI mode or larger.  */
7636       info->type = ADDRESS_SYMBOLIC;
7637
7638       if (!load_store_pair_p
7639           && GET_MODE_SIZE (mode).is_constant (&const_size)
7640           && const_size >= 4)
7641         {
7642           rtx sym, addend;
7643
7644           split_const (x, &sym, &addend);
7645           return ((GET_CODE (sym) == LABEL_REF
7646                    || (GET_CODE (sym) == SYMBOL_REF
7647                        && CONSTANT_POOL_ADDRESS_P (sym)
7648                        && aarch64_pcrelative_literal_loads)));
7649         }
7650       return false;
7651
7652     case LO_SUM:
7653       info->type = ADDRESS_LO_SUM;
7654       info->base = XEXP (x, 0);
7655       info->offset = XEXP (x, 1);
7656       if (allow_reg_index_p
7657           && aarch64_base_register_rtx_p (info->base, strict_p))
7658         {
7659           rtx sym, offs;
7660           split_const (info->offset, &sym, &offs);
7661           if (GET_CODE (sym) == SYMBOL_REF
7662               && (aarch64_classify_symbol (sym, INTVAL (offs))
7663                   == SYMBOL_SMALL_ABSOLUTE))
7664             {
7665               /* The symbol and offset must be aligned to the access size.  */
7666               unsigned int align;
7667
7668               if (CONSTANT_POOL_ADDRESS_P (sym))
7669                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
7670               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
7671                 {
7672                   tree exp = SYMBOL_REF_DECL (sym);
7673                   align = TYPE_ALIGN (TREE_TYPE (exp));
7674                   align = aarch64_constant_alignment (exp, align);
7675                 }
7676               else if (SYMBOL_REF_DECL (sym))
7677                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
7678               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
7679                        && SYMBOL_REF_BLOCK (sym) != NULL)
7680                 align = SYMBOL_REF_BLOCK (sym)->alignment;
7681               else
7682                 align = BITS_PER_UNIT;
7683
7684               poly_int64 ref_size = GET_MODE_SIZE (mode);
7685               if (known_eq (ref_size, 0))
7686                 ref_size = GET_MODE_SIZE (DImode);
7687
7688               return (multiple_p (INTVAL (offs), ref_size)
7689                       && multiple_p (align / BITS_PER_UNIT, ref_size));
7690             }
7691         }
7692       return false;
7693
7694     default:
7695       return false;
7696     }
7697 }
7698
7699 /* Return true if the address X is valid for a PRFM instruction.
7700    STRICT_P is true if we should do strict checking with
7701    aarch64_classify_address.  */
7702
7703 bool
7704 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
7705 {
7706   struct aarch64_address_info addr;
7707
7708   /* PRFM accepts the same addresses as DImode...  */
7709   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
7710   if (!res)
7711     return false;
7712
7713   /* ... except writeback forms.  */
7714   return addr.type != ADDRESS_REG_WB;
7715 }
7716
7717 bool
7718 aarch64_symbolic_address_p (rtx x)
7719 {
7720   rtx offset;
7721
7722   split_const (x, &x, &offset);
7723   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
7724 }
7725
7726 /* Classify the base of symbolic expression X.  */
7727
7728 enum aarch64_symbol_type
7729 aarch64_classify_symbolic_expression (rtx x)
7730 {
7731   rtx offset;
7732
7733   split_const (x, &x, &offset);
7734   return aarch64_classify_symbol (x, INTVAL (offset));
7735 }
7736
7737
7738 /* Return TRUE if X is a legitimate address for accessing memory in
7739    mode MODE.  */
7740 static bool
7741 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
7742 {
7743   struct aarch64_address_info addr;
7744
7745   return aarch64_classify_address (&addr, x, mode, strict_p);
7746 }
7747
7748 /* Return TRUE if X is a legitimate address of type TYPE for accessing
7749    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
7750 bool
7751 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
7752                               aarch64_addr_query_type type)
7753 {
7754   struct aarch64_address_info addr;
7755
7756   return aarch64_classify_address (&addr, x, mode, strict_p, type);
7757 }
7758
7759 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
7760
7761 static bool
7762 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
7763                                          poly_int64 orig_offset,
7764                                          machine_mode mode)
7765 {
7766   HOST_WIDE_INT size;
7767   if (GET_MODE_SIZE (mode).is_constant (&size))
7768     {
7769       HOST_WIDE_INT const_offset, second_offset;
7770
7771       /* A general SVE offset is A * VQ + B.  Remove the A component from
7772          coefficient 0 in order to get the constant B.  */
7773       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
7774
7775       /* Split an out-of-range address displacement into a base and
7776          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
7777          range otherwise to increase opportunities for sharing the base
7778          address of different sizes.  Unaligned accesses use the signed
7779          9-bit range, TImode/TFmode use the intersection of signed
7780          scaled 7-bit and signed 9-bit offset.  */
7781       if (mode == TImode || mode == TFmode)
7782         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
7783       else if ((const_offset & (size - 1)) != 0)
7784         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
7785       else
7786         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
7787
7788       if (second_offset == 0 || known_eq (orig_offset, second_offset))
7789         return false;
7790
7791       /* Split the offset into second_offset and the rest.  */
7792       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7793       *offset2 = gen_int_mode (second_offset, Pmode);
7794       return true;
7795     }
7796   else
7797     {
7798       /* Get the mode we should use as the basis of the range.  For structure
7799          modes this is the mode of one vector.  */
7800       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7801       machine_mode step_mode
7802         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
7803
7804       /* Get the "mul vl" multiplier we'd like to use.  */
7805       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
7806       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
7807       if (vec_flags & VEC_SVE_DATA)
7808         /* LDR supports a 9-bit range, but the move patterns for
7809            structure modes require all vectors to be in range of the
7810            same base.  The simplest way of accomodating that while still
7811            promoting reuse of anchor points between different modes is
7812            to use an 8-bit range unconditionally.  */
7813         vnum = ((vnum + 128) & 255) - 128;
7814       else
7815         /* Predicates are only handled singly, so we might as well use
7816            the full range.  */
7817         vnum = ((vnum + 256) & 511) - 256;
7818       if (vnum == 0)
7819         return false;
7820
7821       /* Convert the "mul vl" multiplier into a byte offset.  */
7822       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
7823       if (known_eq (second_offset, orig_offset))
7824         return false;
7825
7826       /* Split the offset into second_offset and the rest.  */
7827       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7828       *offset2 = gen_int_mode (second_offset, Pmode);
7829       return true;
7830     }
7831 }
7832
7833 /* Return the binary representation of floating point constant VALUE in INTVAL.
7834    If the value cannot be converted, return false without setting INTVAL.
7835    The conversion is done in the given MODE.  */
7836 bool
7837 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
7838 {
7839
7840   /* We make a general exception for 0.  */
7841   if (aarch64_float_const_zero_rtx_p (value))
7842     {
7843       *intval = 0;
7844       return true;
7845     }
7846
7847   scalar_float_mode mode;
7848   if (GET_CODE (value) != CONST_DOUBLE
7849       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
7850       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
7851       /* Only support up to DF mode.  */
7852       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
7853     return false;
7854
7855   unsigned HOST_WIDE_INT ival = 0;
7856
7857   long res[2];
7858   real_to_target (res,
7859                   CONST_DOUBLE_REAL_VALUE (value),
7860                   REAL_MODE_FORMAT (mode));
7861
7862   if (mode == DFmode)
7863     {
7864       int order = BYTES_BIG_ENDIAN ? 1 : 0;
7865       ival = zext_hwi (res[order], 32);
7866       ival |= (zext_hwi (res[1 - order], 32) << 32);
7867     }
7868   else
7869       ival = zext_hwi (res[0], 32);
7870
7871   *intval = ival;
7872   return true;
7873 }
7874
7875 /* Return TRUE if rtx X is an immediate constant that can be moved using a
7876    single MOV(+MOVK) followed by an FMOV.  */
7877 bool
7878 aarch64_float_const_rtx_p (rtx x)
7879 {
7880   machine_mode mode = GET_MODE (x);
7881   if (mode == VOIDmode)
7882     return false;
7883
7884   /* Determine whether it's cheaper to write float constants as
7885      mov/movk pairs over ldr/adrp pairs.  */
7886   unsigned HOST_WIDE_INT ival;
7887
7888   if (GET_CODE (x) == CONST_DOUBLE
7889       && SCALAR_FLOAT_MODE_P (mode)
7890       && aarch64_reinterpret_float_as_int (x, &ival))
7891     {
7892       scalar_int_mode imode = (mode == HFmode
7893                                ? SImode
7894                                : int_mode_for_mode (mode).require ());
7895       int num_instr = aarch64_internal_mov_immediate
7896                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7897       return num_instr < 3;
7898     }
7899
7900   return false;
7901 }
7902
7903 /* Return TRUE if rtx X is immediate constant 0.0 */
7904 bool
7905 aarch64_float_const_zero_rtx_p (rtx x)
7906 {
7907   if (GET_MODE (x) == VOIDmode)
7908     return false;
7909
7910   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
7911     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
7912   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
7913 }
7914
7915 /* Return TRUE if rtx X is immediate constant that fits in a single
7916    MOVI immediate operation.  */
7917 bool
7918 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
7919 {
7920   if (!TARGET_SIMD)
7921      return false;
7922
7923   machine_mode vmode;
7924   scalar_int_mode imode;
7925   unsigned HOST_WIDE_INT ival;
7926
7927   if (GET_CODE (x) == CONST_DOUBLE
7928       && SCALAR_FLOAT_MODE_P (mode))
7929     {
7930       if (!aarch64_reinterpret_float_as_int (x, &ival))
7931         return false;
7932
7933       /* We make a general exception for 0.  */
7934       if (aarch64_float_const_zero_rtx_p (x))
7935         return true;
7936
7937       imode = int_mode_for_mode (mode).require ();
7938     }
7939   else if (GET_CODE (x) == CONST_INT
7940            && is_a <scalar_int_mode> (mode, &imode))
7941     ival = INTVAL (x);
7942   else
7943     return false;
7944
7945    /* use a 64 bit mode for everything except for DI/DF mode, where we use
7946      a 128 bit vector mode.  */
7947   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
7948
7949   vmode = aarch64_simd_container_mode (imode, width);
7950   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
7951
7952   return aarch64_simd_valid_immediate (v_op, NULL);
7953 }
7954
7955
7956 /* Return the fixed registers used for condition codes.  */
7957
7958 static bool
7959 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
7960 {
7961   *p1 = CC_REGNUM;
7962   *p2 = INVALID_REGNUM;
7963   return true;
7964 }
7965
7966 /* This function is used by the call expanders of the machine description.
7967    RESULT is the register in which the result is returned.  It's NULL for
7968    "call" and "sibcall".
7969    MEM is the location of the function call.
7970    SIBCALL indicates whether this function call is normal call or sibling call.
7971    It will generate different pattern accordingly.  */
7972
7973 void
7974 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
7975 {
7976   rtx call, callee, tmp;
7977   rtvec vec;
7978   machine_mode mode;
7979
7980   gcc_assert (MEM_P (mem));
7981   callee = XEXP (mem, 0);
7982   mode = GET_MODE (callee);
7983   gcc_assert (mode == Pmode);
7984
7985   /* Decide if we should generate indirect calls by loading the
7986      address of the callee into a register before performing
7987      the branch-and-link.  */
7988   if (SYMBOL_REF_P (callee)
7989       ? (aarch64_is_long_call_p (callee)
7990          || aarch64_is_noplt_call_p (callee))
7991       : !REG_P (callee))
7992     XEXP (mem, 0) = force_reg (mode, callee);
7993
7994   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
7995
7996   if (result != NULL_RTX)
7997     call = gen_rtx_SET (result, call);
7998
7999   if (sibcall)
8000     tmp = ret_rtx;
8001   else
8002     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
8003
8004   vec = gen_rtvec (2, call, tmp);
8005   call = gen_rtx_PARALLEL (VOIDmode, vec);
8006
8007   aarch64_emit_call_insn (call);
8008 }
8009
8010 /* Emit call insn with PAT and do aarch64-specific handling.  */
8011
8012 void
8013 aarch64_emit_call_insn (rtx pat)
8014 {
8015   rtx insn = emit_call_insn (pat);
8016
8017   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
8018   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
8019   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
8020 }
8021
8022 machine_mode
8023 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
8024 {
8025   machine_mode mode_x = GET_MODE (x);
8026   rtx_code code_x = GET_CODE (x);
8027
8028   /* All floating point compares return CCFP if it is an equality
8029      comparison, and CCFPE otherwise.  */
8030   if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
8031     {
8032       switch (code)
8033         {
8034         case EQ:
8035         case NE:
8036         case UNORDERED:
8037         case ORDERED:
8038         case UNLT:
8039         case UNLE:
8040         case UNGT:
8041         case UNGE:
8042         case UNEQ:
8043           return CCFPmode;
8044
8045         case LT:
8046         case LE:
8047         case GT:
8048         case GE:
8049         case LTGT:
8050           return CCFPEmode;
8051
8052         default:
8053           gcc_unreachable ();
8054         }
8055     }
8056
8057   /* Equality comparisons of short modes against zero can be performed
8058      using the TST instruction with the appropriate bitmask.  */
8059   if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
8060       && (code == EQ || code == NE)
8061       && (mode_x == HImode || mode_x == QImode))
8062     return CC_NZmode;
8063
8064   /* Similarly, comparisons of zero_extends from shorter modes can
8065      be performed using an ANDS with an immediate mask.  */
8066   if (y == const0_rtx && code_x == ZERO_EXTEND
8067       && (mode_x == SImode || mode_x == DImode)
8068       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
8069       && (code == EQ || code == NE))
8070     return CC_NZmode;
8071
8072   if ((mode_x == SImode || mode_x == DImode)
8073       && y == const0_rtx
8074       && (code == EQ || code == NE || code == LT || code == GE)
8075       && (code_x == PLUS || code_x == MINUS || code_x == AND
8076           || code_x == NEG
8077           || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
8078               && CONST_INT_P (XEXP (x, 2)))))
8079     return CC_NZmode;
8080
8081   /* A compare with a shifted operand.  Because of canonicalization,
8082      the comparison will have to be swapped when we emit the assembly
8083      code.  */
8084   if ((mode_x == SImode || mode_x == DImode)
8085       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
8086       && (code_x == ASHIFT || code_x == ASHIFTRT
8087           || code_x == LSHIFTRT
8088           || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
8089     return CC_SWPmode;
8090
8091   /* Similarly for a negated operand, but we can only do this for
8092      equalities.  */
8093   if ((mode_x == SImode || mode_x == DImode)
8094       && (REG_P (y) || GET_CODE (y) == SUBREG)
8095       && (code == EQ || code == NE)
8096       && code_x == NEG)
8097     return CC_Zmode;
8098
8099   /* A test for unsigned overflow from an addition.  */
8100   if ((mode_x == DImode || mode_x == TImode)
8101       && (code == LTU || code == GEU)
8102       && code_x == PLUS
8103       && rtx_equal_p (XEXP (x, 0), y))
8104     return CC_Cmode;
8105
8106   /* A test for unsigned overflow from an add with carry.  */
8107   if ((mode_x == DImode || mode_x == TImode)
8108       && (code == LTU || code == GEU)
8109       && code_x == PLUS
8110       && CONST_SCALAR_INT_P (y)
8111       && (rtx_mode_t (y, mode_x)
8112           == (wi::shwi (1, mode_x)
8113               << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
8114     return CC_ADCmode;
8115
8116   /* A test for signed overflow.  */
8117   if ((mode_x == DImode || mode_x == TImode)
8118       && code == NE
8119       && code_x == PLUS
8120       && GET_CODE (y) == SIGN_EXTEND)
8121     return CC_Vmode;
8122
8123   /* For everything else, return CCmode.  */
8124   return CCmode;
8125 }
8126
8127 static int
8128 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
8129
8130 int
8131 aarch64_get_condition_code (rtx x)
8132 {
8133   machine_mode mode = GET_MODE (XEXP (x, 0));
8134   enum rtx_code comp_code = GET_CODE (x);
8135
8136   if (GET_MODE_CLASS (mode) != MODE_CC)
8137     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
8138   return aarch64_get_condition_code_1 (mode, comp_code);
8139 }
8140
8141 static int
8142 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
8143 {
8144   switch (mode)
8145     {
8146     case E_CCFPmode:
8147     case E_CCFPEmode:
8148       switch (comp_code)
8149         {
8150         case GE: return AARCH64_GE;
8151         case GT: return AARCH64_GT;
8152         case LE: return AARCH64_LS;
8153         case LT: return AARCH64_MI;
8154         case NE: return AARCH64_NE;
8155         case EQ: return AARCH64_EQ;
8156         case ORDERED: return AARCH64_VC;
8157         case UNORDERED: return AARCH64_VS;
8158         case UNLT: return AARCH64_LT;
8159         case UNLE: return AARCH64_LE;
8160         case UNGT: return AARCH64_HI;
8161         case UNGE: return AARCH64_PL;
8162         default: return -1;
8163         }
8164       break;
8165
8166     case E_CCmode:
8167       switch (comp_code)
8168         {
8169         case NE: return AARCH64_NE;
8170         case EQ: return AARCH64_EQ;
8171         case GE: return AARCH64_GE;
8172         case GT: return AARCH64_GT;
8173         case LE: return AARCH64_LE;
8174         case LT: return AARCH64_LT;
8175         case GEU: return AARCH64_CS;
8176         case GTU: return AARCH64_HI;
8177         case LEU: return AARCH64_LS;
8178         case LTU: return AARCH64_CC;
8179         default: return -1;
8180         }
8181       break;
8182
8183     case E_CC_SWPmode:
8184       switch (comp_code)
8185         {
8186         case NE: return AARCH64_NE;
8187         case EQ: return AARCH64_EQ;
8188         case GE: return AARCH64_LE;
8189         case GT: return AARCH64_LT;
8190         case LE: return AARCH64_GE;
8191         case LT: return AARCH64_GT;
8192         case GEU: return AARCH64_LS;
8193         case GTU: return AARCH64_CC;
8194         case LEU: return AARCH64_CS;
8195         case LTU: return AARCH64_HI;
8196         default: return -1;
8197         }
8198       break;
8199
8200     case E_CC_NZCmode:
8201       switch (comp_code)
8202         {
8203         case NE: return AARCH64_NE; /* = any */
8204         case EQ: return AARCH64_EQ; /* = none */
8205         case GE: return AARCH64_PL; /* = nfrst */
8206         case LT: return AARCH64_MI; /* = first */
8207         case GEU: return AARCH64_CS; /* = nlast */
8208         case GTU: return AARCH64_HI; /* = pmore */
8209         case LEU: return AARCH64_LS; /* = plast */
8210         case LTU: return AARCH64_CC; /* = last */
8211         default: return -1;
8212         }
8213       break;
8214
8215     case E_CC_NZmode:
8216       switch (comp_code)
8217         {
8218         case NE: return AARCH64_NE;
8219         case EQ: return AARCH64_EQ;
8220         case GE: return AARCH64_PL;
8221         case LT: return AARCH64_MI;
8222         default: return -1;
8223         }
8224       break;
8225
8226     case E_CC_Zmode:
8227       switch (comp_code)
8228         {
8229         case NE: return AARCH64_NE;
8230         case EQ: return AARCH64_EQ;
8231         default: return -1;
8232         }
8233       break;
8234
8235     case E_CC_Cmode:
8236       switch (comp_code)
8237         {
8238         case LTU: return AARCH64_CS;
8239         case GEU: return AARCH64_CC;
8240         default: return -1;
8241         }
8242       break;
8243
8244     case E_CC_ADCmode:
8245       switch (comp_code)
8246         {
8247         case GEU: return AARCH64_CS;
8248         case LTU: return AARCH64_CC;
8249         default: return -1;
8250         }
8251       break;
8252
8253     case E_CC_Vmode:
8254       switch (comp_code)
8255         {
8256         case NE: return AARCH64_VS;
8257         case EQ: return AARCH64_VC;
8258         default: return -1;
8259         }
8260       break;
8261
8262     default:
8263       return -1;
8264     }
8265
8266   return -1;
8267 }
8268
8269 bool
8270 aarch64_const_vec_all_same_in_range_p (rtx x,
8271                                        HOST_WIDE_INT minval,
8272                                        HOST_WIDE_INT maxval)
8273 {
8274   rtx elt;
8275   return (const_vec_duplicate_p (x, &elt)
8276           && CONST_INT_P (elt)
8277           && IN_RANGE (INTVAL (elt), minval, maxval));
8278 }
8279
8280 bool
8281 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
8282 {
8283   return aarch64_const_vec_all_same_in_range_p (x, val, val);
8284 }
8285
8286 /* Return true if VEC is a constant in which every element is in the range
8287    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
8288
8289 static bool
8290 aarch64_const_vec_all_in_range_p (rtx vec,
8291                                   HOST_WIDE_INT minval,
8292                                   HOST_WIDE_INT maxval)
8293 {
8294   if (GET_CODE (vec) != CONST_VECTOR
8295       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
8296     return false;
8297
8298   int nunits;
8299   if (!CONST_VECTOR_STEPPED_P (vec))
8300     nunits = const_vector_encoded_nelts (vec);
8301   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
8302     return false;
8303
8304   for (int i = 0; i < nunits; i++)
8305     {
8306       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
8307       if (!CONST_INT_P (vec_elem)
8308           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
8309         return false;
8310     }
8311   return true;
8312 }
8313
8314 /* N Z C V.  */
8315 #define AARCH64_CC_V 1
8316 #define AARCH64_CC_C (1 << 1)
8317 #define AARCH64_CC_Z (1 << 2)
8318 #define AARCH64_CC_N (1 << 3)
8319
8320 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
8321 static const int aarch64_nzcv_codes[] =
8322 {
8323   0,            /* EQ, Z == 1.  */
8324   AARCH64_CC_Z, /* NE, Z == 0.  */
8325   0,            /* CS, C == 1.  */
8326   AARCH64_CC_C, /* CC, C == 0.  */
8327   0,            /* MI, N == 1.  */
8328   AARCH64_CC_N, /* PL, N == 0.  */
8329   0,            /* VS, V == 1.  */
8330   AARCH64_CC_V, /* VC, V == 0.  */
8331   0,            /* HI, C ==1 && Z == 0.  */
8332   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
8333   AARCH64_CC_V, /* GE, N == V.  */
8334   0,            /* LT, N != V.  */
8335   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
8336   0,            /* LE, !(Z == 0 && N == V).  */
8337   0,            /* AL, Any.  */
8338   0             /* NV, Any.  */
8339 };
8340
8341 /* Print floating-point vector immediate operand X to F, negating it
8342    first if NEGATE is true.  Return true on success, false if it isn't
8343    a constant we can handle.  */
8344
8345 static bool
8346 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
8347 {
8348   rtx elt;
8349
8350   if (!const_vec_duplicate_p (x, &elt))
8351     return false;
8352
8353   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
8354   if (negate)
8355     r = real_value_negate (&r);
8356
8357   /* Handle the SVE single-bit immediates specially, since they have a
8358      fixed form in the assembly syntax.  */
8359   if (real_equal (&r, &dconst0))
8360     asm_fprintf (f, "0.0");
8361   else if (real_equal (&r, &dconst2))
8362     asm_fprintf (f, "2.0");
8363   else if (real_equal (&r, &dconst1))
8364     asm_fprintf (f, "1.0");
8365   else if (real_equal (&r, &dconsthalf))
8366     asm_fprintf (f, "0.5");
8367   else
8368     {
8369       const int buf_size = 20;
8370       char float_buf[buf_size] = {'\0'};
8371       real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
8372                                 1, GET_MODE (elt));
8373       asm_fprintf (f, "%s", float_buf);
8374     }
8375
8376   return true;
8377 }
8378
8379 /* Return the equivalent letter for size.  */
8380 static char
8381 sizetochar (int size)
8382 {
8383   switch (size)
8384     {
8385     case 64: return 'd';
8386     case 32: return 's';
8387     case 16: return 'h';
8388     case 8 : return 'b';
8389     default: gcc_unreachable ();
8390     }
8391 }
8392
8393 /* Print operand X to file F in a target specific manner according to CODE.
8394    The acceptable formatting commands given by CODE are:
8395      'c':               An integer or symbol address without a preceding #
8396                         sign.
8397      'C':               Take the duplicated element in a vector constant
8398                         and print it in hex.
8399      'D':               Take the duplicated element in a vector constant
8400                         and print it as an unsigned integer, in decimal.
8401      'e':               Print the sign/zero-extend size as a character 8->b,
8402                         16->h, 32->w.  Can also be used for masks:
8403                         0xff->b, 0xffff->h, 0xffffffff->w.
8404      'I':               If the operand is a duplicated vector constant,
8405                         replace it with the duplicated scalar.  If the
8406                         operand is then a floating-point constant, replace
8407                         it with the integer bit representation.  Print the
8408                         transformed constant as a signed decimal number.
8409      'p':               Prints N such that 2^N == X (X must be power of 2 and
8410                         const int).
8411      'P':               Print the number of non-zero bits in X (a const_int).
8412      'H':               Print the higher numbered register of a pair (TImode)
8413                         of regs.
8414      'm':               Print a condition (eq, ne, etc).
8415      'M':               Same as 'm', but invert condition.
8416      'N':               Take the duplicated element in a vector constant
8417                         and print the negative of it in decimal.
8418      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
8419      'S/T/U/V':         Print a FP/SIMD register name for a register list.
8420                         The register printed is the FP/SIMD register name
8421                         of X + 0/1/2/3 for S/T/U/V.
8422      'R':               Print a scalar FP/SIMD register name + 1.
8423      'X':               Print bottom 16 bits of integer constant in hex.
8424      'w/x':             Print a general register name or the zero register
8425                         (32-bit or 64-bit).
8426      '0':               Print a normal operand, if it's a general register,
8427                         then we assume DImode.
8428      'k':               Print NZCV for conditional compare instructions.
8429      'A':               Output address constant representing the first
8430                         argument of X, specifying a relocation offset
8431                         if appropriate.
8432      'L':               Output constant address specified by X
8433                         with a relocation offset if appropriate.
8434      'G':               Prints address of X, specifying a PC relative
8435                         relocation mode if appropriate.
8436      'y':               Output address of LDP or STP - this is used for
8437                         some LDP/STPs which don't use a PARALLEL in their
8438                         pattern (so the mode needs to be adjusted).
8439      'z':               Output address of a typical LDP or STP.  */
8440
8441 static void
8442 aarch64_print_operand (FILE *f, rtx x, int code)
8443 {
8444   rtx elt;
8445   switch (code)
8446     {
8447     case 'c':
8448       switch (GET_CODE (x))
8449         {
8450         case CONST_INT:
8451           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
8452           break;
8453
8454         case SYMBOL_REF:
8455           output_addr_const (f, x);
8456           break;
8457
8458         case CONST:
8459           if (GET_CODE (XEXP (x, 0)) == PLUS
8460               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
8461             {
8462               output_addr_const (f, x);
8463               break;
8464             }
8465           /* Fall through.  */
8466
8467         default:
8468           output_operand_lossage ("unsupported operand for code '%c'", code);
8469         }
8470       break;
8471
8472     case 'e':
8473       {
8474         x = unwrap_const_vec_duplicate (x);
8475         if (!CONST_INT_P (x))
8476           {
8477             output_operand_lossage ("invalid operand for '%%%c'", code);
8478             return;
8479           }
8480
8481         HOST_WIDE_INT val = INTVAL (x);
8482         if ((val & ~7) == 8 || val == 0xff)
8483           fputc ('b', f);
8484         else if ((val & ~7) == 16 || val == 0xffff)
8485           fputc ('h', f);
8486         else if ((val & ~7) == 32 || val == 0xffffffff)
8487           fputc ('w', f);
8488         else
8489           {
8490             output_operand_lossage ("invalid operand for '%%%c'", code);
8491             return;
8492           }
8493       }
8494       break;
8495
8496     case 'p':
8497       {
8498         int n;
8499
8500         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
8501           {
8502             output_operand_lossage ("invalid operand for '%%%c'", code);
8503             return;
8504           }
8505
8506         asm_fprintf (f, "%d", n);
8507       }
8508       break;
8509
8510     case 'P':
8511       if (!CONST_INT_P (x))
8512         {
8513           output_operand_lossage ("invalid operand for '%%%c'", code);
8514           return;
8515         }
8516
8517       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
8518       break;
8519
8520     case 'H':
8521       if (x == const0_rtx)
8522         {
8523           asm_fprintf (f, "xzr");
8524           break;
8525         }
8526
8527       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
8528         {
8529           output_operand_lossage ("invalid operand for '%%%c'", code);
8530           return;
8531         }
8532
8533       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
8534       break;
8535
8536     case 'I':
8537       {
8538         x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
8539         if (CONST_INT_P (x))
8540           asm_fprintf (f, "%wd", INTVAL (x));
8541         else
8542           {
8543             output_operand_lossage ("invalid operand for '%%%c'", code);
8544             return;
8545           }
8546         break;
8547       }
8548
8549     case 'M':
8550     case 'm':
8551       {
8552         int cond_code;
8553         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
8554         if (x == const_true_rtx)
8555           {
8556             if (code == 'M')
8557               fputs ("nv", f);
8558             return;
8559           }
8560
8561         if (!COMPARISON_P (x))
8562           {
8563             output_operand_lossage ("invalid operand for '%%%c'", code);
8564             return;
8565           }
8566
8567         cond_code = aarch64_get_condition_code (x);
8568         gcc_assert (cond_code >= 0);
8569         if (code == 'M')
8570           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
8571         if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
8572           fputs (aarch64_sve_condition_codes[cond_code], f);
8573         else
8574           fputs (aarch64_condition_codes[cond_code], f);
8575       }
8576       break;
8577
8578     case 'N':
8579       if (!const_vec_duplicate_p (x, &elt))
8580         {
8581           output_operand_lossage ("invalid vector constant");
8582           return;
8583         }
8584
8585       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
8586         asm_fprintf (f, "%wd", -INTVAL (elt));
8587       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
8588                && aarch64_print_vector_float_operand (f, x, true))
8589         ;
8590       else
8591         {
8592           output_operand_lossage ("invalid vector constant");
8593           return;
8594         }
8595       break;
8596
8597     case 'b':
8598     case 'h':
8599     case 's':
8600     case 'd':
8601     case 'q':
8602       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8603         {
8604           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8605           return;
8606         }
8607       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
8608       break;
8609
8610     case 'S':
8611     case 'T':
8612     case 'U':
8613     case 'V':
8614       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8615         {
8616           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8617           return;
8618         }
8619       asm_fprintf (f, "%c%d",
8620                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
8621                    REGNO (x) - V0_REGNUM + (code - 'S'));
8622       break;
8623
8624     case 'R':
8625       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8626         {
8627           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8628           return;
8629         }
8630       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
8631       break;
8632
8633     case 'X':
8634       if (!CONST_INT_P (x))
8635         {
8636           output_operand_lossage ("invalid operand for '%%%c'", code);
8637           return;
8638         }
8639       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
8640       break;
8641
8642     case 'C':
8643       {
8644         /* Print a replicated constant in hex.  */
8645         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
8646           {
8647             output_operand_lossage ("invalid operand for '%%%c'", code);
8648             return;
8649           }
8650         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
8651         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
8652       }
8653       break;
8654
8655     case 'D':
8656       {
8657         /* Print a replicated constant in decimal, treating it as
8658            unsigned.  */
8659         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
8660           {
8661             output_operand_lossage ("invalid operand for '%%%c'", code);
8662             return;
8663           }
8664         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
8665         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
8666       }
8667       break;
8668
8669     case 'w':
8670     case 'x':
8671       if (x == const0_rtx
8672           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
8673         {
8674           asm_fprintf (f, "%czr", code);
8675           break;
8676         }
8677
8678       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
8679         {
8680           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
8681           break;
8682         }
8683
8684       if (REG_P (x) && REGNO (x) == SP_REGNUM)
8685         {
8686           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
8687           break;
8688         }
8689
8690       /* Fall through */
8691
8692     case 0:
8693       if (x == NULL)
8694         {
8695           output_operand_lossage ("missing operand");
8696           return;
8697         }
8698
8699       switch (GET_CODE (x))
8700         {
8701         case REG:
8702           if (aarch64_sve_data_mode_p (GET_MODE (x)))
8703             {
8704               if (REG_NREGS (x) == 1)
8705                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
8706               else
8707                 {
8708                   char suffix
8709                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
8710                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
8711                                REGNO (x) - V0_REGNUM, suffix,
8712                                END_REGNO (x) - V0_REGNUM - 1, suffix);
8713                 }
8714             }
8715           else
8716             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
8717           break;
8718
8719         case MEM:
8720           output_address (GET_MODE (x), XEXP (x, 0));
8721           break;
8722
8723         case LABEL_REF:
8724         case SYMBOL_REF:
8725           output_addr_const (asm_out_file, x);
8726           break;
8727
8728         case CONST_INT:
8729           asm_fprintf (f, "%wd", INTVAL (x));
8730           break;
8731
8732         case CONST:
8733           if (!VECTOR_MODE_P (GET_MODE (x)))
8734             {
8735               output_addr_const (asm_out_file, x);
8736               break;
8737             }
8738           /* fall through */
8739
8740         case CONST_VECTOR:
8741           if (!const_vec_duplicate_p (x, &elt))
8742             {
8743               output_operand_lossage ("invalid vector constant");
8744               return;
8745             }
8746
8747           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
8748             asm_fprintf (f, "%wd", INTVAL (elt));
8749           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
8750                    && aarch64_print_vector_float_operand (f, x, false))
8751             ;
8752           else
8753             {
8754               output_operand_lossage ("invalid vector constant");
8755               return;
8756             }
8757           break;
8758
8759         case CONST_DOUBLE:
8760           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
8761              be getting CONST_DOUBLEs holding integers.  */
8762           gcc_assert (GET_MODE (x) != VOIDmode);
8763           if (aarch64_float_const_zero_rtx_p (x))
8764             {
8765               fputc ('0', f);
8766               break;
8767             }
8768           else if (aarch64_float_const_representable_p (x))
8769             {
8770 #define buf_size 20
8771               char float_buf[buf_size] = {'\0'};
8772               real_to_decimal_for_mode (float_buf,
8773                                         CONST_DOUBLE_REAL_VALUE (x),
8774                                         buf_size, buf_size,
8775                                         1, GET_MODE (x));
8776               asm_fprintf (asm_out_file, "%s", float_buf);
8777               break;
8778 #undef buf_size
8779             }
8780           output_operand_lossage ("invalid constant");
8781           return;
8782         default:
8783           output_operand_lossage ("invalid operand");
8784           return;
8785         }
8786       break;
8787
8788     case 'A':
8789       if (GET_CODE (x) == HIGH)
8790         x = XEXP (x, 0);
8791
8792       switch (aarch64_classify_symbolic_expression (x))
8793         {
8794         case SYMBOL_SMALL_GOT_4G:
8795           asm_fprintf (asm_out_file, ":got:");
8796           break;
8797
8798         case SYMBOL_SMALL_TLSGD:
8799           asm_fprintf (asm_out_file, ":tlsgd:");
8800           break;
8801
8802         case SYMBOL_SMALL_TLSDESC:
8803           asm_fprintf (asm_out_file, ":tlsdesc:");
8804           break;
8805
8806         case SYMBOL_SMALL_TLSIE:
8807           asm_fprintf (asm_out_file, ":gottprel:");
8808           break;
8809
8810         case SYMBOL_TLSLE24:
8811           asm_fprintf (asm_out_file, ":tprel:");
8812           break;
8813
8814         case SYMBOL_TINY_GOT:
8815           gcc_unreachable ();
8816           break;
8817
8818         default:
8819           break;
8820         }
8821       output_addr_const (asm_out_file, x);
8822       break;
8823
8824     case 'L':
8825       switch (aarch64_classify_symbolic_expression (x))
8826         {
8827         case SYMBOL_SMALL_GOT_4G:
8828           asm_fprintf (asm_out_file, ":lo12:");
8829           break;
8830
8831         case SYMBOL_SMALL_TLSGD:
8832           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
8833           break;
8834
8835         case SYMBOL_SMALL_TLSDESC:
8836           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
8837           break;
8838
8839         case SYMBOL_SMALL_TLSIE:
8840           asm_fprintf (asm_out_file, ":gottprel_lo12:");
8841           break;
8842
8843         case SYMBOL_TLSLE12:
8844           asm_fprintf (asm_out_file, ":tprel_lo12:");
8845           break;
8846
8847         case SYMBOL_TLSLE24:
8848           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
8849           break;
8850
8851         case SYMBOL_TINY_GOT:
8852           asm_fprintf (asm_out_file, ":got:");
8853           break;
8854
8855         case SYMBOL_TINY_TLSIE:
8856           asm_fprintf (asm_out_file, ":gottprel:");
8857           break;
8858
8859         default:
8860           break;
8861         }
8862       output_addr_const (asm_out_file, x);
8863       break;
8864
8865     case 'G':
8866       switch (aarch64_classify_symbolic_expression (x))
8867         {
8868         case SYMBOL_TLSLE24:
8869           asm_fprintf (asm_out_file, ":tprel_hi12:");
8870           break;
8871         default:
8872           break;
8873         }
8874       output_addr_const (asm_out_file, x);
8875       break;
8876
8877     case 'k':
8878       {
8879         HOST_WIDE_INT cond_code;
8880
8881         if (!CONST_INT_P (x))
8882           {
8883             output_operand_lossage ("invalid operand for '%%%c'", code);
8884             return;
8885           }
8886
8887         cond_code = INTVAL (x);
8888         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
8889         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
8890       }
8891       break;
8892
8893     case 'y':
8894     case 'z':
8895       {
8896         machine_mode mode = GET_MODE (x);
8897
8898         if (GET_CODE (x) != MEM
8899             || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
8900           {
8901             output_operand_lossage ("invalid operand for '%%%c'", code);
8902             return;
8903           }
8904
8905         if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
8906                                             code == 'y'
8907                                             ? ADDR_QUERY_LDP_STP_N
8908                                             : ADDR_QUERY_LDP_STP))
8909           output_operand_lossage ("invalid operand prefix '%%%c'", code);
8910       }
8911       break;
8912
8913     default:
8914       output_operand_lossage ("invalid operand prefix '%%%c'", code);
8915       return;
8916     }
8917 }
8918
8919 /* Print address 'x' of a memory access with mode 'mode'.
8920    'op' is the context required by aarch64_classify_address.  It can either be
8921    MEM for a normal memory access or PARALLEL for LDP/STP.  */
8922 static bool
8923 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
8924                                 aarch64_addr_query_type type)
8925 {
8926   struct aarch64_address_info addr;
8927   unsigned int size;
8928
8929   /* Check all addresses are Pmode - including ILP32.  */
8930   if (GET_MODE (x) != Pmode
8931       && (!CONST_INT_P (x)
8932           || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
8933     {
8934       output_operand_lossage ("invalid address mode");
8935       return false;
8936     }
8937
8938   if (aarch64_classify_address (&addr, x, mode, true, type))
8939     switch (addr.type)
8940       {
8941       case ADDRESS_REG_IMM:
8942         if (known_eq (addr.const_offset, 0))
8943           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
8944         else if (aarch64_sve_data_mode_p (mode))
8945           {
8946             HOST_WIDE_INT vnum
8947               = exact_div (addr.const_offset,
8948                            BYTES_PER_SVE_VECTOR).to_constant ();
8949             asm_fprintf (f, "[%s, #%wd, mul vl]",
8950                          reg_names[REGNO (addr.base)], vnum);
8951           }
8952         else if (aarch64_sve_pred_mode_p (mode))
8953           {
8954             HOST_WIDE_INT vnum
8955               = exact_div (addr.const_offset,
8956                            BYTES_PER_SVE_PRED).to_constant ();
8957             asm_fprintf (f, "[%s, #%wd, mul vl]",
8958                          reg_names[REGNO (addr.base)], vnum);
8959           }
8960         else
8961           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
8962                        INTVAL (addr.offset));
8963         return true;
8964
8965       case ADDRESS_REG_REG:
8966         if (addr.shift == 0)
8967           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
8968                        reg_names [REGNO (addr.offset)]);
8969         else
8970           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
8971                        reg_names [REGNO (addr.offset)], addr.shift);
8972         return true;
8973
8974       case ADDRESS_REG_UXTW:
8975         if (addr.shift == 0)
8976           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
8977                        REGNO (addr.offset) - R0_REGNUM);
8978         else
8979           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
8980                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
8981         return true;
8982
8983       case ADDRESS_REG_SXTW:
8984         if (addr.shift == 0)
8985           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
8986                        REGNO (addr.offset) - R0_REGNUM);
8987         else
8988           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
8989                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
8990         return true;
8991
8992       case ADDRESS_REG_WB:
8993         /* Writeback is only supported for fixed-width modes.  */
8994         size = GET_MODE_SIZE (mode).to_constant ();
8995         switch (GET_CODE (x))
8996           {
8997           case PRE_INC:
8998             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
8999             return true;
9000           case POST_INC:
9001             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
9002             return true;
9003           case PRE_DEC:
9004             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
9005             return true;
9006           case POST_DEC:
9007             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
9008             return true;
9009           case PRE_MODIFY:
9010             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
9011                          INTVAL (addr.offset));
9012             return true;
9013           case POST_MODIFY:
9014             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
9015                          INTVAL (addr.offset));
9016             return true;
9017           default:
9018             break;
9019           }
9020         break;
9021
9022       case ADDRESS_LO_SUM:
9023         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
9024         output_addr_const (f, addr.offset);
9025         asm_fprintf (f, "]");
9026         return true;
9027
9028       case ADDRESS_SYMBOLIC:
9029         output_addr_const (f, x);
9030         return true;
9031       }
9032
9033   return false;
9034 }
9035
9036 /* Print address 'x' of a memory access with mode 'mode'.  */
9037 static void
9038 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
9039 {
9040   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
9041     output_addr_const (f, x);
9042 }
9043
9044 bool
9045 aarch64_label_mentioned_p (rtx x)
9046 {
9047   const char *fmt;
9048   int i;
9049
9050   if (GET_CODE (x) == LABEL_REF)
9051     return true;
9052
9053   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
9054      referencing instruction, but they are constant offsets, not
9055      symbols.  */
9056   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
9057     return false;
9058
9059   fmt = GET_RTX_FORMAT (GET_CODE (x));
9060   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
9061     {
9062       if (fmt[i] == 'E')
9063         {
9064           int j;
9065
9066           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
9067             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
9068               return 1;
9069         }
9070       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
9071         return 1;
9072     }
9073
9074   return 0;
9075 }
9076
9077 /* Implement REGNO_REG_CLASS.  */
9078
9079 enum reg_class
9080 aarch64_regno_regclass (unsigned regno)
9081 {
9082   if (GP_REGNUM_P (regno))
9083     return GENERAL_REGS;
9084
9085   if (regno == SP_REGNUM)
9086     return STACK_REG;
9087
9088   if (regno == FRAME_POINTER_REGNUM
9089       || regno == ARG_POINTER_REGNUM)
9090     return POINTER_REGS;
9091
9092   if (FP_REGNUM_P (regno))
9093     return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
9094             : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
9095
9096   if (PR_REGNUM_P (regno))
9097     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
9098
9099   return NO_REGS;
9100 }
9101
9102 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
9103    If OFFSET is out of range, return an offset of an anchor point
9104    that is in range.  Return 0 otherwise.  */
9105
9106 static HOST_WIDE_INT
9107 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
9108                        machine_mode mode)
9109 {
9110   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
9111   if (size > 16)
9112     return (offset + 0x400) & ~0x7f0;
9113
9114   /* For offsets that aren't a multiple of the access size, the limit is
9115      -256...255.  */
9116   if (offset & (size - 1))
9117     {
9118       /* BLKmode typically uses LDP of X-registers.  */
9119       if (mode == BLKmode)
9120         return (offset + 512) & ~0x3ff;
9121       return (offset + 0x100) & ~0x1ff;
9122     }
9123
9124   /* Small negative offsets are supported.  */
9125   if (IN_RANGE (offset, -256, 0))
9126     return 0;
9127
9128   if (mode == TImode || mode == TFmode)
9129     return (offset + 0x100) & ~0x1ff;
9130
9131   /* Use 12-bit offset by access size.  */
9132   return offset & (~0xfff * size);
9133 }
9134
9135 static rtx
9136 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
9137 {
9138   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
9139      where mask is selected by alignment and size of the offset.
9140      We try to pick as large a range for the offset as possible to
9141      maximize the chance of a CSE.  However, for aligned addresses
9142      we limit the range to 4k so that structures with different sized
9143      elements are likely to use the same base.  We need to be careful
9144      not to split a CONST for some forms of address expression, otherwise
9145      it will generate sub-optimal code.  */
9146
9147   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
9148     {
9149       rtx base = XEXP (x, 0);
9150       rtx offset_rtx = XEXP (x, 1);
9151       HOST_WIDE_INT offset = INTVAL (offset_rtx);
9152
9153       if (GET_CODE (base) == PLUS)
9154         {
9155           rtx op0 = XEXP (base, 0);
9156           rtx op1 = XEXP (base, 1);
9157
9158           /* Force any scaling into a temp for CSE.  */
9159           op0 = force_reg (Pmode, op0);
9160           op1 = force_reg (Pmode, op1);
9161
9162           /* Let the pointer register be in op0.  */
9163           if (REG_POINTER (op1))
9164             std::swap (op0, op1);
9165
9166           /* If the pointer is virtual or frame related, then we know that
9167              virtual register instantiation or register elimination is going
9168              to apply a second constant.  We want the two constants folded
9169              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
9170           if (virt_or_elim_regno_p (REGNO (op0)))
9171             {
9172               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
9173                                    NULL_RTX, true, OPTAB_DIRECT);
9174               return gen_rtx_PLUS (Pmode, base, op1);
9175             }
9176
9177           /* Otherwise, in order to encourage CSE (and thence loop strength
9178              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
9179           base = expand_binop (Pmode, add_optab, op0, op1,
9180                                NULL_RTX, true, OPTAB_DIRECT);
9181           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
9182         }
9183
9184       HOST_WIDE_INT size;
9185       if (GET_MODE_SIZE (mode).is_constant (&size))
9186         {
9187           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
9188                                                              mode);
9189           if (base_offset != 0)
9190             {
9191               base = plus_constant (Pmode, base, base_offset);
9192               base = force_operand (base, NULL_RTX);
9193               return plus_constant (Pmode, base, offset - base_offset);
9194             }
9195         }
9196     }
9197
9198   return x;
9199 }
9200
9201 static reg_class_t
9202 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
9203                           reg_class_t rclass,
9204                           machine_mode mode,
9205                           secondary_reload_info *sri)
9206 {
9207   /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
9208      directly by the *aarch64_sve_mov<mode>_be move pattern.  See the
9209      comment at the head of aarch64-sve.md for more details about the
9210      big-endian handling.  */
9211   if (BYTES_BIG_ENDIAN
9212       && reg_class_subset_p (rclass, FP_REGS)
9213       && !((REG_P (x) && HARD_REGISTER_P (x))
9214            || aarch64_simd_valid_immediate (x, NULL))
9215       && aarch64_sve_data_mode_p (mode))
9216     {
9217       sri->icode = CODE_FOR_aarch64_sve_reload_be;
9218       return NO_REGS;
9219     }
9220
9221   /* If we have to disable direct literal pool loads and stores because the
9222      function is too big, then we need a scratch register.  */
9223   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
9224       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
9225           || targetm.vector_mode_supported_p (GET_MODE (x)))
9226       && !aarch64_pcrelative_literal_loads)
9227     {
9228       sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
9229       return NO_REGS;
9230     }
9231
9232   /* Without the TARGET_SIMD instructions we cannot move a Q register
9233      to a Q register directly.  We need a scratch.  */
9234   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
9235       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
9236       && reg_class_subset_p (rclass, FP_REGS))
9237     {
9238       sri->icode = code_for_aarch64_reload_mov (mode);
9239       return NO_REGS;
9240     }
9241
9242   /* A TFmode or TImode memory access should be handled via an FP_REGS
9243      because AArch64 has richer addressing modes for LDR/STR instructions
9244      than LDP/STP instructions.  */
9245   if (TARGET_FLOAT && rclass == GENERAL_REGS
9246       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
9247     return FP_REGS;
9248
9249   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
9250       return GENERAL_REGS;
9251
9252   return NO_REGS;
9253 }
9254
9255 static bool
9256 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
9257 {
9258   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
9259
9260   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
9261      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
9262   if (frame_pointer_needed)
9263     return to == HARD_FRAME_POINTER_REGNUM;
9264   return true;
9265 }
9266
9267 poly_int64
9268 aarch64_initial_elimination_offset (unsigned from, unsigned to)
9269 {
9270   if (to == HARD_FRAME_POINTER_REGNUM)
9271     {
9272       if (from == ARG_POINTER_REGNUM)
9273         return cfun->machine->frame.hard_fp_offset;
9274
9275       if (from == FRAME_POINTER_REGNUM)
9276         return cfun->machine->frame.hard_fp_offset
9277                - cfun->machine->frame.locals_offset;
9278     }
9279
9280   if (to == STACK_POINTER_REGNUM)
9281     {
9282       if (from == FRAME_POINTER_REGNUM)
9283           return cfun->machine->frame.frame_size
9284                  - cfun->machine->frame.locals_offset;
9285     }
9286
9287   return cfun->machine->frame.frame_size;
9288 }
9289
9290 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
9291    previous frame.  */
9292
9293 rtx
9294 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
9295 {
9296   if (count != 0)
9297     return const0_rtx;
9298   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
9299 }
9300
9301
9302 static void
9303 aarch64_asm_trampoline_template (FILE *f)
9304 {
9305   int offset1 = 16;
9306   int offset2 = 20;
9307
9308   if (aarch64_bti_enabled ())
9309     {
9310       asm_fprintf (f, "\thint\t34 // bti c\n");
9311       offset1 -= 4;
9312       offset2 -= 4;
9313     }
9314
9315   if (TARGET_ILP32)
9316     {
9317       asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
9318       asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
9319                    offset1);
9320     }
9321   else
9322     {
9323       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
9324       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
9325                    offset2);
9326     }
9327   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
9328
9329   /* The trampoline needs an extra padding instruction.  In case if BTI is
9330      enabled the padding instruction is replaced by the BTI instruction at
9331      the beginning.  */
9332   if (!aarch64_bti_enabled ())
9333     assemble_aligned_integer (4, const0_rtx);
9334
9335   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
9336   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
9337 }
9338
9339 static void
9340 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
9341 {
9342   rtx fnaddr, mem, a_tramp;
9343   const int tramp_code_sz = 16;
9344
9345   /* Don't need to copy the trailing D-words, we fill those in below.  */
9346   emit_block_move (m_tramp, assemble_trampoline_template (),
9347                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
9348   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
9349   fnaddr = XEXP (DECL_RTL (fndecl), 0);
9350   if (GET_MODE (fnaddr) != ptr_mode)
9351     fnaddr = convert_memory_address (ptr_mode, fnaddr);
9352   emit_move_insn (mem, fnaddr);
9353
9354   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
9355   emit_move_insn (mem, chain_value);
9356
9357   /* XXX We should really define a "clear_cache" pattern and use
9358      gen_clear_cache().  */
9359   a_tramp = XEXP (m_tramp, 0);
9360   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
9361                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
9362                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
9363                      ptr_mode);
9364 }
9365
9366 static unsigned char
9367 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
9368 {
9369   /* ??? Logically we should only need to provide a value when
9370      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
9371      can hold MODE, but at the moment we need to handle all modes.
9372      Just ignore any runtime parts for registers that can't store them.  */
9373   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
9374   unsigned int nregs;
9375   switch (regclass)
9376     {
9377     case TAILCALL_ADDR_REGS:
9378     case POINTER_REGS:
9379     case GENERAL_REGS:
9380     case ALL_REGS:
9381     case POINTER_AND_FP_REGS:
9382     case FP_REGS:
9383     case FP_LO_REGS:
9384     case FP_LO8_REGS:
9385       if (aarch64_sve_data_mode_p (mode)
9386           && constant_multiple_p (GET_MODE_SIZE (mode),
9387                                   BYTES_PER_SVE_VECTOR, &nregs))
9388         return nregs;
9389       return (aarch64_vector_data_mode_p (mode)
9390               ? CEIL (lowest_size, UNITS_PER_VREG)
9391               : CEIL (lowest_size, UNITS_PER_WORD));
9392     case STACK_REG:
9393     case PR_REGS:
9394     case PR_LO_REGS:
9395     case PR_HI_REGS:
9396       return 1;
9397
9398     case NO_REGS:
9399       return 0;
9400
9401     default:
9402       break;
9403     }
9404   gcc_unreachable ();
9405 }
9406
9407 static reg_class_t
9408 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
9409 {
9410   if (regclass == POINTER_REGS)
9411     return GENERAL_REGS;
9412
9413   if (regclass == STACK_REG)
9414     {
9415       if (REG_P(x)
9416           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
9417           return regclass;
9418
9419       return NO_REGS;
9420     }
9421
9422   /* Register eliminiation can result in a request for
9423      SP+constant->FP_REGS.  We cannot support such operations which
9424      use SP as source and an FP_REG as destination, so reject out
9425      right now.  */
9426   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
9427     {
9428       rtx lhs = XEXP (x, 0);
9429
9430       /* Look through a possible SUBREG introduced by ILP32.  */
9431       if (GET_CODE (lhs) == SUBREG)
9432         lhs = SUBREG_REG (lhs);
9433
9434       gcc_assert (REG_P (lhs));
9435       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
9436                                       POINTER_REGS));
9437       return NO_REGS;
9438     }
9439
9440   return regclass;
9441 }
9442
9443 void
9444 aarch64_asm_output_labelref (FILE* f, const char *name)
9445 {
9446   asm_fprintf (f, "%U%s", name);
9447 }
9448
9449 static void
9450 aarch64_elf_asm_constructor (rtx symbol, int priority)
9451 {
9452   if (priority == DEFAULT_INIT_PRIORITY)
9453     default_ctor_section_asm_out_constructor (symbol, priority);
9454   else
9455     {
9456       section *s;
9457       /* While priority is known to be in range [0, 65535], so 18 bytes
9458          would be enough, the compiler might not know that.  To avoid
9459          -Wformat-truncation false positive, use a larger size.  */
9460       char buf[23];
9461       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
9462       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
9463       switch_to_section (s);
9464       assemble_align (POINTER_SIZE);
9465       assemble_aligned_integer (POINTER_BYTES, symbol);
9466     }
9467 }
9468
9469 static void
9470 aarch64_elf_asm_destructor (rtx symbol, int priority)
9471 {
9472   if (priority == DEFAULT_INIT_PRIORITY)
9473     default_dtor_section_asm_out_destructor (symbol, priority);
9474   else
9475     {
9476       section *s;
9477       /* While priority is known to be in range [0, 65535], so 18 bytes
9478          would be enough, the compiler might not know that.  To avoid
9479          -Wformat-truncation false positive, use a larger size.  */
9480       char buf[23];
9481       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
9482       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
9483       switch_to_section (s);
9484       assemble_align (POINTER_SIZE);
9485       assemble_aligned_integer (POINTER_BYTES, symbol);
9486     }
9487 }
9488
9489 const char*
9490 aarch64_output_casesi (rtx *operands)
9491 {
9492   char buf[100];
9493   char label[100];
9494   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
9495   int index;
9496   static const char *const patterns[4][2] =
9497   {
9498     {
9499       "ldrb\t%w3, [%0,%w1,uxtw]",
9500       "add\t%3, %4, %w3, sxtb #2"
9501     },
9502     {
9503       "ldrh\t%w3, [%0,%w1,uxtw #1]",
9504       "add\t%3, %4, %w3, sxth #2"
9505     },
9506     {
9507       "ldr\t%w3, [%0,%w1,uxtw #2]",
9508       "add\t%3, %4, %w3, sxtw #2"
9509     },
9510     /* We assume that DImode is only generated when not optimizing and
9511        that we don't really need 64-bit address offsets.  That would
9512        imply an object file with 8GB of code in a single function!  */
9513     {
9514       "ldr\t%w3, [%0,%w1,uxtw #2]",
9515       "add\t%3, %4, %w3, sxtw #2"
9516     }
9517   };
9518
9519   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
9520
9521   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
9522   index = exact_log2 (GET_MODE_SIZE (mode));
9523
9524   gcc_assert (index >= 0 && index <= 3);
9525
9526   /* Need to implement table size reduction, by chaning the code below.  */
9527   output_asm_insn (patterns[index][0], operands);
9528   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
9529   snprintf (buf, sizeof (buf),
9530             "adr\t%%4, %s", targetm.strip_name_encoding (label));
9531   output_asm_insn (buf, operands);
9532   output_asm_insn (patterns[index][1], operands);
9533   output_asm_insn ("br\t%3", operands);
9534   assemble_label (asm_out_file, label);
9535   return "";
9536 }
9537
9538
9539 /* Return size in bits of an arithmetic operand which is shifted/scaled and
9540    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
9541    operator.  */
9542
9543 int
9544 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
9545 {
9546   if (shift >= 0 && shift <= 3)
9547     {
9548       int size;
9549       for (size = 8; size <= 32; size *= 2)
9550         {
9551           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
9552           if (mask == bits << shift)
9553             return size;
9554         }
9555     }
9556   return 0;
9557 }
9558
9559 /* Constant pools are per function only when PC relative
9560    literal loads are true or we are in the large memory
9561    model.  */
9562
9563 static inline bool
9564 aarch64_can_use_per_function_literal_pools_p (void)
9565 {
9566   return (aarch64_pcrelative_literal_loads
9567           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
9568 }
9569
9570 static bool
9571 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
9572 {
9573   /* We can't use blocks for constants when we're using a per-function
9574      constant pool.  */
9575   return !aarch64_can_use_per_function_literal_pools_p ();
9576 }
9577
9578 /* Select appropriate section for constants depending
9579    on where we place literal pools.  */
9580
9581 static section *
9582 aarch64_select_rtx_section (machine_mode mode,
9583                             rtx x,
9584                             unsigned HOST_WIDE_INT align)
9585 {
9586   if (aarch64_can_use_per_function_literal_pools_p ())
9587     return function_section (current_function_decl);
9588
9589   return default_elf_select_rtx_section (mode, x, align);
9590 }
9591
9592 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
9593 void
9594 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
9595                                   HOST_WIDE_INT offset)
9596 {
9597   /* When using per-function literal pools, we must ensure that any code
9598      section is aligned to the minimal instruction length, lest we get
9599      errors from the assembler re "unaligned instructions".  */
9600   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
9601     ASM_OUTPUT_ALIGN (f, 2);
9602 }
9603
9604 /* Costs.  */
9605
9606 /* Helper function for rtx cost calculation.  Strip a shift expression
9607    from X.  Returns the inner operand if successful, or the original
9608    expression on failure.  */
9609 static rtx
9610 aarch64_strip_shift (rtx x)
9611 {
9612   rtx op = x;
9613
9614   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
9615      we can convert both to ROR during final output.  */
9616   if ((GET_CODE (op) == ASHIFT
9617        || GET_CODE (op) == ASHIFTRT
9618        || GET_CODE (op) == LSHIFTRT
9619        || GET_CODE (op) == ROTATERT
9620        || GET_CODE (op) == ROTATE)
9621       && CONST_INT_P (XEXP (op, 1)))
9622     return XEXP (op, 0);
9623
9624   if (GET_CODE (op) == MULT
9625       && CONST_INT_P (XEXP (op, 1))
9626       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
9627     return XEXP (op, 0);
9628
9629   return x;
9630 }
9631
9632 /* Helper function for rtx cost calculation.  Strip an extend
9633    expression from X.  Returns the inner operand if successful, or the
9634    original expression on failure.  We deal with a number of possible
9635    canonicalization variations here. If STRIP_SHIFT is true, then
9636    we can strip off a shift also.  */
9637 static rtx
9638 aarch64_strip_extend (rtx x, bool strip_shift)
9639 {
9640   scalar_int_mode mode;
9641   rtx op = x;
9642
9643   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
9644     return op;
9645
9646   /* Zero and sign extraction of a widened value.  */
9647   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
9648       && XEXP (op, 2) == const0_rtx
9649       && GET_CODE (XEXP (op, 0)) == MULT
9650       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
9651                                          XEXP (op, 1)))
9652     return XEXP (XEXP (op, 0), 0);
9653
9654   /* It can also be represented (for zero-extend) as an AND with an
9655      immediate.  */
9656   if (GET_CODE (op) == AND
9657       && GET_CODE (XEXP (op, 0)) == MULT
9658       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
9659       && CONST_INT_P (XEXP (op, 1))
9660       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
9661                            INTVAL (XEXP (op, 1))) != 0)
9662     return XEXP (XEXP (op, 0), 0);
9663
9664   /* Now handle extended register, as this may also have an optional
9665      left shift by 1..4.  */
9666   if (strip_shift
9667       && GET_CODE (op) == ASHIFT
9668       && CONST_INT_P (XEXP (op, 1))
9669       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
9670     op = XEXP (op, 0);
9671
9672   if (GET_CODE (op) == ZERO_EXTEND
9673       || GET_CODE (op) == SIGN_EXTEND)
9674     op = XEXP (op, 0);
9675
9676   if (op != x)
9677     return op;
9678
9679   return x;
9680 }
9681
9682 /* Return true iff CODE is a shift supported in combination
9683    with arithmetic instructions.  */
9684
9685 static bool
9686 aarch64_shift_p (enum rtx_code code)
9687 {
9688   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
9689 }
9690
9691
9692 /* Return true iff X is a cheap shift without a sign extend. */
9693
9694 static bool
9695 aarch64_cheap_mult_shift_p (rtx x)
9696 {
9697   rtx op0, op1;
9698
9699   op0 = XEXP (x, 0);
9700   op1 = XEXP (x, 1);
9701
9702   if (!(aarch64_tune_params.extra_tuning_flags
9703                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
9704     return false;
9705
9706   if (GET_CODE (op0) == SIGN_EXTEND)
9707     return false;
9708
9709   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
9710       && UINTVAL (op1) <= 4)
9711     return true;
9712
9713   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
9714     return false;
9715
9716   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
9717
9718   if (l2 > 0 && l2 <= 4)
9719     return true;
9720
9721   return false;
9722 }
9723
9724 /* Helper function for rtx cost calculation.  Calculate the cost of
9725    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
9726    Return the calculated cost of the expression, recursing manually in to
9727    operands where needed.  */
9728
9729 static int
9730 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
9731 {
9732   rtx op0, op1;
9733   const struct cpu_cost_table *extra_cost
9734     = aarch64_tune_params.insn_extra_cost;
9735   int cost = 0;
9736   bool compound_p = (outer == PLUS || outer == MINUS);
9737   machine_mode mode = GET_MODE (x);
9738
9739   gcc_checking_assert (code == MULT);
9740
9741   op0 = XEXP (x, 0);
9742   op1 = XEXP (x, 1);
9743
9744   if (VECTOR_MODE_P (mode))
9745     mode = GET_MODE_INNER (mode);
9746
9747   /* Integer multiply/fma.  */
9748   if (GET_MODE_CLASS (mode) == MODE_INT)
9749     {
9750       /* The multiply will be canonicalized as a shift, cost it as such.  */
9751       if (aarch64_shift_p (GET_CODE (x))
9752           || (CONST_INT_P (op1)
9753               && exact_log2 (INTVAL (op1)) > 0))
9754         {
9755           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
9756                            || GET_CODE (op0) == SIGN_EXTEND;
9757           if (speed)
9758             {
9759               if (compound_p)
9760                 {
9761                   /* If the shift is considered cheap,
9762                      then don't add any cost. */
9763                   if (aarch64_cheap_mult_shift_p (x))
9764                     ;
9765                   else if (REG_P (op1))
9766                     /* ARITH + shift-by-register.  */
9767                     cost += extra_cost->alu.arith_shift_reg;
9768                   else if (is_extend)
9769                     /* ARITH + extended register.  We don't have a cost field
9770                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
9771                     cost += extra_cost->alu.extend_arith;
9772                   else
9773                     /* ARITH + shift-by-immediate.  */
9774                     cost += extra_cost->alu.arith_shift;
9775                 }
9776               else
9777                 /* LSL (immediate).  */
9778                 cost += extra_cost->alu.shift;
9779
9780             }
9781           /* Strip extends as we will have costed them in the case above.  */
9782           if (is_extend)
9783             op0 = aarch64_strip_extend (op0, true);
9784
9785           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
9786
9787           return cost;
9788         }
9789
9790       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
9791          compound and let the below cases handle it.  After all, MNEG is a
9792          special-case alias of MSUB.  */
9793       if (GET_CODE (op0) == NEG)
9794         {
9795           op0 = XEXP (op0, 0);
9796           compound_p = true;
9797         }
9798
9799       /* Integer multiplies or FMAs have zero/sign extending variants.  */
9800       if ((GET_CODE (op0) == ZERO_EXTEND
9801            && GET_CODE (op1) == ZERO_EXTEND)
9802           || (GET_CODE (op0) == SIGN_EXTEND
9803               && GET_CODE (op1) == SIGN_EXTEND))
9804         {
9805           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
9806           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
9807
9808           if (speed)
9809             {
9810               if (compound_p)
9811                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
9812                 cost += extra_cost->mult[0].extend_add;
9813               else
9814                 /* MUL/SMULL/UMULL.  */
9815                 cost += extra_cost->mult[0].extend;
9816             }
9817
9818           return cost;
9819         }
9820
9821       /* This is either an integer multiply or a MADD.  In both cases
9822          we want to recurse and cost the operands.  */
9823       cost += rtx_cost (op0, mode, MULT, 0, speed);
9824       cost += rtx_cost (op1, mode, MULT, 1, speed);
9825
9826       if (speed)
9827         {
9828           if (compound_p)
9829             /* MADD/MSUB.  */
9830             cost += extra_cost->mult[mode == DImode].add;
9831           else
9832             /* MUL.  */
9833             cost += extra_cost->mult[mode == DImode].simple;
9834         }
9835
9836       return cost;
9837     }
9838   else
9839     {
9840       if (speed)
9841         {
9842           /* Floating-point FMA/FMUL can also support negations of the
9843              operands, unless the rounding mode is upward or downward in
9844              which case FNMUL is different than FMUL with operand negation.  */
9845           bool neg0 = GET_CODE (op0) == NEG;
9846           bool neg1 = GET_CODE (op1) == NEG;
9847           if (compound_p || !flag_rounding_math || (neg0 && neg1))
9848             {
9849               if (neg0)
9850                 op0 = XEXP (op0, 0);
9851               if (neg1)
9852                 op1 = XEXP (op1, 0);
9853             }
9854
9855           if (compound_p)
9856             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
9857             cost += extra_cost->fp[mode == DFmode].fma;
9858           else
9859             /* FMUL/FNMUL.  */
9860             cost += extra_cost->fp[mode == DFmode].mult;
9861         }
9862
9863       cost += rtx_cost (op0, mode, MULT, 0, speed);
9864       cost += rtx_cost (op1, mode, MULT, 1, speed);
9865       return cost;
9866     }
9867 }
9868
9869 static int
9870 aarch64_address_cost (rtx x,
9871                       machine_mode mode,
9872                       addr_space_t as ATTRIBUTE_UNUSED,
9873                       bool speed)
9874 {
9875   enum rtx_code c = GET_CODE (x);
9876   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
9877   struct aarch64_address_info info;
9878   int cost = 0;
9879   info.shift = 0;
9880
9881   if (!aarch64_classify_address (&info, x, mode, false))
9882     {
9883       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
9884         {
9885           /* This is a CONST or SYMBOL ref which will be split
9886              in a different way depending on the code model in use.
9887              Cost it through the generic infrastructure.  */
9888           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
9889           /* Divide through by the cost of one instruction to
9890              bring it to the same units as the address costs.  */
9891           cost_symbol_ref /= COSTS_N_INSNS (1);
9892           /* The cost is then the cost of preparing the address,
9893              followed by an immediate (possibly 0) offset.  */
9894           return cost_symbol_ref + addr_cost->imm_offset;
9895         }
9896       else
9897         {
9898           /* This is most likely a jump table from a case
9899              statement.  */
9900           return addr_cost->register_offset;
9901         }
9902     }
9903
9904   switch (info.type)
9905     {
9906       case ADDRESS_LO_SUM:
9907       case ADDRESS_SYMBOLIC:
9908       case ADDRESS_REG_IMM:
9909         cost += addr_cost->imm_offset;
9910         break;
9911
9912       case ADDRESS_REG_WB:
9913         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
9914           cost += addr_cost->pre_modify;
9915         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
9916           cost += addr_cost->post_modify;
9917         else
9918           gcc_unreachable ();
9919
9920         break;
9921
9922       case ADDRESS_REG_REG:
9923         cost += addr_cost->register_offset;
9924         break;
9925
9926       case ADDRESS_REG_SXTW:
9927         cost += addr_cost->register_sextend;
9928         break;
9929
9930       case ADDRESS_REG_UXTW:
9931         cost += addr_cost->register_zextend;
9932         break;
9933
9934       default:
9935         gcc_unreachable ();
9936     }
9937
9938
9939   if (info.shift > 0)
9940     {
9941       /* For the sake of calculating the cost of the shifted register
9942          component, we can treat same sized modes in the same way.  */
9943       if (known_eq (GET_MODE_BITSIZE (mode), 16))
9944         cost += addr_cost->addr_scale_costs.hi;
9945       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
9946         cost += addr_cost->addr_scale_costs.si;
9947       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
9948         cost += addr_cost->addr_scale_costs.di;
9949       else
9950         /* We can't tell, or this is a 128-bit vector.  */
9951         cost += addr_cost->addr_scale_costs.ti;
9952     }
9953
9954   return cost;
9955 }
9956
9957 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
9958    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
9959    to be taken.  */
9960
9961 int
9962 aarch64_branch_cost (bool speed_p, bool predictable_p)
9963 {
9964   /* When optimizing for speed, use the cost of unpredictable branches.  */
9965   const struct cpu_branch_cost *branch_costs =
9966     aarch64_tune_params.branch_costs;
9967
9968   if (!speed_p || predictable_p)
9969     return branch_costs->predictable;
9970   else
9971     return branch_costs->unpredictable;
9972 }
9973
9974 /* Return true if the RTX X in mode MODE is a zero or sign extract
9975    usable in an ADD or SUB (extended register) instruction.  */
9976 static bool
9977 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
9978 {
9979   /* Catch add with a sign extract.
9980      This is add_<optab><mode>_multp2.  */
9981   if (GET_CODE (x) == SIGN_EXTRACT
9982       || GET_CODE (x) == ZERO_EXTRACT)
9983     {
9984       rtx op0 = XEXP (x, 0);
9985       rtx op1 = XEXP (x, 1);
9986       rtx op2 = XEXP (x, 2);
9987
9988       if (GET_CODE (op0) == MULT
9989           && CONST_INT_P (op1)
9990           && op2 == const0_rtx
9991           && CONST_INT_P (XEXP (op0, 1))
9992           && aarch64_is_extend_from_extract (mode,
9993                                              XEXP (op0, 1),
9994                                              op1))
9995         {
9996           return true;
9997         }
9998     }
9999   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
10000      No shift.  */
10001   else if (GET_CODE (x) == SIGN_EXTEND
10002            || GET_CODE (x) == ZERO_EXTEND)
10003     return REG_P (XEXP (x, 0));
10004
10005   return false;
10006 }
10007
10008 static bool
10009 aarch64_frint_unspec_p (unsigned int u)
10010 {
10011   switch (u)
10012     {
10013       case UNSPEC_FRINTZ:
10014       case UNSPEC_FRINTP:
10015       case UNSPEC_FRINTM:
10016       case UNSPEC_FRINTA:
10017       case UNSPEC_FRINTN:
10018       case UNSPEC_FRINTX:
10019       case UNSPEC_FRINTI:
10020         return true;
10021
10022       default:
10023         return false;
10024     }
10025 }
10026
10027 /* Return true iff X is an rtx that will match an extr instruction
10028    i.e. as described in the *extr<mode>5_insn family of patterns.
10029    OP0 and OP1 will be set to the operands of the shifts involved
10030    on success and will be NULL_RTX otherwise.  */
10031
10032 static bool
10033 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
10034 {
10035   rtx op0, op1;
10036   scalar_int_mode mode;
10037   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
10038     return false;
10039
10040   *res_op0 = NULL_RTX;
10041   *res_op1 = NULL_RTX;
10042
10043   if (GET_CODE (x) != IOR)
10044     return false;
10045
10046   op0 = XEXP (x, 0);
10047   op1 = XEXP (x, 1);
10048
10049   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
10050       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
10051     {
10052      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
10053       if (GET_CODE (op1) == ASHIFT)
10054         std::swap (op0, op1);
10055
10056       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
10057         return false;
10058
10059       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
10060       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
10061
10062       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
10063           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
10064         {
10065           *res_op0 = XEXP (op0, 0);
10066           *res_op1 = XEXP (op1, 0);
10067           return true;
10068         }
10069     }
10070
10071   return false;
10072 }
10073
10074 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
10075    storing it in *COST.  Result is true if the total cost of the operation
10076    has now been calculated.  */
10077 static bool
10078 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
10079 {
10080   rtx inner;
10081   rtx comparator;
10082   enum rtx_code cmpcode;
10083
10084   if (COMPARISON_P (op0))
10085     {
10086       inner = XEXP (op0, 0);
10087       comparator = XEXP (op0, 1);
10088       cmpcode = GET_CODE (op0);
10089     }
10090   else
10091     {
10092       inner = op0;
10093       comparator = const0_rtx;
10094       cmpcode = NE;
10095     }
10096
10097   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
10098     {
10099       /* Conditional branch.  */
10100       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
10101         return true;
10102       else
10103         {
10104           if (cmpcode == NE || cmpcode == EQ)
10105             {
10106               if (comparator == const0_rtx)
10107                 {
10108                   /* TBZ/TBNZ/CBZ/CBNZ.  */
10109                   if (GET_CODE (inner) == ZERO_EXTRACT)
10110                     /* TBZ/TBNZ.  */
10111                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
10112                                        ZERO_EXTRACT, 0, speed);
10113                   else
10114                     /* CBZ/CBNZ.  */
10115                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
10116
10117                 return true;
10118               }
10119             }
10120           else if (cmpcode == LT || cmpcode == GE)
10121             {
10122               /* TBZ/TBNZ.  */
10123               if (comparator == const0_rtx)
10124                 return true;
10125             }
10126         }
10127     }
10128   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
10129     {
10130       /* CCMP.  */
10131       if (GET_CODE (op1) == COMPARE)
10132         {
10133           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
10134           if (XEXP (op1, 1) == const0_rtx)
10135             *cost += 1;
10136           if (speed)
10137             {
10138               machine_mode mode = GET_MODE (XEXP (op1, 0));
10139               const struct cpu_cost_table *extra_cost
10140                 = aarch64_tune_params.insn_extra_cost;
10141
10142               if (GET_MODE_CLASS (mode) == MODE_INT)
10143                 *cost += extra_cost->alu.arith;
10144               else
10145                 *cost += extra_cost->fp[mode == DFmode].compare;
10146             }
10147           return true;
10148         }
10149
10150       /* It's a conditional operation based on the status flags,
10151          so it must be some flavor of CSEL.  */
10152
10153       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
10154       if (GET_CODE (op1) == NEG
10155           || GET_CODE (op1) == NOT
10156           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
10157         op1 = XEXP (op1, 0);
10158       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
10159         {
10160           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
10161           op1 = XEXP (op1, 0);
10162           op2 = XEXP (op2, 0);
10163         }
10164
10165       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
10166       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
10167       return true;
10168     }
10169
10170   /* We don't know what this is, cost all operands.  */
10171   return false;
10172 }
10173
10174 /* Check whether X is a bitfield operation of the form shift + extend that
10175    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
10176    operand to which the bitfield operation is applied.  Otherwise return
10177    NULL_RTX.  */
10178
10179 static rtx
10180 aarch64_extend_bitfield_pattern_p (rtx x)
10181 {
10182   rtx_code outer_code = GET_CODE (x);
10183   machine_mode outer_mode = GET_MODE (x);
10184
10185   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
10186       && outer_mode != SImode && outer_mode != DImode)
10187     return NULL_RTX;
10188
10189   rtx inner = XEXP (x, 0);
10190   rtx_code inner_code = GET_CODE (inner);
10191   machine_mode inner_mode = GET_MODE (inner);
10192   rtx op = NULL_RTX;
10193
10194   switch (inner_code)
10195     {
10196       case ASHIFT:
10197         if (CONST_INT_P (XEXP (inner, 1))
10198             && (inner_mode == QImode || inner_mode == HImode))
10199           op = XEXP (inner, 0);
10200         break;
10201       case LSHIFTRT:
10202         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
10203             && (inner_mode == QImode || inner_mode == HImode))
10204           op = XEXP (inner, 0);
10205         break;
10206       case ASHIFTRT:
10207         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
10208             && (inner_mode == QImode || inner_mode == HImode))
10209           op = XEXP (inner, 0);
10210         break;
10211       default:
10212         break;
10213     }
10214
10215   return op;
10216 }
10217
10218 /* Return true if the mask and a shift amount from an RTX of the form
10219    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
10220    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
10221
10222 bool
10223 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
10224                                     rtx shft_amnt)
10225 {
10226   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
10227          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
10228          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
10229          && (INTVAL (mask)
10230              & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
10231 }
10232
10233 /* Return true if the masks and a shift amount from an RTX of the form
10234    ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
10235    a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
10236
10237 bool
10238 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
10239                                    unsigned HOST_WIDE_INT mask1,
10240                                    unsigned HOST_WIDE_INT shft_amnt,
10241                                    unsigned HOST_WIDE_INT mask2)
10242 {
10243   unsigned HOST_WIDE_INT t;
10244
10245   /* Verify that there is no overlap in what bits are set in the two masks.  */
10246   if (mask1 != ~mask2)
10247     return false;
10248
10249   /* Verify that mask2 is not all zeros or ones.  */
10250   if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
10251     return false;
10252
10253   /* The shift amount should always be less than the mode size.  */
10254   gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
10255
10256   /* Verify that the mask being shifted is contiguous and would be in the
10257      least significant bits after shifting by shft_amnt.  */
10258   t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
10259   return (t == (t & -t));
10260 }
10261
10262 /* Calculate the cost of calculating X, storing it in *COST.  Result
10263    is true if the total cost of the operation has now been calculated.  */
10264 static bool
10265 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
10266                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
10267 {
10268   rtx op0, op1, op2;
10269   const struct cpu_cost_table *extra_cost
10270     = aarch64_tune_params.insn_extra_cost;
10271   int code = GET_CODE (x);
10272   scalar_int_mode int_mode;
10273
10274   /* By default, assume that everything has equivalent cost to the
10275      cheapest instruction.  Any additional costs are applied as a delta
10276      above this default.  */
10277   *cost = COSTS_N_INSNS (1);
10278
10279   switch (code)
10280     {
10281     case SET:
10282       /* The cost depends entirely on the operands to SET.  */
10283       *cost = 0;
10284       op0 = SET_DEST (x);
10285       op1 = SET_SRC (x);
10286
10287       switch (GET_CODE (op0))
10288         {
10289         case MEM:
10290           if (speed)
10291             {
10292               rtx address = XEXP (op0, 0);
10293               if (VECTOR_MODE_P (mode))
10294                 *cost += extra_cost->ldst.storev;
10295               else if (GET_MODE_CLASS (mode) == MODE_INT)
10296                 *cost += extra_cost->ldst.store;
10297               else if (mode == SFmode)
10298                 *cost += extra_cost->ldst.storef;
10299               else if (mode == DFmode)
10300                 *cost += extra_cost->ldst.stored;
10301
10302               *cost +=
10303                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10304                                                      0, speed));
10305             }
10306
10307           *cost += rtx_cost (op1, mode, SET, 1, speed);
10308           return true;
10309
10310         case SUBREG:
10311           if (! REG_P (SUBREG_REG (op0)))
10312             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
10313
10314           /* Fall through.  */
10315         case REG:
10316           /* The cost is one per vector-register copied.  */
10317           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
10318             {
10319               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
10320               *cost = COSTS_N_INSNS (nregs);
10321             }
10322           /* const0_rtx is in general free, but we will use an
10323              instruction to set a register to 0.  */
10324           else if (REG_P (op1) || op1 == const0_rtx)
10325             {
10326               /* The cost is 1 per register copied.  */
10327               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
10328               *cost = COSTS_N_INSNS (nregs);
10329             }
10330           else
10331             /* Cost is just the cost of the RHS of the set.  */
10332             *cost += rtx_cost (op1, mode, SET, 1, speed);
10333           return true;
10334
10335         case ZERO_EXTRACT:
10336         case SIGN_EXTRACT:
10337           /* Bit-field insertion.  Strip any redundant widening of
10338              the RHS to meet the width of the target.  */
10339           if (GET_CODE (op1) == SUBREG)
10340             op1 = SUBREG_REG (op1);
10341           if ((GET_CODE (op1) == ZERO_EXTEND
10342                || GET_CODE (op1) == SIGN_EXTEND)
10343               && CONST_INT_P (XEXP (op0, 1))
10344               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
10345               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
10346             op1 = XEXP (op1, 0);
10347
10348           if (CONST_INT_P (op1))
10349             {
10350               /* MOV immediate is assumed to always be cheap.  */
10351               *cost = COSTS_N_INSNS (1);
10352             }
10353           else
10354             {
10355               /* BFM.  */
10356               if (speed)
10357                 *cost += extra_cost->alu.bfi;
10358               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
10359             }
10360
10361           return true;
10362
10363         default:
10364           /* We can't make sense of this, assume default cost.  */
10365           *cost = COSTS_N_INSNS (1);
10366           return false;
10367         }
10368       return false;
10369
10370     case CONST_INT:
10371       /* If an instruction can incorporate a constant within the
10372          instruction, the instruction's expression avoids calling
10373          rtx_cost() on the constant.  If rtx_cost() is called on a
10374          constant, then it is usually because the constant must be
10375          moved into a register by one or more instructions.
10376
10377          The exception is constant 0, which can be expressed
10378          as XZR/WZR and is therefore free.  The exception to this is
10379          if we have (set (reg) (const0_rtx)) in which case we must cost
10380          the move.  However, we can catch that when we cost the SET, so
10381          we don't need to consider that here.  */
10382       if (x == const0_rtx)
10383         *cost = 0;
10384       else
10385         {
10386           /* To an approximation, building any other constant is
10387              proportionally expensive to the number of instructions
10388              required to build that constant.  This is true whether we
10389              are compiling for SPEED or otherwise.  */
10390           if (!is_a <scalar_int_mode> (mode, &int_mode))
10391             int_mode = word_mode;
10392           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
10393                                  (NULL_RTX, x, false, int_mode));
10394         }
10395       return true;
10396
10397     case CONST_DOUBLE:
10398
10399       /* First determine number of instructions to do the move
10400           as an integer constant.  */
10401       if (!aarch64_float_const_representable_p (x)
10402            && !aarch64_can_const_movi_rtx_p (x, mode)
10403            && aarch64_float_const_rtx_p (x))
10404         {
10405           unsigned HOST_WIDE_INT ival;
10406           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
10407           gcc_assert (succeed);
10408
10409           scalar_int_mode imode = (mode == HFmode
10410                                    ? SImode
10411                                    : int_mode_for_mode (mode).require ());
10412           int ncost = aarch64_internal_mov_immediate
10413                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
10414           *cost += COSTS_N_INSNS (ncost);
10415           return true;
10416         }
10417
10418       if (speed)
10419         {
10420           /* mov[df,sf]_aarch64.  */
10421           if (aarch64_float_const_representable_p (x))
10422             /* FMOV (scalar immediate).  */
10423             *cost += extra_cost->fp[mode == DFmode].fpconst;
10424           else if (!aarch64_float_const_zero_rtx_p (x))
10425             {
10426               /* This will be a load from memory.  */
10427               if (mode == DFmode)
10428                 *cost += extra_cost->ldst.loadd;
10429               else
10430                 *cost += extra_cost->ldst.loadf;
10431             }
10432           else
10433             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
10434                or MOV v0.s[0], wzr - neither of which are modeled by the
10435                cost tables.  Just use the default cost.  */
10436             {
10437             }
10438         }
10439
10440       return true;
10441
10442     case MEM:
10443       if (speed)
10444         {
10445           /* For loads we want the base cost of a load, plus an
10446              approximation for the additional cost of the addressing
10447              mode.  */
10448           rtx address = XEXP (x, 0);
10449           if (VECTOR_MODE_P (mode))
10450             *cost += extra_cost->ldst.loadv;
10451           else if (GET_MODE_CLASS (mode) == MODE_INT)
10452             *cost += extra_cost->ldst.load;
10453           else if (mode == SFmode)
10454             *cost += extra_cost->ldst.loadf;
10455           else if (mode == DFmode)
10456             *cost += extra_cost->ldst.loadd;
10457
10458           *cost +=
10459                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10460                                                      0, speed));
10461         }
10462
10463       return true;
10464
10465     case NEG:
10466       op0 = XEXP (x, 0);
10467
10468       if (VECTOR_MODE_P (mode))
10469         {
10470           if (speed)
10471             {
10472               /* FNEG.  */
10473               *cost += extra_cost->vect.alu;
10474             }
10475           return false;
10476         }
10477
10478       if (GET_MODE_CLASS (mode) == MODE_INT)
10479         {
10480           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
10481               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
10482             {
10483               /* CSETM.  */
10484               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
10485               return true;
10486             }
10487
10488           /* Cost this as SUB wzr, X.  */
10489           op0 = CONST0_RTX (mode);
10490           op1 = XEXP (x, 0);
10491           goto cost_minus;
10492         }
10493
10494       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10495         {
10496           /* Support (neg(fma...)) as a single instruction only if
10497              sign of zeros is unimportant.  This matches the decision
10498              making in aarch64.md.  */
10499           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
10500             {
10501               /* FNMADD.  */
10502               *cost = rtx_cost (op0, mode, NEG, 0, speed);
10503               return true;
10504             }
10505           if (GET_CODE (op0) == MULT)
10506             {
10507               /* FNMUL.  */
10508               *cost = rtx_cost (op0, mode, NEG, 0, speed);
10509               return true;
10510             }
10511           if (speed)
10512             /* FNEG.  */
10513             *cost += extra_cost->fp[mode == DFmode].neg;
10514           return false;
10515         }
10516
10517       return false;
10518
10519     case CLRSB:
10520     case CLZ:
10521       if (speed)
10522         {
10523           if (VECTOR_MODE_P (mode))
10524             *cost += extra_cost->vect.alu;
10525           else
10526             *cost += extra_cost->alu.clz;
10527         }
10528
10529       return false;
10530
10531     case COMPARE:
10532       op0 = XEXP (x, 0);
10533       op1 = XEXP (x, 1);
10534
10535       if (op1 == const0_rtx
10536           && GET_CODE (op0) == AND)
10537         {
10538           x = op0;
10539           mode = GET_MODE (op0);
10540           goto cost_logic;
10541         }
10542
10543       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
10544         {
10545           /* TODO: A write to the CC flags possibly costs extra, this
10546              needs encoding in the cost tables.  */
10547
10548           mode = GET_MODE (op0);
10549           /* ANDS.  */
10550           if (GET_CODE (op0) == AND)
10551             {
10552               x = op0;
10553               goto cost_logic;
10554             }
10555
10556           if (GET_CODE (op0) == PLUS)
10557             {
10558               /* ADDS (and CMN alias).  */
10559               x = op0;
10560               goto cost_plus;
10561             }
10562
10563           if (GET_CODE (op0) == MINUS)
10564             {
10565               /* SUBS.  */
10566               x = op0;
10567               goto cost_minus;
10568             }
10569
10570           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
10571               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
10572               && CONST_INT_P (XEXP (op0, 2)))
10573             {
10574               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
10575                  Handle it here directly rather than going to cost_logic
10576                  since we know the immediate generated for the TST is valid
10577                  so we can avoid creating an intermediate rtx for it only
10578                  for costing purposes.  */
10579               if (speed)
10580                 *cost += extra_cost->alu.logical;
10581
10582               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
10583                                  ZERO_EXTRACT, 0, speed);
10584               return true;
10585             }
10586
10587           if (GET_CODE (op1) == NEG)
10588             {
10589               /* CMN.  */
10590               if (speed)
10591                 *cost += extra_cost->alu.arith;
10592
10593               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
10594               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
10595               return true;
10596             }
10597
10598           /* CMP.
10599
10600              Compare can freely swap the order of operands, and
10601              canonicalization puts the more complex operation first.
10602              But the integer MINUS logic expects the shift/extend
10603              operation in op1.  */
10604           if (! (REG_P (op0)
10605                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
10606           {
10607             op0 = XEXP (x, 1);
10608             op1 = XEXP (x, 0);
10609           }
10610           goto cost_minus;
10611         }
10612
10613       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
10614         {
10615           /* FCMP.  */
10616           if (speed)
10617             *cost += extra_cost->fp[mode == DFmode].compare;
10618
10619           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
10620             {
10621               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
10622               /* FCMP supports constant 0.0 for no extra cost. */
10623               return true;
10624             }
10625           return false;
10626         }
10627
10628       if (VECTOR_MODE_P (mode))
10629         {
10630           /* Vector compare.  */
10631           if (speed)
10632             *cost += extra_cost->vect.alu;
10633
10634           if (aarch64_float_const_zero_rtx_p (op1))
10635             {
10636               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
10637                  cost.  */
10638               return true;
10639             }
10640           return false;
10641         }
10642       return false;
10643
10644     case MINUS:
10645       {
10646         op0 = XEXP (x, 0);
10647         op1 = XEXP (x, 1);
10648
10649 cost_minus:
10650         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
10651
10652         /* Detect valid immediates.  */
10653         if ((GET_MODE_CLASS (mode) == MODE_INT
10654              || (GET_MODE_CLASS (mode) == MODE_CC
10655                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
10656             && CONST_INT_P (op1)
10657             && aarch64_uimm12_shift (INTVAL (op1)))
10658           {
10659             if (speed)
10660               /* SUB(S) (immediate).  */
10661               *cost += extra_cost->alu.arith;
10662             return true;
10663           }
10664
10665         /* Look for SUB (extended register).  */
10666         if (is_a <scalar_int_mode> (mode, &int_mode)
10667             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
10668           {
10669             if (speed)
10670               *cost += extra_cost->alu.extend_arith;
10671
10672             op1 = aarch64_strip_extend (op1, true);
10673             *cost += rtx_cost (op1, VOIDmode,
10674                                (enum rtx_code) GET_CODE (op1), 0, speed);
10675             return true;
10676           }
10677
10678         rtx new_op1 = aarch64_strip_extend (op1, false);
10679
10680         /* Cost this as an FMA-alike operation.  */
10681         if ((GET_CODE (new_op1) == MULT
10682              || aarch64_shift_p (GET_CODE (new_op1)))
10683             && code != COMPARE)
10684           {
10685             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
10686                                             (enum rtx_code) code,
10687                                             speed);
10688             return true;
10689           }
10690
10691         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
10692
10693         if (speed)
10694           {
10695             if (VECTOR_MODE_P (mode))
10696               {
10697                 /* Vector SUB.  */
10698                 *cost += extra_cost->vect.alu;
10699               }
10700             else if (GET_MODE_CLASS (mode) == MODE_INT)
10701               {
10702                 /* SUB(S).  */
10703                 *cost += extra_cost->alu.arith;
10704               }
10705             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10706               {
10707                 /* FSUB.  */
10708                 *cost += extra_cost->fp[mode == DFmode].addsub;
10709               }
10710           }
10711         return true;
10712       }
10713
10714     case PLUS:
10715       {
10716         rtx new_op0;
10717
10718         op0 = XEXP (x, 0);
10719         op1 = XEXP (x, 1);
10720
10721 cost_plus:
10722         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
10723             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
10724           {
10725             /* CSINC.  */
10726             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
10727             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10728             return true;
10729           }
10730
10731         if (GET_MODE_CLASS (mode) == MODE_INT
10732             && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
10733                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
10734           {
10735             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
10736
10737             if (speed)
10738               /* ADD (immediate).  */
10739               *cost += extra_cost->alu.arith;
10740             return true;
10741           }
10742
10743         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10744
10745         /* Look for ADD (extended register).  */
10746         if (is_a <scalar_int_mode> (mode, &int_mode)
10747             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
10748           {
10749             if (speed)
10750               *cost += extra_cost->alu.extend_arith;
10751
10752             op0 = aarch64_strip_extend (op0, true);
10753             *cost += rtx_cost (op0, VOIDmode,
10754                                (enum rtx_code) GET_CODE (op0), 0, speed);
10755             return true;
10756           }
10757
10758         /* Strip any extend, leave shifts behind as we will
10759            cost them through mult_cost.  */
10760         new_op0 = aarch64_strip_extend (op0, false);
10761
10762         if (GET_CODE (new_op0) == MULT
10763             || aarch64_shift_p (GET_CODE (new_op0)))
10764           {
10765             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
10766                                             speed);
10767             return true;
10768           }
10769
10770         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
10771
10772         if (speed)
10773           {
10774             if (VECTOR_MODE_P (mode))
10775               {
10776                 /* Vector ADD.  */
10777                 *cost += extra_cost->vect.alu;
10778               }
10779             else if (GET_MODE_CLASS (mode) == MODE_INT)
10780               {
10781                 /* ADD.  */
10782                 *cost += extra_cost->alu.arith;
10783               }
10784             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10785               {
10786                 /* FADD.  */
10787                 *cost += extra_cost->fp[mode == DFmode].addsub;
10788               }
10789           }
10790         return true;
10791       }
10792
10793     case BSWAP:
10794       *cost = COSTS_N_INSNS (1);
10795
10796       if (speed)
10797         {
10798           if (VECTOR_MODE_P (mode))
10799             *cost += extra_cost->vect.alu;
10800           else
10801             *cost += extra_cost->alu.rev;
10802         }
10803       return false;
10804
10805     case IOR:
10806       if (aarch_rev16_p (x))
10807         {
10808           *cost = COSTS_N_INSNS (1);
10809
10810           if (speed)
10811             {
10812               if (VECTOR_MODE_P (mode))
10813                 *cost += extra_cost->vect.alu;
10814               else
10815                 *cost += extra_cost->alu.rev;
10816             }
10817           return true;
10818         }
10819
10820       if (aarch64_extr_rtx_p (x, &op0, &op1))
10821         {
10822           *cost += rtx_cost (op0, mode, IOR, 0, speed);
10823           *cost += rtx_cost (op1, mode, IOR, 1, speed);
10824           if (speed)
10825             *cost += extra_cost->alu.shift;
10826
10827           return true;
10828         }
10829     /* Fall through.  */
10830     case XOR:
10831     case AND:
10832     cost_logic:
10833       op0 = XEXP (x, 0);
10834       op1 = XEXP (x, 1);
10835
10836       if (VECTOR_MODE_P (mode))
10837         {
10838           if (speed)
10839             *cost += extra_cost->vect.alu;
10840           return true;
10841         }
10842
10843       if (code == AND
10844           && GET_CODE (op0) == MULT
10845           && CONST_INT_P (XEXP (op0, 1))
10846           && CONST_INT_P (op1)
10847           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
10848                                INTVAL (op1)) != 0)
10849         {
10850           /* This is a UBFM/SBFM.  */
10851           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
10852           if (speed)
10853             *cost += extra_cost->alu.bfx;
10854           return true;
10855         }
10856
10857       if (is_int_mode (mode, &int_mode))
10858         {
10859           if (CONST_INT_P (op1))
10860             {
10861               /* We have a mask + shift version of a UBFIZ
10862                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
10863               if (GET_CODE (op0) == ASHIFT
10864                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
10865                                                          XEXP (op0, 1)))
10866                 {
10867                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
10868                                      (enum rtx_code) code, 0, speed);
10869                   if (speed)
10870                     *cost += extra_cost->alu.bfx;
10871
10872                   return true;
10873                 }
10874               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
10875                 {
10876                 /* We possibly get the immediate for free, this is not
10877                    modelled.  */
10878                   *cost += rtx_cost (op0, int_mode,
10879                                      (enum rtx_code) code, 0, speed);
10880                   if (speed)
10881                     *cost += extra_cost->alu.logical;
10882
10883                   return true;
10884                 }
10885             }
10886           else
10887             {
10888               rtx new_op0 = op0;
10889
10890               /* Handle ORN, EON, or BIC.  */
10891               if (GET_CODE (op0) == NOT)
10892                 op0 = XEXP (op0, 0);
10893
10894               new_op0 = aarch64_strip_shift (op0);
10895
10896               /* If we had a shift on op0 then this is a logical-shift-
10897                  by-register/immediate operation.  Otherwise, this is just
10898                  a logical operation.  */
10899               if (speed)
10900                 {
10901                   if (new_op0 != op0)
10902                     {
10903                       /* Shift by immediate.  */
10904                       if (CONST_INT_P (XEXP (op0, 1)))
10905                         *cost += extra_cost->alu.log_shift;
10906                       else
10907                         *cost += extra_cost->alu.log_shift_reg;
10908                     }
10909                   else
10910                     *cost += extra_cost->alu.logical;
10911                 }
10912
10913               /* In both cases we want to cost both operands.  */
10914               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
10915                                  0, speed);
10916               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
10917                                  1, speed);
10918
10919               return true;
10920             }
10921         }
10922       return false;
10923
10924     case NOT:
10925       x = XEXP (x, 0);
10926       op0 = aarch64_strip_shift (x);
10927
10928       if (VECTOR_MODE_P (mode))
10929         {
10930           /* Vector NOT.  */
10931           *cost += extra_cost->vect.alu;
10932           return false;
10933         }
10934
10935       /* MVN-shifted-reg.  */
10936       if (op0 != x)
10937         {
10938           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10939
10940           if (speed)
10941             *cost += extra_cost->alu.log_shift;
10942
10943           return true;
10944         }
10945       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
10946          Handle the second form here taking care that 'a' in the above can
10947          be a shift.  */
10948       else if (GET_CODE (op0) == XOR)
10949         {
10950           rtx newop0 = XEXP (op0, 0);
10951           rtx newop1 = XEXP (op0, 1);
10952           rtx op0_stripped = aarch64_strip_shift (newop0);
10953
10954           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
10955           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
10956
10957           if (speed)
10958             {
10959               if (op0_stripped != newop0)
10960                 *cost += extra_cost->alu.log_shift;
10961               else
10962                 *cost += extra_cost->alu.logical;
10963             }
10964
10965           return true;
10966         }
10967       /* MVN.  */
10968       if (speed)
10969         *cost += extra_cost->alu.logical;
10970
10971       return false;
10972
10973     case ZERO_EXTEND:
10974
10975       op0 = XEXP (x, 0);
10976       /* If a value is written in SI mode, then zero extended to DI
10977          mode, the operation will in general be free as a write to
10978          a 'w' register implicitly zeroes the upper bits of an 'x'
10979          register.  However, if this is
10980
10981            (set (reg) (zero_extend (reg)))
10982
10983          we must cost the explicit register move.  */
10984       if (mode == DImode
10985           && GET_MODE (op0) == SImode
10986           && outer == SET)
10987         {
10988           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
10989
10990         /* If OP_COST is non-zero, then the cost of the zero extend
10991            is effectively the cost of the inner operation.  Otherwise
10992            we have a MOV instruction and we take the cost from the MOV
10993            itself.  This is true independently of whether we are
10994            optimizing for space or time.  */
10995           if (op_cost)
10996             *cost = op_cost;
10997
10998           return true;
10999         }
11000       else if (MEM_P (op0))
11001         {
11002           /* All loads can zero extend to any size for free.  */
11003           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
11004           return true;
11005         }
11006
11007       op0 = aarch64_extend_bitfield_pattern_p (x);
11008       if (op0)
11009         {
11010           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
11011           if (speed)
11012             *cost += extra_cost->alu.bfx;
11013           return true;
11014         }
11015
11016       if (speed)
11017         {
11018           if (VECTOR_MODE_P (mode))
11019             {
11020               /* UMOV.  */
11021               *cost += extra_cost->vect.alu;
11022             }
11023           else
11024             {
11025               /* We generate an AND instead of UXTB/UXTH.  */
11026               *cost += extra_cost->alu.logical;
11027             }
11028         }
11029       return false;
11030
11031     case SIGN_EXTEND:
11032       if (MEM_P (XEXP (x, 0)))
11033         {
11034           /* LDRSH.  */
11035           if (speed)
11036             {
11037               rtx address = XEXP (XEXP (x, 0), 0);
11038               *cost += extra_cost->ldst.load_sign_extend;
11039
11040               *cost +=
11041                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
11042                                                      0, speed));
11043             }
11044           return true;
11045         }
11046
11047       op0 = aarch64_extend_bitfield_pattern_p (x);
11048       if (op0)
11049         {
11050           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
11051           if (speed)
11052             *cost += extra_cost->alu.bfx;
11053           return true;
11054         }
11055
11056       if (speed)
11057         {
11058           if (VECTOR_MODE_P (mode))
11059             *cost += extra_cost->vect.alu;
11060           else
11061             *cost += extra_cost->alu.extend;
11062         }
11063       return false;
11064
11065     case ASHIFT:
11066       op0 = XEXP (x, 0);
11067       op1 = XEXP (x, 1);
11068
11069       if (CONST_INT_P (op1))
11070         {
11071           if (speed)
11072             {
11073               if (VECTOR_MODE_P (mode))
11074                 {
11075                   /* Vector shift (immediate).  */
11076                   *cost += extra_cost->vect.alu;
11077                 }
11078               else
11079                 {
11080                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
11081                      aliases.  */
11082                   *cost += extra_cost->alu.shift;
11083                 }
11084             }
11085
11086           /* We can incorporate zero/sign extend for free.  */
11087           if (GET_CODE (op0) == ZERO_EXTEND
11088               || GET_CODE (op0) == SIGN_EXTEND)
11089             op0 = XEXP (op0, 0);
11090
11091           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
11092           return true;
11093         }
11094       else
11095         {
11096           if (VECTOR_MODE_P (mode))
11097             {
11098               if (speed)
11099                 /* Vector shift (register).  */
11100                 *cost += extra_cost->vect.alu;
11101             }
11102           else
11103             {
11104               if (speed)
11105                 /* LSLV.  */
11106                 *cost += extra_cost->alu.shift_reg;
11107
11108               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
11109                   && CONST_INT_P (XEXP (op1, 1))
11110                   && known_eq (INTVAL (XEXP (op1, 1)),
11111                                GET_MODE_BITSIZE (mode) - 1))
11112                 {
11113                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
11114                   /* We already demanded XEXP (op1, 0) to be REG_P, so
11115                      don't recurse into it.  */
11116                   return true;
11117                 }
11118             }
11119           return false;  /* All arguments need to be in registers.  */
11120         }
11121
11122     case ROTATE:
11123     case ROTATERT:
11124     case LSHIFTRT:
11125     case ASHIFTRT:
11126       op0 = XEXP (x, 0);
11127       op1 = XEXP (x, 1);
11128
11129       if (CONST_INT_P (op1))
11130         {
11131           /* ASR (immediate) and friends.  */
11132           if (speed)
11133             {
11134               if (VECTOR_MODE_P (mode))
11135                 *cost += extra_cost->vect.alu;
11136               else
11137                 *cost += extra_cost->alu.shift;
11138             }
11139
11140           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
11141           return true;
11142         }
11143       else
11144         {
11145           if (VECTOR_MODE_P (mode))
11146             {
11147               if (speed)
11148                 /* Vector shift (register).  */
11149                 *cost += extra_cost->vect.alu;
11150             }
11151           else
11152             {
11153               if (speed)
11154                 /* ASR (register) and friends.  */
11155                 *cost += extra_cost->alu.shift_reg;
11156
11157               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
11158                   && CONST_INT_P (XEXP (op1, 1))
11159                   && known_eq (INTVAL (XEXP (op1, 1)),
11160                                GET_MODE_BITSIZE (mode) - 1))
11161                 {
11162                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
11163                   /* We already demanded XEXP (op1, 0) to be REG_P, so
11164                      don't recurse into it.  */
11165                   return true;
11166                 }
11167             }
11168           return false;  /* All arguments need to be in registers.  */
11169         }
11170
11171     case SYMBOL_REF:
11172
11173       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
11174           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
11175         {
11176           /* LDR.  */
11177           if (speed)
11178             *cost += extra_cost->ldst.load;
11179         }
11180       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
11181                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
11182         {
11183           /* ADRP, followed by ADD.  */
11184           *cost += COSTS_N_INSNS (1);
11185           if (speed)
11186             *cost += 2 * extra_cost->alu.arith;
11187         }
11188       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
11189                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
11190         {
11191           /* ADR.  */
11192           if (speed)
11193             *cost += extra_cost->alu.arith;
11194         }
11195
11196       if (flag_pic)
11197         {
11198           /* One extra load instruction, after accessing the GOT.  */
11199           *cost += COSTS_N_INSNS (1);
11200           if (speed)
11201             *cost += extra_cost->ldst.load;
11202         }
11203       return true;
11204
11205     case HIGH:
11206     case LO_SUM:
11207       /* ADRP/ADD (immediate).  */
11208       if (speed)
11209         *cost += extra_cost->alu.arith;
11210       return true;
11211
11212     case ZERO_EXTRACT:
11213     case SIGN_EXTRACT:
11214       /* UBFX/SBFX.  */
11215       if (speed)
11216         {
11217           if (VECTOR_MODE_P (mode))
11218             *cost += extra_cost->vect.alu;
11219           else
11220             *cost += extra_cost->alu.bfx;
11221         }
11222
11223       /* We can trust that the immediates used will be correct (there
11224          are no by-register forms), so we need only cost op0.  */
11225       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
11226       return true;
11227
11228     case MULT:
11229       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
11230       /* aarch64_rtx_mult_cost always handles recursion to its
11231          operands.  */
11232       return true;
11233
11234     case MOD:
11235     /* We can expand signed mod by power of 2 using a NEGS, two parallel
11236        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
11237        an unconditional negate.  This case should only ever be reached through
11238        the set_smod_pow2_cheap check in expmed.c.  */
11239       if (CONST_INT_P (XEXP (x, 1))
11240           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
11241           && (mode == SImode || mode == DImode))
11242         {
11243           /* We expand to 4 instructions.  Reset the baseline.  */
11244           *cost = COSTS_N_INSNS (4);
11245
11246           if (speed)
11247             *cost += 2 * extra_cost->alu.logical
11248                      + 2 * extra_cost->alu.arith;
11249
11250           return true;
11251         }
11252
11253     /* Fall-through.  */
11254     case UMOD:
11255       if (speed)
11256         {
11257           /* Slighly prefer UMOD over SMOD.  */
11258           if (VECTOR_MODE_P (mode))
11259             *cost += extra_cost->vect.alu;
11260           else if (GET_MODE_CLASS (mode) == MODE_INT)
11261             *cost += (extra_cost->mult[mode == DImode].add
11262                       + extra_cost->mult[mode == DImode].idiv
11263                       + (code == MOD ? 1 : 0));
11264         }
11265       return false;  /* All arguments need to be in registers.  */
11266
11267     case DIV:
11268     case UDIV:
11269     case SQRT:
11270       if (speed)
11271         {
11272           if (VECTOR_MODE_P (mode))
11273             *cost += extra_cost->vect.alu;
11274           else if (GET_MODE_CLASS (mode) == MODE_INT)
11275             /* There is no integer SQRT, so only DIV and UDIV can get
11276                here.  */
11277             *cost += (extra_cost->mult[mode == DImode].idiv
11278                      /* Slighly prefer UDIV over SDIV.  */
11279                      + (code == DIV ? 1 : 0));
11280           else
11281             *cost += extra_cost->fp[mode == DFmode].div;
11282         }
11283       return false;  /* All arguments need to be in registers.  */
11284
11285     case IF_THEN_ELSE:
11286       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
11287                                          XEXP (x, 2), cost, speed);
11288
11289     case EQ:
11290     case NE:
11291     case GT:
11292     case GTU:
11293     case LT:
11294     case LTU:
11295     case GE:
11296     case GEU:
11297     case LE:
11298     case LEU:
11299
11300       return false; /* All arguments must be in registers.  */
11301
11302     case FMA:
11303       op0 = XEXP (x, 0);
11304       op1 = XEXP (x, 1);
11305       op2 = XEXP (x, 2);
11306
11307       if (speed)
11308         {
11309           if (VECTOR_MODE_P (mode))
11310             *cost += extra_cost->vect.alu;
11311           else
11312             *cost += extra_cost->fp[mode == DFmode].fma;
11313         }
11314
11315       /* FMSUB, FNMADD, and FNMSUB are free.  */
11316       if (GET_CODE (op0) == NEG)
11317         op0 = XEXP (op0, 0);
11318
11319       if (GET_CODE (op2) == NEG)
11320         op2 = XEXP (op2, 0);
11321
11322       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
11323          and the by-element operand as operand 0.  */
11324       if (GET_CODE (op1) == NEG)
11325         op1 = XEXP (op1, 0);
11326
11327       /* Catch vector-by-element operations.  The by-element operand can
11328          either be (vec_duplicate (vec_select (x))) or just
11329          (vec_select (x)), depending on whether we are multiplying by
11330          a vector or a scalar.
11331
11332          Canonicalization is not very good in these cases, FMA4 will put the
11333          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
11334       if (GET_CODE (op0) == VEC_DUPLICATE)
11335         op0 = XEXP (op0, 0);
11336       else if (GET_CODE (op1) == VEC_DUPLICATE)
11337         op1 = XEXP (op1, 0);
11338
11339       if (GET_CODE (op0) == VEC_SELECT)
11340         op0 = XEXP (op0, 0);
11341       else if (GET_CODE (op1) == VEC_SELECT)
11342         op1 = XEXP (op1, 0);
11343
11344       /* If the remaining parameters are not registers,
11345          get the cost to put them into registers.  */
11346       *cost += rtx_cost (op0, mode, FMA, 0, speed);
11347       *cost += rtx_cost (op1, mode, FMA, 1, speed);
11348       *cost += rtx_cost (op2, mode, FMA, 2, speed);
11349       return true;
11350
11351     case FLOAT:
11352     case UNSIGNED_FLOAT:
11353       if (speed)
11354         *cost += extra_cost->fp[mode == DFmode].fromint;
11355       return false;
11356
11357     case FLOAT_EXTEND:
11358       if (speed)
11359         {
11360           if (VECTOR_MODE_P (mode))
11361             {
11362               /*Vector truncate.  */
11363               *cost += extra_cost->vect.alu;
11364             }
11365           else
11366             *cost += extra_cost->fp[mode == DFmode].widen;
11367         }
11368       return false;
11369
11370     case FLOAT_TRUNCATE:
11371       if (speed)
11372         {
11373           if (VECTOR_MODE_P (mode))
11374             {
11375               /*Vector conversion.  */
11376               *cost += extra_cost->vect.alu;
11377             }
11378           else
11379             *cost += extra_cost->fp[mode == DFmode].narrow;
11380         }
11381       return false;
11382
11383     case FIX:
11384     case UNSIGNED_FIX:
11385       x = XEXP (x, 0);
11386       /* Strip the rounding part.  They will all be implemented
11387          by the fcvt* family of instructions anyway.  */
11388       if (GET_CODE (x) == UNSPEC)
11389         {
11390           unsigned int uns_code = XINT (x, 1);
11391
11392           if (uns_code == UNSPEC_FRINTA
11393               || uns_code == UNSPEC_FRINTM
11394               || uns_code == UNSPEC_FRINTN
11395               || uns_code == UNSPEC_FRINTP
11396               || uns_code == UNSPEC_FRINTZ)
11397             x = XVECEXP (x, 0, 0);
11398         }
11399
11400       if (speed)
11401         {
11402           if (VECTOR_MODE_P (mode))
11403             *cost += extra_cost->vect.alu;
11404           else
11405             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
11406         }
11407
11408       /* We can combine fmul by a power of 2 followed by a fcvt into a single
11409          fixed-point fcvt.  */
11410       if (GET_CODE (x) == MULT
11411           && ((VECTOR_MODE_P (mode)
11412                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
11413               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
11414         {
11415           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
11416                              0, speed);
11417           return true;
11418         }
11419
11420       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
11421       return true;
11422
11423     case ABS:
11424       if (VECTOR_MODE_P (mode))
11425         {
11426           /* ABS (vector).  */
11427           if (speed)
11428             *cost += extra_cost->vect.alu;
11429         }
11430       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11431         {
11432           op0 = XEXP (x, 0);
11433
11434           /* FABD, which is analogous to FADD.  */
11435           if (GET_CODE (op0) == MINUS)
11436             {
11437               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
11438               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
11439               if (speed)
11440                 *cost += extra_cost->fp[mode == DFmode].addsub;
11441
11442               return true;
11443             }
11444           /* Simple FABS is analogous to FNEG.  */
11445           if (speed)
11446             *cost += extra_cost->fp[mode == DFmode].neg;
11447         }
11448       else
11449         {
11450           /* Integer ABS will either be split to
11451              two arithmetic instructions, or will be an ABS
11452              (scalar), which we don't model.  */
11453           *cost = COSTS_N_INSNS (2);
11454           if (speed)
11455             *cost += 2 * extra_cost->alu.arith;
11456         }
11457       return false;
11458
11459     case SMAX:
11460     case SMIN:
11461       if (speed)
11462         {
11463           if (VECTOR_MODE_P (mode))
11464             *cost += extra_cost->vect.alu;
11465           else
11466             {
11467               /* FMAXNM/FMINNM/FMAX/FMIN.
11468                  TODO: This may not be accurate for all implementations, but
11469                  we do not model this in the cost tables.  */
11470               *cost += extra_cost->fp[mode == DFmode].addsub;
11471             }
11472         }
11473       return false;
11474
11475     case UNSPEC:
11476       /* The floating point round to integer frint* instructions.  */
11477       if (aarch64_frint_unspec_p (XINT (x, 1)))
11478         {
11479           if (speed)
11480             *cost += extra_cost->fp[mode == DFmode].roundint;
11481
11482           return false;
11483         }
11484
11485       if (XINT (x, 1) == UNSPEC_RBIT)
11486         {
11487           if (speed)
11488             *cost += extra_cost->alu.rev;
11489
11490           return false;
11491         }
11492       break;
11493
11494     case TRUNCATE:
11495
11496       /* Decompose <su>muldi3_highpart.  */
11497       if (/* (truncate:DI  */
11498           mode == DImode
11499           /*   (lshiftrt:TI  */
11500           && GET_MODE (XEXP (x, 0)) == TImode
11501           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
11502           /*      (mult:TI  */
11503           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
11504           /*        (ANY_EXTEND:TI (reg:DI))
11505                     (ANY_EXTEND:TI (reg:DI)))  */
11506           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
11507                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
11508               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
11509                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
11510           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
11511           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
11512           /*     (const_int 64)  */
11513           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
11514           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
11515         {
11516           /* UMULH/SMULH.  */
11517           if (speed)
11518             *cost += extra_cost->mult[mode == DImode].extend;
11519           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
11520                              mode, MULT, 0, speed);
11521           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
11522                              mode, MULT, 1, speed);
11523           return true;
11524         }
11525
11526       /* Fall through.  */
11527     default:
11528       break;
11529     }
11530
11531   if (dump_file
11532       && flag_aarch64_verbose_cost)
11533     fprintf (dump_file,
11534       "\nFailed to cost RTX.  Assuming default cost.\n");
11535
11536   return true;
11537 }
11538
11539 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
11540    calculated for X.  This cost is stored in *COST.  Returns true
11541    if the total cost of X was calculated.  */
11542 static bool
11543 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
11544                    int param, int *cost, bool speed)
11545 {
11546   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
11547
11548   if (dump_file
11549       && flag_aarch64_verbose_cost)
11550     {
11551       print_rtl_single (dump_file, x);
11552       fprintf (dump_file, "\n%s cost: %d (%s)\n",
11553                speed ? "Hot" : "Cold",
11554                *cost, result ? "final" : "partial");
11555     }
11556
11557   return result;
11558 }
11559
11560 static int
11561 aarch64_register_move_cost (machine_mode mode,
11562                             reg_class_t from_i, reg_class_t to_i)
11563 {
11564   enum reg_class from = (enum reg_class) from_i;
11565   enum reg_class to = (enum reg_class) to_i;
11566   const struct cpu_regmove_cost *regmove_cost
11567     = aarch64_tune_params.regmove_cost;
11568
11569   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
11570   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
11571     to = GENERAL_REGS;
11572
11573   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
11574     from = GENERAL_REGS;
11575
11576   /* Moving between GPR and stack cost is the same as GP2GP.  */
11577   if ((from == GENERAL_REGS && to == STACK_REG)
11578       || (to == GENERAL_REGS && from == STACK_REG))
11579     return regmove_cost->GP2GP;
11580
11581   /* To/From the stack register, we move via the gprs.  */
11582   if (to == STACK_REG || from == STACK_REG)
11583     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
11584             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
11585
11586   if (known_eq (GET_MODE_SIZE (mode), 16))
11587     {
11588       /* 128-bit operations on general registers require 2 instructions.  */
11589       if (from == GENERAL_REGS && to == GENERAL_REGS)
11590         return regmove_cost->GP2GP * 2;
11591       else if (from == GENERAL_REGS)
11592         return regmove_cost->GP2FP * 2;
11593       else if (to == GENERAL_REGS)
11594         return regmove_cost->FP2GP * 2;
11595
11596       /* When AdvSIMD instructions are disabled it is not possible to move
11597          a 128-bit value directly between Q registers.  This is handled in
11598          secondary reload.  A general register is used as a scratch to move
11599          the upper DI value and the lower DI value is moved directly,
11600          hence the cost is the sum of three moves. */
11601       if (! TARGET_SIMD)
11602         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
11603
11604       return regmove_cost->FP2FP;
11605     }
11606
11607   if (from == GENERAL_REGS && to == GENERAL_REGS)
11608     return regmove_cost->GP2GP;
11609   else if (from == GENERAL_REGS)
11610     return regmove_cost->GP2FP;
11611   else if (to == GENERAL_REGS)
11612     return regmove_cost->FP2GP;
11613
11614   return regmove_cost->FP2FP;
11615 }
11616
11617 static int
11618 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
11619                           reg_class_t rclass ATTRIBUTE_UNUSED,
11620                           bool in ATTRIBUTE_UNUSED)
11621 {
11622   return aarch64_tune_params.memmov_cost;
11623 }
11624
11625 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
11626    to optimize 1.0/sqrt.  */
11627
11628 static bool
11629 use_rsqrt_p (machine_mode mode)
11630 {
11631   return (!flag_trapping_math
11632           && flag_unsafe_math_optimizations
11633           && ((aarch64_tune_params.approx_modes->recip_sqrt
11634                & AARCH64_APPROX_MODE (mode))
11635               || flag_mrecip_low_precision_sqrt));
11636 }
11637
11638 /* Function to decide when to use the approximate reciprocal square root
11639    builtin.  */
11640
11641 static tree
11642 aarch64_builtin_reciprocal (tree fndecl)
11643 {
11644   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
11645
11646   if (!use_rsqrt_p (mode))
11647     return NULL_TREE;
11648   return aarch64_builtin_rsqrt (DECL_MD_FUNCTION_CODE (fndecl));
11649 }
11650
11651 /* Emit instruction sequence to compute either the approximate square root
11652    or its approximate reciprocal, depending on the flag RECP, and return
11653    whether the sequence was emitted or not.  */
11654
11655 bool
11656 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
11657 {
11658   machine_mode mode = GET_MODE (dst);
11659
11660   if (GET_MODE_INNER (mode) == HFmode)
11661     {
11662       gcc_assert (!recp);
11663       return false;
11664     }
11665
11666   if (!recp)
11667     {
11668       if (!(flag_mlow_precision_sqrt
11669             || (aarch64_tune_params.approx_modes->sqrt
11670                 & AARCH64_APPROX_MODE (mode))))
11671         return false;
11672
11673       if (flag_finite_math_only
11674           || flag_trapping_math
11675           || !flag_unsafe_math_optimizations
11676           || optimize_function_for_size_p (cfun))
11677         return false;
11678     }
11679   else
11680     /* Caller assumes we cannot fail.  */
11681     gcc_assert (use_rsqrt_p (mode));
11682
11683   machine_mode mmsk = mode_for_int_vector (mode).require ();
11684   rtx xmsk = gen_reg_rtx (mmsk);
11685   if (!recp)
11686     /* When calculating the approximate square root, compare the
11687        argument with 0.0 and create a mask.  */
11688     emit_insn (gen_rtx_SET (xmsk,
11689                             gen_rtx_NEG (mmsk,
11690                                          gen_rtx_EQ (mmsk, src,
11691                                                      CONST0_RTX (mode)))));
11692
11693   /* Estimate the approximate reciprocal square root.  */
11694   rtx xdst = gen_reg_rtx (mode);
11695   emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
11696
11697   /* Iterate over the series twice for SF and thrice for DF.  */
11698   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11699
11700   /* Optionally iterate over the series once less for faster performance
11701      while sacrificing the accuracy.  */
11702   if ((recp && flag_mrecip_low_precision_sqrt)
11703       || (!recp && flag_mlow_precision_sqrt))
11704     iterations--;
11705
11706   /* Iterate over the series to calculate the approximate reciprocal square
11707      root.  */
11708   rtx x1 = gen_reg_rtx (mode);
11709   while (iterations--)
11710     {
11711       rtx x2 = gen_reg_rtx (mode);
11712       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
11713
11714       emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
11715
11716       if (iterations > 0)
11717         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
11718     }
11719
11720   if (!recp)
11721     {
11722       /* Qualify the approximate reciprocal square root when the argument is
11723          0.0 by squashing the intermediary result to 0.0.  */
11724       rtx xtmp = gen_reg_rtx (mmsk);
11725       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
11726                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
11727       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
11728
11729       /* Calculate the approximate square root.  */
11730       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
11731     }
11732
11733   /* Finalize the approximation.  */
11734   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
11735
11736   return true;
11737 }
11738
11739 /* Emit the instruction sequence to compute the approximation for the division
11740    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
11741
11742 bool
11743 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
11744 {
11745   machine_mode mode = GET_MODE (quo);
11746
11747   if (GET_MODE_INNER (mode) == HFmode)
11748     return false;
11749
11750   bool use_approx_division_p = (flag_mlow_precision_div
11751                                 || (aarch64_tune_params.approx_modes->division
11752                                     & AARCH64_APPROX_MODE (mode)));
11753
11754   if (!flag_finite_math_only
11755       || flag_trapping_math
11756       || !flag_unsafe_math_optimizations
11757       || optimize_function_for_size_p (cfun)
11758       || !use_approx_division_p)
11759     return false;
11760
11761   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
11762     return false;
11763
11764   /* Estimate the approximate reciprocal.  */
11765   rtx xrcp = gen_reg_rtx (mode);
11766   emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
11767
11768   /* Iterate over the series twice for SF and thrice for DF.  */
11769   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11770
11771   /* Optionally iterate over the series once less for faster performance,
11772      while sacrificing the accuracy.  */
11773   if (flag_mlow_precision_div)
11774     iterations--;
11775
11776   /* Iterate over the series to calculate the approximate reciprocal.  */
11777   rtx xtmp = gen_reg_rtx (mode);
11778   while (iterations--)
11779     {
11780       emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
11781
11782       if (iterations > 0)
11783         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
11784     }
11785
11786   if (num != CONST1_RTX (mode))
11787     {
11788       /* As the approximate reciprocal of DEN is already calculated, only
11789          calculate the approximate division when NUM is not 1.0.  */
11790       rtx xnum = force_reg (mode, num);
11791       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
11792     }
11793
11794   /* Finalize the approximation.  */
11795   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
11796   return true;
11797 }
11798
11799 /* Return the number of instructions that can be issued per cycle.  */
11800 static int
11801 aarch64_sched_issue_rate (void)
11802 {
11803   return aarch64_tune_params.issue_rate;
11804 }
11805
11806 static int
11807 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
11808 {
11809   int issue_rate = aarch64_sched_issue_rate ();
11810
11811   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
11812 }
11813
11814
11815 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
11816    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
11817    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
11818
11819 static int
11820 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
11821                                                     int ready_index)
11822 {
11823   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
11824 }
11825
11826
11827 /* Vectorizer cost model target hooks.  */
11828
11829 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
11830 static int
11831 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
11832                                     tree vectype,
11833                                     int misalign ATTRIBUTE_UNUSED)
11834 {
11835   unsigned elements;
11836   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
11837   bool fp = false;
11838
11839   if (vectype != NULL)
11840     fp = FLOAT_TYPE_P (vectype);
11841
11842   switch (type_of_cost)
11843     {
11844       case scalar_stmt:
11845         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
11846
11847       case scalar_load:
11848         return costs->scalar_load_cost;
11849
11850       case scalar_store:
11851         return costs->scalar_store_cost;
11852
11853       case vector_stmt:
11854         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11855
11856       case vector_load:
11857         return costs->vec_align_load_cost;
11858
11859       case vector_store:
11860         return costs->vec_store_cost;
11861
11862       case vec_to_scalar:
11863         return costs->vec_to_scalar_cost;
11864
11865       case scalar_to_vec:
11866         return costs->scalar_to_vec_cost;
11867
11868       case unaligned_load:
11869       case vector_gather_load:
11870         return costs->vec_unalign_load_cost;
11871
11872       case unaligned_store:
11873       case vector_scatter_store:
11874         return costs->vec_unalign_store_cost;
11875
11876       case cond_branch_taken:
11877         return costs->cond_taken_branch_cost;
11878
11879       case cond_branch_not_taken:
11880         return costs->cond_not_taken_branch_cost;
11881
11882       case vec_perm:
11883         return costs->vec_permute_cost;
11884
11885       case vec_promote_demote:
11886         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11887
11888       case vec_construct:
11889         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
11890         return elements / 2 + 1;
11891
11892       default:
11893         gcc_unreachable ();
11894     }
11895 }
11896
11897 /* Implement targetm.vectorize.add_stmt_cost.  */
11898 static unsigned
11899 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
11900                        struct _stmt_vec_info *stmt_info, int misalign,
11901                        enum vect_cost_model_location where)
11902 {
11903   unsigned *cost = (unsigned *) data;
11904   unsigned retval = 0;
11905
11906   if (flag_vect_cost_model)
11907     {
11908       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
11909       int stmt_cost =
11910             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
11911
11912       /* Statements in an inner loop relative to the loop being
11913          vectorized are weighted more heavily.  The value here is
11914          arbitrary and could potentially be improved with analysis.  */
11915       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
11916         count *= 50; /*  FIXME  */
11917
11918       retval = (unsigned) (count * stmt_cost);
11919       cost[where] += retval;
11920     }
11921
11922   return retval;
11923 }
11924
11925 static void initialize_aarch64_code_model (struct gcc_options *);
11926
11927 /* Parse the TO_PARSE string and put the architecture struct that it
11928    selects into RES and the architectural features into ISA_FLAGS.
11929    Return an aarch64_parse_opt_result describing the parse result.
11930    If there is an error parsing, RES and ISA_FLAGS are left unchanged.
11931    When the TO_PARSE string contains an invalid extension,
11932    a copy of the string is created and stored to INVALID_EXTENSION.  */
11933
11934 static enum aarch64_parse_opt_result
11935 aarch64_parse_arch (const char *to_parse, const struct processor **res,
11936                     uint64_t *isa_flags, std::string *invalid_extension)
11937 {
11938   const char *ext;
11939   const struct processor *arch;
11940   size_t len;
11941
11942   ext = strchr (to_parse, '+');
11943
11944   if (ext != NULL)
11945     len = ext - to_parse;
11946   else
11947     len = strlen (to_parse);
11948
11949   if (len == 0)
11950     return AARCH64_PARSE_MISSING_ARG;
11951
11952
11953   /* Loop through the list of supported ARCHes to find a match.  */
11954   for (arch = all_architectures; arch->name != NULL; arch++)
11955     {
11956       if (strlen (arch->name) == len
11957           && strncmp (arch->name, to_parse, len) == 0)
11958         {
11959           uint64_t isa_temp = arch->flags;
11960
11961           if (ext != NULL)
11962             {
11963               /* TO_PARSE string contains at least one extension.  */
11964               enum aarch64_parse_opt_result ext_res
11965                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11966
11967               if (ext_res != AARCH64_PARSE_OK)
11968                 return ext_res;
11969             }
11970           /* Extension parsing was successful.  Confirm the result
11971              arch and ISA flags.  */
11972           *res = arch;
11973           *isa_flags = isa_temp;
11974           return AARCH64_PARSE_OK;
11975         }
11976     }
11977
11978   /* ARCH name not found in list.  */
11979   return AARCH64_PARSE_INVALID_ARG;
11980 }
11981
11982 /* Parse the TO_PARSE string and put the result tuning in RES and the
11983    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
11984    describing the parse result.  If there is an error parsing, RES and
11985    ISA_FLAGS are left unchanged.
11986    When the TO_PARSE string contains an invalid extension,
11987    a copy of the string is created and stored to INVALID_EXTENSION.  */
11988
11989 static enum aarch64_parse_opt_result
11990 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
11991                    uint64_t *isa_flags, std::string *invalid_extension)
11992 {
11993   const char *ext;
11994   const struct processor *cpu;
11995   size_t len;
11996
11997   ext = strchr (to_parse, '+');
11998
11999   if (ext != NULL)
12000     len = ext - to_parse;
12001   else
12002     len = strlen (to_parse);
12003
12004   if (len == 0)
12005     return AARCH64_PARSE_MISSING_ARG;
12006
12007
12008   /* Loop through the list of supported CPUs to find a match.  */
12009   for (cpu = all_cores; cpu->name != NULL; cpu++)
12010     {
12011       if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
12012         {
12013           uint64_t isa_temp = cpu->flags;
12014
12015
12016           if (ext != NULL)
12017             {
12018               /* TO_PARSE string contains at least one extension.  */
12019               enum aarch64_parse_opt_result ext_res
12020                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
12021
12022               if (ext_res != AARCH64_PARSE_OK)
12023                 return ext_res;
12024             }
12025           /* Extension parsing was successfull.  Confirm the result
12026              cpu and ISA flags.  */
12027           *res = cpu;
12028           *isa_flags = isa_temp;
12029           return AARCH64_PARSE_OK;
12030         }
12031     }
12032
12033   /* CPU name not found in list.  */
12034   return AARCH64_PARSE_INVALID_ARG;
12035 }
12036
12037 /* Parse the TO_PARSE string and put the cpu it selects into RES.
12038    Return an aarch64_parse_opt_result describing the parse result.
12039    If the parsing fails the RES does not change.  */
12040
12041 static enum aarch64_parse_opt_result
12042 aarch64_parse_tune (const char *to_parse, const struct processor **res)
12043 {
12044   const struct processor *cpu;
12045
12046   /* Loop through the list of supported CPUs to find a match.  */
12047   for (cpu = all_cores; cpu->name != NULL; cpu++)
12048     {
12049       if (strcmp (cpu->name, to_parse) == 0)
12050         {
12051           *res = cpu;
12052           return AARCH64_PARSE_OK;
12053         }
12054     }
12055
12056   /* CPU name not found in list.  */
12057   return AARCH64_PARSE_INVALID_ARG;
12058 }
12059
12060 /* Parse TOKEN, which has length LENGTH to see if it is an option
12061    described in FLAG.  If it is, return the index bit for that fusion type.
12062    If not, error (printing OPTION_NAME) and return zero.  */
12063
12064 static unsigned int
12065 aarch64_parse_one_option_token (const char *token,
12066                                 size_t length,
12067                                 const struct aarch64_flag_desc *flag,
12068                                 const char *option_name)
12069 {
12070   for (; flag->name != NULL; flag++)
12071     {
12072       if (length == strlen (flag->name)
12073           && !strncmp (flag->name, token, length))
12074         return flag->flag;
12075     }
12076
12077   error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
12078   return 0;
12079 }
12080
12081 /* Parse OPTION which is a comma-separated list of flags to enable.
12082    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
12083    default state we inherit from the CPU tuning structures.  OPTION_NAME
12084    gives the top-level option we are parsing in the -moverride string,
12085    for use in error messages.  */
12086
12087 static unsigned int
12088 aarch64_parse_boolean_options (const char *option,
12089                                const struct aarch64_flag_desc *flags,
12090                                unsigned int initial_state,
12091                                const char *option_name)
12092 {
12093   const char separator = '.';
12094   const char* specs = option;
12095   const char* ntoken = option;
12096   unsigned int found_flags = initial_state;
12097
12098   while ((ntoken = strchr (specs, separator)))
12099     {
12100       size_t token_length = ntoken - specs;
12101       unsigned token_ops = aarch64_parse_one_option_token (specs,
12102                                                            token_length,
12103                                                            flags,
12104                                                            option_name);
12105       /* If we find "none" (or, for simplicity's sake, an error) anywhere
12106          in the token stream, reset the supported operations.  So:
12107
12108            adrp+add.cmp+branch.none.adrp+add
12109
12110            would have the result of turning on only adrp+add fusion.  */
12111       if (!token_ops)
12112         found_flags = 0;
12113
12114       found_flags |= token_ops;
12115       specs = ++ntoken;
12116     }
12117
12118   /* We ended with a comma, print something.  */
12119   if (!(*specs))
12120     {
12121       error ("%s string ill-formed\n", option_name);
12122       return 0;
12123     }
12124
12125   /* We still have one more token to parse.  */
12126   size_t token_length = strlen (specs);
12127   unsigned token_ops = aarch64_parse_one_option_token (specs,
12128                                                        token_length,
12129                                                        flags,
12130                                                        option_name);
12131    if (!token_ops)
12132      found_flags = 0;
12133
12134   found_flags |= token_ops;
12135   return found_flags;
12136 }
12137
12138 /* Support for overriding instruction fusion.  */
12139
12140 static void
12141 aarch64_parse_fuse_string (const char *fuse_string,
12142                             struct tune_params *tune)
12143 {
12144   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
12145                                                      aarch64_fusible_pairs,
12146                                                      tune->fusible_ops,
12147                                                      "fuse=");
12148 }
12149
12150 /* Support for overriding other tuning flags.  */
12151
12152 static void
12153 aarch64_parse_tune_string (const char *tune_string,
12154                             struct tune_params *tune)
12155 {
12156   tune->extra_tuning_flags
12157     = aarch64_parse_boolean_options (tune_string,
12158                                      aarch64_tuning_flags,
12159                                      tune->extra_tuning_flags,
12160                                      "tune=");
12161 }
12162
12163 /* Parse the sve_width tuning moverride string in TUNE_STRING.
12164    Accept the valid SVE vector widths allowed by
12165    aarch64_sve_vector_bits_enum and use it to override sve_width
12166    in TUNE.  */
12167
12168 static void
12169 aarch64_parse_sve_width_string (const char *tune_string,
12170                                 struct tune_params *tune)
12171 {
12172   int width = -1;
12173
12174   int n = sscanf (tune_string, "%d", &width);
12175   if (n == EOF)
12176     {
12177       error ("invalid format for sve_width");
12178       return;
12179     }
12180   switch (width)
12181     {
12182     case SVE_128:
12183     case SVE_256:
12184     case SVE_512:
12185     case SVE_1024:
12186     case SVE_2048:
12187       break;
12188     default:
12189       error ("invalid sve_width value: %d", width);
12190     }
12191   tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
12192 }
12193
12194 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
12195    we understand.  If it is, extract the option string and handoff to
12196    the appropriate function.  */
12197
12198 void
12199 aarch64_parse_one_override_token (const char* token,
12200                                   size_t length,
12201                                   struct tune_params *tune)
12202 {
12203   const struct aarch64_tuning_override_function *fn
12204     = aarch64_tuning_override_functions;
12205
12206   const char *option_part = strchr (token, '=');
12207   if (!option_part)
12208     {
12209       error ("tuning string missing in option (%s)", token);
12210       return;
12211     }
12212
12213   /* Get the length of the option name.  */
12214   length = option_part - token;
12215   /* Skip the '=' to get to the option string.  */
12216   option_part++;
12217
12218   for (; fn->name != NULL; fn++)
12219     {
12220       if (!strncmp (fn->name, token, length))
12221         {
12222           fn->parse_override (option_part, tune);
12223           return;
12224         }
12225     }
12226
12227   error ("unknown tuning option (%s)",token);
12228   return;
12229 }
12230
12231 /* A checking mechanism for the implementation of the tls size.  */
12232
12233 static void
12234 initialize_aarch64_tls_size (struct gcc_options *opts)
12235 {
12236   if (aarch64_tls_size == 0)
12237     aarch64_tls_size = 24;
12238
12239   switch (opts->x_aarch64_cmodel_var)
12240     {
12241     case AARCH64_CMODEL_TINY:
12242       /* Both the default and maximum TLS size allowed under tiny is 1M which
12243          needs two instructions to address, so we clamp the size to 24.  */
12244       if (aarch64_tls_size > 24)
12245         aarch64_tls_size = 24;
12246       break;
12247     case AARCH64_CMODEL_SMALL:
12248       /* The maximum TLS size allowed under small is 4G.  */
12249       if (aarch64_tls_size > 32)
12250         aarch64_tls_size = 32;
12251       break;
12252     case AARCH64_CMODEL_LARGE:
12253       /* The maximum TLS size allowed under large is 16E.
12254          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
12255       if (aarch64_tls_size > 48)
12256         aarch64_tls_size = 48;
12257       break;
12258     default:
12259       gcc_unreachable ();
12260     }
12261
12262   return;
12263 }
12264
12265 /* Parse STRING looking for options in the format:
12266      string     :: option:string
12267      option     :: name=substring
12268      name       :: {a-z}
12269      substring  :: defined by option.  */
12270
12271 static void
12272 aarch64_parse_override_string (const char* input_string,
12273                                struct tune_params* tune)
12274 {
12275   const char separator = ':';
12276   size_t string_length = strlen (input_string) + 1;
12277   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
12278   char *string = string_root;
12279   strncpy (string, input_string, string_length);
12280   string[string_length - 1] = '\0';
12281
12282   char* ntoken = string;
12283
12284   while ((ntoken = strchr (string, separator)))
12285     {
12286       size_t token_length = ntoken - string;
12287       /* Make this substring look like a string.  */
12288       *ntoken = '\0';
12289       aarch64_parse_one_override_token (string, token_length, tune);
12290       string = ++ntoken;
12291     }
12292
12293   /* One last option to parse.  */
12294   aarch64_parse_one_override_token (string, strlen (string), tune);
12295   free (string_root);
12296 }
12297
12298
12299 static void
12300 aarch64_override_options_after_change_1 (struct gcc_options *opts)
12301 {
12302   if (accepted_branch_protection_string)
12303     {
12304       opts->x_aarch64_branch_protection_string
12305         = xstrdup (accepted_branch_protection_string);
12306     }
12307
12308   /* PR 70044: We have to be careful about being called multiple times for the
12309      same function.  This means all changes should be repeatable.  */
12310
12311   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
12312      Disable the frame pointer flag so the mid-end will not use a frame
12313      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
12314      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
12315      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
12316   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
12317   if (opts->x_flag_omit_frame_pointer == 0)
12318     opts->x_flag_omit_frame_pointer = 2;
12319
12320   /* If not optimizing for size, set the default
12321      alignment to what the target wants.  */
12322   if (!opts->x_optimize_size)
12323     {
12324       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
12325         opts->x_str_align_loops = aarch64_tune_params.loop_align;
12326       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
12327         opts->x_str_align_jumps = aarch64_tune_params.jump_align;
12328       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
12329         opts->x_str_align_functions = aarch64_tune_params.function_align;
12330     }
12331
12332   /* We default to no pc-relative literal loads.  */
12333
12334   aarch64_pcrelative_literal_loads = false;
12335
12336   /* If -mpc-relative-literal-loads is set on the command line, this
12337      implies that the user asked for PC relative literal loads.  */
12338   if (opts->x_pcrelative_literal_loads == 1)
12339     aarch64_pcrelative_literal_loads = true;
12340
12341   /* In the tiny memory model it makes no sense to disallow PC relative
12342      literal pool loads.  */
12343   if (aarch64_cmodel == AARCH64_CMODEL_TINY
12344       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
12345     aarch64_pcrelative_literal_loads = true;
12346
12347   /* When enabling the lower precision Newton series for the square root, also
12348      enable it for the reciprocal square root, since the latter is an
12349      intermediary step for the former.  */
12350   if (flag_mlow_precision_sqrt)
12351     flag_mrecip_low_precision_sqrt = true;
12352 }
12353
12354 /* 'Unpack' up the internal tuning structs and update the options
12355     in OPTS.  The caller must have set up selected_tune and selected_arch
12356     as all the other target-specific codegen decisions are
12357     derived from them.  */
12358
12359 void
12360 aarch64_override_options_internal (struct gcc_options *opts)
12361 {
12362   aarch64_tune_flags = selected_tune->flags;
12363   aarch64_tune = selected_tune->sched_core;
12364   /* Make a copy of the tuning parameters attached to the core, which
12365      we may later overwrite.  */
12366   aarch64_tune_params = *(selected_tune->tune);
12367   aarch64_architecture_version = selected_arch->architecture_version;
12368
12369   if (opts->x_aarch64_override_tune_string)
12370     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
12371                                   &aarch64_tune_params);
12372
12373   /* This target defaults to strict volatile bitfields.  */
12374   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
12375     opts->x_flag_strict_volatile_bitfields = 1;
12376
12377   if (aarch64_stack_protector_guard == SSP_GLOBAL
12378       && opts->x_aarch64_stack_protector_guard_offset_str)
12379     {
12380       error ("incompatible options %<-mstack-protector-guard=global%> and "
12381              "%<-mstack-protector-guard-offset=%s%>",
12382              aarch64_stack_protector_guard_offset_str);
12383     }
12384
12385   if (aarch64_stack_protector_guard == SSP_SYSREG
12386       && !(opts->x_aarch64_stack_protector_guard_offset_str
12387            && opts->x_aarch64_stack_protector_guard_reg_str))
12388     {
12389       error ("both %<-mstack-protector-guard-offset%> and "
12390              "%<-mstack-protector-guard-reg%> must be used "
12391              "with %<-mstack-protector-guard=sysreg%>");
12392     }
12393
12394   if (opts->x_aarch64_stack_protector_guard_reg_str)
12395     {
12396       if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
12397           error ("specify a system register with a small string length.");
12398     }
12399
12400   if (opts->x_aarch64_stack_protector_guard_offset_str)
12401     {
12402       char *end;
12403       const char *str = aarch64_stack_protector_guard_offset_str;
12404       errno = 0;
12405       long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
12406       if (!*str || *end || errno)
12407         error ("%qs is not a valid offset in %qs", str,
12408                "-mstack-protector-guard-offset=");
12409       aarch64_stack_protector_guard_offset = offs;
12410     }
12411
12412   initialize_aarch64_code_model (opts);
12413   initialize_aarch64_tls_size (opts);
12414
12415   int queue_depth = 0;
12416   switch (aarch64_tune_params.autoprefetcher_model)
12417     {
12418       case tune_params::AUTOPREFETCHER_OFF:
12419         queue_depth = -1;
12420         break;
12421       case tune_params::AUTOPREFETCHER_WEAK:
12422         queue_depth = 0;
12423         break;
12424       case tune_params::AUTOPREFETCHER_STRONG:
12425         queue_depth = max_insn_queue_index + 1;
12426         break;
12427       default:
12428         gcc_unreachable ();
12429     }
12430
12431   /* We don't mind passing in global_options_set here as we don't use
12432      the *options_set structs anyway.  */
12433   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
12434                          queue_depth,
12435                          opts->x_param_values,
12436                          global_options_set.x_param_values);
12437
12438   /* Set up parameters to be used in prefetching algorithm.  Do not
12439      override the defaults unless we are tuning for a core we have
12440      researched values for.  */
12441   if (aarch64_tune_params.prefetch->num_slots > 0)
12442     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
12443                            aarch64_tune_params.prefetch->num_slots,
12444                            opts->x_param_values,
12445                            global_options_set.x_param_values);
12446   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
12447     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
12448                            aarch64_tune_params.prefetch->l1_cache_size,
12449                            opts->x_param_values,
12450                            global_options_set.x_param_values);
12451   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
12452     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
12453                            aarch64_tune_params.prefetch->l1_cache_line_size,
12454                            opts->x_param_values,
12455                            global_options_set.x_param_values);
12456   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
12457     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
12458                            aarch64_tune_params.prefetch->l2_cache_size,
12459                            opts->x_param_values,
12460                            global_options_set.x_param_values);
12461   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
12462     maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
12463                            0,
12464                            opts->x_param_values,
12465                            global_options_set.x_param_values);
12466   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
12467     maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
12468                            aarch64_tune_params.prefetch->minimum_stride,
12469                            opts->x_param_values,
12470                            global_options_set.x_param_values);
12471
12472   /* Use the alternative scheduling-pressure algorithm by default.  */
12473   maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
12474                          opts->x_param_values,
12475                          global_options_set.x_param_values);
12476
12477   /* If the user hasn't changed it via configure then set the default to 64 KB
12478      for the backend.  */
12479   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
12480                          DEFAULT_STK_CLASH_GUARD_SIZE == 0
12481                            ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE,
12482                          opts->x_param_values,
12483                          global_options_set.x_param_values);
12484
12485   /* Validate the guard size.  */
12486   int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
12487
12488   /* Enforce that interval is the same size as size so the mid-end does the
12489      right thing.  */
12490   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
12491                          guard_size,
12492                          opts->x_param_values,
12493                          global_options_set.x_param_values);
12494
12495   /* The maybe_set calls won't update the value if the user has explicitly set
12496      one.  Which means we need to validate that probing interval and guard size
12497      are equal.  */
12498   int probe_interval
12499     = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
12500   if (guard_size != probe_interval)
12501     error ("stack clash guard size %<%d%> must be equal to probing interval "
12502            "%<%d%>", guard_size, probe_interval);
12503
12504   /* Enable sw prefetching at specified optimization level for
12505      CPUS that have prefetch.  Lower optimization level threshold by 1
12506      when profiling is enabled.  */
12507   if (opts->x_flag_prefetch_loop_arrays < 0
12508       && !opts->x_optimize_size
12509       && aarch64_tune_params.prefetch->default_opt_level >= 0
12510       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
12511     opts->x_flag_prefetch_loop_arrays = 1;
12512
12513   if (opts->x_aarch64_arch_string == NULL)
12514     opts->x_aarch64_arch_string = selected_arch->name;
12515   if (opts->x_aarch64_cpu_string == NULL)
12516     opts->x_aarch64_cpu_string = selected_cpu->name;
12517   if (opts->x_aarch64_tune_string == NULL)
12518     opts->x_aarch64_tune_string = selected_tune->name;
12519
12520   aarch64_override_options_after_change_1 (opts);
12521 }
12522
12523 /* Print a hint with a suggestion for a core or architecture name that
12524    most closely resembles what the user passed in STR.  ARCH is true if
12525    the user is asking for an architecture name.  ARCH is false if the user
12526    is asking for a core name.  */
12527
12528 static void
12529 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
12530 {
12531   auto_vec<const char *> candidates;
12532   const struct processor *entry = arch ? all_architectures : all_cores;
12533   for (; entry->name != NULL; entry++)
12534     candidates.safe_push (entry->name);
12535
12536 #ifdef HAVE_LOCAL_CPU_DETECT
12537   /* Add also "native" as possible value.  */
12538   if (arch)
12539     candidates.safe_push ("native");
12540 #endif
12541
12542   char *s;
12543   const char *hint = candidates_list_and_hint (str, s, candidates);
12544   if (hint)
12545     inform (input_location, "valid arguments are: %s;"
12546                              " did you mean %qs?", s, hint);
12547   else
12548     inform (input_location, "valid arguments are: %s", s);
12549
12550   XDELETEVEC (s);
12551 }
12552
12553 /* Print a hint with a suggestion for a core name that most closely resembles
12554    what the user passed in STR.  */
12555
12556 inline static void
12557 aarch64_print_hint_for_core (const char *str)
12558 {
12559   aarch64_print_hint_for_core_or_arch (str, false);
12560 }
12561
12562 /* Print a hint with a suggestion for an architecture name that most closely
12563    resembles what the user passed in STR.  */
12564
12565 inline static void
12566 aarch64_print_hint_for_arch (const char *str)
12567 {
12568   aarch64_print_hint_for_core_or_arch (str, true);
12569 }
12570
12571
12572 /* Print a hint with a suggestion for an extension name
12573    that most closely resembles what the user passed in STR.  */
12574
12575 void
12576 aarch64_print_hint_for_extensions (const std::string &str)
12577 {
12578   auto_vec<const char *> candidates;
12579   aarch64_get_all_extension_candidates (&candidates);
12580   char *s;
12581   const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
12582   if (hint)
12583     inform (input_location, "valid arguments are: %s;"
12584                              " did you mean %qs?", s, hint);
12585   else
12586     inform (input_location, "valid arguments are: %s;", s);
12587
12588   XDELETEVEC (s);
12589 }
12590
12591 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
12592    specified in STR and throw errors if appropriate.  Put the results if
12593    they are valid in RES and ISA_FLAGS.  Return whether the option is
12594    valid.  */
12595
12596 static bool
12597 aarch64_validate_mcpu (const char *str, const struct processor **res,
12598                        uint64_t *isa_flags)
12599 {
12600   std::string invalid_extension;
12601   enum aarch64_parse_opt_result parse_res
12602     = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
12603
12604   if (parse_res == AARCH64_PARSE_OK)
12605     return true;
12606
12607   switch (parse_res)
12608     {
12609       case AARCH64_PARSE_MISSING_ARG:
12610         error ("missing cpu name in %<-mcpu=%s%>", str);
12611         break;
12612       case AARCH64_PARSE_INVALID_ARG:
12613         error ("unknown value %qs for %<-mcpu%>", str);
12614         aarch64_print_hint_for_core (str);
12615         break;
12616       case AARCH64_PARSE_INVALID_FEATURE:
12617         error ("invalid feature modifier %qs in %<-mcpu=%s%>",
12618                invalid_extension.c_str (), str);
12619         aarch64_print_hint_for_extensions (invalid_extension);
12620         break;
12621       default:
12622         gcc_unreachable ();
12623     }
12624
12625   return false;
12626 }
12627
12628 /* Parses CONST_STR for branch protection features specified in
12629    aarch64_branch_protect_types, and set any global variables required.  Returns
12630    the parsing result and assigns LAST_STR to the last processed token from
12631    CONST_STR so that it can be used for error reporting.  */
12632
12633 static enum
12634 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
12635                                                           char** last_str)
12636 {
12637   char *str_root = xstrdup (const_str);
12638   char* token_save = NULL;
12639   char *str = strtok_r (str_root, "+", &token_save);
12640   enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
12641   if (!str)
12642     res = AARCH64_PARSE_MISSING_ARG;
12643   else
12644     {
12645       char *next_str = strtok_r (NULL, "+", &token_save);
12646       /* Reset the branch protection features to their defaults.  */
12647       aarch64_handle_no_branch_protection (NULL, NULL);
12648
12649       while (str && res == AARCH64_PARSE_OK)
12650         {
12651           const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
12652           bool found = false;
12653           /* Search for this type.  */
12654           while (type && type->name && !found && res == AARCH64_PARSE_OK)
12655             {
12656               if (strcmp (str, type->name) == 0)
12657                 {
12658                   found = true;
12659                   res = type->handler (str, next_str);
12660                   str = next_str;
12661                   next_str = strtok_r (NULL, "+", &token_save);
12662                 }
12663               else
12664                 type++;
12665             }
12666           if (found && res == AARCH64_PARSE_OK)
12667             {
12668               bool found_subtype = true;
12669               /* Loop through each token until we find one that isn't a
12670                  subtype.  */
12671               while (found_subtype)
12672                 {
12673                   found_subtype = false;
12674                   const aarch64_branch_protect_type *subtype = type->subtypes;
12675                   /* Search for the subtype.  */
12676                   while (str && subtype && subtype->name && !found_subtype
12677                           && res == AARCH64_PARSE_OK)
12678                     {
12679                       if (strcmp (str, subtype->name) == 0)
12680                         {
12681                           found_subtype = true;
12682                           res = subtype->handler (str, next_str);
12683                           str = next_str;
12684                           next_str = strtok_r (NULL, "+", &token_save);
12685                         }
12686                       else
12687                         subtype++;
12688                     }
12689                 }
12690             }
12691           else if (!found)
12692             res = AARCH64_PARSE_INVALID_ARG;
12693         }
12694     }
12695   /* Copy the last processed token into the argument to pass it back.
12696     Used by option and attribute validation to print the offending token.  */
12697   if (last_str)
12698     {
12699       if (str) strcpy (*last_str, str);
12700       else *last_str = NULL;
12701     }
12702   if (res == AARCH64_PARSE_OK)
12703     {
12704       /* If needed, alloc the accepted string then copy in const_str.
12705         Used by override_option_after_change_1.  */
12706       if (!accepted_branch_protection_string)
12707         accepted_branch_protection_string = (char *) xmalloc (
12708                                                       BRANCH_PROTECT_STR_MAX
12709                                                         + 1);
12710       strncpy (accepted_branch_protection_string, const_str,
12711                 BRANCH_PROTECT_STR_MAX + 1);
12712       /* Forcibly null-terminate.  */
12713       accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
12714     }
12715   return res;
12716 }
12717
12718 static bool
12719 aarch64_validate_mbranch_protection (const char *const_str)
12720 {
12721   char *str = (char *) xmalloc (strlen (const_str));
12722   enum aarch64_parse_opt_result res =
12723     aarch64_parse_branch_protection (const_str, &str);
12724   if (res == AARCH64_PARSE_INVALID_ARG)
12725     error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
12726   else if (res == AARCH64_PARSE_MISSING_ARG)
12727     error ("missing argument for %<-mbranch-protection=%>");
12728   free (str);
12729   return res == AARCH64_PARSE_OK;
12730 }
12731
12732 /* Validate a command-line -march option.  Parse the arch and extensions
12733    (if any) specified in STR and throw errors if appropriate.  Put the
12734    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
12735    option is valid.  */
12736
12737 static bool
12738 aarch64_validate_march (const char *str, const struct processor **res,
12739                          uint64_t *isa_flags)
12740 {
12741   std::string invalid_extension;
12742   enum aarch64_parse_opt_result parse_res
12743     = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
12744
12745   if (parse_res == AARCH64_PARSE_OK)
12746     return true;
12747
12748   switch (parse_res)
12749     {
12750       case AARCH64_PARSE_MISSING_ARG:
12751         error ("missing arch name in %<-march=%s%>", str);
12752         break;
12753       case AARCH64_PARSE_INVALID_ARG:
12754         error ("unknown value %qs for %<-march%>", str);
12755         aarch64_print_hint_for_arch (str);
12756         break;
12757       case AARCH64_PARSE_INVALID_FEATURE:
12758         error ("invalid feature modifier %qs in %<-march=%s%>",
12759                invalid_extension.c_str (), str);
12760         aarch64_print_hint_for_extensions (invalid_extension);
12761         break;
12762       default:
12763         gcc_unreachable ();
12764     }
12765
12766   return false;
12767 }
12768
12769 /* Validate a command-line -mtune option.  Parse the cpu
12770    specified in STR and throw errors if appropriate.  Put the
12771    result, if it is valid, in RES.  Return whether the option is
12772    valid.  */
12773
12774 static bool
12775 aarch64_validate_mtune (const char *str, const struct processor **res)
12776 {
12777   enum aarch64_parse_opt_result parse_res
12778     = aarch64_parse_tune (str, res);
12779
12780   if (parse_res == AARCH64_PARSE_OK)
12781     return true;
12782
12783   switch (parse_res)
12784     {
12785       case AARCH64_PARSE_MISSING_ARG:
12786         error ("missing cpu name in %<-mtune=%s%>", str);
12787         break;
12788       case AARCH64_PARSE_INVALID_ARG:
12789         error ("unknown value %qs for %<-mtune%>", str);
12790         aarch64_print_hint_for_core (str);
12791         break;
12792       default:
12793         gcc_unreachable ();
12794     }
12795   return false;
12796 }
12797
12798 /* Return the CPU corresponding to the enum CPU.
12799    If it doesn't specify a cpu, return the default.  */
12800
12801 static const struct processor *
12802 aarch64_get_tune_cpu (enum aarch64_processor cpu)
12803 {
12804   if (cpu != aarch64_none)
12805     return &all_cores[cpu];
12806
12807   /* The & 0x3f is to extract the bottom 6 bits that encode the
12808      default cpu as selected by the --with-cpu GCC configure option
12809      in config.gcc.
12810      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
12811      flags mechanism should be reworked to make it more sane.  */
12812   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12813 }
12814
12815 /* Return the architecture corresponding to the enum ARCH.
12816    If it doesn't specify a valid architecture, return the default.  */
12817
12818 static const struct processor *
12819 aarch64_get_arch (enum aarch64_arch arch)
12820 {
12821   if (arch != aarch64_no_arch)
12822     return &all_architectures[arch];
12823
12824   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12825
12826   return &all_architectures[cpu->arch];
12827 }
12828
12829 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
12830
12831 static poly_uint16
12832 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
12833 {
12834   /* For now generate vector-length agnostic code for -msve-vector-bits=128.
12835      This ensures we can clearly distinguish SVE and Advanced SIMD modes when
12836      deciding which .md file patterns to use and when deciding whether
12837      something is a legitimate address or constant.  */
12838   if (value == SVE_SCALABLE || value == SVE_128)
12839     return poly_uint16 (2, 2);
12840   else
12841     return (int) value / 64;
12842 }
12843
12844 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
12845    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
12846    tuning structs.  In particular it must set selected_tune and
12847    aarch64_isa_flags that define the available ISA features and tuning
12848    decisions.  It must also set selected_arch as this will be used to
12849    output the .arch asm tags for each function.  */
12850
12851 static void
12852 aarch64_override_options (void)
12853 {
12854   uint64_t cpu_isa = 0;
12855   uint64_t arch_isa = 0;
12856   aarch64_isa_flags = 0;
12857
12858   bool valid_cpu = true;
12859   bool valid_tune = true;
12860   bool valid_arch = true;
12861
12862   selected_cpu = NULL;
12863   selected_arch = NULL;
12864   selected_tune = NULL;
12865
12866   if (aarch64_branch_protection_string)
12867     aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
12868
12869   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
12870      If either of -march or -mtune is given, they override their
12871      respective component of -mcpu.  */
12872   if (aarch64_cpu_string)
12873     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
12874                                         &cpu_isa);
12875
12876   if (aarch64_arch_string)
12877     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
12878                                           &arch_isa);
12879
12880   if (aarch64_tune_string)
12881     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
12882
12883 #ifdef SUBTARGET_OVERRIDE_OPTIONS
12884   SUBTARGET_OVERRIDE_OPTIONS;
12885 #endif
12886
12887   /* If the user did not specify a processor, choose the default
12888      one for them.  This will be the CPU set during configuration using
12889      --with-cpu, otherwise it is "generic".  */
12890   if (!selected_cpu)
12891     {
12892       if (selected_arch)
12893         {
12894           selected_cpu = &all_cores[selected_arch->ident];
12895           aarch64_isa_flags = arch_isa;
12896           explicit_arch = selected_arch->arch;
12897         }
12898       else
12899         {
12900           /* Get default configure-time CPU.  */
12901           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
12902           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
12903         }
12904
12905       if (selected_tune)
12906         explicit_tune_core = selected_tune->ident;
12907     }
12908   /* If both -mcpu and -march are specified check that they are architecturally
12909      compatible, warn if they're not and prefer the -march ISA flags.  */
12910   else if (selected_arch)
12911     {
12912       if (selected_arch->arch != selected_cpu->arch)
12913         {
12914           warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
12915                        all_architectures[selected_cpu->arch].name,
12916                        selected_arch->name);
12917         }
12918       aarch64_isa_flags = arch_isa;
12919       explicit_arch = selected_arch->arch;
12920       explicit_tune_core = selected_tune ? selected_tune->ident
12921                                           : selected_cpu->ident;
12922     }
12923   else
12924     {
12925       /* -mcpu but no -march.  */
12926       aarch64_isa_flags = cpu_isa;
12927       explicit_tune_core = selected_tune ? selected_tune->ident
12928                                           : selected_cpu->ident;
12929       gcc_assert (selected_cpu);
12930       selected_arch = &all_architectures[selected_cpu->arch];
12931       explicit_arch = selected_arch->arch;
12932     }
12933
12934   /* Set the arch as well as we will need it when outputing
12935      the .arch directive in assembly.  */
12936   if (!selected_arch)
12937     {
12938       gcc_assert (selected_cpu);
12939       selected_arch = &all_architectures[selected_cpu->arch];
12940     }
12941
12942   if (!selected_tune)
12943     selected_tune = selected_cpu;
12944
12945   if (aarch64_enable_bti == 2)
12946     {
12947 #ifdef TARGET_ENABLE_BTI
12948       aarch64_enable_bti = 1;
12949 #else
12950       aarch64_enable_bti = 0;
12951 #endif
12952     }
12953
12954   /* Return address signing is currently not supported for ILP32 targets.  For
12955      LP64 targets use the configured option in the absence of a command-line
12956      option for -mbranch-protection.  */
12957   if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
12958     {
12959 #ifdef TARGET_ENABLE_PAC_RET
12960       aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
12961 #else
12962       aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
12963 #endif
12964     }
12965
12966 #ifndef HAVE_AS_MABI_OPTION
12967   /* The compiler may have been configured with 2.23.* binutils, which does
12968      not have support for ILP32.  */
12969   if (TARGET_ILP32)
12970     error ("assembler does not support %<-mabi=ilp32%>");
12971 #endif
12972
12973   /* Convert -msve-vector-bits to a VG count.  */
12974   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
12975
12976   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
12977     sorry ("return address signing is only supported for %<-mabi=lp64%>");
12978
12979   /* Make sure we properly set up the explicit options.  */
12980   if ((aarch64_cpu_string && valid_cpu)
12981        || (aarch64_tune_string && valid_tune))
12982     gcc_assert (explicit_tune_core != aarch64_none);
12983
12984   if ((aarch64_cpu_string && valid_cpu)
12985        || (aarch64_arch_string && valid_arch))
12986     gcc_assert (explicit_arch != aarch64_no_arch);
12987
12988   /* The pass to insert speculation tracking runs before
12989      shrink-wrapping and the latter does not know how to update the
12990      tracking status.  So disable it in this case.  */
12991   if (aarch64_track_speculation)
12992     flag_shrink_wrap = 0;
12993
12994   aarch64_override_options_internal (&global_options);
12995
12996   /* Save these options as the default ones in case we push and pop them later
12997      while processing functions with potential target attributes.  */
12998   target_option_default_node = target_option_current_node
12999       = build_target_option_node (&global_options);
13000 }
13001
13002 /* Implement targetm.override_options_after_change.  */
13003
13004 static void
13005 aarch64_override_options_after_change (void)
13006 {
13007   aarch64_override_options_after_change_1 (&global_options);
13008 }
13009
13010 static struct machine_function *
13011 aarch64_init_machine_status (void)
13012 {
13013   struct machine_function *machine;
13014   machine = ggc_cleared_alloc<machine_function> ();
13015   return machine;
13016 }
13017
13018 void
13019 aarch64_init_expanders (void)
13020 {
13021   init_machine_status = aarch64_init_machine_status;
13022 }
13023
13024 /* A checking mechanism for the implementation of the various code models.  */
13025 static void
13026 initialize_aarch64_code_model (struct gcc_options *opts)
13027 {
13028    if (opts->x_flag_pic)
13029      {
13030        switch (opts->x_aarch64_cmodel_var)
13031          {
13032          case AARCH64_CMODEL_TINY:
13033            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
13034            break;
13035          case AARCH64_CMODEL_SMALL:
13036 #ifdef HAVE_AS_SMALL_PIC_RELOCS
13037            aarch64_cmodel = (flag_pic == 2
13038                              ? AARCH64_CMODEL_SMALL_PIC
13039                              : AARCH64_CMODEL_SMALL_SPIC);
13040 #else
13041            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
13042 #endif
13043            break;
13044          case AARCH64_CMODEL_LARGE:
13045            sorry ("code model %qs with %<-f%s%>", "large",
13046                   opts->x_flag_pic > 1 ? "PIC" : "pic");
13047            break;
13048          default:
13049            gcc_unreachable ();
13050          }
13051      }
13052    else
13053      aarch64_cmodel = opts->x_aarch64_cmodel_var;
13054 }
13055
13056 /* Implement TARGET_OPTION_SAVE.  */
13057
13058 static void
13059 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
13060 {
13061   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
13062   ptr->x_aarch64_branch_protection_string
13063     = opts->x_aarch64_branch_protection_string;
13064 }
13065
13066 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
13067    using the information saved in PTR.  */
13068
13069 static void
13070 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
13071 {
13072   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
13073   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
13074   opts->x_explicit_arch = ptr->x_explicit_arch;
13075   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
13076   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
13077   opts->x_aarch64_branch_protection_string
13078     = ptr->x_aarch64_branch_protection_string;
13079   if (opts->x_aarch64_branch_protection_string)
13080     {
13081       aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
13082                                         NULL);
13083     }
13084
13085   aarch64_override_options_internal (opts);
13086 }
13087
13088 /* Implement TARGET_OPTION_PRINT.  */
13089
13090 static void
13091 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
13092 {
13093   const struct processor *cpu
13094     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
13095   uint64_t isa_flags = ptr->x_aarch64_isa_flags;
13096   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
13097   std::string extension
13098     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
13099
13100   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
13101   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
13102            arch->name, extension.c_str ());
13103 }
13104
13105 static GTY(()) tree aarch64_previous_fndecl;
13106
13107 void
13108 aarch64_reset_previous_fndecl (void)
13109 {
13110   aarch64_previous_fndecl = NULL;
13111 }
13112
13113 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
13114    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
13115    make sure optab availability predicates are recomputed when necessary.  */
13116
13117 void
13118 aarch64_save_restore_target_globals (tree new_tree)
13119 {
13120   if (TREE_TARGET_GLOBALS (new_tree))
13121     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
13122   else if (new_tree == target_option_default_node)
13123     restore_target_globals (&default_target_globals);
13124   else
13125     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
13126 }
13127
13128 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
13129    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
13130    of the function, if such exists.  This function may be called multiple
13131    times on a single function so use aarch64_previous_fndecl to avoid
13132    setting up identical state.  */
13133
13134 static void
13135 aarch64_set_current_function (tree fndecl)
13136 {
13137   if (!fndecl || fndecl == aarch64_previous_fndecl)
13138     return;
13139
13140   tree old_tree = (aarch64_previous_fndecl
13141                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
13142                    : NULL_TREE);
13143
13144   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13145
13146   /* If current function has no attributes but the previous one did,
13147      use the default node.  */
13148   if (!new_tree && old_tree)
13149     new_tree = target_option_default_node;
13150
13151   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
13152      the default have been handled by aarch64_save_restore_target_globals from
13153      aarch64_pragma_target_parse.  */
13154   if (old_tree == new_tree)
13155     return;
13156
13157   aarch64_previous_fndecl = fndecl;
13158
13159   /* First set the target options.  */
13160   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
13161
13162   aarch64_save_restore_target_globals (new_tree);
13163 }
13164
13165 /* Enum describing the various ways we can handle attributes.
13166    In many cases we can reuse the generic option handling machinery.  */
13167
13168 enum aarch64_attr_opt_type
13169 {
13170   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
13171   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
13172   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
13173   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
13174 };
13175
13176 /* All the information needed to handle a target attribute.
13177    NAME is the name of the attribute.
13178    ATTR_TYPE specifies the type of behavior of the attribute as described
13179    in the definition of enum aarch64_attr_opt_type.
13180    ALLOW_NEG is true if the attribute supports a "no-" form.
13181    HANDLER is the function that takes the attribute string as an argument
13182    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
13183    OPT_NUM is the enum specifying the option that the attribute modifies.
13184    This is needed for attributes that mirror the behavior of a command-line
13185    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
13186    aarch64_attr_enum.  */
13187
13188 struct aarch64_attribute_info
13189 {
13190   const char *name;
13191   enum aarch64_attr_opt_type attr_type;
13192   bool allow_neg;
13193   bool (*handler) (const char *);
13194   enum opt_code opt_num;
13195 };
13196
13197 /* Handle the ARCH_STR argument to the arch= target attribute.  */
13198
13199 static bool
13200 aarch64_handle_attr_arch (const char *str)
13201 {
13202   const struct processor *tmp_arch = NULL;
13203   std::string invalid_extension;
13204   enum aarch64_parse_opt_result parse_res
13205     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
13206
13207   if (parse_res == AARCH64_PARSE_OK)
13208     {
13209       gcc_assert (tmp_arch);
13210       selected_arch = tmp_arch;
13211       explicit_arch = selected_arch->arch;
13212       return true;
13213     }
13214
13215   switch (parse_res)
13216     {
13217       case AARCH64_PARSE_MISSING_ARG:
13218         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
13219         break;
13220       case AARCH64_PARSE_INVALID_ARG:
13221         error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
13222         aarch64_print_hint_for_arch (str);
13223         break;
13224       case AARCH64_PARSE_INVALID_FEATURE:
13225         error ("invalid feature modifier %s of value (\"%s\") in "
13226                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13227         aarch64_print_hint_for_extensions (invalid_extension);
13228         break;
13229       default:
13230         gcc_unreachable ();
13231     }
13232
13233   return false;
13234 }
13235
13236 /* Handle the argument CPU_STR to the cpu= target attribute.  */
13237
13238 static bool
13239 aarch64_handle_attr_cpu (const char *str)
13240 {
13241   const struct processor *tmp_cpu = NULL;
13242   std::string invalid_extension;
13243   enum aarch64_parse_opt_result parse_res
13244     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
13245
13246   if (parse_res == AARCH64_PARSE_OK)
13247     {
13248       gcc_assert (tmp_cpu);
13249       selected_tune = tmp_cpu;
13250       explicit_tune_core = selected_tune->ident;
13251
13252       selected_arch = &all_architectures[tmp_cpu->arch];
13253       explicit_arch = selected_arch->arch;
13254       return true;
13255     }
13256
13257   switch (parse_res)
13258     {
13259       case AARCH64_PARSE_MISSING_ARG:
13260         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
13261         break;
13262       case AARCH64_PARSE_INVALID_ARG:
13263         error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
13264         aarch64_print_hint_for_core (str);
13265         break;
13266       case AARCH64_PARSE_INVALID_FEATURE:
13267         error ("invalid feature modifier %s of value (\"%s\") in "
13268                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13269         aarch64_print_hint_for_extensions (invalid_extension);
13270         break;
13271       default:
13272         gcc_unreachable ();
13273     }
13274
13275   return false;
13276 }
13277
13278 /* Handle the argument STR to the branch-protection= attribute.  */
13279
13280  static bool
13281  aarch64_handle_attr_branch_protection (const char* str)
13282  {
13283   char *err_str = (char *) xmalloc (strlen (str));
13284   enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
13285                                                                       &err_str);
13286   bool success = false;
13287   switch (res)
13288     {
13289      case AARCH64_PARSE_MISSING_ARG:
13290        error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
13291               " attribute");
13292        break;
13293      case AARCH64_PARSE_INVALID_ARG:
13294        error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
13295               "=\")%> pragma or attribute", err_str);
13296        break;
13297      case AARCH64_PARSE_OK:
13298        success = true;
13299       /* Fall through.  */
13300      case AARCH64_PARSE_INVALID_FEATURE:
13301        break;
13302      default:
13303        gcc_unreachable ();
13304     }
13305   free (err_str);
13306   return success;
13307  }
13308
13309 /* Handle the argument STR to the tune= target attribute.  */
13310
13311 static bool
13312 aarch64_handle_attr_tune (const char *str)
13313 {
13314   const struct processor *tmp_tune = NULL;
13315   enum aarch64_parse_opt_result parse_res
13316     = aarch64_parse_tune (str, &tmp_tune);
13317
13318   if (parse_res == AARCH64_PARSE_OK)
13319     {
13320       gcc_assert (tmp_tune);
13321       selected_tune = tmp_tune;
13322       explicit_tune_core = selected_tune->ident;
13323       return true;
13324     }
13325
13326   switch (parse_res)
13327     {
13328       case AARCH64_PARSE_INVALID_ARG:
13329         error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
13330         aarch64_print_hint_for_core (str);
13331         break;
13332       default:
13333         gcc_unreachable ();
13334     }
13335
13336   return false;
13337 }
13338
13339 /* Parse an architecture extensions target attribute string specified in STR.
13340    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
13341    if successful.  Update aarch64_isa_flags to reflect the ISA features
13342    modified.  */
13343
13344 static bool
13345 aarch64_handle_attr_isa_flags (char *str)
13346 {
13347   enum aarch64_parse_opt_result parse_res;
13348   uint64_t isa_flags = aarch64_isa_flags;
13349
13350   /* We allow "+nothing" in the beginning to clear out all architectural
13351      features if the user wants to handpick specific features.  */
13352   if (strncmp ("+nothing", str, 8) == 0)
13353     {
13354       isa_flags = 0;
13355       str += 8;
13356     }
13357
13358   std::string invalid_extension;
13359   parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
13360
13361   if (parse_res == AARCH64_PARSE_OK)
13362     {
13363       aarch64_isa_flags = isa_flags;
13364       return true;
13365     }
13366
13367   switch (parse_res)
13368     {
13369       case AARCH64_PARSE_MISSING_ARG:
13370         error ("missing value in %<target()%> pragma or attribute");
13371         break;
13372
13373       case AARCH64_PARSE_INVALID_FEATURE:
13374         error ("invalid feature modifier %s of value (\"%s\") in "
13375                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13376         break;
13377
13378       default:
13379         gcc_unreachable ();
13380     }
13381
13382  return false;
13383 }
13384
13385 /* The target attributes that we support.  On top of these we also support just
13386    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
13387    handled explicitly in aarch64_process_one_target_attr.  */
13388
13389 static const struct aarch64_attribute_info aarch64_attributes[] =
13390 {
13391   { "general-regs-only", aarch64_attr_mask, false, NULL,
13392      OPT_mgeneral_regs_only },
13393   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
13394      OPT_mfix_cortex_a53_835769 },
13395   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
13396      OPT_mfix_cortex_a53_843419 },
13397   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
13398   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
13399   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
13400      OPT_momit_leaf_frame_pointer },
13401   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
13402   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
13403      OPT_march_ },
13404   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
13405   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
13406      OPT_mtune_ },
13407   { "branch-protection", aarch64_attr_custom, false,
13408      aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
13409   { "sign-return-address", aarch64_attr_enum, false, NULL,
13410      OPT_msign_return_address_ },
13411   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
13412 };
13413
13414 /* Parse ARG_STR which contains the definition of one target attribute.
13415    Show appropriate errors if any or return true if the attribute is valid.  */
13416
13417 static bool
13418 aarch64_process_one_target_attr (char *arg_str)
13419 {
13420   bool invert = false;
13421
13422   size_t len = strlen (arg_str);
13423
13424   if (len == 0)
13425     {
13426       error ("malformed %<target()%> pragma or attribute");
13427       return false;
13428     }
13429
13430   char *str_to_check = (char *) alloca (len + 1);
13431   strcpy (str_to_check, arg_str);
13432
13433   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
13434      It is easier to detect and handle it explicitly here rather than going
13435      through the machinery for the rest of the target attributes in this
13436      function.  */
13437   if (*str_to_check == '+')
13438     return aarch64_handle_attr_isa_flags (str_to_check);
13439
13440   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
13441     {
13442       invert = true;
13443       str_to_check += 3;
13444     }
13445   char *arg = strchr (str_to_check, '=');
13446
13447   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
13448      and point ARG to "foo".  */
13449   if (arg)
13450     {
13451       *arg = '\0';
13452       arg++;
13453     }
13454   const struct aarch64_attribute_info *p_attr;
13455   bool found = false;
13456   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
13457     {
13458       /* If the names don't match up, or the user has given an argument
13459          to an attribute that doesn't accept one, or didn't give an argument
13460          to an attribute that expects one, fail to match.  */
13461       if (strcmp (str_to_check, p_attr->name) != 0)
13462         continue;
13463
13464       found = true;
13465       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
13466                               || p_attr->attr_type == aarch64_attr_enum;
13467
13468       if (attr_need_arg_p ^ (arg != NULL))
13469         {
13470           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
13471           return false;
13472         }
13473
13474       /* If the name matches but the attribute does not allow "no-" versions
13475          then we can't match.  */
13476       if (invert && !p_attr->allow_neg)
13477         {
13478           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
13479           return false;
13480         }
13481
13482       switch (p_attr->attr_type)
13483         {
13484         /* Has a custom handler registered.
13485            For example, cpu=, arch=, tune=.  */
13486           case aarch64_attr_custom:
13487             gcc_assert (p_attr->handler);
13488             if (!p_attr->handler (arg))
13489               return false;
13490             break;
13491
13492           /* Either set or unset a boolean option.  */
13493           case aarch64_attr_bool:
13494             {
13495               struct cl_decoded_option decoded;
13496
13497               generate_option (p_attr->opt_num, NULL, !invert,
13498                                CL_TARGET, &decoded);
13499               aarch64_handle_option (&global_options, &global_options_set,
13500                                       &decoded, input_location);
13501               break;
13502             }
13503           /* Set or unset a bit in the target_flags.  aarch64_handle_option
13504              should know what mask to apply given the option number.  */
13505           case aarch64_attr_mask:
13506             {
13507               struct cl_decoded_option decoded;
13508               /* We only need to specify the option number.
13509                  aarch64_handle_option will know which mask to apply.  */
13510               decoded.opt_index = p_attr->opt_num;
13511               decoded.value = !invert;
13512               aarch64_handle_option (&global_options, &global_options_set,
13513                                       &decoded, input_location);
13514               break;
13515             }
13516           /* Use the option setting machinery to set an option to an enum.  */
13517           case aarch64_attr_enum:
13518             {
13519               gcc_assert (arg);
13520               bool valid;
13521               int value;
13522               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
13523                                               &value, CL_TARGET);
13524               if (valid)
13525                 {
13526                   set_option (&global_options, NULL, p_attr->opt_num, value,
13527                               NULL, DK_UNSPECIFIED, input_location,
13528                               global_dc);
13529                 }
13530               else
13531                 {
13532                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
13533                 }
13534               break;
13535             }
13536           default:
13537             gcc_unreachable ();
13538         }
13539     }
13540
13541   /* If we reached here we either have found an attribute and validated
13542      it or didn't match any.  If we matched an attribute but its arguments
13543      were malformed we will have returned false already.  */
13544   return found;
13545 }
13546
13547 /* Count how many times the character C appears in
13548    NULL-terminated string STR.  */
13549
13550 static unsigned int
13551 num_occurences_in_str (char c, char *str)
13552 {
13553   unsigned int res = 0;
13554   while (*str != '\0')
13555     {
13556       if (*str == c)
13557         res++;
13558
13559       str++;
13560     }
13561
13562   return res;
13563 }
13564
13565 /* Parse the tree in ARGS that contains the target attribute information
13566    and update the global target options space.  */
13567
13568 bool
13569 aarch64_process_target_attr (tree args)
13570 {
13571   if (TREE_CODE (args) == TREE_LIST)
13572     {
13573       do
13574         {
13575           tree head = TREE_VALUE (args);
13576           if (head)
13577             {
13578               if (!aarch64_process_target_attr (head))
13579                 return false;
13580             }
13581           args = TREE_CHAIN (args);
13582         } while (args);
13583
13584       return true;
13585     }
13586
13587   if (TREE_CODE (args) != STRING_CST)
13588     {
13589       error ("attribute %<target%> argument not a string");
13590       return false;
13591     }
13592
13593   size_t len = strlen (TREE_STRING_POINTER (args));
13594   char *str_to_check = (char *) alloca (len + 1);
13595   strcpy (str_to_check, TREE_STRING_POINTER (args));
13596
13597   if (len == 0)
13598     {
13599       error ("malformed %<target()%> pragma or attribute");
13600       return false;
13601     }
13602
13603   /* Used to catch empty spaces between commas i.e.
13604      attribute ((target ("attr1,,attr2"))).  */
13605   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
13606
13607   /* Handle multiple target attributes separated by ','.  */
13608   char *token = strtok_r (str_to_check, ",", &str_to_check);
13609
13610   unsigned int num_attrs = 0;
13611   while (token)
13612     {
13613       num_attrs++;
13614       if (!aarch64_process_one_target_attr (token))
13615         {
13616           error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
13617           return false;
13618         }
13619
13620       token = strtok_r (NULL, ",", &str_to_check);
13621     }
13622
13623   if (num_attrs != num_commas + 1)
13624     {
13625       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
13626       return false;
13627     }
13628
13629   return true;
13630 }
13631
13632 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
13633    process attribute ((target ("..."))).  */
13634
13635 static bool
13636 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
13637 {
13638   struct cl_target_option cur_target;
13639   bool ret;
13640   tree old_optimize;
13641   tree new_target, new_optimize;
13642   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13643
13644   /* If what we're processing is the current pragma string then the
13645      target option node is already stored in target_option_current_node
13646      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
13647      having to re-parse the string.  This is especially useful to keep
13648      arm_neon.h compile times down since that header contains a lot
13649      of intrinsics enclosed in pragmas.  */
13650   if (!existing_target && args == current_target_pragma)
13651     {
13652       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
13653       return true;
13654     }
13655   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13656
13657   old_optimize = build_optimization_node (&global_options);
13658   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13659
13660   /* If the function changed the optimization levels as well as setting
13661      target options, start with the optimizations specified.  */
13662   if (func_optimize && func_optimize != old_optimize)
13663     cl_optimization_restore (&global_options,
13664                              TREE_OPTIMIZATION (func_optimize));
13665
13666   /* Save the current target options to restore at the end.  */
13667   cl_target_option_save (&cur_target, &global_options);
13668
13669   /* If fndecl already has some target attributes applied to it, unpack
13670      them so that we add this attribute on top of them, rather than
13671      overwriting them.  */
13672   if (existing_target)
13673     {
13674       struct cl_target_option *existing_options
13675         = TREE_TARGET_OPTION (existing_target);
13676
13677       if (existing_options)
13678         cl_target_option_restore (&global_options, existing_options);
13679     }
13680   else
13681     cl_target_option_restore (&global_options,
13682                         TREE_TARGET_OPTION (target_option_current_node));
13683
13684   ret = aarch64_process_target_attr (args);
13685
13686   /* Set up any additional state.  */
13687   if (ret)
13688     {
13689       aarch64_override_options_internal (&global_options);
13690       /* Initialize SIMD builtins if we haven't already.
13691          Set current_target_pragma to NULL for the duration so that
13692          the builtin initialization code doesn't try to tag the functions
13693          being built with the attributes specified by any current pragma, thus
13694          going into an infinite recursion.  */
13695       if (TARGET_SIMD)
13696         {
13697           tree saved_current_target_pragma = current_target_pragma;
13698           current_target_pragma = NULL;
13699           aarch64_init_simd_builtins ();
13700           current_target_pragma = saved_current_target_pragma;
13701         }
13702       new_target = build_target_option_node (&global_options);
13703     }
13704   else
13705     new_target = NULL;
13706
13707   new_optimize = build_optimization_node (&global_options);
13708
13709   if (fndecl && ret)
13710     {
13711       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
13712
13713       if (old_optimize != new_optimize)
13714         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
13715     }
13716
13717   cl_target_option_restore (&global_options, &cur_target);
13718
13719   if (old_optimize != new_optimize)
13720     cl_optimization_restore (&global_options,
13721                              TREE_OPTIMIZATION (old_optimize));
13722   return ret;
13723 }
13724
13725 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
13726    tri-bool options (yes, no, don't care) and the default value is
13727    DEF, determine whether to reject inlining.  */
13728
13729 static bool
13730 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
13731                                      int dont_care, int def)
13732 {
13733   /* If the callee doesn't care, always allow inlining.  */
13734   if (callee == dont_care)
13735     return true;
13736
13737   /* If the caller doesn't care, always allow inlining.  */
13738   if (caller == dont_care)
13739     return true;
13740
13741   /* Otherwise, allow inlining if either the callee and caller values
13742      agree, or if the callee is using the default value.  */
13743   return (callee == caller || callee == def);
13744 }
13745
13746 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
13747    to inline CALLEE into CALLER based on target-specific info.
13748    Make sure that the caller and callee have compatible architectural
13749    features.  Then go through the other possible target attributes
13750    and see if they can block inlining.  Try not to reject always_inline
13751    callees unless they are incompatible architecturally.  */
13752
13753 static bool
13754 aarch64_can_inline_p (tree caller, tree callee)
13755 {
13756   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
13757   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
13758
13759   struct cl_target_option *caller_opts
13760         = TREE_TARGET_OPTION (caller_tree ? caller_tree
13761                                            : target_option_default_node);
13762
13763   struct cl_target_option *callee_opts
13764         = TREE_TARGET_OPTION (callee_tree ? callee_tree
13765                                            : target_option_default_node);
13766
13767   /* Callee's ISA flags should be a subset of the caller's.  */
13768   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
13769        != callee_opts->x_aarch64_isa_flags)
13770     return false;
13771
13772   /* Allow non-strict aligned functions inlining into strict
13773      aligned ones.  */
13774   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
13775        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
13776       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
13777            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
13778     return false;
13779
13780   bool always_inline = lookup_attribute ("always_inline",
13781                                           DECL_ATTRIBUTES (callee));
13782
13783   /* If the architectural features match up and the callee is always_inline
13784      then the other attributes don't matter.  */
13785   if (always_inline)
13786     return true;
13787
13788   if (caller_opts->x_aarch64_cmodel_var
13789       != callee_opts->x_aarch64_cmodel_var)
13790     return false;
13791
13792   if (caller_opts->x_aarch64_tls_dialect
13793       != callee_opts->x_aarch64_tls_dialect)
13794     return false;
13795
13796   /* Honour explicit requests to workaround errata.  */
13797   if (!aarch64_tribools_ok_for_inlining_p (
13798           caller_opts->x_aarch64_fix_a53_err835769,
13799           callee_opts->x_aarch64_fix_a53_err835769,
13800           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
13801     return false;
13802
13803   if (!aarch64_tribools_ok_for_inlining_p (
13804           caller_opts->x_aarch64_fix_a53_err843419,
13805           callee_opts->x_aarch64_fix_a53_err843419,
13806           2, TARGET_FIX_ERR_A53_843419))
13807     return false;
13808
13809   /* If the user explicitly specified -momit-leaf-frame-pointer for the
13810      caller and calle and they don't match up, reject inlining.  */
13811   if (!aarch64_tribools_ok_for_inlining_p (
13812           caller_opts->x_flag_omit_leaf_frame_pointer,
13813           callee_opts->x_flag_omit_leaf_frame_pointer,
13814           2, 1))
13815     return false;
13816
13817   /* If the callee has specific tuning overrides, respect them.  */
13818   if (callee_opts->x_aarch64_override_tune_string != NULL
13819       && caller_opts->x_aarch64_override_tune_string == NULL)
13820     return false;
13821
13822   /* If the user specified tuning override strings for the
13823      caller and callee and they don't match up, reject inlining.
13824      We just do a string compare here, we don't analyze the meaning
13825      of the string, as it would be too costly for little gain.  */
13826   if (callee_opts->x_aarch64_override_tune_string
13827       && caller_opts->x_aarch64_override_tune_string
13828       && (strcmp (callee_opts->x_aarch64_override_tune_string,
13829                   caller_opts->x_aarch64_override_tune_string) != 0))
13830     return false;
13831
13832   return true;
13833 }
13834
13835 /* Return true if SYMBOL_REF X binds locally.  */
13836
13837 static bool
13838 aarch64_symbol_binds_local_p (const_rtx x)
13839 {
13840   return (SYMBOL_REF_DECL (x)
13841           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
13842           : SYMBOL_REF_LOCAL_P (x));
13843 }
13844
13845 /* Return true if SYMBOL_REF X is thread local */
13846 static bool
13847 aarch64_tls_symbol_p (rtx x)
13848 {
13849   if (! TARGET_HAVE_TLS)
13850     return false;
13851
13852   if (GET_CODE (x) != SYMBOL_REF)
13853     return false;
13854
13855   return SYMBOL_REF_TLS_MODEL (x) != 0;
13856 }
13857
13858 /* Classify a TLS symbol into one of the TLS kinds.  */
13859 enum aarch64_symbol_type
13860 aarch64_classify_tls_symbol (rtx x)
13861 {
13862   enum tls_model tls_kind = tls_symbolic_operand_type (x);
13863
13864   switch (tls_kind)
13865     {
13866     case TLS_MODEL_GLOBAL_DYNAMIC:
13867     case TLS_MODEL_LOCAL_DYNAMIC:
13868       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
13869
13870     case TLS_MODEL_INITIAL_EXEC:
13871       switch (aarch64_cmodel)
13872         {
13873         case AARCH64_CMODEL_TINY:
13874         case AARCH64_CMODEL_TINY_PIC:
13875           return SYMBOL_TINY_TLSIE;
13876         default:
13877           return SYMBOL_SMALL_TLSIE;
13878         }
13879
13880     case TLS_MODEL_LOCAL_EXEC:
13881       if (aarch64_tls_size == 12)
13882         return SYMBOL_TLSLE12;
13883       else if (aarch64_tls_size == 24)
13884         return SYMBOL_TLSLE24;
13885       else if (aarch64_tls_size == 32)
13886         return SYMBOL_TLSLE32;
13887       else if (aarch64_tls_size == 48)
13888         return SYMBOL_TLSLE48;
13889       else
13890         gcc_unreachable ();
13891
13892     case TLS_MODEL_EMULATED:
13893     case TLS_MODEL_NONE:
13894       return SYMBOL_FORCE_TO_MEM;
13895
13896     default:
13897       gcc_unreachable ();
13898     }
13899 }
13900
13901 /* Return the correct method for accessing X + OFFSET, where X is either
13902    a SYMBOL_REF or LABEL_REF.  */
13903
13904 enum aarch64_symbol_type
13905 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
13906 {
13907   if (GET_CODE (x) == LABEL_REF)
13908     {
13909       switch (aarch64_cmodel)
13910         {
13911         case AARCH64_CMODEL_LARGE:
13912           return SYMBOL_FORCE_TO_MEM;
13913
13914         case AARCH64_CMODEL_TINY_PIC:
13915         case AARCH64_CMODEL_TINY:
13916           return SYMBOL_TINY_ABSOLUTE;
13917
13918         case AARCH64_CMODEL_SMALL_SPIC:
13919         case AARCH64_CMODEL_SMALL_PIC:
13920         case AARCH64_CMODEL_SMALL:
13921           return SYMBOL_SMALL_ABSOLUTE;
13922
13923         default:
13924           gcc_unreachable ();
13925         }
13926     }
13927
13928   if (GET_CODE (x) == SYMBOL_REF)
13929     {
13930       if (aarch64_tls_symbol_p (x))
13931         return aarch64_classify_tls_symbol (x);
13932
13933       switch (aarch64_cmodel)
13934         {
13935         case AARCH64_CMODEL_TINY:
13936           /* When we retrieve symbol + offset address, we have to make sure
13937              the offset does not cause overflow of the final address.  But
13938              we have no way of knowing the address of symbol at compile time
13939              so we can't accurately say if the distance between the PC and
13940              symbol + offset is outside the addressible range of +/-1M in the
13941              TINY code model.  So we rely on images not being greater than
13942              1M and cap the offset at 1M and anything beyond 1M will have to
13943              be loaded using an alternative mechanism.  Furthermore if the
13944              symbol is a weak reference to something that isn't known to
13945              resolve to a symbol in this module, then force to memory.  */
13946           if ((SYMBOL_REF_WEAK (x)
13947                && !aarch64_symbol_binds_local_p (x))
13948               || !IN_RANGE (offset, -1048575, 1048575))
13949             return SYMBOL_FORCE_TO_MEM;
13950           return SYMBOL_TINY_ABSOLUTE;
13951
13952         case AARCH64_CMODEL_SMALL:
13953           /* Same reasoning as the tiny code model, but the offset cap here is
13954              4G.  */
13955           if ((SYMBOL_REF_WEAK (x)
13956                && !aarch64_symbol_binds_local_p (x))
13957               || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
13958                             HOST_WIDE_INT_C (4294967264)))
13959             return SYMBOL_FORCE_TO_MEM;
13960           return SYMBOL_SMALL_ABSOLUTE;
13961
13962         case AARCH64_CMODEL_TINY_PIC:
13963           if (!aarch64_symbol_binds_local_p (x))
13964             return SYMBOL_TINY_GOT;
13965           return SYMBOL_TINY_ABSOLUTE;
13966
13967         case AARCH64_CMODEL_SMALL_SPIC:
13968         case AARCH64_CMODEL_SMALL_PIC:
13969           if (!aarch64_symbol_binds_local_p (x))
13970             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
13971                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
13972           return SYMBOL_SMALL_ABSOLUTE;
13973
13974         case AARCH64_CMODEL_LARGE:
13975           /* This is alright even in PIC code as the constant
13976              pool reference is always PC relative and within
13977              the same translation unit.  */
13978           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
13979             return SYMBOL_SMALL_ABSOLUTE;
13980           else
13981             return SYMBOL_FORCE_TO_MEM;
13982
13983         default:
13984           gcc_unreachable ();
13985         }
13986     }
13987
13988   /* By default push everything into the constant pool.  */
13989   return SYMBOL_FORCE_TO_MEM;
13990 }
13991
13992 bool
13993 aarch64_constant_address_p (rtx x)
13994 {
13995   return (CONSTANT_P (x) && memory_address_p (DImode, x));
13996 }
13997
13998 bool
13999 aarch64_legitimate_pic_operand_p (rtx x)
14000 {
14001   if (GET_CODE (x) == SYMBOL_REF
14002       || (GET_CODE (x) == CONST
14003           && GET_CODE (XEXP (x, 0)) == PLUS
14004           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
14005      return false;
14006
14007   return true;
14008 }
14009
14010 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
14011    that should be rematerialized rather than spilled.  */
14012
14013 static bool
14014 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
14015 {
14016   /* Support CSE and rematerialization of common constants.  */
14017   if (CONST_INT_P (x)
14018       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
14019       || GET_CODE (x) == CONST_VECTOR)
14020     return true;
14021
14022   /* Do not allow vector struct mode constants for Advanced SIMD.
14023      We could support 0 and -1 easily, but they need support in
14024      aarch64-simd.md.  */
14025   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14026   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
14027     return false;
14028
14029   /* Only accept variable-length vector constants if they can be
14030      handled directly.
14031
14032      ??? It would be possible to handle rematerialization of other
14033      constants via secondary reloads.  */
14034   if (vec_flags & VEC_ANY_SVE)
14035     return aarch64_simd_valid_immediate (x, NULL);
14036
14037   if (GET_CODE (x) == HIGH)
14038     x = XEXP (x, 0);
14039
14040   /* Accept polynomial constants that can be calculated by using the
14041      destination of a move as the sole temporary.  Constants that
14042      require a second temporary cannot be rematerialized (they can't be
14043      forced to memory and also aren't legitimate constants).  */
14044   poly_int64 offset;
14045   if (poly_int_rtx_p (x, &offset))
14046     return aarch64_offset_temporaries (false, offset) <= 1;
14047
14048   /* If an offset is being added to something else, we need to allow the
14049      base to be moved into the destination register, meaning that there
14050      are no free temporaries for the offset.  */
14051   x = strip_offset (x, &offset);
14052   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
14053     return false;
14054
14055   /* Do not allow const (plus (anchor_symbol, const_int)).  */
14056   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
14057     return false;
14058
14059   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
14060      so spilling them is better than rematerialization.  */
14061   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
14062     return true;
14063
14064   /* Label references are always constant.  */
14065   if (GET_CODE (x) == LABEL_REF)
14066     return true;
14067
14068   return false;
14069 }
14070
14071 rtx
14072 aarch64_load_tp (rtx target)
14073 {
14074   if (!target
14075       || GET_MODE (target) != Pmode
14076       || !register_operand (target, Pmode))
14077     target = gen_reg_rtx (Pmode);
14078
14079   /* Can return in any reg.  */
14080   emit_insn (gen_aarch64_load_tp_hard (target));
14081   return target;
14082 }
14083
14084 /* On AAPCS systems, this is the "struct __va_list".  */
14085 static GTY(()) tree va_list_type;
14086
14087 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
14088    Return the type to use as __builtin_va_list.
14089
14090    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
14091
14092    struct __va_list
14093    {
14094      void *__stack;
14095      void *__gr_top;
14096      void *__vr_top;
14097      int   __gr_offs;
14098      int   __vr_offs;
14099    };  */
14100
14101 static tree
14102 aarch64_build_builtin_va_list (void)
14103 {
14104   tree va_list_name;
14105   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14106
14107   /* Create the type.  */
14108   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
14109   /* Give it the required name.  */
14110   va_list_name = build_decl (BUILTINS_LOCATION,
14111                              TYPE_DECL,
14112                              get_identifier ("__va_list"),
14113                              va_list_type);
14114   DECL_ARTIFICIAL (va_list_name) = 1;
14115   TYPE_NAME (va_list_type) = va_list_name;
14116   TYPE_STUB_DECL (va_list_type) = va_list_name;
14117
14118   /* Create the fields.  */
14119   f_stack = build_decl (BUILTINS_LOCATION,
14120                         FIELD_DECL, get_identifier ("__stack"),
14121                         ptr_type_node);
14122   f_grtop = build_decl (BUILTINS_LOCATION,
14123                         FIELD_DECL, get_identifier ("__gr_top"),
14124                         ptr_type_node);
14125   f_vrtop = build_decl (BUILTINS_LOCATION,
14126                         FIELD_DECL, get_identifier ("__vr_top"),
14127                         ptr_type_node);
14128   f_groff = build_decl (BUILTINS_LOCATION,
14129                         FIELD_DECL, get_identifier ("__gr_offs"),
14130                         integer_type_node);
14131   f_vroff = build_decl (BUILTINS_LOCATION,
14132                         FIELD_DECL, get_identifier ("__vr_offs"),
14133                         integer_type_node);
14134
14135   /* Tell tree-stdarg pass about our internal offset fields.
14136      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
14137      purpose to identify whether the code is updating va_list internal
14138      offset fields through irregular way.  */
14139   va_list_gpr_counter_field = f_groff;
14140   va_list_fpr_counter_field = f_vroff;
14141
14142   DECL_ARTIFICIAL (f_stack) = 1;
14143   DECL_ARTIFICIAL (f_grtop) = 1;
14144   DECL_ARTIFICIAL (f_vrtop) = 1;
14145   DECL_ARTIFICIAL (f_groff) = 1;
14146   DECL_ARTIFICIAL (f_vroff) = 1;
14147
14148   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
14149   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
14150   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
14151   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
14152   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
14153
14154   TYPE_FIELDS (va_list_type) = f_stack;
14155   DECL_CHAIN (f_stack) = f_grtop;
14156   DECL_CHAIN (f_grtop) = f_vrtop;
14157   DECL_CHAIN (f_vrtop) = f_groff;
14158   DECL_CHAIN (f_groff) = f_vroff;
14159
14160   /* Compute its layout.  */
14161   layout_type (va_list_type);
14162
14163   return va_list_type;
14164 }
14165
14166 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
14167 static void
14168 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
14169 {
14170   const CUMULATIVE_ARGS *cum;
14171   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14172   tree stack, grtop, vrtop, groff, vroff;
14173   tree t;
14174   int gr_save_area_size = cfun->va_list_gpr_size;
14175   int vr_save_area_size = cfun->va_list_fpr_size;
14176   int vr_offset;
14177
14178   cum = &crtl->args.info;
14179   if (cfun->va_list_gpr_size)
14180     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
14181                              cfun->va_list_gpr_size);
14182   if (cfun->va_list_fpr_size)
14183     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
14184                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
14185
14186   if (!TARGET_FLOAT)
14187     {
14188       gcc_assert (cum->aapcs_nvrn == 0);
14189       vr_save_area_size = 0;
14190     }
14191
14192   f_stack = TYPE_FIELDS (va_list_type_node);
14193   f_grtop = DECL_CHAIN (f_stack);
14194   f_vrtop = DECL_CHAIN (f_grtop);
14195   f_groff = DECL_CHAIN (f_vrtop);
14196   f_vroff = DECL_CHAIN (f_groff);
14197
14198   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
14199                   NULL_TREE);
14200   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
14201                   NULL_TREE);
14202   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
14203                   NULL_TREE);
14204   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
14205                   NULL_TREE);
14206   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
14207                   NULL_TREE);
14208
14209   /* Emit code to initialize STACK, which points to the next varargs stack
14210      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
14211      by named arguments.  STACK is 8-byte aligned.  */
14212   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
14213   if (cum->aapcs_stack_size > 0)
14214     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
14215   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
14216   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14217
14218   /* Emit code to initialize GRTOP, the top of the GR save area.
14219      virtual_incoming_args_rtx should have been 16 byte aligned.  */
14220   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
14221   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
14222   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14223
14224   /* Emit code to initialize VRTOP, the top of the VR save area.
14225      This address is gr_save_area_bytes below GRTOP, rounded
14226      down to the next 16-byte boundary.  */
14227   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
14228   vr_offset = ROUND_UP (gr_save_area_size,
14229                         STACK_BOUNDARY / BITS_PER_UNIT);
14230
14231   if (vr_offset)
14232     t = fold_build_pointer_plus_hwi (t, -vr_offset);
14233   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
14234   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14235
14236   /* Emit code to initialize GROFF, the offset from GRTOP of the
14237      next GPR argument.  */
14238   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
14239               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
14240   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14241
14242   /* Likewise emit code to initialize VROFF, the offset from FTOP
14243      of the next VR argument.  */
14244   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
14245               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
14246   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14247 }
14248
14249 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
14250
14251 static tree
14252 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
14253                               gimple_seq *post_p ATTRIBUTE_UNUSED)
14254 {
14255   tree addr;
14256   bool indirect_p;
14257   bool is_ha;           /* is HFA or HVA.  */
14258   bool dw_align;        /* double-word align.  */
14259   machine_mode ag_mode = VOIDmode;
14260   int nregs;
14261   machine_mode mode;
14262
14263   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14264   tree stack, f_top, f_off, off, arg, roundup, on_stack;
14265   HOST_WIDE_INT size, rsize, adjust, align;
14266   tree t, u, cond1, cond2;
14267
14268   indirect_p = pass_va_arg_by_reference (type);
14269   if (indirect_p)
14270     type = build_pointer_type (type);
14271
14272   mode = TYPE_MODE (type);
14273
14274   f_stack = TYPE_FIELDS (va_list_type_node);
14275   f_grtop = DECL_CHAIN (f_stack);
14276   f_vrtop = DECL_CHAIN (f_grtop);
14277   f_groff = DECL_CHAIN (f_vrtop);
14278   f_vroff = DECL_CHAIN (f_groff);
14279
14280   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
14281                   f_stack, NULL_TREE);
14282   size = int_size_in_bytes (type);
14283
14284   bool abi_break;
14285   align
14286     = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
14287
14288   dw_align = false;
14289   adjust = 0;
14290   if (aarch64_vfp_is_call_or_return_candidate (mode,
14291                                                type,
14292                                                &ag_mode,
14293                                                &nregs,
14294                                                &is_ha))
14295     {
14296       /* No frontends can create types with variable-sized modes, so we
14297          shouldn't be asked to pass or return them.  */
14298       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
14299
14300       /* TYPE passed in fp/simd registers.  */
14301       if (!TARGET_FLOAT)
14302         aarch64_err_no_fpadvsimd (mode);
14303
14304       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
14305                       unshare_expr (valist), f_vrtop, NULL_TREE);
14306       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
14307                       unshare_expr (valist), f_vroff, NULL_TREE);
14308
14309       rsize = nregs * UNITS_PER_VREG;
14310
14311       if (is_ha)
14312         {
14313           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
14314             adjust = UNITS_PER_VREG - ag_size;
14315         }
14316       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14317                && size < UNITS_PER_VREG)
14318         {
14319           adjust = UNITS_PER_VREG - size;
14320         }
14321     }
14322   else
14323     {
14324       /* TYPE passed in general registers.  */
14325       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
14326                       unshare_expr (valist), f_grtop, NULL_TREE);
14327       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
14328                       unshare_expr (valist), f_groff, NULL_TREE);
14329       rsize = ROUND_UP (size, UNITS_PER_WORD);
14330       nregs = rsize / UNITS_PER_WORD;
14331
14332       if (align > 8)
14333         {
14334           if (abi_break && warn_psabi)
14335             inform (input_location, "parameter passing for argument of type "
14336                     "%qT changed in GCC 9.1", type);
14337           dw_align = true;
14338         }
14339
14340       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14341           && size < UNITS_PER_WORD)
14342         {
14343           adjust = UNITS_PER_WORD  - size;
14344         }
14345     }
14346
14347   /* Get a local temporary for the field value.  */
14348   off = get_initialized_tmp_var (f_off, pre_p, NULL);
14349
14350   /* Emit code to branch if off >= 0.  */
14351   t = build2 (GE_EXPR, boolean_type_node, off,
14352               build_int_cst (TREE_TYPE (off), 0));
14353   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
14354
14355   if (dw_align)
14356     {
14357       /* Emit: offs = (offs + 15) & -16.  */
14358       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
14359                   build_int_cst (TREE_TYPE (off), 15));
14360       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
14361                   build_int_cst (TREE_TYPE (off), -16));
14362       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
14363     }
14364   else
14365     roundup = NULL;
14366
14367   /* Update ap.__[g|v]r_offs  */
14368   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
14369               build_int_cst (TREE_TYPE (off), rsize));
14370   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
14371
14372   /* String up.  */
14373   if (roundup)
14374     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
14375
14376   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
14377   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
14378               build_int_cst (TREE_TYPE (f_off), 0));
14379   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
14380
14381   /* String up: make sure the assignment happens before the use.  */
14382   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
14383   COND_EXPR_ELSE (cond1) = t;
14384
14385   /* Prepare the trees handling the argument that is passed on the stack;
14386      the top level node will store in ON_STACK.  */
14387   arg = get_initialized_tmp_var (stack, pre_p, NULL);
14388   if (align > 8)
14389     {
14390       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
14391       t = fold_build_pointer_plus_hwi (arg, 15);
14392       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
14393                   build_int_cst (TREE_TYPE (t), -16));
14394       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
14395     }
14396   else
14397     roundup = NULL;
14398   /* Advance ap.__stack  */
14399   t = fold_build_pointer_plus_hwi (arg, size + 7);
14400   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
14401               build_int_cst (TREE_TYPE (t), -8));
14402   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
14403   /* String up roundup and advance.  */
14404   if (roundup)
14405     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
14406   /* String up with arg */
14407   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
14408   /* Big-endianness related address adjustment.  */
14409   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14410       && size < UNITS_PER_WORD)
14411   {
14412     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
14413                 size_int (UNITS_PER_WORD - size));
14414     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
14415   }
14416
14417   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
14418   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
14419
14420   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
14421   t = off;
14422   if (adjust)
14423     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
14424                 build_int_cst (TREE_TYPE (off), adjust));
14425
14426   t = fold_convert (sizetype, t);
14427   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
14428
14429   if (is_ha)
14430     {
14431       /* type ha; // treat as "struct {ftype field[n];}"
14432          ... [computing offs]
14433          for (i = 0; i <nregs; ++i, offs += 16)
14434            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
14435          return ha;  */
14436       int i;
14437       tree tmp_ha, field_t, field_ptr_t;
14438
14439       /* Declare a local variable.  */
14440       tmp_ha = create_tmp_var_raw (type, "ha");
14441       gimple_add_tmp_var (tmp_ha);
14442
14443       /* Establish the base type.  */
14444       switch (ag_mode)
14445         {
14446         case E_SFmode:
14447           field_t = float_type_node;
14448           field_ptr_t = float_ptr_type_node;
14449           break;
14450         case E_DFmode:
14451           field_t = double_type_node;
14452           field_ptr_t = double_ptr_type_node;
14453           break;
14454         case E_TFmode:
14455           field_t = long_double_type_node;
14456           field_ptr_t = long_double_ptr_type_node;
14457           break;
14458         case E_HFmode:
14459           field_t = aarch64_fp16_type_node;
14460           field_ptr_t = aarch64_fp16_ptr_type_node;
14461           break;
14462         case E_V2SImode:
14463         case E_V4SImode:
14464             {
14465               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
14466               field_t = build_vector_type_for_mode (innertype, ag_mode);
14467               field_ptr_t = build_pointer_type (field_t);
14468             }
14469           break;
14470         default:
14471           gcc_assert (0);
14472         }
14473
14474       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
14475       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
14476       addr = t;
14477       t = fold_convert (field_ptr_t, addr);
14478       t = build2 (MODIFY_EXPR, field_t,
14479                   build1 (INDIRECT_REF, field_t, tmp_ha),
14480                   build1 (INDIRECT_REF, field_t, t));
14481
14482       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
14483       for (i = 1; i < nregs; ++i)
14484         {
14485           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
14486           u = fold_convert (field_ptr_t, addr);
14487           u = build2 (MODIFY_EXPR, field_t,
14488                       build2 (MEM_REF, field_t, tmp_ha,
14489                               build_int_cst (field_ptr_t,
14490                                              (i *
14491                                               int_size_in_bytes (field_t)))),
14492                       build1 (INDIRECT_REF, field_t, u));
14493           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
14494         }
14495
14496       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
14497       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
14498     }
14499
14500   COND_EXPR_ELSE (cond2) = t;
14501   addr = fold_convert (build_pointer_type (type), cond1);
14502   addr = build_va_arg_indirect_ref (addr);
14503
14504   if (indirect_p)
14505     addr = build_va_arg_indirect_ref (addr);
14506
14507   return addr;
14508 }
14509
14510 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
14511
14512 static void
14513 aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
14514                                 const function_arg_info &arg,
14515                                 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
14516 {
14517   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
14518   CUMULATIVE_ARGS local_cum;
14519   int gr_saved = cfun->va_list_gpr_size;
14520   int vr_saved = cfun->va_list_fpr_size;
14521
14522   /* The caller has advanced CUM up to, but not beyond, the last named
14523      argument.  Advance a local copy of CUM past the last "real" named
14524      argument, to find out how many registers are left over.  */
14525   local_cum = *cum;
14526   aarch64_function_arg_advance (pack_cumulative_args(&local_cum),
14527                                 arg.mode, arg.type, arg.named);
14528
14529   /* Found out how many registers we need to save.
14530      Honor tree-stdvar analysis results.  */
14531   if (cfun->va_list_gpr_size)
14532     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
14533                     cfun->va_list_gpr_size / UNITS_PER_WORD);
14534   if (cfun->va_list_fpr_size)
14535     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
14536                     cfun->va_list_fpr_size / UNITS_PER_VREG);
14537
14538   if (!TARGET_FLOAT)
14539     {
14540       gcc_assert (local_cum.aapcs_nvrn == 0);
14541       vr_saved = 0;
14542     }
14543
14544   if (!no_rtl)
14545     {
14546       if (gr_saved > 0)
14547         {
14548           rtx ptr, mem;
14549
14550           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
14551           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
14552                                - gr_saved * UNITS_PER_WORD);
14553           mem = gen_frame_mem (BLKmode, ptr);
14554           set_mem_alias_set (mem, get_varargs_alias_set ());
14555
14556           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
14557                                mem, gr_saved);
14558         }
14559       if (vr_saved > 0)
14560         {
14561           /* We can't use move_block_from_reg, because it will use
14562              the wrong mode, storing D regs only.  */
14563           machine_mode mode = TImode;
14564           int off, i, vr_start;
14565
14566           /* Set OFF to the offset from virtual_incoming_args_rtx of
14567              the first vector register.  The VR save area lies below
14568              the GR one, and is aligned to 16 bytes.  */
14569           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
14570                            STACK_BOUNDARY / BITS_PER_UNIT);
14571           off -= vr_saved * UNITS_PER_VREG;
14572
14573           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
14574           for (i = 0; i < vr_saved; ++i)
14575             {
14576               rtx ptr, mem;
14577
14578               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
14579               mem = gen_frame_mem (mode, ptr);
14580               set_mem_alias_set (mem, get_varargs_alias_set ());
14581               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
14582               off += UNITS_PER_VREG;
14583             }
14584         }
14585     }
14586
14587   /* We don't save the size into *PRETEND_SIZE because we want to avoid
14588      any complication of having crtl->args.pretend_args_size changed.  */
14589   cfun->machine->frame.saved_varargs_size
14590     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
14591                  STACK_BOUNDARY / BITS_PER_UNIT)
14592        + vr_saved * UNITS_PER_VREG);
14593 }
14594
14595 static void
14596 aarch64_conditional_register_usage (void)
14597 {
14598   int i;
14599   if (!TARGET_FLOAT)
14600     {
14601       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
14602         {
14603           fixed_regs[i] = 1;
14604           call_used_regs[i] = 1;
14605         }
14606     }
14607   if (!TARGET_SVE)
14608     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
14609       {
14610         fixed_regs[i] = 1;
14611         call_used_regs[i] = 1;
14612       }
14613
14614   /* When tracking speculation, we need a couple of call-clobbered registers
14615      to track the speculation state.  It would be nice to just use
14616      IP0 and IP1, but currently there are numerous places that just
14617      assume these registers are free for other uses (eg pointer
14618      authentication).  */
14619   if (aarch64_track_speculation)
14620     {
14621       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
14622       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
14623       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14624       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14625     }
14626 }
14627
14628 /* Walk down the type tree of TYPE counting consecutive base elements.
14629    If *MODEP is VOIDmode, then set it to the first valid floating point
14630    type.  If a non-floating point type is found, or if a floating point
14631    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
14632    otherwise return the count in the sub-tree.  */
14633 static int
14634 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
14635 {
14636   machine_mode mode;
14637   HOST_WIDE_INT size;
14638
14639   switch (TREE_CODE (type))
14640     {
14641     case REAL_TYPE:
14642       mode = TYPE_MODE (type);
14643       if (mode != DFmode && mode != SFmode
14644           && mode != TFmode && mode != HFmode)
14645         return -1;
14646
14647       if (*modep == VOIDmode)
14648         *modep = mode;
14649
14650       if (*modep == mode)
14651         return 1;
14652
14653       break;
14654
14655     case COMPLEX_TYPE:
14656       mode = TYPE_MODE (TREE_TYPE (type));
14657       if (mode != DFmode && mode != SFmode
14658           && mode != TFmode && mode != HFmode)
14659         return -1;
14660
14661       if (*modep == VOIDmode)
14662         *modep = mode;
14663
14664       if (*modep == mode)
14665         return 2;
14666
14667       break;
14668
14669     case VECTOR_TYPE:
14670       /* Use V2SImode and V4SImode as representatives of all 64-bit
14671          and 128-bit vector types.  */
14672       size = int_size_in_bytes (type);
14673       switch (size)
14674         {
14675         case 8:
14676           mode = V2SImode;
14677           break;
14678         case 16:
14679           mode = V4SImode;
14680           break;
14681         default:
14682           return -1;
14683         }
14684
14685       if (*modep == VOIDmode)
14686         *modep = mode;
14687
14688       /* Vector modes are considered to be opaque: two vectors are
14689          equivalent for the purposes of being homogeneous aggregates
14690          if they are the same size.  */
14691       if (*modep == mode)
14692         return 1;
14693
14694       break;
14695
14696     case ARRAY_TYPE:
14697       {
14698         int count;
14699         tree index = TYPE_DOMAIN (type);
14700
14701         /* Can't handle incomplete types nor sizes that are not
14702            fixed.  */
14703         if (!COMPLETE_TYPE_P (type)
14704             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14705           return -1;
14706
14707         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
14708         if (count == -1
14709             || !index
14710             || !TYPE_MAX_VALUE (index)
14711             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
14712             || !TYPE_MIN_VALUE (index)
14713             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
14714             || count < 0)
14715           return -1;
14716
14717         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
14718                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
14719
14720         /* There must be no padding.  */
14721         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14722                       count * GET_MODE_BITSIZE (*modep)))
14723           return -1;
14724
14725         return count;
14726       }
14727
14728     case RECORD_TYPE:
14729       {
14730         int count = 0;
14731         int sub_count;
14732         tree field;
14733
14734         /* Can't handle incomplete types nor sizes that are not
14735            fixed.  */
14736         if (!COMPLETE_TYPE_P (type)
14737             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14738           return -1;
14739
14740         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14741           {
14742             if (TREE_CODE (field) != FIELD_DECL)
14743               continue;
14744
14745             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14746             if (sub_count < 0)
14747               return -1;
14748             count += sub_count;
14749           }
14750
14751         /* There must be no padding.  */
14752         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14753                       count * GET_MODE_BITSIZE (*modep)))
14754           return -1;
14755
14756         return count;
14757       }
14758
14759     case UNION_TYPE:
14760     case QUAL_UNION_TYPE:
14761       {
14762         /* These aren't very interesting except in a degenerate case.  */
14763         int count = 0;
14764         int sub_count;
14765         tree field;
14766
14767         /* Can't handle incomplete types nor sizes that are not
14768            fixed.  */
14769         if (!COMPLETE_TYPE_P (type)
14770             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14771           return -1;
14772
14773         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14774           {
14775             if (TREE_CODE (field) != FIELD_DECL)
14776               continue;
14777
14778             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14779             if (sub_count < 0)
14780               return -1;
14781             count = count > sub_count ? count : sub_count;
14782           }
14783
14784         /* There must be no padding.  */
14785         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14786                       count * GET_MODE_BITSIZE (*modep)))
14787           return -1;
14788
14789         return count;
14790       }
14791
14792     default:
14793       break;
14794     }
14795
14796   return -1;
14797 }
14798
14799 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
14800    type as described in AAPCS64 \S 4.1.2.
14801
14802    See the comment above aarch64_composite_type_p for the notes on MODE.  */
14803
14804 static bool
14805 aarch64_short_vector_p (const_tree type,
14806                         machine_mode mode)
14807 {
14808   poly_int64 size = -1;
14809
14810   if (type && TREE_CODE (type) == VECTOR_TYPE)
14811     size = int_size_in_bytes (type);
14812   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
14813             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
14814     size = GET_MODE_SIZE (mode);
14815
14816   return known_eq (size, 8) || known_eq (size, 16);
14817 }
14818
14819 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
14820    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
14821    array types.  The C99 floating-point complex types are also considered
14822    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
14823    types, which are GCC extensions and out of the scope of AAPCS64, are
14824    treated as composite types here as well.
14825
14826    Note that MODE itself is not sufficient in determining whether a type
14827    is such a composite type or not.  This is because
14828    stor-layout.c:compute_record_mode may have already changed the MODE
14829    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
14830    structure with only one field may have its MODE set to the mode of the
14831    field.  Also an integer mode whose size matches the size of the
14832    RECORD_TYPE type may be used to substitute the original mode
14833    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
14834    solely relied on.  */
14835
14836 static bool
14837 aarch64_composite_type_p (const_tree type,
14838                           machine_mode mode)
14839 {
14840   if (aarch64_short_vector_p (type, mode))
14841     return false;
14842
14843   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
14844     return true;
14845
14846   if (mode == BLKmode
14847       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
14848       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
14849     return true;
14850
14851   return false;
14852 }
14853
14854 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
14855    shall be passed or returned in simd/fp register(s) (providing these
14856    parameter passing registers are available).
14857
14858    Upon successful return, *COUNT returns the number of needed registers,
14859    *BASE_MODE returns the mode of the individual register and when IS_HAF
14860    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
14861    floating-point aggregate or a homogeneous short-vector aggregate.  */
14862
14863 static bool
14864 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
14865                                          const_tree type,
14866                                          machine_mode *base_mode,
14867                                          int *count,
14868                                          bool *is_ha)
14869 {
14870   machine_mode new_mode = VOIDmode;
14871   bool composite_p = aarch64_composite_type_p (type, mode);
14872
14873   if (is_ha != NULL) *is_ha = false;
14874
14875   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
14876       || aarch64_short_vector_p (type, mode))
14877     {
14878       *count = 1;
14879       new_mode = mode;
14880     }
14881   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
14882     {
14883       if (is_ha != NULL) *is_ha = true;
14884       *count = 2;
14885       new_mode = GET_MODE_INNER (mode);
14886     }
14887   else if (type && composite_p)
14888     {
14889       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
14890
14891       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
14892         {
14893           if (is_ha != NULL) *is_ha = true;
14894           *count = ag_count;
14895         }
14896       else
14897         return false;
14898     }
14899   else
14900     return false;
14901
14902   *base_mode = new_mode;
14903   return true;
14904 }
14905
14906 /* Implement TARGET_STRUCT_VALUE_RTX.  */
14907
14908 static rtx
14909 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
14910                           int incoming ATTRIBUTE_UNUSED)
14911 {
14912   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
14913 }
14914
14915 /* Implements target hook vector_mode_supported_p.  */
14916 static bool
14917 aarch64_vector_mode_supported_p (machine_mode mode)
14918 {
14919   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14920   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
14921 }
14922
14923 /* Return the full-width SVE vector mode for element mode MODE, if one
14924    exists.  */
14925 opt_machine_mode
14926 aarch64_full_sve_mode (scalar_mode mode)
14927 {
14928   switch (mode)
14929     {
14930     case E_DFmode:
14931       return VNx2DFmode;
14932     case E_SFmode:
14933       return VNx4SFmode;
14934     case E_HFmode:
14935       return VNx8HFmode;
14936     case E_DImode:
14937         return VNx2DImode;
14938     case E_SImode:
14939       return VNx4SImode;
14940     case E_HImode:
14941       return VNx8HImode;
14942     case E_QImode:
14943       return VNx16QImode;
14944     default:
14945       return opt_machine_mode ();
14946     }
14947 }
14948
14949 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
14950    if it exists.  */
14951 opt_machine_mode
14952 aarch64_vq_mode (scalar_mode mode)
14953 {
14954   switch (mode)
14955     {
14956     case E_DFmode:
14957       return V2DFmode;
14958     case E_SFmode:
14959       return V4SFmode;
14960     case E_HFmode:
14961       return V8HFmode;
14962     case E_SImode:
14963       return V4SImode;
14964     case E_HImode:
14965       return V8HImode;
14966     case E_QImode:
14967       return V16QImode;
14968     case E_DImode:
14969       return V2DImode;
14970     default:
14971       return opt_machine_mode ();
14972     }
14973 }
14974
14975 /* Return appropriate SIMD container
14976    for MODE within a vector of WIDTH bits.  */
14977 static machine_mode
14978 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
14979 {
14980   if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
14981     return aarch64_full_sve_mode (mode).else_mode (word_mode);
14982
14983   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
14984   if (TARGET_SIMD)
14985     {
14986       if (known_eq (width, 128))
14987         return aarch64_vq_mode (mode).else_mode (word_mode);
14988       else
14989         switch (mode)
14990           {
14991           case E_SFmode:
14992             return V2SFmode;
14993           case E_HFmode:
14994             return V4HFmode;
14995           case E_SImode:
14996             return V2SImode;
14997           case E_HImode:
14998             return V4HImode;
14999           case E_QImode:
15000             return V8QImode;
15001           default:
15002             break;
15003           }
15004     }
15005   return word_mode;
15006 }
15007
15008 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
15009 static machine_mode
15010 aarch64_preferred_simd_mode (scalar_mode mode)
15011 {
15012   poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
15013   return aarch64_simd_container_mode (mode, bits);
15014 }
15015
15016 /* Return a list of possible vector sizes for the vectorizer
15017    to iterate over.  */
15018 static void
15019 aarch64_autovectorize_vector_sizes (vector_sizes *sizes, bool)
15020 {
15021   if (TARGET_SVE)
15022     sizes->safe_push (BYTES_PER_SVE_VECTOR);
15023   sizes->safe_push (16);
15024   sizes->safe_push (8);
15025 }
15026
15027 /* Implement TARGET_MANGLE_TYPE.  */
15028
15029 static const char *
15030 aarch64_mangle_type (const_tree type)
15031 {
15032   /* The AArch64 ABI documents say that "__va_list" has to be
15033      mangled as if it is in the "std" namespace.  */
15034   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
15035     return "St9__va_list";
15036
15037   /* Half-precision float.  */
15038   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
15039     return "Dh";
15040
15041   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
15042      builtin types.  */
15043   if (TYPE_NAME (type) != NULL)
15044     return aarch64_mangle_builtin_type (type);
15045
15046   /* Use the default mangling.  */
15047   return NULL;
15048 }
15049
15050 /* Find the first rtx_insn before insn that will generate an assembly
15051    instruction.  */
15052
15053 static rtx_insn *
15054 aarch64_prev_real_insn (rtx_insn *insn)
15055 {
15056   if (!insn)
15057     return NULL;
15058
15059   do
15060     {
15061       insn = prev_real_insn (insn);
15062     }
15063   while (insn && recog_memoized (insn) < 0);
15064
15065   return insn;
15066 }
15067
15068 static bool
15069 is_madd_op (enum attr_type t1)
15070 {
15071   unsigned int i;
15072   /* A number of these may be AArch32 only.  */
15073   enum attr_type mlatypes[] = {
15074     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
15075     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
15076     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
15077   };
15078
15079   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
15080     {
15081       if (t1 == mlatypes[i])
15082         return true;
15083     }
15084
15085   return false;
15086 }
15087
15088 /* Check if there is a register dependency between a load and the insn
15089    for which we hold recog_data.  */
15090
15091 static bool
15092 dep_between_memop_and_curr (rtx memop)
15093 {
15094   rtx load_reg;
15095   int opno;
15096
15097   gcc_assert (GET_CODE (memop) == SET);
15098
15099   if (!REG_P (SET_DEST (memop)))
15100     return false;
15101
15102   load_reg = SET_DEST (memop);
15103   for (opno = 1; opno < recog_data.n_operands; opno++)
15104     {
15105       rtx operand = recog_data.operand[opno];
15106       if (REG_P (operand)
15107           && reg_overlap_mentioned_p (load_reg, operand))
15108         return true;
15109
15110     }
15111   return false;
15112 }
15113
15114
15115 /* When working around the Cortex-A53 erratum 835769,
15116    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
15117    instruction and has a preceding memory instruction such that a NOP
15118    should be inserted between them.  */
15119
15120 bool
15121 aarch64_madd_needs_nop (rtx_insn* insn)
15122 {
15123   enum attr_type attr_type;
15124   rtx_insn *prev;
15125   rtx body;
15126
15127   if (!TARGET_FIX_ERR_A53_835769)
15128     return false;
15129
15130   if (!INSN_P (insn) || recog_memoized (insn) < 0)
15131     return false;
15132
15133   attr_type = get_attr_type (insn);
15134   if (!is_madd_op (attr_type))
15135     return false;
15136
15137   prev = aarch64_prev_real_insn (insn);
15138   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
15139      Restore recog state to INSN to avoid state corruption.  */
15140   extract_constrain_insn_cached (insn);
15141
15142   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
15143     return false;
15144
15145   body = single_set (prev);
15146
15147   /* If the previous insn is a memory op and there is no dependency between
15148      it and the DImode madd, emit a NOP between them.  If body is NULL then we
15149      have a complex memory operation, probably a load/store pair.
15150      Be conservative for now and emit a NOP.  */
15151   if (GET_MODE (recog_data.operand[0]) == DImode
15152       && (!body || !dep_between_memop_and_curr (body)))
15153     return true;
15154
15155   return false;
15156
15157 }
15158
15159
15160 /* Implement FINAL_PRESCAN_INSN.  */
15161
15162 void
15163 aarch64_final_prescan_insn (rtx_insn *insn)
15164 {
15165   if (aarch64_madd_needs_nop (insn))
15166     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
15167 }
15168
15169
15170 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
15171    instruction.  */
15172
15173 bool
15174 aarch64_sve_index_immediate_p (rtx base_or_step)
15175 {
15176   return (CONST_INT_P (base_or_step)
15177           && IN_RANGE (INTVAL (base_or_step), -16, 15));
15178 }
15179
15180 /* Return true if X is a valid immediate for the SVE ADD and SUB
15181    instructions.  Negate X first if NEGATE_P is true.  */
15182
15183 bool
15184 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
15185 {
15186   rtx elt;
15187
15188   if (!const_vec_duplicate_p (x, &elt)
15189       || !CONST_INT_P (elt))
15190     return false;
15191
15192   HOST_WIDE_INT val = INTVAL (elt);
15193   if (negate_p)
15194     val = -val;
15195   val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
15196
15197   if (val & 0xff)
15198     return IN_RANGE (val, 0, 0xff);
15199   return IN_RANGE (val, 0, 0xff00);
15200 }
15201
15202 /* Return true if X is a valid immediate operand for an SVE logical
15203    instruction such as AND.  */
15204
15205 bool
15206 aarch64_sve_bitmask_immediate_p (rtx x)
15207 {
15208   rtx elt;
15209
15210   return (const_vec_duplicate_p (x, &elt)
15211           && CONST_INT_P (elt)
15212           && aarch64_bitmask_imm (INTVAL (elt),
15213                                   GET_MODE_INNER (GET_MODE (x))));
15214 }
15215
15216 /* Return true if X is a valid immediate for the SVE DUP and CPY
15217    instructions.  */
15218
15219 bool
15220 aarch64_sve_dup_immediate_p (rtx x)
15221 {
15222   x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
15223   if (!CONST_INT_P (x))
15224     return false;
15225
15226   HOST_WIDE_INT val = INTVAL (x);
15227   if (val & 0xff)
15228     return IN_RANGE (val, -0x80, 0x7f);
15229   return IN_RANGE (val, -0x8000, 0x7f00);
15230 }
15231
15232 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
15233    SIGNED_P says whether the operand is signed rather than unsigned.  */
15234
15235 bool
15236 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
15237 {
15238   rtx elt;
15239
15240   return (const_vec_duplicate_p (x, &elt)
15241           && CONST_INT_P (elt)
15242           && (signed_p
15243               ? IN_RANGE (INTVAL (elt), -16, 15)
15244               : IN_RANGE (INTVAL (elt), 0, 127)));
15245 }
15246
15247 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
15248    instruction.  Negate X first if NEGATE_P is true.  */
15249
15250 bool
15251 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
15252 {
15253   rtx elt;
15254   REAL_VALUE_TYPE r;
15255
15256   if (!const_vec_duplicate_p (x, &elt)
15257       || GET_CODE (elt) != CONST_DOUBLE)
15258     return false;
15259
15260   r = *CONST_DOUBLE_REAL_VALUE (elt);
15261
15262   if (negate_p)
15263     r = real_value_negate (&r);
15264
15265   if (real_equal (&r, &dconst1))
15266     return true;
15267   if (real_equal (&r, &dconsthalf))
15268     return true;
15269   return false;
15270 }
15271
15272 /* Return true if X is a valid immediate operand for an SVE FMUL
15273    instruction.  */
15274
15275 bool
15276 aarch64_sve_float_mul_immediate_p (rtx x)
15277 {
15278   rtx elt;
15279
15280   return (const_vec_duplicate_p (x, &elt)
15281           && GET_CODE (elt) == CONST_DOUBLE
15282           && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
15283               || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
15284 }
15285
15286 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
15287    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
15288    is nonnull, use it to describe valid immediates.  */
15289 static bool
15290 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
15291                                     simd_immediate_info *info,
15292                                     enum simd_immediate_check which,
15293                                     simd_immediate_info::insn_type insn)
15294 {
15295   /* Try a 4-byte immediate with LSL.  */
15296   for (unsigned int shift = 0; shift < 32; shift += 8)
15297     if ((val32 & (0xff << shift)) == val32)
15298       {
15299         if (info)
15300           *info = simd_immediate_info (SImode, val32 >> shift, insn,
15301                                        simd_immediate_info::LSL, shift);
15302         return true;
15303       }
15304
15305   /* Try a 2-byte immediate with LSL.  */
15306   unsigned int imm16 = val32 & 0xffff;
15307   if (imm16 == (val32 >> 16))
15308     for (unsigned int shift = 0; shift < 16; shift += 8)
15309       if ((imm16 & (0xff << shift)) == imm16)
15310         {
15311           if (info)
15312             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
15313                                          simd_immediate_info::LSL, shift);
15314           return true;
15315         }
15316
15317   /* Try a 4-byte immediate with MSL, except for cases that MVN
15318      can handle.  */
15319   if (which == AARCH64_CHECK_MOV)
15320     for (unsigned int shift = 8; shift < 24; shift += 8)
15321       {
15322         unsigned int low = (1 << shift) - 1;
15323         if (((val32 & (0xff << shift)) | low) == val32)
15324           {
15325             if (info)
15326               *info = simd_immediate_info (SImode, val32 >> shift, insn,
15327                                            simd_immediate_info::MSL, shift);
15328             return true;
15329           }
15330       }
15331
15332   return false;
15333 }
15334
15335 /* Return true if replicating VAL64 is a valid immediate for the
15336    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
15337    use it to describe valid immediates.  */
15338 static bool
15339 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
15340                                  simd_immediate_info *info,
15341                                  enum simd_immediate_check which)
15342 {
15343   unsigned int val32 = val64 & 0xffffffff;
15344   unsigned int val16 = val64 & 0xffff;
15345   unsigned int val8 = val64 & 0xff;
15346
15347   if (val32 == (val64 >> 32))
15348     {
15349       if ((which & AARCH64_CHECK_ORR) != 0
15350           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
15351                                                  simd_immediate_info::MOV))
15352         return true;
15353
15354       if ((which & AARCH64_CHECK_BIC) != 0
15355           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
15356                                                  simd_immediate_info::MVN))
15357         return true;
15358
15359       /* Try using a replicated byte.  */
15360       if (which == AARCH64_CHECK_MOV
15361           && val16 == (val32 >> 16)
15362           && val8 == (val16 >> 8))
15363         {
15364           if (info)
15365             *info = simd_immediate_info (QImode, val8);
15366           return true;
15367         }
15368     }
15369
15370   /* Try using a bit-to-bytemask.  */
15371   if (which == AARCH64_CHECK_MOV)
15372     {
15373       unsigned int i;
15374       for (i = 0; i < 64; i += 8)
15375         {
15376           unsigned char byte = (val64 >> i) & 0xff;
15377           if (byte != 0 && byte != 0xff)
15378             break;
15379         }
15380       if (i == 64)
15381         {
15382           if (info)
15383             *info = simd_immediate_info (DImode, val64);
15384           return true;
15385         }
15386     }
15387   return false;
15388 }
15389
15390 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
15391    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
15392
15393 static bool
15394 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
15395                              simd_immediate_info *info)
15396 {
15397   scalar_int_mode mode = DImode;
15398   unsigned int val32 = val64 & 0xffffffff;
15399   if (val32 == (val64 >> 32))
15400     {
15401       mode = SImode;
15402       unsigned int val16 = val32 & 0xffff;
15403       if (val16 == (val32 >> 16))
15404         {
15405           mode = HImode;
15406           unsigned int val8 = val16 & 0xff;
15407           if (val8 == (val16 >> 8))
15408             mode = QImode;
15409         }
15410     }
15411   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
15412   if (IN_RANGE (val, -0x80, 0x7f))
15413     {
15414       /* DUP with no shift.  */
15415       if (info)
15416         *info = simd_immediate_info (mode, val);
15417       return true;
15418     }
15419   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
15420     {
15421       /* DUP with LSL #8.  */
15422       if (info)
15423         *info = simd_immediate_info (mode, val);
15424       return true;
15425     }
15426   if (aarch64_bitmask_imm (val64, mode))
15427     {
15428       /* DUPM.  */
15429       if (info)
15430         *info = simd_immediate_info (mode, val);
15431       return true;
15432     }
15433   return false;
15434 }
15435
15436 /* Return true if X is a valid SVE predicate.  If INFO is nonnull, use
15437    it to describe valid immediates.  */
15438
15439 static bool
15440 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
15441 {
15442   if (x == CONST0_RTX (GET_MODE (x)))
15443     {
15444       if (info)
15445         *info = simd_immediate_info (DImode, 0);
15446       return true;
15447     }
15448
15449   /* Analyze the value as a VNx16BImode.  This should be relatively
15450      efficient, since rtx_vector_builder has enough built-in capacity
15451      to store all VLA predicate constants without needing the heap.  */
15452   rtx_vector_builder builder;
15453   if (!aarch64_get_sve_pred_bits (builder, x))
15454     return false;
15455
15456   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
15457   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
15458     {
15459       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
15460       aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
15461       if (pattern != AARCH64_NUM_SVPATTERNS)
15462         {
15463           if (info)
15464             {
15465               scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
15466               *info = simd_immediate_info (int_mode, pattern);
15467             }
15468           return true;
15469         }
15470     }
15471   return false;
15472 }
15473
15474 /* Return true if OP is a valid SIMD immediate for the operation
15475    described by WHICH.  If INFO is nonnull, use it to describe valid
15476    immediates.  */
15477 bool
15478 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
15479                               enum simd_immediate_check which)
15480 {
15481   machine_mode mode = GET_MODE (op);
15482   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15483   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15484     return false;
15485
15486   if (vec_flags & VEC_SVE_PRED)
15487     return aarch64_sve_pred_valid_immediate (op, info);
15488
15489   scalar_mode elt_mode = GET_MODE_INNER (mode);
15490   rtx base, step;
15491   unsigned int n_elts;
15492   if (GET_CODE (op) == CONST_VECTOR
15493       && CONST_VECTOR_DUPLICATE_P (op))
15494     n_elts = CONST_VECTOR_NPATTERNS (op);
15495   else if ((vec_flags & VEC_SVE_DATA)
15496            && const_vec_series_p (op, &base, &step))
15497     {
15498       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
15499       if (!aarch64_sve_index_immediate_p (base)
15500           || !aarch64_sve_index_immediate_p (step))
15501         return false;
15502
15503       if (info)
15504         *info = simd_immediate_info (elt_mode, base, step);
15505       return true;
15506     }
15507   else if (GET_CODE (op) == CONST_VECTOR
15508            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
15509     /* N_ELTS set above.  */;
15510   else
15511     return false;
15512
15513   scalar_float_mode elt_float_mode;
15514   if (n_elts == 1
15515       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
15516     {
15517       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
15518       if (aarch64_float_const_zero_rtx_p (elt)
15519           || aarch64_float_const_representable_p (elt))
15520         {
15521           if (info)
15522             *info = simd_immediate_info (elt_float_mode, elt);
15523           return true;
15524         }
15525     }
15526
15527   unsigned int elt_size = GET_MODE_SIZE (elt_mode);
15528   if (elt_size > 8)
15529     return false;
15530
15531   scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
15532
15533   /* Expand the vector constant out into a byte vector, with the least
15534      significant byte of the register first.  */
15535   auto_vec<unsigned char, 16> bytes;
15536   bytes.reserve (n_elts * elt_size);
15537   for (unsigned int i = 0; i < n_elts; i++)
15538     {
15539       /* The vector is provided in gcc endian-neutral fashion.
15540          For aarch64_be Advanced SIMD, it must be laid out in the vector
15541          register in reverse order.  */
15542       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
15543       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
15544
15545       if (elt_mode != elt_int_mode)
15546         elt = gen_lowpart (elt_int_mode, elt);
15547
15548       if (!CONST_INT_P (elt))
15549         return false;
15550
15551       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
15552       for (unsigned int byte = 0; byte < elt_size; byte++)
15553         {
15554           bytes.quick_push (elt_val & 0xff);
15555           elt_val >>= BITS_PER_UNIT;
15556         }
15557     }
15558
15559   /* The immediate must repeat every eight bytes.  */
15560   unsigned int nbytes = bytes.length ();
15561   for (unsigned i = 8; i < nbytes; ++i)
15562     if (bytes[i] != bytes[i - 8])
15563       return false;
15564
15565   /* Get the repeating 8-byte value as an integer.  No endian correction
15566      is needed here because bytes is already in lsb-first order.  */
15567   unsigned HOST_WIDE_INT val64 = 0;
15568   for (unsigned int i = 0; i < 8; i++)
15569     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
15570               << (i * BITS_PER_UNIT));
15571
15572   if (vec_flags & VEC_SVE_DATA)
15573     return aarch64_sve_valid_immediate (val64, info);
15574   else
15575     return aarch64_advsimd_valid_immediate (val64, info, which);
15576 }
15577
15578 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
15579    has a step in the range of INDEX.  Return the index expression if so,
15580    otherwise return null.  */
15581 rtx
15582 aarch64_check_zero_based_sve_index_immediate (rtx x)
15583 {
15584   rtx base, step;
15585   if (const_vec_series_p (x, &base, &step)
15586       && base == const0_rtx
15587       && aarch64_sve_index_immediate_p (step))
15588     return step;
15589   return NULL_RTX;
15590 }
15591
15592 /* Check of immediate shift constants are within range.  */
15593 bool
15594 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
15595 {
15596   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
15597   if (left)
15598     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
15599   else
15600     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
15601 }
15602
15603 /* Return the bitmask CONST_INT to select the bits required by a zero extract
15604    operation of width WIDTH at bit position POS.  */
15605
15606 rtx
15607 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
15608 {
15609   gcc_assert (CONST_INT_P (width));
15610   gcc_assert (CONST_INT_P (pos));
15611
15612   unsigned HOST_WIDE_INT mask
15613     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
15614   return GEN_INT (mask << UINTVAL (pos));
15615 }
15616
15617 bool
15618 aarch64_mov_operand_p (rtx x, machine_mode mode)
15619 {
15620   if (GET_CODE (x) == HIGH
15621       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
15622     return true;
15623
15624   if (CONST_INT_P (x))
15625     return true;
15626
15627   if (VECTOR_MODE_P (GET_MODE (x)))
15628     {
15629       /* Require predicate constants to be VNx16BI before RA, so that we
15630          force everything to have a canonical form.  */
15631       if (!lra_in_progress
15632           && !reload_completed
15633           && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
15634           && GET_MODE (x) != VNx16BImode)
15635         return false;
15636
15637       return aarch64_simd_valid_immediate (x, NULL);
15638     }
15639
15640   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
15641     return true;
15642
15643   if (aarch64_sve_cnt_immediate_p (x))
15644     return true;
15645
15646   return aarch64_classify_symbolic_expression (x)
15647     == SYMBOL_TINY_ABSOLUTE;
15648 }
15649
15650 /* Return a const_int vector of VAL.  */
15651 rtx
15652 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
15653 {
15654   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
15655   return gen_const_vec_duplicate (mode, c);
15656 }
15657
15658 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
15659
15660 bool
15661 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
15662 {
15663   machine_mode vmode;
15664
15665   vmode = aarch64_simd_container_mode (mode, 64);
15666   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
15667   return aarch64_simd_valid_immediate (op_v, NULL);
15668 }
15669
15670 /* Construct and return a PARALLEL RTX vector with elements numbering the
15671    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
15672    the vector - from the perspective of the architecture.  This does not
15673    line up with GCC's perspective on lane numbers, so we end up with
15674    different masks depending on our target endian-ness.  The diagram
15675    below may help.  We must draw the distinction when building masks
15676    which select one half of the vector.  An instruction selecting
15677    architectural low-lanes for a big-endian target, must be described using
15678    a mask selecting GCC high-lanes.
15679
15680                  Big-Endian             Little-Endian
15681
15682 GCC             0   1   2   3           3   2   1   0
15683               | x | x | x | x |       | x | x | x | x |
15684 Architecture    3   2   1   0           3   2   1   0
15685
15686 Low Mask:         { 2, 3 }                { 0, 1 }
15687 High Mask:        { 0, 1 }                { 2, 3 }
15688
15689    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
15690
15691 rtx
15692 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
15693 {
15694   rtvec v = rtvec_alloc (nunits / 2);
15695   int high_base = nunits / 2;
15696   int low_base = 0;
15697   int base;
15698   rtx t1;
15699   int i;
15700
15701   if (BYTES_BIG_ENDIAN)
15702     base = high ? low_base : high_base;
15703   else
15704     base = high ? high_base : low_base;
15705
15706   for (i = 0; i < nunits / 2; i++)
15707     RTVEC_ELT (v, i) = GEN_INT (base + i);
15708
15709   t1 = gen_rtx_PARALLEL (mode, v);
15710   return t1;
15711 }
15712
15713 /* Check OP for validity as a PARALLEL RTX vector with elements
15714    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
15715    from the perspective of the architecture.  See the diagram above
15716    aarch64_simd_vect_par_cnst_half for more details.  */
15717
15718 bool
15719 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
15720                                        bool high)
15721 {
15722   int nelts;
15723   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
15724     return false;
15725
15726   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
15727   HOST_WIDE_INT count_op = XVECLEN (op, 0);
15728   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
15729   int i = 0;
15730
15731   if (count_op != count_ideal)
15732     return false;
15733
15734   for (i = 0; i < count_ideal; i++)
15735     {
15736       rtx elt_op = XVECEXP (op, 0, i);
15737       rtx elt_ideal = XVECEXP (ideal, 0, i);
15738
15739       if (!CONST_INT_P (elt_op)
15740           || INTVAL (elt_ideal) != INTVAL (elt_op))
15741         return false;
15742     }
15743   return true;
15744 }
15745
15746 /* Return a PARALLEL containing NELTS elements, with element I equal
15747    to BASE + I * STEP.  */
15748
15749 rtx
15750 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
15751 {
15752   rtvec vec = rtvec_alloc (nelts);
15753   for (unsigned int i = 0; i < nelts; ++i)
15754     RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
15755   return gen_rtx_PARALLEL (VOIDmode, vec);
15756 }
15757
15758 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
15759    series with step STEP.  */
15760
15761 bool
15762 aarch64_stepped_int_parallel_p (rtx op, int step)
15763 {
15764   if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
15765     return false;
15766
15767   unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
15768   for (int i = 1; i < XVECLEN (op, 0); ++i)
15769     if (!CONST_INT_P (XVECEXP (op, 0, i))
15770         || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
15771       return false;
15772
15773   return true;
15774 }
15775
15776 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
15777    HIGH (exclusive).  */
15778 void
15779 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
15780                           const_tree exp)
15781 {
15782   HOST_WIDE_INT lane;
15783   gcc_assert (CONST_INT_P (operand));
15784   lane = INTVAL (operand);
15785
15786   if (lane < low || lane >= high)
15787   {
15788     if (exp)
15789       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
15790     else
15791       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
15792   }
15793 }
15794
15795 /* Peform endian correction on lane number N, which indexes a vector
15796    of mode MODE, and return the result as an SImode rtx.  */
15797
15798 rtx
15799 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
15800 {
15801   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
15802 }
15803
15804 /* Return TRUE if OP is a valid vector addressing mode.  */
15805
15806 bool
15807 aarch64_simd_mem_operand_p (rtx op)
15808 {
15809   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
15810                         || REG_P (XEXP (op, 0)));
15811 }
15812
15813 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
15814
15815 bool
15816 aarch64_sve_ld1r_operand_p (rtx op)
15817 {
15818   struct aarch64_address_info addr;
15819   scalar_mode mode;
15820
15821   return (MEM_P (op)
15822           && is_a <scalar_mode> (GET_MODE (op), &mode)
15823           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
15824           && addr.type == ADDRESS_REG_IMM
15825           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
15826 }
15827
15828 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction.  */
15829 bool
15830 aarch64_sve_ld1rq_operand_p (rtx op)
15831 {
15832   struct aarch64_address_info addr;
15833   scalar_mode elem_mode = GET_MODE_INNER (GET_MODE (op));
15834   if (!MEM_P (op)
15835       || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
15836     return false;
15837
15838   if (addr.type == ADDRESS_REG_IMM)
15839     return offset_4bit_signed_scaled_p (TImode, addr.const_offset);
15840
15841   if (addr.type == ADDRESS_REG_REG)
15842     return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
15843
15844   return false;
15845 }
15846
15847 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
15848    The conditions for STR are the same.  */
15849 bool
15850 aarch64_sve_ldr_operand_p (rtx op)
15851 {
15852   struct aarch64_address_info addr;
15853
15854   return (MEM_P (op)
15855           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
15856                                        false, ADDR_QUERY_ANY)
15857           && addr.type == ADDRESS_REG_IMM);
15858 }
15859
15860 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
15861    We need to be able to access the individual pieces, so the range
15862    is different from LD[234] and ST[234].  */
15863 bool
15864 aarch64_sve_struct_memory_operand_p (rtx op)
15865 {
15866   if (!MEM_P (op))
15867     return false;
15868
15869   machine_mode mode = GET_MODE (op);
15870   struct aarch64_address_info addr;
15871   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
15872                                  ADDR_QUERY_ANY)
15873       || addr.type != ADDRESS_REG_IMM)
15874     return false;
15875
15876   poly_int64 first = addr.const_offset;
15877   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
15878   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
15879           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
15880 }
15881
15882 /* Emit a register copy from operand to operand, taking care not to
15883    early-clobber source registers in the process.
15884
15885    COUNT is the number of components into which the copy needs to be
15886    decomposed.  */
15887 void
15888 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
15889                                 unsigned int count)
15890 {
15891   unsigned int i;
15892   int rdest = REGNO (operands[0]);
15893   int rsrc = REGNO (operands[1]);
15894
15895   if (!reg_overlap_mentioned_p (operands[0], operands[1])
15896       || rdest < rsrc)
15897     for (i = 0; i < count; i++)
15898       emit_move_insn (gen_rtx_REG (mode, rdest + i),
15899                       gen_rtx_REG (mode, rsrc + i));
15900   else
15901     for (i = 0; i < count; i++)
15902       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
15903                       gen_rtx_REG (mode, rsrc + count - i - 1));
15904 }
15905
15906 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
15907    one of VSTRUCT modes: OI, CI, or XI.  */
15908 int
15909 aarch64_simd_attr_length_rglist (machine_mode mode)
15910 {
15911   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
15912   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
15913 }
15914
15915 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
15916    alignment of a vector to 128 bits.  SVE predicates have an alignment of
15917    16 bits.  */
15918 static HOST_WIDE_INT
15919 aarch64_simd_vector_alignment (const_tree type)
15920 {
15921   /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
15922      be set for non-predicate vectors of booleans.  Modes are the most
15923      direct way we have of identifying real SVE predicate types.  */
15924   if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
15925     return 16;
15926   if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
15927     return 128;
15928   return wi::umin (wi::to_wide (TYPE_SIZE (type)), 128).to_uhwi ();
15929 }
15930
15931 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
15932 static poly_uint64
15933 aarch64_vectorize_preferred_vector_alignment (const_tree type)
15934 {
15935   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
15936     {
15937       /* If the length of the vector is fixed, try to align to that length,
15938          otherwise don't try to align at all.  */
15939       HOST_WIDE_INT result;
15940       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
15941         result = TYPE_ALIGN (TREE_TYPE (type));
15942       return result;
15943     }
15944   return TYPE_ALIGN (type);
15945 }
15946
15947 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
15948 static bool
15949 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
15950 {
15951   if (is_packed)
15952     return false;
15953
15954   /* For fixed-length vectors, check that the vectorizer will aim for
15955      full-vector alignment.  This isn't true for generic GCC vectors
15956      that are wider than the ABI maximum of 128 bits.  */
15957   poly_uint64 preferred_alignment =
15958     aarch64_vectorize_preferred_vector_alignment (type);
15959   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
15960       && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
15961                    preferred_alignment))
15962     return false;
15963
15964   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
15965   return true;
15966 }
15967
15968 /* Return true if the vector misalignment factor is supported by the
15969    target.  */
15970 static bool
15971 aarch64_builtin_support_vector_misalignment (machine_mode mode,
15972                                              const_tree type, int misalignment,
15973                                              bool is_packed)
15974 {
15975   if (TARGET_SIMD && STRICT_ALIGNMENT)
15976     {
15977       /* Return if movmisalign pattern is not supported for this mode.  */
15978       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
15979         return false;
15980
15981       /* Misalignment factor is unknown at compile time.  */
15982       if (misalignment == -1)
15983         return false;
15984     }
15985   return default_builtin_support_vector_misalignment (mode, type, misalignment,
15986                                                       is_packed);
15987 }
15988
15989 /* If VALS is a vector constant that can be loaded into a register
15990    using DUP, generate instructions to do so and return an RTX to
15991    assign to the register.  Otherwise return NULL_RTX.  */
15992 static rtx
15993 aarch64_simd_dup_constant (rtx vals)
15994 {
15995   machine_mode mode = GET_MODE (vals);
15996   machine_mode inner_mode = GET_MODE_INNER (mode);
15997   rtx x;
15998
15999   if (!const_vec_duplicate_p (vals, &x))
16000     return NULL_RTX;
16001
16002   /* We can load this constant by using DUP and a constant in a
16003      single ARM register.  This will be cheaper than a vector
16004      load.  */
16005   x = copy_to_mode_reg (inner_mode, x);
16006   return gen_vec_duplicate (mode, x);
16007 }
16008
16009
16010 /* Generate code to load VALS, which is a PARALLEL containing only
16011    constants (for vec_init) or CONST_VECTOR, efficiently into a
16012    register.  Returns an RTX to copy into the register, or NULL_RTX
16013    for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
16014 static rtx
16015 aarch64_simd_make_constant (rtx vals)
16016 {
16017   machine_mode mode = GET_MODE (vals);
16018   rtx const_dup;
16019   rtx const_vec = NULL_RTX;
16020   int n_const = 0;
16021   int i;
16022
16023   if (GET_CODE (vals) == CONST_VECTOR)
16024     const_vec = vals;
16025   else if (GET_CODE (vals) == PARALLEL)
16026     {
16027       /* A CONST_VECTOR must contain only CONST_INTs and
16028          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
16029          Only store valid constants in a CONST_VECTOR.  */
16030       int n_elts = XVECLEN (vals, 0);
16031       for (i = 0; i < n_elts; ++i)
16032         {
16033           rtx x = XVECEXP (vals, 0, i);
16034           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16035             n_const++;
16036         }
16037       if (n_const == n_elts)
16038         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
16039     }
16040   else
16041     gcc_unreachable ();
16042
16043   if (const_vec != NULL_RTX
16044       && aarch64_simd_valid_immediate (const_vec, NULL))
16045     /* Load using MOVI/MVNI.  */
16046     return const_vec;
16047   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
16048     /* Loaded using DUP.  */
16049     return const_dup;
16050   else if (const_vec != NULL_RTX)
16051     /* Load from constant pool. We cannot take advantage of single-cycle
16052        LD1 because we need a PC-relative addressing mode.  */
16053     return const_vec;
16054   else
16055     /* A PARALLEL containing something not valid inside CONST_VECTOR.
16056        We cannot construct an initializer.  */
16057     return NULL_RTX;
16058 }
16059
16060 /* Expand a vector initialisation sequence, such that TARGET is
16061    initialised to contain VALS.  */
16062
16063 void
16064 aarch64_expand_vector_init (rtx target, rtx vals)
16065 {
16066   machine_mode mode = GET_MODE (target);
16067   scalar_mode inner_mode = GET_MODE_INNER (mode);
16068   /* The number of vector elements.  */
16069   int n_elts = XVECLEN (vals, 0);
16070   /* The number of vector elements which are not constant.  */
16071   int n_var = 0;
16072   rtx any_const = NULL_RTX;
16073   /* The first element of vals.  */
16074   rtx v0 = XVECEXP (vals, 0, 0);
16075   bool all_same = true;
16076
16077   /* This is a special vec_init<M><N> where N is not an element mode but a
16078      vector mode with half the elements of M.  We expect to find two entries
16079      of mode N in VALS and we must put their concatentation into TARGET.  */
16080   if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
16081     {
16082       gcc_assert (known_eq (GET_MODE_SIZE (mode),
16083                   2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
16084       rtx lo = XVECEXP (vals, 0, 0);
16085       rtx hi = XVECEXP (vals, 0, 1);
16086       machine_mode narrow_mode = GET_MODE (lo);
16087       gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
16088       gcc_assert (narrow_mode == GET_MODE (hi));
16089
16090       /* When we want to concatenate a half-width vector with zeroes we can
16091          use the aarch64_combinez[_be] patterns.  Just make sure that the
16092          zeroes are in the right half.  */
16093       if (BYTES_BIG_ENDIAN
16094           && aarch64_simd_imm_zero (lo, narrow_mode)
16095           && general_operand (hi, narrow_mode))
16096         emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
16097       else if (!BYTES_BIG_ENDIAN
16098                && aarch64_simd_imm_zero (hi, narrow_mode)
16099                && general_operand (lo, narrow_mode))
16100         emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
16101       else
16102         {
16103           /* Else create the two half-width registers and combine them.  */
16104           if (!REG_P (lo))
16105             lo = force_reg (GET_MODE (lo), lo);
16106           if (!REG_P (hi))
16107             hi = force_reg (GET_MODE (hi), hi);
16108
16109           if (BYTES_BIG_ENDIAN)
16110             std::swap (lo, hi);
16111           emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
16112         }
16113      return;
16114    }
16115
16116   /* Count the number of variable elements to initialise.  */
16117   for (int i = 0; i < n_elts; ++i)
16118     {
16119       rtx x = XVECEXP (vals, 0, i);
16120       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
16121         ++n_var;
16122       else
16123         any_const = x;
16124
16125       all_same &= rtx_equal_p (x, v0);
16126     }
16127
16128   /* No variable elements, hand off to aarch64_simd_make_constant which knows
16129      how best to handle this.  */
16130   if (n_var == 0)
16131     {
16132       rtx constant = aarch64_simd_make_constant (vals);
16133       if (constant != NULL_RTX)
16134         {
16135           emit_move_insn (target, constant);
16136           return;
16137         }
16138     }
16139
16140   /* Splat a single non-constant element if we can.  */
16141   if (all_same)
16142     {
16143       rtx x = copy_to_mode_reg (inner_mode, v0);
16144       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
16145       return;
16146     }
16147
16148   enum insn_code icode = optab_handler (vec_set_optab, mode);
16149   gcc_assert (icode != CODE_FOR_nothing);
16150
16151   /* If there are only variable elements, try to optimize
16152      the insertion using dup for the most common element
16153      followed by insertions.  */
16154
16155   /* The algorithm will fill matches[*][0] with the earliest matching element,
16156      and matches[X][1] with the count of duplicate elements (if X is the
16157      earliest element which has duplicates).  */
16158
16159   if (n_var == n_elts && n_elts <= 16)
16160     {
16161       int matches[16][2] = {0};
16162       for (int i = 0; i < n_elts; i++)
16163         {
16164           for (int j = 0; j <= i; j++)
16165             {
16166               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
16167                 {
16168                   matches[i][0] = j;
16169                   matches[j][1]++;
16170                   break;
16171                 }
16172             }
16173         }
16174       int maxelement = 0;
16175       int maxv = 0;
16176       for (int i = 0; i < n_elts; i++)
16177         if (matches[i][1] > maxv)
16178           {
16179             maxelement = i;
16180             maxv = matches[i][1];
16181           }
16182
16183       /* Create a duplicate of the most common element, unless all elements
16184          are equally useless to us, in which case just immediately set the
16185          vector register using the first element.  */
16186
16187       if (maxv == 1)
16188         {
16189           /* For vectors of two 64-bit elements, we can do even better.  */
16190           if (n_elts == 2
16191               && (inner_mode == E_DImode
16192                   || inner_mode == E_DFmode))
16193
16194             {
16195               rtx x0 = XVECEXP (vals, 0, 0);
16196               rtx x1 = XVECEXP (vals, 0, 1);
16197               /* Combine can pick up this case, but handling it directly
16198                  here leaves clearer RTL.
16199
16200                  This is load_pair_lanes<mode>, and also gives us a clean-up
16201                  for store_pair_lanes<mode>.  */
16202               if (memory_operand (x0, inner_mode)
16203                   && memory_operand (x1, inner_mode)
16204                   && !STRICT_ALIGNMENT
16205                   && rtx_equal_p (XEXP (x1, 0),
16206                                   plus_constant (Pmode,
16207                                                  XEXP (x0, 0),
16208                                                  GET_MODE_SIZE (inner_mode))))
16209                 {
16210                   rtx t;
16211                   if (inner_mode == DFmode)
16212                     t = gen_load_pair_lanesdf (target, x0, x1);
16213                   else
16214                     t = gen_load_pair_lanesdi (target, x0, x1);
16215                   emit_insn (t);
16216                   return;
16217                 }
16218             }
16219           /* The subreg-move sequence below will move into lane zero of the
16220              vector register.  For big-endian we want that position to hold
16221              the last element of VALS.  */
16222           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
16223           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
16224           aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
16225         }
16226       else
16227         {
16228           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
16229           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
16230         }
16231
16232       /* Insert the rest.  */
16233       for (int i = 0; i < n_elts; i++)
16234         {
16235           rtx x = XVECEXP (vals, 0, i);
16236           if (matches[i][0] == maxelement)
16237             continue;
16238           x = copy_to_mode_reg (inner_mode, x);
16239           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
16240         }
16241       return;
16242     }
16243
16244   /* Initialise a vector which is part-variable.  We want to first try
16245      to build those lanes which are constant in the most efficient way we
16246      can.  */
16247   if (n_var != n_elts)
16248     {
16249       rtx copy = copy_rtx (vals);
16250
16251       /* Load constant part of vector.  We really don't care what goes into the
16252          parts we will overwrite, but we're more likely to be able to load the
16253          constant efficiently if it has fewer, larger, repeating parts
16254          (see aarch64_simd_valid_immediate).  */
16255       for (int i = 0; i < n_elts; i++)
16256         {
16257           rtx x = XVECEXP (vals, 0, i);
16258           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16259             continue;
16260           rtx subst = any_const;
16261           for (int bit = n_elts / 2; bit > 0; bit /= 2)
16262             {
16263               /* Look in the copied vector, as more elements are const.  */
16264               rtx test = XVECEXP (copy, 0, i ^ bit);
16265               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
16266                 {
16267                   subst = test;
16268                   break;
16269                 }
16270             }
16271           XVECEXP (copy, 0, i) = subst;
16272         }
16273       aarch64_expand_vector_init (target, copy);
16274     }
16275
16276   /* Insert the variable lanes directly.  */
16277   for (int i = 0; i < n_elts; i++)
16278     {
16279       rtx x = XVECEXP (vals, 0, i);
16280       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16281         continue;
16282       x = copy_to_mode_reg (inner_mode, x);
16283       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
16284     }
16285 }
16286
16287 /* Emit RTL corresponding to:
16288    insr TARGET, ELEM.  */
16289
16290 static void
16291 emit_insr (rtx target, rtx elem)
16292 {
16293   machine_mode mode = GET_MODE (target);
16294   scalar_mode elem_mode = GET_MODE_INNER (mode);
16295   elem = force_reg (elem_mode, elem);
16296
16297   insn_code icode = optab_handler (vec_shl_insert_optab, mode);
16298   gcc_assert (icode != CODE_FOR_nothing);
16299   emit_insn (GEN_FCN (icode) (target, target, elem));
16300 }
16301
16302 /* Subroutine of aarch64_sve_expand_vector_init for handling
16303    trailing constants.
16304    This function works as follows:
16305    (a) Create a new vector consisting of trailing constants.
16306    (b) Initialize TARGET with the constant vector using emit_move_insn.
16307    (c) Insert remaining elements in TARGET using insr.
16308    NELTS is the total number of elements in original vector while
16309    while NELTS_REQD is the number of elements that are actually
16310    significant.
16311
16312    ??? The heuristic used is to do above only if number of constants
16313    is at least half the total number of elements.  May need fine tuning.  */
16314
16315 static bool
16316 aarch64_sve_expand_vector_init_handle_trailing_constants
16317  (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
16318 {
16319   machine_mode mode = GET_MODE (target);
16320   scalar_mode elem_mode = GET_MODE_INNER (mode);
16321   int n_trailing_constants = 0;
16322
16323   for (int i = nelts_reqd - 1;
16324        i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i));
16325        i--)
16326     n_trailing_constants++;
16327
16328   if (n_trailing_constants >= nelts_reqd / 2)
16329     {
16330       rtx_vector_builder v (mode, 1, nelts);
16331       for (int i = 0; i < nelts; i++)
16332         v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants));
16333       rtx const_vec = v.build ();
16334       emit_move_insn (target, const_vec);
16335
16336       for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
16337         emit_insr (target, builder.elt (i));
16338
16339       return true;
16340     }
16341
16342   return false;
16343 }
16344
16345 /* Subroutine of aarch64_sve_expand_vector_init.
16346    Works as follows:
16347    (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
16348    (b) Skip trailing elements from BUILDER, which are the same as
16349        element NELTS_REQD - 1.
16350    (c) Insert earlier elements in reverse order in TARGET using insr.  */
16351
16352 static void
16353 aarch64_sve_expand_vector_init_insert_elems (rtx target,
16354                                              const rtx_vector_builder &builder,
16355                                              int nelts_reqd)
16356 {
16357   machine_mode mode = GET_MODE (target);
16358   scalar_mode elem_mode = GET_MODE_INNER (mode);
16359
16360   struct expand_operand ops[2];
16361   enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
16362   gcc_assert (icode != CODE_FOR_nothing);
16363
16364   create_output_operand (&ops[0], target, mode);
16365   create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
16366   expand_insn (icode, 2, ops);
16367
16368   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
16369   for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
16370     emit_insr (target, builder.elt (i));
16371 }
16372
16373 /* Subroutine of aarch64_sve_expand_vector_init to handle case
16374    when all trailing elements of builder are same.
16375    This works as follows:
16376    (a) Use expand_insn interface to broadcast last vector element in TARGET.
16377    (b) Insert remaining elements in TARGET using insr.
16378
16379    ??? The heuristic used is to do above if number of same trailing elements
16380    is at least 3/4 of total number of elements, loosely based on
16381    heuristic from mostly_zeros_p.  May need fine-tuning.  */
16382
16383 static bool
16384 aarch64_sve_expand_vector_init_handle_trailing_same_elem
16385  (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
16386 {
16387   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
16388   if (ndups >= (3 * nelts_reqd) / 4)
16389     {
16390       aarch64_sve_expand_vector_init_insert_elems (target, builder,
16391                                                    nelts_reqd - ndups + 1);
16392       return true;
16393     }
16394
16395   return false;
16396 }
16397
16398 /* Initialize register TARGET from BUILDER. NELTS is the constant number
16399    of elements in BUILDER.
16400
16401    The function tries to initialize TARGET from BUILDER if it fits one
16402    of the special cases outlined below.
16403
16404    Failing that, the function divides BUILDER into two sub-vectors:
16405    v_even = even elements of BUILDER;
16406    v_odd = odd elements of BUILDER;
16407
16408    and recursively calls itself with v_even and v_odd.
16409
16410    if (recursive call succeeded for v_even or v_odd)
16411      TARGET = zip (v_even, v_odd)
16412
16413    The function returns true if it managed to build TARGET from BUILDER
16414    with one of the special cases, false otherwise.
16415
16416    Example: {a, 1, b, 2, c, 3, d, 4}
16417
16418    The vector gets divided into:
16419    v_even = {a, b, c, d}
16420    v_odd = {1, 2, 3, 4}
16421
16422    aarch64_sve_expand_vector_init(v_odd) hits case 1 and
16423    initialize tmp2 from constant vector v_odd using emit_move_insn.
16424
16425    aarch64_sve_expand_vector_init(v_even) fails since v_even contains
16426    4 elements, so we construct tmp1 from v_even using insr:
16427    tmp1 = dup(d)
16428    insr tmp1, c
16429    insr tmp1, b
16430    insr tmp1, a
16431
16432    And finally:
16433    TARGET = zip (tmp1, tmp2)
16434    which sets TARGET to {a, 1, b, 2, c, 3, d, 4}.  */
16435
16436 static bool
16437 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
16438                                 int nelts, int nelts_reqd)
16439 {
16440   machine_mode mode = GET_MODE (target);
16441
16442   /* Case 1: Vector contains trailing constants.  */
16443
16444   if (aarch64_sve_expand_vector_init_handle_trailing_constants
16445        (target, builder, nelts, nelts_reqd))
16446     return true;
16447
16448   /* Case 2: Vector contains leading constants.  */
16449
16450   rtx_vector_builder rev_builder (mode, 1, nelts_reqd);
16451   for (int i = 0; i < nelts_reqd; i++)
16452     rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
16453   rev_builder.finalize ();
16454
16455   if (aarch64_sve_expand_vector_init_handle_trailing_constants
16456        (target, rev_builder, nelts, nelts_reqd))
16457     {
16458       emit_insn (gen_aarch64_sve_rev (mode, target, target));
16459       return true;
16460     }
16461
16462   /* Case 3: Vector contains trailing same element.  */
16463
16464   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16465        (target, builder, nelts_reqd))
16466     return true;
16467
16468   /* Case 4: Vector contains leading same element.  */
16469
16470   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16471        (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
16472     {
16473       emit_insn (gen_aarch64_sve_rev (mode, target, target));
16474       return true;
16475     }
16476
16477   /* Avoid recursing below 4-elements.
16478      ??? The threshold 4 may need fine-tuning.  */
16479
16480   if (nelts_reqd <= 4)
16481     return false;
16482
16483   rtx_vector_builder v_even (mode, 1, nelts);
16484   rtx_vector_builder v_odd (mode, 1, nelts);
16485
16486   for (int i = 0; i < nelts * 2; i += 2)
16487     {
16488       v_even.quick_push (builder.elt (i));
16489       v_odd.quick_push (builder.elt (i + 1));
16490     }
16491
16492   v_even.finalize ();
16493   v_odd.finalize ();
16494
16495   rtx tmp1 = gen_reg_rtx (mode);
16496   bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
16497                                                     nelts, nelts_reqd / 2);
16498
16499   rtx tmp2 = gen_reg_rtx (mode);
16500   bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
16501                                                    nelts, nelts_reqd / 2);
16502
16503   if (!did_even_p && !did_odd_p)
16504     return false;
16505
16506   /* Initialize v_even and v_odd using INSR if it didn't match any of the
16507      special cases and zip v_even, v_odd.  */
16508
16509   if (!did_even_p)
16510     aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
16511
16512   if (!did_odd_p)
16513     aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
16514
16515   rtvec v = gen_rtvec (2, tmp1, tmp2);
16516   emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
16517   return true;
16518 }
16519
16520 /* Initialize register TARGET from the elements in PARALLEL rtx VALS.  */
16521
16522 void
16523 aarch64_sve_expand_vector_init (rtx target, rtx vals)
16524 {
16525   machine_mode mode = GET_MODE (target);
16526   int nelts = XVECLEN (vals, 0);
16527
16528   rtx_vector_builder v (mode, 1, nelts);
16529   for (int i = 0; i < nelts; i++)
16530     v.quick_push (XVECEXP (vals, 0, i));
16531   v.finalize ();
16532
16533   /* If neither sub-vectors of v could be initialized specially,
16534      then use INSR to insert all elements from v into TARGET.
16535      ??? This might not be optimal for vectors with large
16536      initializers like 16-element or above.
16537      For nelts < 4, it probably isn't useful to handle specially.  */
16538
16539   if (nelts < 4
16540       || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
16541     aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
16542 }
16543
16544 /* Check whether VALUE is a vector constant in which every element
16545    is either a power of 2 or a negated power of 2.  If so, return
16546    a constant vector of log2s, and flip CODE between PLUS and MINUS
16547    if VALUE contains negated powers of 2.  Return NULL_RTX otherwise.  */
16548
16549 static rtx
16550 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
16551 {
16552   if (GET_CODE (value) != CONST_VECTOR)
16553     return NULL_RTX;
16554
16555   rtx_vector_builder builder;
16556   if (!builder.new_unary_operation (GET_MODE (value), value, false))
16557     return NULL_RTX;
16558
16559   scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
16560   /* 1 if the result of the multiplication must be negated,
16561      0 if it mustn't, or -1 if we don't yet care.  */
16562   int negate = -1;
16563   unsigned int encoded_nelts = const_vector_encoded_nelts (value);
16564   for (unsigned int i = 0; i < encoded_nelts; ++i)
16565     {
16566       rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
16567       if (!CONST_SCALAR_INT_P (elt))
16568         return NULL_RTX;
16569       rtx_mode_t val (elt, int_mode);
16570       wide_int pow2 = wi::neg (val);
16571       if (val != pow2)
16572         {
16573           /* It matters whether we negate or not.  Make that choice,
16574              and make sure that it's consistent with previous elements.  */
16575           if (negate == !wi::neg_p (val))
16576             return NULL_RTX;
16577           negate = wi::neg_p (val);
16578           if (!negate)
16579             pow2 = val;
16580         }
16581       /* POW2 is now the value that we want to be a power of 2.  */
16582       int shift = wi::exact_log2 (pow2);
16583       if (shift < 0)
16584         return NULL_RTX;
16585       builder.quick_push (gen_int_mode (shift, int_mode));
16586     }
16587   if (negate == -1)
16588     /* PLUS and MINUS are equivalent; canonicalize on PLUS.  */
16589     code = PLUS;
16590   else if (negate == 1)
16591     code = code == PLUS ? MINUS : PLUS;
16592   return builder.build ();
16593 }
16594
16595 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
16596    CODE is PLUS for the former and MINUS for the latter.  OPERANDS is the
16597    operands array, in the same order as for fma_optab.  Return true if
16598    the function emitted all the necessary instructions, false if the caller
16599    should generate the pattern normally with the new OPERANDS array.  */
16600
16601 bool
16602 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
16603 {
16604   machine_mode mode = GET_MODE (operands[0]);
16605   if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
16606     {
16607       rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
16608                                   NULL_RTX, true, OPTAB_DIRECT);
16609       force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
16610                           operands[3], product, operands[0], true,
16611                           OPTAB_DIRECT);
16612       return true;
16613     }
16614   operands[2] = force_reg (mode, operands[2]);
16615   return false;
16616 }
16617
16618 /* Likewise, but for a conditional pattern.  */
16619
16620 bool
16621 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
16622 {
16623   machine_mode mode = GET_MODE (operands[0]);
16624   if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
16625     {
16626       rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
16627                                   NULL_RTX, true, OPTAB_DIRECT);
16628       emit_insn (gen_cond (code, mode, operands[0], operands[1],
16629                            operands[4], product, operands[5]));
16630       return true;
16631     }
16632   operands[3] = force_reg (mode, operands[3]);
16633   return false;
16634 }
16635
16636 static unsigned HOST_WIDE_INT
16637 aarch64_shift_truncation_mask (machine_mode mode)
16638 {
16639   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
16640     return 0;
16641   return GET_MODE_UNIT_BITSIZE (mode) - 1;
16642 }
16643
16644 /* Select a format to encode pointers in exception handling data.  */
16645 int
16646 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
16647 {
16648    int type;
16649    switch (aarch64_cmodel)
16650      {
16651      case AARCH64_CMODEL_TINY:
16652      case AARCH64_CMODEL_TINY_PIC:
16653      case AARCH64_CMODEL_SMALL:
16654      case AARCH64_CMODEL_SMALL_PIC:
16655      case AARCH64_CMODEL_SMALL_SPIC:
16656        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
16657           for everything.  */
16658        type = DW_EH_PE_sdata4;
16659        break;
16660      default:
16661        /* No assumptions here.  8-byte relocs required.  */
16662        type = DW_EH_PE_sdata8;
16663        break;
16664      }
16665    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
16666 }
16667
16668 /* Output .variant_pcs for aarch64_vector_pcs function symbols.  */
16669
16670 static void
16671 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
16672 {
16673   if (aarch64_simd_decl_p (decl))
16674     {
16675       fprintf (stream, "\t.variant_pcs\t");
16676       assemble_name (stream, name);
16677       fprintf (stream, "\n");
16678     }
16679 }
16680
16681 /* The last .arch and .tune assembly strings that we printed.  */
16682 static std::string aarch64_last_printed_arch_string;
16683 static std::string aarch64_last_printed_tune_string;
16684
16685 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
16686    by the function fndecl.  */
16687
16688 void
16689 aarch64_declare_function_name (FILE *stream, const char* name,
16690                                 tree fndecl)
16691 {
16692   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
16693
16694   struct cl_target_option *targ_options;
16695   if (target_parts)
16696     targ_options = TREE_TARGET_OPTION (target_parts);
16697   else
16698     targ_options = TREE_TARGET_OPTION (target_option_current_node);
16699   gcc_assert (targ_options);
16700
16701   const struct processor *this_arch
16702     = aarch64_get_arch (targ_options->x_explicit_arch);
16703
16704   uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
16705   std::string extension
16706     = aarch64_get_extension_string_for_isa_flags (isa_flags,
16707                                                   this_arch->flags);
16708   /* Only update the assembler .arch string if it is distinct from the last
16709      such string we printed.  */
16710   std::string to_print = this_arch->name + extension;
16711   if (to_print != aarch64_last_printed_arch_string)
16712     {
16713       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
16714       aarch64_last_printed_arch_string = to_print;
16715     }
16716
16717   /* Print the cpu name we're tuning for in the comments, might be
16718      useful to readers of the generated asm.  Do it only when it changes
16719      from function to function and verbose assembly is requested.  */
16720   const struct processor *this_tune
16721     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
16722
16723   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
16724     {
16725       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
16726                    this_tune->name);
16727       aarch64_last_printed_tune_string = this_tune->name;
16728     }
16729
16730   aarch64_asm_output_variant_pcs (stream, fndecl, name);
16731
16732   /* Don't forget the type directive for ELF.  */
16733   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
16734   ASM_OUTPUT_LABEL (stream, name);
16735 }
16736
16737 /* Implement ASM_OUTPUT_DEF_FROM_DECLS.  Output .variant_pcs for aliases.  */
16738
16739 void
16740 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
16741 {
16742   const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
16743   const char *value = IDENTIFIER_POINTER (target);
16744   aarch64_asm_output_variant_pcs (stream, decl, name);
16745   ASM_OUTPUT_DEF (stream, name, value);
16746 }
16747
16748 /* Implement ASM_OUTPUT_EXTERNAL.  Output .variant_pcs for undefined
16749    function symbol references.  */
16750
16751 void
16752 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
16753 {
16754   default_elf_asm_output_external (stream, decl, name);
16755   aarch64_asm_output_variant_pcs (stream, decl, name);
16756 }
16757
16758 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
16759    Used to output the .cfi_b_key_frame directive when signing the current
16760    function with the B key.  */
16761
16762 void
16763 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
16764 {
16765   if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
16766       && aarch64_ra_sign_key == AARCH64_KEY_B)
16767         asm_fprintf (f, "\t.cfi_b_key_frame\n");
16768 }
16769
16770 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
16771
16772 static void
16773 aarch64_start_file (void)
16774 {
16775   struct cl_target_option *default_options
16776     = TREE_TARGET_OPTION (target_option_default_node);
16777
16778   const struct processor *default_arch
16779     = aarch64_get_arch (default_options->x_explicit_arch);
16780   uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
16781   std::string extension
16782     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
16783                                                   default_arch->flags);
16784
16785    aarch64_last_printed_arch_string = default_arch->name + extension;
16786    aarch64_last_printed_tune_string = "";
16787    asm_fprintf (asm_out_file, "\t.arch %s\n",
16788                 aarch64_last_printed_arch_string.c_str ());
16789
16790    default_file_start ();
16791 }
16792
16793 /* Emit load exclusive.  */
16794
16795 static void
16796 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
16797                              rtx mem, rtx model_rtx)
16798 {
16799   emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
16800 }
16801
16802 /* Emit store exclusive.  */
16803
16804 static void
16805 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
16806                               rtx rval, rtx mem, rtx model_rtx)
16807 {
16808   emit_insn (gen_aarch64_store_exclusive (mode, bval, rval, mem, model_rtx));
16809 }
16810
16811 /* Mark the previous jump instruction as unlikely.  */
16812
16813 static void
16814 aarch64_emit_unlikely_jump (rtx insn)
16815 {
16816   rtx_insn *jump = emit_jump_insn (insn);
16817   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
16818 }
16819
16820 /* Expand a compare and swap pattern.  */
16821
16822 void
16823 aarch64_expand_compare_and_swap (rtx operands[])
16824 {
16825   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
16826   machine_mode mode, r_mode;
16827
16828   bval = operands[0];
16829   rval = operands[1];
16830   mem = operands[2];
16831   oldval = operands[3];
16832   newval = operands[4];
16833   is_weak = operands[5];
16834   mod_s = operands[6];
16835   mod_f = operands[7];
16836   mode = GET_MODE (mem);
16837
16838   /* Normally the succ memory model must be stronger than fail, but in the
16839      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
16840      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
16841   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
16842       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
16843     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
16844
16845   r_mode = mode;
16846   if (mode == QImode || mode == HImode)
16847     {
16848       r_mode = SImode;
16849       rval = gen_reg_rtx (r_mode);
16850     }
16851
16852   if (TARGET_LSE)
16853     {
16854       /* The CAS insn requires oldval and rval overlap, but we need to
16855          have a copy of oldval saved across the operation to tell if
16856          the operation is successful.  */
16857       if (reg_overlap_mentioned_p (rval, oldval))
16858         rval = copy_to_mode_reg (r_mode, oldval);
16859       else
16860         emit_move_insn (rval, gen_lowpart (r_mode, oldval));
16861
16862       emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
16863                                                    newval, mod_s));
16864       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
16865     }
16866   else
16867     {
16868       /* The oldval predicate varies by mode.  Test it and force to reg.  */
16869       insn_code code = code_for_aarch64_compare_and_swap (mode);
16870       if (!insn_data[code].operand[2].predicate (oldval, mode))
16871         oldval = force_reg (mode, oldval);
16872
16873       emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
16874                                  is_weak, mod_s, mod_f));
16875       cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
16876     }
16877
16878   if (r_mode != mode)
16879     rval = gen_lowpart (mode, rval);
16880   emit_move_insn (operands[1], rval);
16881
16882   x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
16883   emit_insn (gen_rtx_SET (bval, x));
16884 }
16885
16886 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
16887    sequence implementing an atomic operation.  */
16888
16889 static void
16890 aarch64_emit_post_barrier (enum memmodel model)
16891 {
16892   const enum memmodel base_model = memmodel_base (model);
16893
16894   if (is_mm_sync (model)
16895       && (base_model == MEMMODEL_ACQUIRE
16896           || base_model == MEMMODEL_ACQ_REL
16897           || base_model == MEMMODEL_SEQ_CST))
16898     {
16899       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
16900     }
16901 }
16902
16903 /* Split a compare and swap pattern.  */
16904
16905 void
16906 aarch64_split_compare_and_swap (rtx operands[])
16907 {
16908   rtx rval, mem, oldval, newval, scratch;
16909   machine_mode mode;
16910   bool is_weak;
16911   rtx_code_label *label1, *label2;
16912   rtx x, cond;
16913   enum memmodel model;
16914   rtx model_rtx;
16915
16916   rval = operands[0];
16917   mem = operands[1];
16918   oldval = operands[2];
16919   newval = operands[3];
16920   is_weak = (operands[4] != const0_rtx);
16921   model_rtx = operands[5];
16922   scratch = operands[7];
16923   mode = GET_MODE (mem);
16924   model = memmodel_from_int (INTVAL (model_rtx));
16925
16926   /* When OLDVAL is zero and we want the strong version we can emit a tighter
16927     loop:
16928     .label1:
16929         LD[A]XR rval, [mem]
16930         CBNZ    rval, .label2
16931         ST[L]XR scratch, newval, [mem]
16932         CBNZ    scratch, .label1
16933     .label2:
16934         CMP     rval, 0.  */
16935   bool strong_zero_p = !is_weak && oldval == const0_rtx;
16936
16937   label1 = NULL;
16938   if (!is_weak)
16939     {
16940       label1 = gen_label_rtx ();
16941       emit_label (label1);
16942     }
16943   label2 = gen_label_rtx ();
16944
16945   /* The initial load can be relaxed for a __sync operation since a final
16946      barrier will be emitted to stop code hoisting.  */
16947   if (is_mm_sync (model))
16948     aarch64_emit_load_exclusive (mode, rval, mem,
16949                                  GEN_INT (MEMMODEL_RELAXED));
16950   else
16951     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
16952
16953   if (strong_zero_p)
16954     {
16955       if (aarch64_track_speculation)
16956         {
16957           /* Emit an explicit compare instruction, so that we can correctly
16958              track the condition codes.  */
16959           rtx cc_reg = aarch64_gen_compare_reg (NE, rval, const0_rtx);
16960           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16961         }
16962       else
16963         x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
16964
16965       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16966                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
16967       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16968     }
16969   else
16970     {
16971       cond = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
16972       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
16973       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16974                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
16975       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16976     }
16977
16978   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
16979
16980   if (!is_weak)
16981     {
16982       if (aarch64_track_speculation)
16983         {
16984           /* Emit an explicit compare instruction, so that we can correctly
16985              track the condition codes.  */
16986           rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
16987           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16988         }
16989       else
16990         x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
16991
16992       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16993                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
16994       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16995     }
16996   else
16997     {
16998       cond = gen_rtx_REG (CCmode, CC_REGNUM);
16999       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
17000       emit_insn (gen_rtx_SET (cond, x));
17001     }
17002
17003   emit_label (label2);
17004   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
17005      to set the condition flags.  If this is not used it will be removed by
17006      later passes.  */
17007   if (strong_zero_p)
17008     {
17009       cond = gen_rtx_REG (CCmode, CC_REGNUM);
17010       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
17011       emit_insn (gen_rtx_SET (cond, x));
17012     }
17013   /* Emit any final barrier needed for a __sync operation.  */
17014   if (is_mm_sync (model))
17015     aarch64_emit_post_barrier (model);
17016 }
17017
17018 /* Split an atomic operation.  */
17019
17020 void
17021 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
17022                          rtx value, rtx model_rtx, rtx cond)
17023 {
17024   machine_mode mode = GET_MODE (mem);
17025   machine_mode wmode = (mode == DImode ? DImode : SImode);
17026   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
17027   const bool is_sync = is_mm_sync (model);
17028   rtx_code_label *label;
17029   rtx x;
17030
17031   /* Split the atomic operation into a sequence.  */
17032   label = gen_label_rtx ();
17033   emit_label (label);
17034
17035   if (new_out)
17036     new_out = gen_lowpart (wmode, new_out);
17037   if (old_out)
17038     old_out = gen_lowpart (wmode, old_out);
17039   else
17040     old_out = new_out;
17041   value = simplify_gen_subreg (wmode, value, mode, 0);
17042
17043   /* The initial load can be relaxed for a __sync operation since a final
17044      barrier will be emitted to stop code hoisting.  */
17045  if (is_sync)
17046     aarch64_emit_load_exclusive (mode, old_out, mem,
17047                                  GEN_INT (MEMMODEL_RELAXED));
17048   else
17049     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
17050
17051   switch (code)
17052     {
17053     case SET:
17054       new_out = value;
17055       break;
17056
17057     case NOT:
17058       x = gen_rtx_AND (wmode, old_out, value);
17059       emit_insn (gen_rtx_SET (new_out, x));
17060       x = gen_rtx_NOT (wmode, new_out);
17061       emit_insn (gen_rtx_SET (new_out, x));
17062       break;
17063
17064     case MINUS:
17065       if (CONST_INT_P (value))
17066         {
17067           value = GEN_INT (-INTVAL (value));
17068           code = PLUS;
17069         }
17070       /* Fall through.  */
17071
17072     default:
17073       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
17074       emit_insn (gen_rtx_SET (new_out, x));
17075       break;
17076     }
17077
17078   aarch64_emit_store_exclusive (mode, cond, mem,
17079                                 gen_lowpart (mode, new_out), model_rtx);
17080
17081   if (aarch64_track_speculation)
17082     {
17083       /* Emit an explicit compare instruction, so that we can correctly
17084          track the condition codes.  */
17085       rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
17086       x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
17087     }
17088   else
17089     x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
17090
17091   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
17092                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
17093   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
17094
17095   /* Emit any final barrier needed for a __sync operation.  */
17096   if (is_sync)
17097     aarch64_emit_post_barrier (model);
17098 }
17099
17100 static void
17101 aarch64_init_libfuncs (void)
17102 {
17103    /* Half-precision float operations.  The compiler handles all operations
17104      with NULL libfuncs by converting to SFmode.  */
17105
17106   /* Conversions.  */
17107   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
17108   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
17109
17110   /* Arithmetic.  */
17111   set_optab_libfunc (add_optab, HFmode, NULL);
17112   set_optab_libfunc (sdiv_optab, HFmode, NULL);
17113   set_optab_libfunc (smul_optab, HFmode, NULL);
17114   set_optab_libfunc (neg_optab, HFmode, NULL);
17115   set_optab_libfunc (sub_optab, HFmode, NULL);
17116
17117   /* Comparisons.  */
17118   set_optab_libfunc (eq_optab, HFmode, NULL);
17119   set_optab_libfunc (ne_optab, HFmode, NULL);
17120   set_optab_libfunc (lt_optab, HFmode, NULL);
17121   set_optab_libfunc (le_optab, HFmode, NULL);
17122   set_optab_libfunc (ge_optab, HFmode, NULL);
17123   set_optab_libfunc (gt_optab, HFmode, NULL);
17124   set_optab_libfunc (unord_optab, HFmode, NULL);
17125 }
17126
17127 /* Target hook for c_mode_for_suffix.  */
17128 static machine_mode
17129 aarch64_c_mode_for_suffix (char suffix)
17130 {
17131   if (suffix == 'q')
17132     return TFmode;
17133
17134   return VOIDmode;
17135 }
17136
17137 /* We can only represent floating point constants which will fit in
17138    "quarter-precision" values.  These values are characterised by
17139    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
17140    by:
17141
17142    (-1)^s * (n/16) * 2^r
17143
17144    Where:
17145      's' is the sign bit.
17146      'n' is an integer in the range 16 <= n <= 31.
17147      'r' is an integer in the range -3 <= r <= 4.  */
17148
17149 /* Return true iff X can be represented by a quarter-precision
17150    floating point immediate operand X.  Note, we cannot represent 0.0.  */
17151 bool
17152 aarch64_float_const_representable_p (rtx x)
17153 {
17154   /* This represents our current view of how many bits
17155      make up the mantissa.  */
17156   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
17157   int exponent;
17158   unsigned HOST_WIDE_INT mantissa, mask;
17159   REAL_VALUE_TYPE r, m;
17160   bool fail;
17161
17162   x = unwrap_const_vec_duplicate (x);
17163   if (!CONST_DOUBLE_P (x))
17164     return false;
17165
17166   if (GET_MODE (x) == VOIDmode
17167       || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
17168     return false;
17169
17170   r = *CONST_DOUBLE_REAL_VALUE (x);
17171
17172   /* We cannot represent infinities, NaNs or +/-zero.  We won't
17173      know if we have +zero until we analyse the mantissa, but we
17174      can reject the other invalid values.  */
17175   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
17176       || REAL_VALUE_MINUS_ZERO (r))
17177     return false;
17178
17179   /* Extract exponent.  */
17180   r = real_value_abs (&r);
17181   exponent = REAL_EXP (&r);
17182
17183   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
17184      highest (sign) bit, with a fixed binary point at bit point_pos.
17185      m1 holds the low part of the mantissa, m2 the high part.
17186      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
17187      bits for the mantissa, this can fail (low bits will be lost).  */
17188   real_ldexp (&m, &r, point_pos - exponent);
17189   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
17190
17191   /* If the low part of the mantissa has bits set we cannot represent
17192      the value.  */
17193   if (w.ulow () != 0)
17194     return false;
17195   /* We have rejected the lower HOST_WIDE_INT, so update our
17196      understanding of how many bits lie in the mantissa and
17197      look only at the high HOST_WIDE_INT.  */
17198   mantissa = w.elt (1);
17199   point_pos -= HOST_BITS_PER_WIDE_INT;
17200
17201   /* We can only represent values with a mantissa of the form 1.xxxx.  */
17202   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
17203   if ((mantissa & mask) != 0)
17204     return false;
17205
17206   /* Having filtered unrepresentable values, we may now remove all
17207      but the highest 5 bits.  */
17208   mantissa >>= point_pos - 5;
17209
17210   /* We cannot represent the value 0.0, so reject it.  This is handled
17211      elsewhere.  */
17212   if (mantissa == 0)
17213     return false;
17214
17215   /* Then, as bit 4 is always set, we can mask it off, leaving
17216      the mantissa in the range [0, 15].  */
17217   mantissa &= ~(1 << 4);
17218   gcc_assert (mantissa <= 15);
17219
17220   /* GCC internally does not use IEEE754-like encoding (where normalized
17221      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
17222      Our mantissa values are shifted 4 places to the left relative to
17223      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
17224      by 5 places to correct for GCC's representation.  */
17225   exponent = 5 - exponent;
17226
17227   return (exponent >= 0 && exponent <= 7);
17228 }
17229
17230 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
17231    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
17232    output MOVI/MVNI, ORR or BIC immediate.  */
17233 char*
17234 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
17235                                    enum simd_immediate_check which)
17236 {
17237   bool is_valid;
17238   static char templ[40];
17239   const char *mnemonic;
17240   const char *shift_op;
17241   unsigned int lane_count = 0;
17242   char element_char;
17243
17244   struct simd_immediate_info info;
17245
17246   /* This will return true to show const_vector is legal for use as either
17247      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
17248      It will also update INFO to show how the immediate should be generated.
17249      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
17250   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
17251   gcc_assert (is_valid);
17252
17253   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
17254   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
17255
17256   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
17257     {
17258       gcc_assert (info.insn == simd_immediate_info::MOV
17259                   && info.u.mov.shift == 0);
17260       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
17261          move immediate path.  */
17262       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
17263         info.u.mov.value = GEN_INT (0);
17264       else
17265         {
17266           const unsigned int buf_size = 20;
17267           char float_buf[buf_size] = {'\0'};
17268           real_to_decimal_for_mode (float_buf,
17269                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
17270                                     buf_size, buf_size, 1, info.elt_mode);
17271
17272           if (lane_count == 1)
17273             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
17274           else
17275             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
17276                       lane_count, element_char, float_buf);
17277           return templ;
17278         }
17279     }
17280
17281   gcc_assert (CONST_INT_P (info.u.mov.value));
17282
17283   if (which == AARCH64_CHECK_MOV)
17284     {
17285       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
17286       shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
17287                   ? "msl" : "lsl");
17288       if (lane_count == 1)
17289         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
17290                   mnemonic, UINTVAL (info.u.mov.value));
17291       else if (info.u.mov.shift)
17292         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
17293                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
17294                   element_char, UINTVAL (info.u.mov.value), shift_op,
17295                   info.u.mov.shift);
17296       else
17297         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
17298                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
17299                   element_char, UINTVAL (info.u.mov.value));
17300     }
17301   else
17302     {
17303       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
17304       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
17305       if (info.u.mov.shift)
17306         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
17307                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
17308                   element_char, UINTVAL (info.u.mov.value), "lsl",
17309                   info.u.mov.shift);
17310       else
17311         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
17312                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
17313                   element_char, UINTVAL (info.u.mov.value));
17314     }
17315   return templ;
17316 }
17317
17318 char*
17319 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
17320 {
17321
17322   /* If a floating point number was passed and we desire to use it in an
17323      integer mode do the conversion to integer.  */
17324   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
17325     {
17326       unsigned HOST_WIDE_INT ival;
17327       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
17328           gcc_unreachable ();
17329       immediate = gen_int_mode (ival, mode);
17330     }
17331
17332   machine_mode vmode;
17333   /* use a 64 bit mode for everything except for DI/DF mode, where we use
17334      a 128 bit vector mode.  */
17335   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
17336
17337   vmode = aarch64_simd_container_mode (mode, width);
17338   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
17339   return aarch64_output_simd_mov_immediate (v_op, width);
17340 }
17341
17342 /* Return the output string to use for moving immediate CONST_VECTOR
17343    into an SVE register.  */
17344
17345 char *
17346 aarch64_output_sve_mov_immediate (rtx const_vector)
17347 {
17348   static char templ[40];
17349   struct simd_immediate_info info;
17350   char element_char;
17351
17352   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
17353   gcc_assert (is_valid);
17354
17355   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
17356
17357   machine_mode vec_mode = GET_MODE (const_vector);
17358   if (aarch64_sve_pred_mode_p (vec_mode))
17359     {
17360       static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
17361       if (info.insn == simd_immediate_info::MOV)
17362         {
17363           gcc_assert (info.u.mov.value == const0_rtx);
17364           snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
17365         }
17366       else
17367         {
17368           gcc_assert (info.insn == simd_immediate_info::PTRUE);
17369           unsigned int total_bytes;
17370           if (info.u.pattern == AARCH64_SV_ALL
17371               && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
17372             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
17373                       total_bytes / GET_MODE_SIZE (info.elt_mode));
17374           else
17375             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
17376                       svpattern_token (info.u.pattern));
17377         }
17378       return buf;
17379     }
17380
17381   if (info.insn == simd_immediate_info::INDEX)
17382     {
17383       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
17384                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
17385                 element_char, INTVAL (info.u.index.base),
17386                 INTVAL (info.u.index.step));
17387       return templ;
17388     }
17389
17390   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
17391     {
17392       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
17393         info.u.mov.value = GEN_INT (0);
17394       else
17395         {
17396           const int buf_size = 20;
17397           char float_buf[buf_size] = {};
17398           real_to_decimal_for_mode (float_buf,
17399                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
17400                                     buf_size, buf_size, 1, info.elt_mode);
17401
17402           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
17403                     element_char, float_buf);
17404           return templ;
17405         }
17406     }
17407
17408   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
17409             element_char, INTVAL (info.u.mov.value));
17410   return templ;
17411 }
17412
17413 /* Split operands into moves from op[1] + op[2] into op[0].  */
17414
17415 void
17416 aarch64_split_combinev16qi (rtx operands[3])
17417 {
17418   unsigned int dest = REGNO (operands[0]);
17419   unsigned int src1 = REGNO (operands[1]);
17420   unsigned int src2 = REGNO (operands[2]);
17421   machine_mode halfmode = GET_MODE (operands[1]);
17422   unsigned int halfregs = REG_NREGS (operands[1]);
17423   rtx destlo, desthi;
17424
17425   gcc_assert (halfmode == V16QImode);
17426
17427   if (src1 == dest && src2 == dest + halfregs)
17428     {
17429       /* No-op move.  Can't split to nothing; emit something.  */
17430       emit_note (NOTE_INSN_DELETED);
17431       return;
17432     }
17433
17434   /* Preserve register attributes for variable tracking.  */
17435   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
17436   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
17437                                GET_MODE_SIZE (halfmode));
17438
17439   /* Special case of reversed high/low parts.  */
17440   if (reg_overlap_mentioned_p (operands[2], destlo)
17441       && reg_overlap_mentioned_p (operands[1], desthi))
17442     {
17443       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
17444       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
17445       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
17446     }
17447   else if (!reg_overlap_mentioned_p (operands[2], destlo))
17448     {
17449       /* Try to avoid unnecessary moves if part of the result
17450          is in the right place already.  */
17451       if (src1 != dest)
17452         emit_move_insn (destlo, operands[1]);
17453       if (src2 != dest + halfregs)
17454         emit_move_insn (desthi, operands[2]);
17455     }
17456   else
17457     {
17458       if (src2 != dest + halfregs)
17459         emit_move_insn (desthi, operands[2]);
17460       if (src1 != dest)
17461         emit_move_insn (destlo, operands[1]);
17462     }
17463 }
17464
17465 /* vec_perm support.  */
17466
17467 struct expand_vec_perm_d
17468 {
17469   rtx target, op0, op1;
17470   vec_perm_indices perm;
17471   machine_mode vmode;
17472   unsigned int vec_flags;
17473   bool one_vector_p;
17474   bool testing_p;
17475 };
17476
17477 /* Generate a variable permutation.  */
17478
17479 static void
17480 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
17481 {
17482   machine_mode vmode = GET_MODE (target);
17483   bool one_vector_p = rtx_equal_p (op0, op1);
17484
17485   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
17486   gcc_checking_assert (GET_MODE (op0) == vmode);
17487   gcc_checking_assert (GET_MODE (op1) == vmode);
17488   gcc_checking_assert (GET_MODE (sel) == vmode);
17489   gcc_checking_assert (TARGET_SIMD);
17490
17491   if (one_vector_p)
17492     {
17493       if (vmode == V8QImode)
17494         {
17495           /* Expand the argument to a V16QI mode by duplicating it.  */
17496           rtx pair = gen_reg_rtx (V16QImode);
17497           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
17498           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
17499         }
17500       else
17501         {
17502           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
17503         }
17504     }
17505   else
17506     {
17507       rtx pair;
17508
17509       if (vmode == V8QImode)
17510         {
17511           pair = gen_reg_rtx (V16QImode);
17512           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
17513           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
17514         }
17515       else
17516         {
17517           pair = gen_reg_rtx (OImode);
17518           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
17519           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
17520         }
17521     }
17522 }
17523
17524 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
17525    NELT is the number of elements in the vector.  */
17526
17527 void
17528 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
17529                          unsigned int nelt)
17530 {
17531   machine_mode vmode = GET_MODE (target);
17532   bool one_vector_p = rtx_equal_p (op0, op1);
17533   rtx mask;
17534
17535   /* The TBL instruction does not use a modulo index, so we must take care
17536      of that ourselves.  */
17537   mask = aarch64_simd_gen_const_vector_dup (vmode,
17538       one_vector_p ? nelt - 1 : 2 * nelt - 1);
17539   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
17540
17541   /* For big-endian, we also need to reverse the index within the vector
17542      (but not which vector).  */
17543   if (BYTES_BIG_ENDIAN)
17544     {
17545       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
17546       if (!one_vector_p)
17547         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
17548       sel = expand_simple_binop (vmode, XOR, sel, mask,
17549                                  NULL, 0, OPTAB_LIB_WIDEN);
17550     }
17551   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
17552 }
17553
17554 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
17555
17556 static void
17557 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
17558 {
17559   emit_insn (gen_rtx_SET (target,
17560                           gen_rtx_UNSPEC (GET_MODE (target),
17561                                           gen_rtvec (2, op0, op1), code)));
17562 }
17563
17564 /* Expand an SVE vec_perm with the given operands.  */
17565
17566 void
17567 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
17568 {
17569   machine_mode data_mode = GET_MODE (target);
17570   machine_mode sel_mode = GET_MODE (sel);
17571   /* Enforced by the pattern condition.  */
17572   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
17573
17574   /* Note: vec_perm indices are supposed to wrap when they go beyond the
17575      size of the two value vectors, i.e. the upper bits of the indices
17576      are effectively ignored.  SVE TBL instead produces 0 for any
17577      out-of-range indices, so we need to modulo all the vec_perm indices
17578      to ensure they are all in range.  */
17579   rtx sel_reg = force_reg (sel_mode, sel);
17580
17581   /* Check if the sel only references the first values vector.  */
17582   if (GET_CODE (sel) == CONST_VECTOR
17583       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
17584     {
17585       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
17586       return;
17587     }
17588
17589   /* Check if the two values vectors are the same.  */
17590   if (rtx_equal_p (op0, op1))
17591     {
17592       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
17593       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
17594                                          NULL, 0, OPTAB_DIRECT);
17595       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
17596       return;
17597     }
17598
17599   /* Run TBL on for each value vector and combine the results.  */
17600
17601   rtx res0 = gen_reg_rtx (data_mode);
17602   rtx res1 = gen_reg_rtx (data_mode);
17603   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
17604   if (GET_CODE (sel) != CONST_VECTOR
17605       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
17606     {
17607       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
17608                                                        2 * nunits - 1);
17609       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
17610                                      NULL, 0, OPTAB_DIRECT);
17611     }
17612   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
17613   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
17614                                      NULL, 0, OPTAB_DIRECT);
17615   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
17616   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
17617     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
17618   else
17619     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
17620 }
17621
17622 /* Recognize patterns suitable for the TRN instructions.  */
17623 static bool
17624 aarch64_evpc_trn (struct expand_vec_perm_d *d)
17625 {
17626   HOST_WIDE_INT odd;
17627   poly_uint64 nelt = d->perm.length ();
17628   rtx out, in0, in1, x;
17629   machine_mode vmode = d->vmode;
17630
17631   if (GET_MODE_UNIT_SIZE (vmode) > 8)
17632     return false;
17633
17634   /* Note that these are little-endian tests.
17635      We correct for big-endian later.  */
17636   if (!d->perm[0].is_constant (&odd)
17637       || (odd != 0 && odd != 1)
17638       || !d->perm.series_p (0, 2, odd, 2)
17639       || !d->perm.series_p (1, 2, nelt + odd, 2))
17640     return false;
17641
17642   /* Success!  */
17643   if (d->testing_p)
17644     return true;
17645
17646   in0 = d->op0;
17647   in1 = d->op1;
17648   /* We don't need a big-endian lane correction for SVE; see the comment
17649      at the head of aarch64-sve.md for details.  */
17650   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17651     {
17652       x = in0, in0 = in1, in1 = x;
17653       odd = !odd;
17654     }
17655   out = d->target;
17656
17657   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17658                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
17659   return true;
17660 }
17661
17662 /* Recognize patterns suitable for the UZP instructions.  */
17663 static bool
17664 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
17665 {
17666   HOST_WIDE_INT odd;
17667   rtx out, in0, in1, x;
17668   machine_mode vmode = d->vmode;
17669
17670   if (GET_MODE_UNIT_SIZE (vmode) > 8)
17671     return false;
17672
17673   /* Note that these are little-endian tests.
17674      We correct for big-endian later.  */
17675   if (!d->perm[0].is_constant (&odd)
17676       || (odd != 0 && odd != 1)
17677       || !d->perm.series_p (0, 1, odd, 2))
17678     return false;
17679
17680   /* Success!  */
17681   if (d->testing_p)
17682     return true;
17683
17684   in0 = d->op0;
17685   in1 = d->op1;
17686   /* We don't need a big-endian lane correction for SVE; see the comment
17687      at the head of aarch64-sve.md for details.  */
17688   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17689     {
17690       x = in0, in0 = in1, in1 = x;
17691       odd = !odd;
17692     }
17693   out = d->target;
17694
17695   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17696                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
17697   return true;
17698 }
17699
17700 /* Recognize patterns suitable for the ZIP instructions.  */
17701 static bool
17702 aarch64_evpc_zip (struct expand_vec_perm_d *d)
17703 {
17704   unsigned int high;
17705   poly_uint64 nelt = d->perm.length ();
17706   rtx out, in0, in1, x;
17707   machine_mode vmode = d->vmode;
17708
17709   if (GET_MODE_UNIT_SIZE (vmode) > 8)
17710     return false;
17711
17712   /* Note that these are little-endian tests.
17713      We correct for big-endian later.  */
17714   poly_uint64 first = d->perm[0];
17715   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
17716       || !d->perm.series_p (0, 2, first, 1)
17717       || !d->perm.series_p (1, 2, first + nelt, 1))
17718     return false;
17719   high = maybe_ne (first, 0U);
17720
17721   /* Success!  */
17722   if (d->testing_p)
17723     return true;
17724
17725   in0 = d->op0;
17726   in1 = d->op1;
17727   /* We don't need a big-endian lane correction for SVE; see the comment
17728      at the head of aarch64-sve.md for details.  */
17729   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17730     {
17731       x = in0, in0 = in1, in1 = x;
17732       high = !high;
17733     }
17734   out = d->target;
17735
17736   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17737                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
17738   return true;
17739 }
17740
17741 /* Recognize patterns for the EXT insn.  */
17742
17743 static bool
17744 aarch64_evpc_ext (struct expand_vec_perm_d *d)
17745 {
17746   HOST_WIDE_INT location;
17747   rtx offset;
17748
17749   /* The first element always refers to the first vector.
17750      Check if the extracted indices are increasing by one.  */
17751   if (d->vec_flags == VEC_SVE_PRED
17752       || !d->perm[0].is_constant (&location)
17753       || !d->perm.series_p (0, 1, location, 1))
17754     return false;
17755
17756   /* Success! */
17757   if (d->testing_p)
17758     return true;
17759
17760   /* The case where (location == 0) is a no-op for both big- and little-endian,
17761      and is removed by the mid-end at optimization levels -O1 and higher.
17762
17763      We don't need a big-endian lane correction for SVE; see the comment
17764      at the head of aarch64-sve.md for details.  */
17765   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
17766     {
17767       /* After setup, we want the high elements of the first vector (stored
17768          at the LSB end of the register), and the low elements of the second
17769          vector (stored at the MSB end of the register). So swap.  */
17770       std::swap (d->op0, d->op1);
17771       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
17772          to_constant () is safe since this is restricted to Advanced SIMD
17773          vectors.  */
17774       location = d->perm.length ().to_constant () - location;
17775     }
17776
17777   offset = GEN_INT (location);
17778   emit_set_insn (d->target,
17779                  gen_rtx_UNSPEC (d->vmode,
17780                                  gen_rtvec (3, d->op0, d->op1, offset),
17781                                  UNSPEC_EXT));
17782   return true;
17783 }
17784
17785 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
17786    within each 64-bit, 32-bit or 16-bit granule.  */
17787
17788 static bool
17789 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
17790 {
17791   HOST_WIDE_INT diff;
17792   unsigned int i, size, unspec;
17793   machine_mode pred_mode;
17794
17795   if (d->vec_flags == VEC_SVE_PRED
17796       || !d->one_vector_p
17797       || !d->perm[0].is_constant (&diff))
17798     return false;
17799
17800   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
17801   if (size == 8)
17802     {
17803       unspec = UNSPEC_REV64;
17804       pred_mode = VNx2BImode;
17805     }
17806   else if (size == 4)
17807     {
17808       unspec = UNSPEC_REV32;
17809       pred_mode = VNx4BImode;
17810     }
17811   else if (size == 2)
17812     {
17813       unspec = UNSPEC_REV16;
17814       pred_mode = VNx8BImode;
17815     }
17816   else
17817     return false;
17818
17819   unsigned int step = diff + 1;
17820   for (i = 0; i < step; ++i)
17821     if (!d->perm.series_p (i, step, diff - i, step))
17822       return false;
17823
17824   /* Success! */
17825   if (d->testing_p)
17826     return true;
17827
17828   if (d->vec_flags == VEC_SVE_DATA)
17829     {
17830       machine_mode int_mode = aarch64_sve_int_mode (pred_mode);
17831       rtx target = gen_reg_rtx (int_mode);
17832       if (BYTES_BIG_ENDIAN)
17833         /* The act of taking a subreg between INT_MODE and d->vmode
17834            is itself a reversing operation on big-endian targets;
17835            see the comment at the head of aarch64-sve.md for details.
17836            First reinterpret OP0 as INT_MODE without using a subreg
17837            and without changing the contents.  */
17838         emit_insn (gen_aarch64_sve_reinterpret (int_mode, target, d->op0));
17839       else
17840         {
17841           /* For SVE we use REV[BHW] unspecs derived from the element size
17842              of v->mode and vector modes whose elements have SIZE bytes.
17843              This ensures that the vector modes match the predicate modes.  */
17844           int unspec = aarch64_sve_rev_unspec (d->vmode);
17845           rtx pred = aarch64_ptrue_reg (pred_mode);
17846           emit_insn (gen_aarch64_pred (unspec, int_mode, target, pred,
17847                                        gen_lowpart (int_mode, d->op0)));
17848         }
17849       emit_move_insn (d->target, gen_lowpart (d->vmode, target));
17850       return true;
17851     }
17852   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
17853   emit_set_insn (d->target, src);
17854   return true;
17855 }
17856
17857 /* Recognize patterns for the REV insn, which reverses elements within
17858    a full vector.  */
17859
17860 static bool
17861 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
17862 {
17863   poly_uint64 nelt = d->perm.length ();
17864
17865   if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
17866     return false;
17867
17868   if (!d->perm.series_p (0, 1, nelt - 1, -1))
17869     return false;
17870
17871   /* Success! */
17872   if (d->testing_p)
17873     return true;
17874
17875   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
17876   emit_set_insn (d->target, src);
17877   return true;
17878 }
17879
17880 static bool
17881 aarch64_evpc_dup (struct expand_vec_perm_d *d)
17882 {
17883   rtx out = d->target;
17884   rtx in0;
17885   HOST_WIDE_INT elt;
17886   machine_mode vmode = d->vmode;
17887   rtx lane;
17888
17889   if (d->vec_flags == VEC_SVE_PRED
17890       || d->perm.encoding ().encoded_nelts () != 1
17891       || !d->perm[0].is_constant (&elt))
17892     return false;
17893
17894   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
17895     return false;
17896
17897   /* Success! */
17898   if (d->testing_p)
17899     return true;
17900
17901   /* The generic preparation in aarch64_expand_vec_perm_const_1
17902      swaps the operand order and the permute indices if it finds
17903      d->perm[0] to be in the second operand.  Thus, we can always
17904      use d->op0 and need not do any extra arithmetic to get the
17905      correct lane number.  */
17906   in0 = d->op0;
17907   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
17908
17909   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
17910   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
17911   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
17912   return true;
17913 }
17914
17915 static bool
17916 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
17917 {
17918   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
17919   machine_mode vmode = d->vmode;
17920
17921   /* Make sure that the indices are constant.  */
17922   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
17923   for (unsigned int i = 0; i < encoded_nelts; ++i)
17924     if (!d->perm[i].is_constant ())
17925       return false;
17926
17927   if (d->testing_p)
17928     return true;
17929
17930   /* Generic code will try constant permutation twice.  Once with the
17931      original mode and again with the elements lowered to QImode.
17932      So wait and don't do the selector expansion ourselves.  */
17933   if (vmode != V8QImode && vmode != V16QImode)
17934     return false;
17935
17936   /* to_constant is safe since this routine is specific to Advanced SIMD
17937      vectors.  */
17938   unsigned int nelt = d->perm.length ().to_constant ();
17939   for (unsigned int i = 0; i < nelt; ++i)
17940     /* If big-endian and two vectors we end up with a weird mixed-endian
17941        mode on NEON.  Reverse the index within each word but not the word
17942        itself.  to_constant is safe because we checked is_constant above.  */
17943     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
17944                         ? d->perm[i].to_constant () ^ (nelt - 1)
17945                         : d->perm[i].to_constant ());
17946
17947   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
17948   sel = force_reg (vmode, sel);
17949
17950   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
17951   return true;
17952 }
17953
17954 /* Try to implement D using an SVE TBL instruction.  */
17955
17956 static bool
17957 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
17958 {
17959   unsigned HOST_WIDE_INT nelt;
17960
17961   /* Permuting two variable-length vectors could overflow the
17962      index range.  */
17963   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
17964     return false;
17965
17966   if (d->testing_p)
17967     return true;
17968
17969   machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
17970   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
17971   if (d->one_vector_p)
17972     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
17973   else
17974     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
17975   return true;
17976 }
17977
17978 static bool
17979 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
17980 {
17981   /* The pattern matching functions above are written to look for a small
17982      number to begin the sequence (0, 1, N/2).  If we begin with an index
17983      from the second operand, we can swap the operands.  */
17984   poly_int64 nelt = d->perm.length ();
17985   if (known_ge (d->perm[0], nelt))
17986     {
17987       d->perm.rotate_inputs (1);
17988       std::swap (d->op0, d->op1);
17989     }
17990
17991   if ((d->vec_flags == VEC_ADVSIMD
17992        || d->vec_flags == VEC_SVE_DATA
17993        || d->vec_flags == VEC_SVE_PRED)
17994       && known_gt (nelt, 1))
17995     {
17996       if (aarch64_evpc_rev_local (d))
17997         return true;
17998       else if (aarch64_evpc_rev_global (d))
17999         return true;
18000       else if (aarch64_evpc_ext (d))
18001         return true;
18002       else if (aarch64_evpc_dup (d))
18003         return true;
18004       else if (aarch64_evpc_zip (d))
18005         return true;
18006       else if (aarch64_evpc_uzp (d))
18007         return true;
18008       else if (aarch64_evpc_trn (d))
18009         return true;
18010       if (d->vec_flags == VEC_SVE_DATA)
18011         return aarch64_evpc_sve_tbl (d);
18012       else if (d->vec_flags == VEC_ADVSIMD)
18013         return aarch64_evpc_tbl (d);
18014     }
18015   return false;
18016 }
18017
18018 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
18019
18020 static bool
18021 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
18022                                   rtx op1, const vec_perm_indices &sel)
18023 {
18024   struct expand_vec_perm_d d;
18025
18026   /* Check whether the mask can be applied to a single vector.  */
18027   if (sel.ninputs () == 1
18028       || (op0 && rtx_equal_p (op0, op1)))
18029     d.one_vector_p = true;
18030   else if (sel.all_from_input_p (0))
18031     {
18032       d.one_vector_p = true;
18033       op1 = op0;
18034     }
18035   else if (sel.all_from_input_p (1))
18036     {
18037       d.one_vector_p = true;
18038       op0 = op1;
18039     }
18040   else
18041     d.one_vector_p = false;
18042
18043   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
18044                      sel.nelts_per_input ());
18045   d.vmode = vmode;
18046   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
18047   d.target = target;
18048   d.op0 = op0;
18049   d.op1 = op1;
18050   d.testing_p = !target;
18051
18052   if (!d.testing_p)
18053     return aarch64_expand_vec_perm_const_1 (&d);
18054
18055   rtx_insn *last = get_last_insn ();
18056   bool ret = aarch64_expand_vec_perm_const_1 (&d);
18057   gcc_assert (last == get_last_insn ());
18058
18059   return ret;
18060 }
18061
18062 /* Generate a byte permute mask for a register of mode MODE,
18063    which has NUNITS units.  */
18064
18065 rtx
18066 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
18067 {
18068   /* We have to reverse each vector because we dont have
18069      a permuted load that can reverse-load according to ABI rules.  */
18070   rtx mask;
18071   rtvec v = rtvec_alloc (16);
18072   unsigned int i, j;
18073   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
18074
18075   gcc_assert (BYTES_BIG_ENDIAN);
18076   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
18077
18078   for (i = 0; i < nunits; i++)
18079     for (j = 0; j < usize; j++)
18080       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
18081   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
18082   return force_reg (V16QImode, mask);
18083 }
18084
18085 /* Expand an SVE integer comparison using the SVE equivalent of:
18086
18087      (set TARGET (CODE OP0 OP1)).  */
18088
18089 void
18090 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
18091 {
18092   machine_mode pred_mode = GET_MODE (target);
18093   machine_mode data_mode = GET_MODE (op0);
18094   rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
18095                                       op0, op1);
18096   if (!rtx_equal_p (target, res))
18097     emit_move_insn (target, res);
18098 }
18099
18100 /* Return the UNSPEC_COND_* code for comparison CODE.  */
18101
18102 static unsigned int
18103 aarch64_unspec_cond_code (rtx_code code)
18104 {
18105   switch (code)
18106     {
18107     case NE:
18108       return UNSPEC_COND_FCMNE;
18109     case EQ:
18110       return UNSPEC_COND_FCMEQ;
18111     case LT:
18112       return UNSPEC_COND_FCMLT;
18113     case GT:
18114       return UNSPEC_COND_FCMGT;
18115     case LE:
18116       return UNSPEC_COND_FCMLE;
18117     case GE:
18118       return UNSPEC_COND_FCMGE;
18119     case UNORDERED:
18120       return UNSPEC_COND_FCMUO;
18121     default:
18122       gcc_unreachable ();
18123     }
18124 }
18125
18126 /* Emit:
18127
18128       (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
18129
18130    where <X> is the operation associated with comparison CODE.
18131    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
18132
18133 static void
18134 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
18135                           bool known_ptrue_p, rtx op0, rtx op1)
18136 {
18137   rtx flag = gen_int_mode (known_ptrue_p, SImode);
18138   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
18139                                gen_rtvec (4, pred, flag, op0, op1),
18140                                aarch64_unspec_cond_code (code));
18141   emit_set_insn (target, unspec);
18142 }
18143
18144 /* Emit the SVE equivalent of:
18145
18146       (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
18147       (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
18148       (set TARGET (ior:PRED_MODE TMP1 TMP2))
18149
18150    where <Xi> is the operation associated with comparison CODEi.
18151    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
18152
18153 static void
18154 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
18155                               rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
18156 {
18157   machine_mode pred_mode = GET_MODE (pred);
18158   rtx tmp1 = gen_reg_rtx (pred_mode);
18159   aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
18160   rtx tmp2 = gen_reg_rtx (pred_mode);
18161   aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
18162   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
18163 }
18164
18165 /* Emit the SVE equivalent of:
18166
18167       (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
18168       (set TARGET (not TMP))
18169
18170    where <X> is the operation associated with comparison CODE.
18171    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
18172
18173 static void
18174 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
18175                                  bool known_ptrue_p, rtx op0, rtx op1)
18176 {
18177   machine_mode pred_mode = GET_MODE (pred);
18178   rtx tmp = gen_reg_rtx (pred_mode);
18179   aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
18180   aarch64_emit_unop (target, one_cmpl_optab, tmp);
18181 }
18182
18183 /* Expand an SVE floating-point comparison using the SVE equivalent of:
18184
18185      (set TARGET (CODE OP0 OP1))
18186
18187    If CAN_INVERT_P is true, the caller can also handle inverted results;
18188    return true if the result is in fact inverted.  */
18189
18190 bool
18191 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
18192                                   rtx op0, rtx op1, bool can_invert_p)
18193 {
18194   machine_mode pred_mode = GET_MODE (target);
18195   machine_mode data_mode = GET_MODE (op0);
18196
18197   rtx ptrue = aarch64_ptrue_reg (pred_mode);
18198   switch (code)
18199     {
18200     case UNORDERED:
18201       /* UNORDERED has no immediate form.  */
18202       op1 = force_reg (data_mode, op1);
18203       /* fall through */
18204     case LT:
18205     case LE:
18206     case GT:
18207     case GE:
18208     case EQ:
18209     case NE:
18210       {
18211         /* There is native support for the comparison.  */
18212         aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
18213         return false;
18214       }
18215
18216     case LTGT:
18217       /* This is a trapping operation (LT or GT).  */
18218       aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
18219       return false;
18220
18221     case UNEQ:
18222       if (!flag_trapping_math)
18223         {
18224           /* This would trap for signaling NaNs.  */
18225           op1 = force_reg (data_mode, op1);
18226           aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
18227                                         ptrue, true, op0, op1);
18228           return false;
18229         }
18230       /* fall through */
18231     case UNLT:
18232     case UNLE:
18233     case UNGT:
18234     case UNGE:
18235       if (flag_trapping_math)
18236         {
18237           /* Work out which elements are ordered.  */
18238           rtx ordered = gen_reg_rtx (pred_mode);
18239           op1 = force_reg (data_mode, op1);
18240           aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
18241                                            ptrue, true, op0, op1);
18242
18243           /* Test the opposite condition for the ordered elements,
18244              then invert the result.  */
18245           if (code == UNEQ)
18246             code = NE;
18247           else
18248             code = reverse_condition_maybe_unordered (code);
18249           if (can_invert_p)
18250             {
18251               aarch64_emit_sve_fp_cond (target, code,
18252                                         ordered, false, op0, op1);
18253               return true;
18254             }
18255           aarch64_emit_sve_invert_fp_cond (target, code,
18256                                            ordered, false, op0, op1);
18257           return false;
18258         }
18259       break;
18260
18261     case ORDERED:
18262       /* ORDERED has no immediate form.  */
18263       op1 = force_reg (data_mode, op1);
18264       break;
18265
18266     default:
18267       gcc_unreachable ();
18268     }
18269
18270   /* There is native support for the inverse comparison.  */
18271   code = reverse_condition_maybe_unordered (code);
18272   if (can_invert_p)
18273     {
18274       aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
18275       return true;
18276     }
18277   aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
18278   return false;
18279 }
18280
18281 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
18282    of the data being selected and CMP_MODE is the mode of the values being
18283    compared.  */
18284
18285 void
18286 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
18287                           rtx *ops)
18288 {
18289   machine_mode pred_mode
18290     = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
18291                              GET_MODE_SIZE (cmp_mode)).require ();
18292   rtx pred = gen_reg_rtx (pred_mode);
18293   if (FLOAT_MODE_P (cmp_mode))
18294     {
18295       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
18296                                             ops[4], ops[5], true))
18297         std::swap (ops[1], ops[2]);
18298     }
18299   else
18300     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
18301
18302   if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
18303     ops[1] = force_reg (data_mode, ops[1]);
18304   /* The "false" value can only be zero if the "true" value is a constant.  */
18305   if (register_operand (ops[1], data_mode)
18306       || !aarch64_simd_reg_or_zero (ops[2], data_mode))
18307     ops[2] = force_reg (data_mode, ops[2]);
18308
18309   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
18310   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
18311 }
18312
18313 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
18314    true.  However due to issues with register allocation it is preferable
18315    to avoid tieing integer scalar and FP scalar modes.  Executing integer
18316    operations in general registers is better than treating them as scalar
18317    vector operations.  This reduces latency and avoids redundant int<->FP
18318    moves.  So tie modes if they are either the same class, or vector modes
18319    with other vector modes, vector structs or any scalar mode.  */
18320
18321 static bool
18322 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
18323 {
18324   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
18325     return true;
18326
18327   /* We specifically want to allow elements of "structure" modes to
18328      be tieable to the structure.  This more general condition allows
18329      other rarer situations too.  The reason we don't extend this to
18330      predicate modes is that there are no predicate structure modes
18331      nor any specific instructions for extracting part of a predicate
18332      register.  */
18333   if (aarch64_vector_data_mode_p (mode1)
18334       && aarch64_vector_data_mode_p (mode2))
18335     return true;
18336
18337   /* Also allow any scalar modes with vectors.  */
18338   if (aarch64_vector_mode_supported_p (mode1)
18339       || aarch64_vector_mode_supported_p (mode2))
18340     return true;
18341
18342   return false;
18343 }
18344
18345 /* Return a new RTX holding the result of moving POINTER forward by
18346    AMOUNT bytes.  */
18347
18348 static rtx
18349 aarch64_move_pointer (rtx pointer, poly_int64 amount)
18350 {
18351   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
18352
18353   return adjust_automodify_address (pointer, GET_MODE (pointer),
18354                                     next, amount);
18355 }
18356
18357 /* Return a new RTX holding the result of moving POINTER forward by the
18358    size of the mode it points to.  */
18359
18360 static rtx
18361 aarch64_progress_pointer (rtx pointer)
18362 {
18363   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
18364 }
18365
18366 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
18367    MODE bytes.  */
18368
18369 static void
18370 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
18371                                               machine_mode mode)
18372 {
18373   rtx reg = gen_reg_rtx (mode);
18374
18375   /* "Cast" the pointers to the correct mode.  */
18376   *src = adjust_address (*src, mode, 0);
18377   *dst = adjust_address (*dst, mode, 0);
18378   /* Emit the memcpy.  */
18379   emit_move_insn (reg, *src);
18380   emit_move_insn (*dst, reg);
18381   /* Move the pointers forward.  */
18382   *src = aarch64_progress_pointer (*src);
18383   *dst = aarch64_progress_pointer (*dst);
18384 }
18385
18386 /* Expand cpymem, as if from a __builtin_memcpy.  Return true if
18387    we succeed, otherwise return false.  */
18388
18389 bool
18390 aarch64_expand_cpymem (rtx *operands)
18391 {
18392   int n, mode_bits;
18393   rtx dst = operands[0];
18394   rtx src = operands[1];
18395   rtx base;
18396   machine_mode cur_mode = BLKmode, next_mode;
18397   bool speed_p = !optimize_function_for_size_p (cfun);
18398
18399   /* When optimizing for size, give a better estimate of the length of a
18400      memcpy call, but use the default otherwise.  Moves larger than 8 bytes
18401      will always require an even number of instructions to do now.  And each
18402      operation requires both a load+store, so devide the max number by 2.  */
18403   int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
18404
18405   /* We can't do anything smart if the amount to copy is not constant.  */
18406   if (!CONST_INT_P (operands[2]))
18407     return false;
18408
18409   n = INTVAL (operands[2]);
18410
18411   /* Try to keep the number of instructions low.  For all cases we will do at
18412      most two moves for the residual amount, since we'll always overlap the
18413      remainder.  */
18414   if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
18415     return false;
18416
18417   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
18418   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
18419
18420   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
18421   src = adjust_automodify_address (src, VOIDmode, base, 0);
18422
18423   /* Convert n to bits to make the rest of the code simpler.  */
18424   n = n * BITS_PER_UNIT;
18425
18426   /* Maximum amount to copy in one go.  The AArch64 back-end has integer modes
18427      larger than TImode, but we should not use them for loads/stores here.  */
18428   const int copy_limit = GET_MODE_BITSIZE (TImode);
18429
18430   while (n > 0)
18431     {
18432       /* Find the largest mode in which to do the copy in without over reading
18433          or writing.  */
18434       opt_scalar_int_mode mode_iter;
18435       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
18436         if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
18437           cur_mode = mode_iter.require ();
18438
18439       gcc_assert (cur_mode != BLKmode);
18440
18441       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
18442       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
18443
18444       n -= mode_bits;
18445
18446       /* Do certain trailing copies as overlapping if it's going to be
18447          cheaper.  i.e. less instructions to do so.  For instance doing a 15
18448          byte copy it's more efficient to do two overlapping 8 byte copies than
18449          8 + 6 + 1.  */
18450       if (n > 0 && n <= 8 * BITS_PER_UNIT)
18451         {
18452           next_mode = smallest_mode_for_size (n, MODE_INT);
18453           int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
18454           src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
18455           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
18456           n = n_bits;
18457         }
18458     }
18459
18460   return true;
18461 }
18462
18463 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
18464    SImode stores.  Handle the case when the constant has identical
18465    bottom and top halves.  This is beneficial when the two stores can be
18466    merged into an STP and we avoid synthesising potentially expensive
18467    immediates twice.  Return true if such a split is possible.  */
18468
18469 bool
18470 aarch64_split_dimode_const_store (rtx dst, rtx src)
18471 {
18472   rtx lo = gen_lowpart (SImode, src);
18473   rtx hi = gen_highpart_mode (SImode, DImode, src);
18474
18475   bool size_p = optimize_function_for_size_p (cfun);
18476
18477   if (!rtx_equal_p (lo, hi))
18478     return false;
18479
18480   unsigned int orig_cost
18481     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
18482   unsigned int lo_cost
18483     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
18484
18485   /* We want to transform:
18486      MOV        x1, 49370
18487      MOVK       x1, 0x140, lsl 16
18488      MOVK       x1, 0xc0da, lsl 32
18489      MOVK       x1, 0x140, lsl 48
18490      STR        x1, [x0]
18491    into:
18492      MOV        w1, 49370
18493      MOVK       w1, 0x140, lsl 16
18494      STP        w1, w1, [x0]
18495    So we want to perform this only when we save two instructions
18496    or more.  When optimizing for size, however, accept any code size
18497    savings we can.  */
18498   if (size_p && orig_cost <= lo_cost)
18499     return false;
18500
18501   if (!size_p
18502       && (orig_cost <= lo_cost + 1))
18503     return false;
18504
18505   rtx mem_lo = adjust_address (dst, SImode, 0);
18506   if (!aarch64_mem_pair_operand (mem_lo, SImode))
18507     return false;
18508
18509   rtx tmp_reg = gen_reg_rtx (SImode);
18510   aarch64_expand_mov_immediate (tmp_reg, lo);
18511   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
18512   /* Don't emit an explicit store pair as this may not be always profitable.
18513      Let the sched-fusion logic decide whether to merge them.  */
18514   emit_move_insn (mem_lo, tmp_reg);
18515   emit_move_insn (mem_hi, tmp_reg);
18516
18517   return true;
18518 }
18519
18520 /* Generate RTL for a conditional branch with rtx comparison CODE in
18521    mode CC_MODE.  The destination of the unlikely conditional branch
18522    is LABEL_REF.  */
18523
18524 void
18525 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
18526                               rtx label_ref)
18527 {
18528   rtx x;
18529   x = gen_rtx_fmt_ee (code, VOIDmode,
18530                       gen_rtx_REG (cc_mode, CC_REGNUM),
18531                       const0_rtx);
18532
18533   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
18534                             gen_rtx_LABEL_REF (VOIDmode, label_ref),
18535                             pc_rtx);
18536   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
18537 }
18538
18539 /* Generate DImode scratch registers for 128-bit (TImode) addition.
18540
18541    OP1 represents the TImode destination operand 1
18542    OP2 represents the TImode destination operand 2
18543    LOW_DEST represents the low half (DImode) of TImode operand 0
18544    LOW_IN1 represents the low half (DImode) of TImode operand 1
18545    LOW_IN2 represents the low half (DImode) of TImode operand 2
18546    HIGH_DEST represents the high half (DImode) of TImode operand 0
18547    HIGH_IN1 represents the high half (DImode) of TImode operand 1
18548    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
18549
18550 void
18551 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
18552                             rtx *low_in1, rtx *low_in2,
18553                             rtx *high_dest, rtx *high_in1,
18554                             rtx *high_in2)
18555 {
18556   *low_dest = gen_reg_rtx (DImode);
18557   *low_in1 = gen_lowpart (DImode, op1);
18558   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
18559                                   subreg_lowpart_offset (DImode, TImode));
18560   *high_dest = gen_reg_rtx (DImode);
18561   *high_in1 = gen_highpart (DImode, op1);
18562   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
18563                                    subreg_highpart_offset (DImode, TImode));
18564 }
18565
18566 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
18567
18568    This function differs from 'arch64_addti_scratch_regs' in that
18569    OP1 can be an immediate constant (zero). We must call
18570    subreg_highpart_offset with DImode and TImode arguments, otherwise
18571    VOIDmode will be used for the const_int which generates an internal
18572    error from subreg_size_highpart_offset which does not expect a size of zero.
18573
18574    OP1 represents the TImode destination operand 1
18575    OP2 represents the TImode destination operand 2
18576    LOW_DEST represents the low half (DImode) of TImode operand 0
18577    LOW_IN1 represents the low half (DImode) of TImode operand 1
18578    LOW_IN2 represents the low half (DImode) of TImode operand 2
18579    HIGH_DEST represents the high half (DImode) of TImode operand 0
18580    HIGH_IN1 represents the high half (DImode) of TImode operand 1
18581    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
18582
18583
18584 void
18585 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
18586                              rtx *low_in1, rtx *low_in2,
18587                              rtx *high_dest, rtx *high_in1,
18588                              rtx *high_in2)
18589 {
18590   *low_dest = gen_reg_rtx (DImode);
18591   *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
18592                                   subreg_lowpart_offset (DImode, TImode));
18593
18594   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
18595                                   subreg_lowpart_offset (DImode, TImode));
18596   *high_dest = gen_reg_rtx (DImode);
18597
18598   *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
18599                                    subreg_highpart_offset (DImode, TImode));
18600   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
18601                                    subreg_highpart_offset (DImode, TImode));
18602 }
18603
18604 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
18605
18606    OP0 represents the TImode destination operand 0
18607    LOW_DEST represents the low half (DImode) of TImode operand 0
18608    LOW_IN1 represents the low half (DImode) of TImode operand 1
18609    LOW_IN2 represents the low half (DImode) of TImode operand 2
18610    HIGH_DEST represents the high half (DImode) of TImode operand 0
18611    HIGH_IN1 represents the high half (DImode) of TImode operand 1
18612    HIGH_IN2 represents the high half (DImode) of TImode operand 2
18613    UNSIGNED_P is true if the operation is being performed on unsigned
18614    values.  */
18615 void
18616 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
18617                        rtx low_in2, rtx high_dest, rtx high_in1,
18618                        rtx high_in2, bool unsigned_p)
18619 {
18620   if (low_in2 == const0_rtx)
18621     {
18622       low_dest = low_in1;
18623       high_in2 = force_reg (DImode, high_in2);
18624       if (unsigned_p)
18625         emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
18626       else
18627         emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
18628     }
18629   else
18630     {
18631       if (CONST_INT_P (low_in2))
18632         {
18633           high_in2 = force_reg (DImode, high_in2);
18634           emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
18635                                               GEN_INT (-INTVAL (low_in2))));
18636         }
18637       else
18638         emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
18639
18640       if (unsigned_p)
18641         emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
18642       else
18643         emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
18644     }
18645
18646   emit_move_insn (gen_lowpart (DImode, op0), low_dest);
18647   emit_move_insn (gen_highpart (DImode, op0), high_dest);
18648
18649 }
18650
18651 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
18652
18653 static unsigned HOST_WIDE_INT
18654 aarch64_asan_shadow_offset (void)
18655 {
18656   if (TARGET_ILP32)
18657     return (HOST_WIDE_INT_1 << 29);
18658   else
18659     return (HOST_WIDE_INT_1 << 36);
18660 }
18661
18662 static rtx
18663 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
18664                         int code, tree treeop0, tree treeop1)
18665 {
18666   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
18667   rtx op0, op1;
18668   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
18669   insn_code icode;
18670   struct expand_operand ops[4];
18671
18672   start_sequence ();
18673   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
18674
18675   op_mode = GET_MODE (op0);
18676   if (op_mode == VOIDmode)
18677     op_mode = GET_MODE (op1);
18678
18679   switch (op_mode)
18680     {
18681     case E_QImode:
18682     case E_HImode:
18683     case E_SImode:
18684       cmp_mode = SImode;
18685       icode = CODE_FOR_cmpsi;
18686       break;
18687
18688     case E_DImode:
18689       cmp_mode = DImode;
18690       icode = CODE_FOR_cmpdi;
18691       break;
18692
18693     case E_SFmode:
18694       cmp_mode = SFmode;
18695       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
18696       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
18697       break;
18698
18699     case E_DFmode:
18700       cmp_mode = DFmode;
18701       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
18702       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
18703       break;
18704
18705     default:
18706       end_sequence ();
18707       return NULL_RTX;
18708     }
18709
18710   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
18711   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
18712   if (!op0 || !op1)
18713     {
18714       end_sequence ();
18715       return NULL_RTX;
18716     }
18717   *prep_seq = get_insns ();
18718   end_sequence ();
18719
18720   create_fixed_operand (&ops[0], op0);
18721   create_fixed_operand (&ops[1], op1);
18722
18723   start_sequence ();
18724   if (!maybe_expand_insn (icode, 2, ops))
18725     {
18726       end_sequence ();
18727       return NULL_RTX;
18728     }
18729   *gen_seq = get_insns ();
18730   end_sequence ();
18731
18732   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
18733                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
18734 }
18735
18736 static rtx
18737 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
18738                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
18739 {
18740   rtx op0, op1, target;
18741   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
18742   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
18743   insn_code icode;
18744   struct expand_operand ops[6];
18745   int aarch64_cond;
18746
18747   push_to_sequence (*prep_seq);
18748   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
18749
18750   op_mode = GET_MODE (op0);
18751   if (op_mode == VOIDmode)
18752     op_mode = GET_MODE (op1);
18753
18754   switch (op_mode)
18755     {
18756     case E_QImode:
18757     case E_HImode:
18758     case E_SImode:
18759       cmp_mode = SImode;
18760       icode = CODE_FOR_ccmpsi;
18761       break;
18762
18763     case E_DImode:
18764       cmp_mode = DImode;
18765       icode = CODE_FOR_ccmpdi;
18766       break;
18767
18768     case E_SFmode:
18769       cmp_mode = SFmode;
18770       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
18771       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
18772       break;
18773
18774     case E_DFmode:
18775       cmp_mode = DFmode;
18776       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
18777       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
18778       break;
18779
18780     default:
18781       end_sequence ();
18782       return NULL_RTX;
18783     }
18784
18785   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
18786   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
18787   if (!op0 || !op1)
18788     {
18789       end_sequence ();
18790       return NULL_RTX;
18791     }
18792   *prep_seq = get_insns ();
18793   end_sequence ();
18794
18795   target = gen_rtx_REG (cc_mode, CC_REGNUM);
18796   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
18797
18798   if (bit_code != AND)
18799     {
18800       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
18801                                                 GET_MODE (XEXP (prev, 0))),
18802                              VOIDmode, XEXP (prev, 0), const0_rtx);
18803       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
18804     }
18805
18806   create_fixed_operand (&ops[0], XEXP (prev, 0));
18807   create_fixed_operand (&ops[1], target);
18808   create_fixed_operand (&ops[2], op0);
18809   create_fixed_operand (&ops[3], op1);
18810   create_fixed_operand (&ops[4], prev);
18811   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
18812
18813   push_to_sequence (*gen_seq);
18814   if (!maybe_expand_insn (icode, 6, ops))
18815     {
18816       end_sequence ();
18817       return NULL_RTX;
18818     }
18819
18820   *gen_seq = get_insns ();
18821   end_sequence ();
18822
18823   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
18824 }
18825
18826 #undef TARGET_GEN_CCMP_FIRST
18827 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
18828
18829 #undef TARGET_GEN_CCMP_NEXT
18830 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
18831
18832 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
18833    instruction fusion of some sort.  */
18834
18835 static bool
18836 aarch64_macro_fusion_p (void)
18837 {
18838   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
18839 }
18840
18841
18842 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
18843    should be kept together during scheduling.  */
18844
18845 static bool
18846 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
18847 {
18848   rtx set_dest;
18849   rtx prev_set = single_set (prev);
18850   rtx curr_set = single_set (curr);
18851   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
18852   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
18853
18854   if (!aarch64_macro_fusion_p ())
18855     return false;
18856
18857   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
18858     {
18859       /* We are trying to match:
18860          prev (mov)  == (set (reg r0) (const_int imm16))
18861          curr (movk) == (set (zero_extract (reg r0)
18862                                            (const_int 16)
18863                                            (const_int 16))
18864                              (const_int imm16_1))  */
18865
18866       set_dest = SET_DEST (curr_set);
18867
18868       if (GET_CODE (set_dest) == ZERO_EXTRACT
18869           && CONST_INT_P (SET_SRC (curr_set))
18870           && CONST_INT_P (SET_SRC (prev_set))
18871           && CONST_INT_P (XEXP (set_dest, 2))
18872           && INTVAL (XEXP (set_dest, 2)) == 16
18873           && REG_P (XEXP (set_dest, 0))
18874           && REG_P (SET_DEST (prev_set))
18875           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
18876         {
18877           return true;
18878         }
18879     }
18880
18881   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
18882     {
18883
18884       /*  We're trying to match:
18885           prev (adrp) == (set (reg r1)
18886                               (high (symbol_ref ("SYM"))))
18887           curr (add) == (set (reg r0)
18888                              (lo_sum (reg r1)
18889                                      (symbol_ref ("SYM"))))
18890           Note that r0 need not necessarily be the same as r1, especially
18891           during pre-regalloc scheduling.  */
18892
18893       if (satisfies_constraint_Ush (SET_SRC (prev_set))
18894           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
18895         {
18896           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
18897               && REG_P (XEXP (SET_SRC (curr_set), 0))
18898               && REGNO (XEXP (SET_SRC (curr_set), 0))
18899                  == REGNO (SET_DEST (prev_set))
18900               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
18901                               XEXP (SET_SRC (curr_set), 1)))
18902             return true;
18903         }
18904     }
18905
18906   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
18907     {
18908
18909       /* We're trying to match:
18910          prev (movk) == (set (zero_extract (reg r0)
18911                                            (const_int 16)
18912                                            (const_int 32))
18913                              (const_int imm16_1))
18914          curr (movk) == (set (zero_extract (reg r0)
18915                                            (const_int 16)
18916                                            (const_int 48))
18917                              (const_int imm16_2))  */
18918
18919       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
18920           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
18921           && REG_P (XEXP (SET_DEST (prev_set), 0))
18922           && REG_P (XEXP (SET_DEST (curr_set), 0))
18923           && REGNO (XEXP (SET_DEST (prev_set), 0))
18924              == REGNO (XEXP (SET_DEST (curr_set), 0))
18925           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
18926           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
18927           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
18928           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
18929           && CONST_INT_P (SET_SRC (prev_set))
18930           && CONST_INT_P (SET_SRC (curr_set)))
18931         return true;
18932
18933     }
18934   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
18935     {
18936       /* We're trying to match:
18937           prev (adrp) == (set (reg r0)
18938                               (high (symbol_ref ("SYM"))))
18939           curr (ldr) == (set (reg r1)
18940                              (mem (lo_sum (reg r0)
18941                                              (symbol_ref ("SYM")))))
18942                  or
18943           curr (ldr) == (set (reg r1)
18944                              (zero_extend (mem
18945                                            (lo_sum (reg r0)
18946                                                    (symbol_ref ("SYM"))))))  */
18947       if (satisfies_constraint_Ush (SET_SRC (prev_set))
18948           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
18949         {
18950           rtx curr_src = SET_SRC (curr_set);
18951
18952           if (GET_CODE (curr_src) == ZERO_EXTEND)
18953             curr_src = XEXP (curr_src, 0);
18954
18955           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
18956               && REG_P (XEXP (XEXP (curr_src, 0), 0))
18957               && REGNO (XEXP (XEXP (curr_src, 0), 0))
18958                  == REGNO (SET_DEST (prev_set))
18959               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
18960                               XEXP (SET_SRC (prev_set), 0)))
18961               return true;
18962         }
18963     }
18964
18965   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
18966       && any_condjump_p (curr))
18967     {
18968       unsigned int condreg1, condreg2;
18969       rtx cc_reg_1;
18970       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
18971       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
18972
18973       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
18974           && prev
18975           && modified_in_p (cc_reg_1, prev))
18976         {
18977           enum attr_type prev_type = get_attr_type (prev);
18978
18979           /* FIXME: this misses some which is considered simple arthematic
18980              instructions for ThunderX.  Simple shifts are missed here.  */
18981           if (prev_type == TYPE_ALUS_SREG
18982               || prev_type == TYPE_ALUS_IMM
18983               || prev_type == TYPE_LOGICS_REG
18984               || prev_type == TYPE_LOGICS_IMM)
18985             return true;
18986         }
18987     }
18988
18989   if (prev_set
18990       && curr_set
18991       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
18992       && any_condjump_p (curr))
18993     {
18994       /* We're trying to match:
18995           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
18996           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
18997                                                          (const_int 0))
18998                                                  (label_ref ("SYM"))
18999                                                  (pc))  */
19000       if (SET_DEST (curr_set) == (pc_rtx)
19001           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
19002           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
19003           && REG_P (SET_DEST (prev_set))
19004           && REGNO (SET_DEST (prev_set))
19005              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
19006         {
19007           /* Fuse ALU operations followed by conditional branch instruction.  */
19008           switch (get_attr_type (prev))
19009             {
19010             case TYPE_ALU_IMM:
19011             case TYPE_ALU_SREG:
19012             case TYPE_ADC_REG:
19013             case TYPE_ADC_IMM:
19014             case TYPE_ADCS_REG:
19015             case TYPE_ADCS_IMM:
19016             case TYPE_LOGIC_REG:
19017             case TYPE_LOGIC_IMM:
19018             case TYPE_CSEL:
19019             case TYPE_ADR:
19020             case TYPE_MOV_IMM:
19021             case TYPE_SHIFT_REG:
19022             case TYPE_SHIFT_IMM:
19023             case TYPE_BFM:
19024             case TYPE_RBIT:
19025             case TYPE_REV:
19026             case TYPE_EXTEND:
19027               return true;
19028
19029             default:;
19030             }
19031         }
19032     }
19033
19034   return false;
19035 }
19036
19037 /* Return true iff the instruction fusion described by OP is enabled.  */
19038
19039 bool
19040 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
19041 {
19042   return (aarch64_tune_params.fusible_ops & op) != 0;
19043 }
19044
19045 /* If MEM is in the form of [base+offset], extract the two parts
19046    of address and set to BASE and OFFSET, otherwise return false
19047    after clearing BASE and OFFSET.  */
19048
19049 bool
19050 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
19051 {
19052   rtx addr;
19053
19054   gcc_assert (MEM_P (mem));
19055
19056   addr = XEXP (mem, 0);
19057
19058   if (REG_P (addr))
19059     {
19060       *base = addr;
19061       *offset = const0_rtx;
19062       return true;
19063     }
19064
19065   if (GET_CODE (addr) == PLUS
19066       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
19067     {
19068       *base = XEXP (addr, 0);
19069       *offset = XEXP (addr, 1);
19070       return true;
19071     }
19072
19073   *base = NULL_RTX;
19074   *offset = NULL_RTX;
19075
19076   return false;
19077 }
19078
19079 /* Types for scheduling fusion.  */
19080 enum sched_fusion_type
19081 {
19082   SCHED_FUSION_NONE = 0,
19083   SCHED_FUSION_LD_SIGN_EXTEND,
19084   SCHED_FUSION_LD_ZERO_EXTEND,
19085   SCHED_FUSION_LD,
19086   SCHED_FUSION_ST,
19087   SCHED_FUSION_NUM
19088 };
19089
19090 /* If INSN is a load or store of address in the form of [base+offset],
19091    extract the two parts and set to BASE and OFFSET.  Return scheduling
19092    fusion type this INSN is.  */
19093
19094 static enum sched_fusion_type
19095 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
19096 {
19097   rtx x, dest, src;
19098   enum sched_fusion_type fusion = SCHED_FUSION_LD;
19099
19100   gcc_assert (INSN_P (insn));
19101   x = PATTERN (insn);
19102   if (GET_CODE (x) != SET)
19103     return SCHED_FUSION_NONE;
19104
19105   src = SET_SRC (x);
19106   dest = SET_DEST (x);
19107
19108   machine_mode dest_mode = GET_MODE (dest);
19109
19110   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
19111     return SCHED_FUSION_NONE;
19112
19113   if (GET_CODE (src) == SIGN_EXTEND)
19114     {
19115       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
19116       src = XEXP (src, 0);
19117       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
19118         return SCHED_FUSION_NONE;
19119     }
19120   else if (GET_CODE (src) == ZERO_EXTEND)
19121     {
19122       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
19123       src = XEXP (src, 0);
19124       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
19125         return SCHED_FUSION_NONE;
19126     }
19127
19128   if (GET_CODE (src) == MEM && REG_P (dest))
19129     extract_base_offset_in_addr (src, base, offset);
19130   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
19131     {
19132       fusion = SCHED_FUSION_ST;
19133       extract_base_offset_in_addr (dest, base, offset);
19134     }
19135   else
19136     return SCHED_FUSION_NONE;
19137
19138   if (*base == NULL_RTX || *offset == NULL_RTX)
19139     fusion = SCHED_FUSION_NONE;
19140
19141   return fusion;
19142 }
19143
19144 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
19145
19146    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
19147    and PRI are only calculated for these instructions.  For other instruction,
19148    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
19149    type instruction fusion can be added by returning different priorities.
19150
19151    It's important that irrelevant instructions get the largest FUSION_PRI.  */
19152
19153 static void
19154 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
19155                                int *fusion_pri, int *pri)
19156 {
19157   int tmp, off_val;
19158   rtx base, offset;
19159   enum sched_fusion_type fusion;
19160
19161   gcc_assert (INSN_P (insn));
19162
19163   tmp = max_pri - 1;
19164   fusion = fusion_load_store (insn, &base, &offset);
19165   if (fusion == SCHED_FUSION_NONE)
19166     {
19167       *pri = tmp;
19168       *fusion_pri = tmp;
19169       return;
19170     }
19171
19172   /* Set FUSION_PRI according to fusion type and base register.  */
19173   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
19174
19175   /* Calculate PRI.  */
19176   tmp /= 2;
19177
19178   /* INSN with smaller offset goes first.  */
19179   off_val = (int)(INTVAL (offset));
19180   if (off_val >= 0)
19181     tmp -= (off_val & 0xfffff);
19182   else
19183     tmp += ((- off_val) & 0xfffff);
19184
19185   *pri = tmp;
19186   return;
19187 }
19188
19189 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
19190    Adjust priority of sha1h instructions so they are scheduled before
19191    other SHA1 instructions.  */
19192
19193 static int
19194 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
19195 {
19196   rtx x = PATTERN (insn);
19197
19198   if (GET_CODE (x) == SET)
19199     {
19200       x = SET_SRC (x);
19201
19202       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
19203         return priority + 10;
19204     }
19205
19206   return priority;
19207 }
19208
19209 /* Given OPERANDS of consecutive load/store, check if we can merge
19210    them into ldp/stp.  LOAD is true if they are load instructions.
19211    MODE is the mode of memory operands.  */
19212
19213 bool
19214 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
19215                                 machine_mode mode)
19216 {
19217   HOST_WIDE_INT offval_1, offval_2, msize;
19218   enum reg_class rclass_1, rclass_2;
19219   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
19220
19221   if (load)
19222     {
19223       mem_1 = operands[1];
19224       mem_2 = operands[3];
19225       reg_1 = operands[0];
19226       reg_2 = operands[2];
19227       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
19228       if (REGNO (reg_1) == REGNO (reg_2))
19229         return false;
19230     }
19231   else
19232     {
19233       mem_1 = operands[0];
19234       mem_2 = operands[2];
19235       reg_1 = operands[1];
19236       reg_2 = operands[3];
19237     }
19238
19239   /* The mems cannot be volatile.  */
19240   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
19241     return false;
19242
19243   /* If we have SImode and slow unaligned ldp,
19244      check the alignment to be at least 8 byte. */
19245   if (mode == SImode
19246       && (aarch64_tune_params.extra_tuning_flags
19247           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
19248       && !optimize_size
19249       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
19250     return false;
19251
19252   /* Check if the addresses are in the form of [base+offset].  */
19253   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
19254   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
19255     return false;
19256   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
19257   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
19258     return false;
19259
19260   /* Check if the bases are same.  */
19261   if (!rtx_equal_p (base_1, base_2))
19262     return false;
19263
19264   /* The operands must be of the same size.  */
19265   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
19266                          GET_MODE_SIZE (GET_MODE (mem_2))));
19267
19268   offval_1 = INTVAL (offset_1);
19269   offval_2 = INTVAL (offset_2);
19270   /* We should only be trying this for fixed-sized modes.  There is no
19271      SVE LDP/STP instruction.  */
19272   msize = GET_MODE_SIZE (mode).to_constant ();
19273   /* Check if the offsets are consecutive.  */
19274   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
19275     return false;
19276
19277   /* Check if the addresses are clobbered by load.  */
19278   if (load)
19279     {
19280       if (reg_mentioned_p (reg_1, mem_1))
19281         return false;
19282
19283       /* In increasing order, the last load can clobber the address.  */
19284       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
19285         return false;
19286     }
19287
19288   /* One of the memory accesses must be a mempair operand.
19289      If it is not the first one, they need to be swapped by the
19290      peephole.  */
19291   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
19292        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
19293     return false;
19294
19295   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
19296     rclass_1 = FP_REGS;
19297   else
19298     rclass_1 = GENERAL_REGS;
19299
19300   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
19301     rclass_2 = FP_REGS;
19302   else
19303     rclass_2 = GENERAL_REGS;
19304
19305   /* Check if the registers are of same class.  */
19306   if (rclass_1 != rclass_2)
19307     return false;
19308
19309   return true;
19310 }
19311
19312 /* Given OPERANDS of consecutive load/store that can be merged,
19313    swap them if they are not in ascending order.  */
19314 void
19315 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
19316 {
19317   rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
19318   HOST_WIDE_INT offval_1, offval_2;
19319
19320   if (load)
19321     {
19322       mem_1 = operands[1];
19323       mem_2 = operands[3];
19324     }
19325   else
19326     {
19327       mem_1 = operands[0];
19328       mem_2 = operands[2];
19329     }
19330
19331   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
19332   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
19333
19334   offval_1 = INTVAL (offset_1);
19335   offval_2 = INTVAL (offset_2);
19336
19337   if (offval_1 > offval_2)
19338     {
19339       /* Irrespective of whether this is a load or a store,
19340          we do the same swap.  */
19341       std::swap (operands[0], operands[2]);
19342       std::swap (operands[1], operands[3]);
19343     }
19344 }
19345
19346 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
19347    comparison between the two.  */
19348 int
19349 aarch64_host_wide_int_compare (const void *x, const void *y)
19350 {
19351   return wi::cmps (* ((const HOST_WIDE_INT *) x),
19352                    * ((const HOST_WIDE_INT *) y));
19353 }
19354
19355 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
19356    other pointing to a REG rtx containing an offset, compare the offsets
19357    of the two pairs.
19358
19359    Return:
19360
19361         1 iff offset (X) > offset (Y)
19362         0 iff offset (X) == offset (Y)
19363         -1 iff offset (X) < offset (Y)  */
19364 int
19365 aarch64_ldrstr_offset_compare (const void *x, const void *y)
19366 {
19367   const rtx * operands_1 = (const rtx *) x;
19368   const rtx * operands_2 = (const rtx *) y;
19369   rtx mem_1, mem_2, base, offset_1, offset_2;
19370
19371   if (MEM_P (operands_1[0]))
19372     mem_1 = operands_1[0];
19373   else
19374     mem_1 = operands_1[1];
19375
19376   if (MEM_P (operands_2[0]))
19377     mem_2 = operands_2[0];
19378   else
19379     mem_2 = operands_2[1];
19380
19381   /* Extract the offsets.  */
19382   extract_base_offset_in_addr (mem_1, &base, &offset_1);
19383   extract_base_offset_in_addr (mem_2, &base, &offset_2);
19384
19385   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
19386
19387   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
19388 }
19389
19390 /* Given OPERANDS of consecutive load/store, check if we can merge
19391    them into ldp/stp by adjusting the offset.  LOAD is true if they
19392    are load instructions.  MODE is the mode of memory operands.
19393
19394    Given below consecutive stores:
19395
19396      str  w1, [xb, 0x100]
19397      str  w1, [xb, 0x104]
19398      str  w1, [xb, 0x108]
19399      str  w1, [xb, 0x10c]
19400
19401    Though the offsets are out of the range supported by stp, we can
19402    still pair them after adjusting the offset, like:
19403
19404      add  scratch, xb, 0x100
19405      stp  w1, w1, [scratch]
19406      stp  w1, w1, [scratch, 0x8]
19407
19408    The peephole patterns detecting this opportunity should guarantee
19409    the scratch register is avaliable.  */
19410
19411 bool
19412 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
19413                                        scalar_mode mode)
19414 {
19415   const int num_insns = 4;
19416   enum reg_class rclass;
19417   HOST_WIDE_INT offvals[num_insns], msize;
19418   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
19419
19420   if (load)
19421     {
19422       for (int i = 0; i < num_insns; i++)
19423         {
19424           reg[i] = operands[2 * i];
19425           mem[i] = operands[2 * i + 1];
19426
19427           gcc_assert (REG_P (reg[i]));
19428         }
19429
19430       /* Do not attempt to merge the loads if the loads clobber each other.  */
19431       for (int i = 0; i < 8; i += 2)
19432         for (int j = i + 2; j < 8; j += 2)
19433           if (reg_overlap_mentioned_p (operands[i], operands[j]))
19434             return false;
19435     }
19436   else
19437     for (int i = 0; i < num_insns; i++)
19438       {
19439         mem[i] = operands[2 * i];
19440         reg[i] = operands[2 * i + 1];
19441       }
19442
19443   /* Skip if memory operand is by itself valid for ldp/stp.  */
19444   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
19445     return false;
19446
19447   for (int i = 0; i < num_insns; i++)
19448     {
19449       /* The mems cannot be volatile.  */
19450       if (MEM_VOLATILE_P (mem[i]))
19451         return false;
19452
19453       /* Check if the addresses are in the form of [base+offset].  */
19454       extract_base_offset_in_addr (mem[i], base + i, offset + i);
19455       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
19456         return false;
19457     }
19458
19459   /* Check if the registers are of same class.  */
19460   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
19461     ? FP_REGS : GENERAL_REGS;
19462
19463   for (int i = 1; i < num_insns; i++)
19464     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
19465       {
19466         if (rclass != FP_REGS)
19467           return false;
19468       }
19469     else
19470       {
19471         if (rclass != GENERAL_REGS)
19472           return false;
19473       }
19474
19475   /* Only the last register in the order in which they occur
19476      may be clobbered by the load.  */
19477   if (rclass == GENERAL_REGS && load)
19478     for (int i = 0; i < num_insns - 1; i++)
19479       if (reg_mentioned_p (reg[i], mem[i]))
19480         return false;
19481
19482   /* Check if the bases are same.  */
19483   for (int i = 0; i < num_insns - 1; i++)
19484     if (!rtx_equal_p (base[i], base[i + 1]))
19485       return false;
19486
19487   for (int i = 0; i < num_insns; i++)
19488     offvals[i] = INTVAL (offset[i]);
19489
19490   msize = GET_MODE_SIZE (mode);
19491
19492   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
19493   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
19494          aarch64_host_wide_int_compare);
19495
19496   if (!(offvals[1] == offvals[0] + msize
19497         && offvals[3] == offvals[2] + msize))
19498     return false;
19499
19500   /* Check that offsets are within range of each other.  The ldp/stp
19501      instructions have 7 bit immediate offsets, so use 0x80.  */
19502   if (offvals[2] - offvals[0] >= msize * 0x80)
19503     return false;
19504
19505   /* The offsets must be aligned with respect to each other.  */
19506   if (offvals[0] % msize != offvals[2] % msize)
19507     return false;
19508
19509   /* If we have SImode and slow unaligned ldp,
19510      check the alignment to be at least 8 byte. */
19511   if (mode == SImode
19512       && (aarch64_tune_params.extra_tuning_flags
19513           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
19514       && !optimize_size
19515       && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
19516     return false;
19517
19518   return true;
19519 }
19520
19521 /* Given OPERANDS of consecutive load/store, this function pairs them
19522    into LDP/STP after adjusting the offset.  It depends on the fact
19523    that the operands can be sorted so the offsets are correct for STP.
19524    MODE is the mode of memory operands.  CODE is the rtl operator
19525    which should be applied to all memory operands, it's SIGN_EXTEND,
19526    ZERO_EXTEND or UNKNOWN.  */
19527
19528 bool
19529 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
19530                              scalar_mode mode, RTX_CODE code)
19531 {
19532   rtx base, offset_1, offset_3, t1, t2;
19533   rtx mem_1, mem_2, mem_3, mem_4;
19534   rtx temp_operands[8];
19535   HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
19536                 stp_off_upper_limit, stp_off_lower_limit, msize;
19537
19538   /* We make changes on a copy as we may still bail out.  */
19539   for (int i = 0; i < 8; i ++)
19540     temp_operands[i] = operands[i];
19541
19542   /* Sort the operands.  */
19543   qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
19544
19545   /* Copy the memory operands so that if we have to bail for some
19546      reason the original addresses are unchanged.  */
19547   if (load)
19548     {
19549       mem_1 = copy_rtx (temp_operands[1]);
19550       mem_2 = copy_rtx (temp_operands[3]);
19551       mem_3 = copy_rtx (temp_operands[5]);
19552       mem_4 = copy_rtx (temp_operands[7]);
19553     }
19554   else
19555     {
19556       mem_1 = copy_rtx (temp_operands[0]);
19557       mem_2 = copy_rtx (temp_operands[2]);
19558       mem_3 = copy_rtx (temp_operands[4]);
19559       mem_4 = copy_rtx (temp_operands[6]);
19560       gcc_assert (code == UNKNOWN);
19561     }
19562
19563   extract_base_offset_in_addr (mem_1, &base, &offset_1);
19564   extract_base_offset_in_addr (mem_3, &base, &offset_3);
19565   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
19566               && offset_3 != NULL_RTX);
19567
19568   /* Adjust offset so it can fit in LDP/STP instruction.  */
19569   msize = GET_MODE_SIZE (mode);
19570   stp_off_upper_limit = msize * (0x40 - 1);
19571   stp_off_lower_limit = - msize * 0x40;
19572
19573   off_val_1 = INTVAL (offset_1);
19574   off_val_3 = INTVAL (offset_3);
19575
19576   /* The base offset is optimally half way between the two STP/LDP offsets.  */
19577   if (msize <= 4)
19578     base_off = (off_val_1 + off_val_3) / 2;
19579   else
19580     /* However, due to issues with negative LDP/STP offset generation for
19581        larger modes, for DF, DI and vector modes. we must not use negative
19582        addresses smaller than 9 signed unadjusted bits can store.  This
19583        provides the most range in this case.  */
19584     base_off = off_val_1;
19585
19586   /* Adjust the base so that it is aligned with the addresses but still
19587      optimal.  */
19588   if (base_off % msize != off_val_1 % msize)
19589     /* Fix the offset, bearing in mind we want to make it bigger not
19590        smaller.  */
19591     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19592   else if (msize <= 4)
19593     /* The negative range of LDP/STP is one larger than the positive range.  */
19594     base_off += msize;
19595
19596   /* Check if base offset is too big or too small.  We can attempt to resolve
19597      this issue by setting it to the maximum value and seeing if the offsets
19598      still fit.  */
19599   if (base_off >= 0x1000)
19600     {
19601       base_off = 0x1000 - 1;
19602       /* We must still make sure that the base offset is aligned with respect
19603          to the address.  But it may may not be made any bigger.  */
19604       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19605     }
19606
19607   /* Likewise for the case where the base is too small.  */
19608   if (base_off <= -0x1000)
19609     {
19610       base_off = -0x1000 + 1;
19611       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19612     }
19613
19614   /* Offset of the first STP/LDP.  */
19615   new_off_1 = off_val_1 - base_off;
19616
19617   /* Offset of the second STP/LDP.  */
19618   new_off_3 = off_val_3 - base_off;
19619
19620   /* The offsets must be within the range of the LDP/STP instructions.  */
19621   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
19622       || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
19623     return false;
19624
19625   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
19626                                                   new_off_1), true);
19627   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
19628                                                   new_off_1 + msize), true);
19629   replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
19630                                                   new_off_3), true);
19631   replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
19632                                                   new_off_3 + msize), true);
19633
19634   if (!aarch64_mem_pair_operand (mem_1, mode)
19635       || !aarch64_mem_pair_operand (mem_3, mode))
19636     return false;
19637
19638   if (code == ZERO_EXTEND)
19639     {
19640       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
19641       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
19642       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
19643       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
19644     }
19645   else if (code == SIGN_EXTEND)
19646     {
19647       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
19648       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
19649       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
19650       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
19651     }
19652
19653   if (load)
19654     {
19655       operands[0] = temp_operands[0];
19656       operands[1] = mem_1;
19657       operands[2] = temp_operands[2];
19658       operands[3] = mem_2;
19659       operands[4] = temp_operands[4];
19660       operands[5] = mem_3;
19661       operands[6] = temp_operands[6];
19662       operands[7] = mem_4;
19663     }
19664   else
19665     {
19666       operands[0] = mem_1;
19667       operands[1] = temp_operands[1];
19668       operands[2] = mem_2;
19669       operands[3] = temp_operands[3];
19670       operands[4] = mem_3;
19671       operands[5] = temp_operands[5];
19672       operands[6] = mem_4;
19673       operands[7] = temp_operands[7];
19674     }
19675
19676   /* Emit adjusting instruction.  */
19677   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
19678   /* Emit ldp/stp instructions.  */
19679   t1 = gen_rtx_SET (operands[0], operands[1]);
19680   t2 = gen_rtx_SET (operands[2], operands[3]);
19681   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
19682   t1 = gen_rtx_SET (operands[4], operands[5]);
19683   t2 = gen_rtx_SET (operands[6], operands[7]);
19684   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
19685   return true;
19686 }
19687
19688 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
19689    it isn't worth branching around empty masked ops (including masked
19690    stores).  */
19691
19692 static bool
19693 aarch64_empty_mask_is_expensive (unsigned)
19694 {
19695   return false;
19696 }
19697
19698 /* Return 1 if pseudo register should be created and used to hold
19699    GOT address for PIC code.  */
19700
19701 bool
19702 aarch64_use_pseudo_pic_reg (void)
19703 {
19704   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
19705 }
19706
19707 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
19708
19709 static int
19710 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
19711 {
19712   switch (XINT (x, 1))
19713     {
19714     case UNSPEC_GOTSMALLPIC:
19715     case UNSPEC_GOTSMALLPIC28K:
19716     case UNSPEC_GOTTINYPIC:
19717       return 0;
19718     default:
19719       break;
19720     }
19721
19722   return default_unspec_may_trap_p (x, flags);
19723 }
19724
19725
19726 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
19727    return the log2 of that value.  Otherwise return -1.  */
19728
19729 int
19730 aarch64_fpconst_pow_of_2 (rtx x)
19731 {
19732   const REAL_VALUE_TYPE *r;
19733
19734   if (!CONST_DOUBLE_P (x))
19735     return -1;
19736
19737   r = CONST_DOUBLE_REAL_VALUE (x);
19738
19739   if (REAL_VALUE_NEGATIVE (*r)
19740       || REAL_VALUE_ISNAN (*r)
19741       || REAL_VALUE_ISINF (*r)
19742       || !real_isinteger (r, DFmode))
19743     return -1;
19744
19745   return exact_log2 (real_to_integer (r));
19746 }
19747
19748 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
19749    power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
19750    return n. Otherwise return -1.  */
19751
19752 int
19753 aarch64_fpconst_pow2_recip (rtx x)
19754 {
19755   REAL_VALUE_TYPE r0;
19756
19757   if (!CONST_DOUBLE_P (x))
19758     return -1;
19759
19760   r0 = *CONST_DOUBLE_REAL_VALUE (x);
19761   if (exact_real_inverse (DFmode, &r0)
19762       && !REAL_VALUE_NEGATIVE (r0))
19763     {
19764         int ret = exact_log2 (real_to_integer (&r0));
19765         if (ret >= 1 && ret <= 32)
19766             return ret;
19767     }
19768   return -1;
19769 }
19770
19771 /* If X is a vector of equal CONST_DOUBLE values and that value is
19772    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
19773
19774 int
19775 aarch64_vec_fpconst_pow_of_2 (rtx x)
19776 {
19777   int nelts;
19778   if (GET_CODE (x) != CONST_VECTOR
19779       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
19780     return -1;
19781
19782   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
19783     return -1;
19784
19785   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
19786   if (firstval <= 0)
19787     return -1;
19788
19789   for (int i = 1; i < nelts; i++)
19790     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
19791       return -1;
19792
19793   return firstval;
19794 }
19795
19796 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
19797    to float.
19798
19799    __fp16 always promotes through this hook.
19800    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
19801    through the generic excess precision logic rather than here.  */
19802
19803 static tree
19804 aarch64_promoted_type (const_tree t)
19805 {
19806   if (SCALAR_FLOAT_TYPE_P (t)
19807       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
19808     return float_type_node;
19809
19810   return NULL_TREE;
19811 }
19812
19813 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
19814
19815 static bool
19816 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
19817                            optimization_type opt_type)
19818 {
19819   switch (op)
19820     {
19821     case rsqrt_optab:
19822       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
19823
19824     default:
19825       return true;
19826     }
19827 }
19828
19829 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
19830
19831 static unsigned int
19832 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
19833                                         int *offset)
19834 {
19835   /* Polynomial invariant 1 == (VG / 2) - 1.  */
19836   gcc_assert (i == 1);
19837   *factor = 2;
19838   *offset = 1;
19839   return AARCH64_DWARF_VG;
19840 }
19841
19842 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
19843    if MODE is HFmode, and punt to the generic implementation otherwise.  */
19844
19845 static bool
19846 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
19847 {
19848   return (mode == HFmode
19849           ? true
19850           : default_libgcc_floating_mode_supported_p (mode));
19851 }
19852
19853 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
19854    if MODE is HFmode, and punt to the generic implementation otherwise.  */
19855
19856 static bool
19857 aarch64_scalar_mode_supported_p (scalar_mode mode)
19858 {
19859   return (mode == HFmode
19860           ? true
19861           : default_scalar_mode_supported_p (mode));
19862 }
19863
19864 /* Set the value of FLT_EVAL_METHOD.
19865    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
19866
19867     0: evaluate all operations and constants, whose semantic type has at
19868        most the range and precision of type float, to the range and
19869        precision of float; evaluate all other operations and constants to
19870        the range and precision of the semantic type;
19871
19872     N, where _FloatN is a supported interchange floating type
19873        evaluate all operations and constants, whose semantic type has at
19874        most the range and precision of _FloatN type, to the range and
19875        precision of the _FloatN type; evaluate all other operations and
19876        constants to the range and precision of the semantic type;
19877
19878    If we have the ARMv8.2-A extensions then we support _Float16 in native
19879    precision, so we should set this to 16.  Otherwise, we support the type,
19880    but want to evaluate expressions in float precision, so set this to
19881    0.  */
19882
19883 static enum flt_eval_method
19884 aarch64_excess_precision (enum excess_precision_type type)
19885 {
19886   switch (type)
19887     {
19888       case EXCESS_PRECISION_TYPE_FAST:
19889       case EXCESS_PRECISION_TYPE_STANDARD:
19890         /* We can calculate either in 16-bit range and precision or
19891            32-bit range and precision.  Make that decision based on whether
19892            we have native support for the ARMv8.2-A 16-bit floating-point
19893            instructions or not.  */
19894         return (TARGET_FP_F16INST
19895                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
19896                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
19897       case EXCESS_PRECISION_TYPE_IMPLICIT:
19898         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
19899       default:
19900         gcc_unreachable ();
19901     }
19902   return FLT_EVAL_METHOD_UNPREDICTABLE;
19903 }
19904
19905 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
19906    scheduled for speculative execution.  Reject the long-running division
19907    and square-root instructions.  */
19908
19909 static bool
19910 aarch64_sched_can_speculate_insn (rtx_insn *insn)
19911 {
19912   switch (get_attr_type (insn))
19913     {
19914       case TYPE_SDIV:
19915       case TYPE_UDIV:
19916       case TYPE_FDIVS:
19917       case TYPE_FDIVD:
19918       case TYPE_FSQRTS:
19919       case TYPE_FSQRTD:
19920       case TYPE_NEON_FP_SQRT_S:
19921       case TYPE_NEON_FP_SQRT_D:
19922       case TYPE_NEON_FP_SQRT_S_Q:
19923       case TYPE_NEON_FP_SQRT_D_Q:
19924       case TYPE_NEON_FP_DIV_S:
19925       case TYPE_NEON_FP_DIV_D:
19926       case TYPE_NEON_FP_DIV_S_Q:
19927       case TYPE_NEON_FP_DIV_D_Q:
19928         return false;
19929       default:
19930         return true;
19931     }
19932 }
19933
19934 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
19935
19936 static int
19937 aarch64_compute_pressure_classes (reg_class *classes)
19938 {
19939   int i = 0;
19940   classes[i++] = GENERAL_REGS;
19941   classes[i++] = FP_REGS;
19942   /* PR_REGS isn't a useful pressure class because many predicate pseudo
19943      registers need to go in PR_LO_REGS at some point during their
19944      lifetime.  Splitting it into two halves has the effect of making
19945      all predicates count against PR_LO_REGS, so that we try whenever
19946      possible to restrict the number of live predicates to 8.  This
19947      greatly reduces the amount of spilling in certain loops.  */
19948   classes[i++] = PR_LO_REGS;
19949   classes[i++] = PR_HI_REGS;
19950   return i;
19951 }
19952
19953 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
19954
19955 static bool
19956 aarch64_can_change_mode_class (machine_mode from,
19957                                machine_mode to, reg_class_t)
19958 {
19959   if (BYTES_BIG_ENDIAN)
19960     {
19961       bool from_sve_p = aarch64_sve_data_mode_p (from);
19962       bool to_sve_p = aarch64_sve_data_mode_p (to);
19963
19964       /* Don't allow changes between SVE data modes and non-SVE modes.
19965          See the comment at the head of aarch64-sve.md for details.  */
19966       if (from_sve_p != to_sve_p)
19967         return false;
19968
19969       /* Don't allow changes in element size: lane 0 of the new vector
19970          would not then be lane 0 of the old vector.  See the comment
19971          above aarch64_maybe_expand_sve_subreg_move for a more detailed
19972          description.
19973
19974          In the worst case, this forces a register to be spilled in
19975          one mode and reloaded in the other, which handles the
19976          endianness correctly.  */
19977       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
19978         return false;
19979     }
19980   return true;
19981 }
19982
19983 /* Implement TARGET_EARLY_REMAT_MODES.  */
19984
19985 static void
19986 aarch64_select_early_remat_modes (sbitmap modes)
19987 {
19988   /* SVE values are not normally live across a call, so it should be
19989      worth doing early rematerialization even in VL-specific mode.  */
19990   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
19991     if (aarch64_sve_mode_p ((machine_mode) i))
19992       bitmap_set_bit (modes, i);
19993 }
19994
19995 /* Override the default target speculation_safe_value.  */
19996 static rtx
19997 aarch64_speculation_safe_value (machine_mode mode,
19998                                 rtx result, rtx val, rtx failval)
19999 {
20000   /* Maybe we should warn if falling back to hard barriers.  They are
20001      likely to be noticably more expensive than the alternative below.  */
20002   if (!aarch64_track_speculation)
20003     return default_speculation_safe_value (mode, result, val, failval);
20004
20005   if (!REG_P (val))
20006     val = copy_to_mode_reg (mode, val);
20007
20008   if (!aarch64_reg_or_zero (failval, mode))
20009     failval = copy_to_mode_reg (mode, failval);
20010
20011   emit_insn (gen_despeculate_copy (mode, result, val, failval));
20012   return result;
20013 }
20014
20015 /* Implement TARGET_ESTIMATED_POLY_VALUE.
20016    Look into the tuning structure for an estimate.
20017    VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
20018    Advanced SIMD 128 bits.  */
20019
20020 static HOST_WIDE_INT
20021 aarch64_estimated_poly_value (poly_int64 val)
20022 {
20023   enum aarch64_sve_vector_bits_enum width_source
20024     = aarch64_tune_params.sve_width;
20025
20026   /* If we still don't have an estimate, use the default.  */
20027   if (width_source == SVE_SCALABLE)
20028     return default_estimated_poly_value (val);
20029
20030   HOST_WIDE_INT over_128 = width_source - 128;
20031   return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
20032 }
20033
20034
20035 /* Return true for types that could be supported as SIMD return or
20036    argument types.  */
20037
20038 static bool
20039 supported_simd_type (tree t)
20040 {
20041   if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
20042     {
20043       HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
20044       return s == 1 || s == 2 || s == 4 || s == 8;
20045     }
20046   return false;
20047 }
20048
20049 /* Return true for types that currently are supported as SIMD return
20050    or argument types.  */
20051
20052 static bool
20053 currently_supported_simd_type (tree t, tree b)
20054 {
20055   if (COMPLEX_FLOAT_TYPE_P (t))
20056     return false;
20057
20058   if (TYPE_SIZE (t) != TYPE_SIZE (b))
20059     return false;
20060
20061   return supported_simd_type (t);
20062 }
20063
20064 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
20065
20066 static int
20067 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
20068                                         struct cgraph_simd_clone *clonei,
20069                                         tree base_type, int num)
20070 {
20071   tree t, ret_type, arg_type;
20072   unsigned int elt_bits, vec_bits, count;
20073
20074   if (!TARGET_SIMD)
20075     return 0;
20076
20077   if (clonei->simdlen
20078       && (clonei->simdlen < 2
20079           || clonei->simdlen > 1024
20080           || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
20081     {
20082       warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20083                   "unsupported simdlen %d", clonei->simdlen);
20084       return 0;
20085     }
20086
20087   ret_type = TREE_TYPE (TREE_TYPE (node->decl));
20088   if (TREE_CODE (ret_type) != VOID_TYPE
20089       && !currently_supported_simd_type (ret_type, base_type))
20090     {
20091       if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
20092         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20093                     "GCC does not currently support mixed size types "
20094                     "for %<simd%> functions");
20095       else if (supported_simd_type (ret_type))
20096         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20097                     "GCC does not currently support return type %qT "
20098                     "for %<simd%> functions", ret_type);
20099       else
20100         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20101                     "unsupported return type %qT for %<simd%> functions",
20102                     ret_type);
20103       return 0;
20104     }
20105
20106   for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
20107     {
20108       arg_type = TREE_TYPE (t);
20109
20110       if (!currently_supported_simd_type (arg_type, base_type))
20111         {
20112           if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
20113             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20114                         "GCC does not currently support mixed size types "
20115                         "for %<simd%> functions");
20116           else
20117             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20118                         "GCC does not currently support argument type %qT "
20119                         "for %<simd%> functions", arg_type);
20120           return 0;
20121         }
20122     }
20123
20124   clonei->vecsize_mangle = 'n';
20125   clonei->mask_mode = VOIDmode;
20126   elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
20127   if (clonei->simdlen == 0)
20128     {
20129       count = 2;
20130       vec_bits = (num == 0 ? 64 : 128);
20131       clonei->simdlen = vec_bits / elt_bits;
20132     }
20133   else
20134     {
20135       count = 1;
20136       vec_bits = clonei->simdlen * elt_bits;
20137       if (vec_bits != 64 && vec_bits != 128)
20138         {
20139           warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20140                       "GCC does not currently support simdlen %d for type %qT",
20141                       clonei->simdlen, base_type);
20142           return 0;
20143         }
20144     }
20145   clonei->vecsize_int = vec_bits;
20146   clonei->vecsize_float = vec_bits;
20147   return count;
20148 }
20149
20150 /* Implement TARGET_SIMD_CLONE_ADJUST.  */
20151
20152 static void
20153 aarch64_simd_clone_adjust (struct cgraph_node *node)
20154 {
20155   /* Add aarch64_vector_pcs target attribute to SIMD clones so they
20156      use the correct ABI.  */
20157
20158   tree t = TREE_TYPE (node->decl);
20159   TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
20160                                         TYPE_ATTRIBUTES (t));
20161 }
20162
20163 /* Implement TARGET_SIMD_CLONE_USABLE.  */
20164
20165 static int
20166 aarch64_simd_clone_usable (struct cgraph_node *node)
20167 {
20168   switch (node->simdclone->vecsize_mangle)
20169     {
20170     case 'n':
20171       if (!TARGET_SIMD)
20172         return -1;
20173       return 0;
20174     default:
20175       gcc_unreachable ();
20176     }
20177 }
20178
20179 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
20180
20181 static int
20182 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
20183 {
20184   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
20185       != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
20186     return 0;
20187   return 1;
20188 }
20189
20190 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
20191
20192 static const char *
20193 aarch64_get_multilib_abi_name (void)
20194 {
20195   if (TARGET_BIG_END)
20196     return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
20197   return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
20198 }
20199
20200 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
20201    global variable based guard use the default else
20202    return a null tree.  */
20203 static tree
20204 aarch64_stack_protect_guard (void)
20205 {
20206   if (aarch64_stack_protector_guard == SSP_GLOBAL)
20207     return default_stack_protect_guard ();
20208
20209   return NULL_TREE;
20210 }
20211
20212 /* Implement TARGET_ASM_FILE_END for AArch64.  This adds the AArch64 GNU NOTE
20213    section at the end if needed.  */
20214 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND      0xc0000000
20215 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI      (1U << 0)
20216 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC      (1U << 1)
20217 void
20218 aarch64_file_end_indicate_exec_stack ()
20219 {
20220   file_end_indicate_exec_stack ();
20221
20222   unsigned feature_1_and = 0;
20223   if (aarch64_bti_enabled ())
20224     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
20225
20226   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
20227     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
20228
20229   if (feature_1_and)
20230     {
20231       /* Generate .note.gnu.property section.  */
20232       switch_to_section (get_section (".note.gnu.property",
20233                                       SECTION_NOTYPE, NULL));
20234
20235       /* PT_NOTE header: namesz, descsz, type.
20236          namesz = 4 ("GNU\0")
20237          descsz = 16 (Size of the program property array)
20238                   [(12 + padding) * Number of array elements]
20239          type   = 5 (NT_GNU_PROPERTY_TYPE_0).  */
20240       assemble_align (POINTER_SIZE);
20241       assemble_integer (GEN_INT (4), 4, 32, 1);
20242       assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
20243       assemble_integer (GEN_INT (5), 4, 32, 1);
20244
20245       /* PT_NOTE name.  */
20246       assemble_string ("GNU", 4);
20247
20248       /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
20249          type   = GNU_PROPERTY_AARCH64_FEATURE_1_AND
20250          datasz = 4
20251          data   = feature_1_and.  */
20252       assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
20253       assemble_integer (GEN_INT (4), 4, 32, 1);
20254       assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
20255
20256       /* Pad the size of the note to the required alignment.  */
20257       assemble_align (POINTER_SIZE);
20258     }
20259 }
20260 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
20261 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
20262 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
20263
20264 /* Target-specific selftests.  */
20265
20266 #if CHECKING_P
20267
20268 namespace selftest {
20269
20270 /* Selftest for the RTL loader.
20271    Verify that the RTL loader copes with a dump from
20272    print_rtx_function.  This is essentially just a test that class
20273    function_reader can handle a real dump, but it also verifies
20274    that lookup_reg_by_dump_name correctly handles hard regs.
20275    The presence of hard reg names in the dump means that the test is
20276    target-specific, hence it is in this file.  */
20277
20278 static void
20279 aarch64_test_loading_full_dump ()
20280 {
20281   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
20282
20283   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
20284
20285   rtx_insn *insn_1 = get_insn_by_uid (1);
20286   ASSERT_EQ (NOTE, GET_CODE (insn_1));
20287
20288   rtx_insn *insn_15 = get_insn_by_uid (15);
20289   ASSERT_EQ (INSN, GET_CODE (insn_15));
20290   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
20291
20292   /* Verify crtl->return_rtx.  */
20293   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
20294   ASSERT_EQ (0, REGNO (crtl->return_rtx));
20295   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
20296 }
20297
20298 /* Run all target-specific selftests.  */
20299
20300 static void
20301 aarch64_run_selftests (void)
20302 {
20303   aarch64_test_loading_full_dump ();
20304 }
20305
20306 } // namespace selftest
20307
20308 #endif /* #if CHECKING_P */
20309
20310 #undef TARGET_STACK_PROTECT_GUARD
20311 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
20312
20313 #undef TARGET_ADDRESS_COST
20314 #define TARGET_ADDRESS_COST aarch64_address_cost
20315
20316 /* This hook will determines whether unnamed bitfields affect the alignment
20317    of the containing structure.  The hook returns true if the structure
20318    should inherit the alignment requirements of an unnamed bitfield's
20319    type.  */
20320 #undef TARGET_ALIGN_ANON_BITFIELD
20321 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
20322
20323 #undef TARGET_ASM_ALIGNED_DI_OP
20324 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
20325
20326 #undef TARGET_ASM_ALIGNED_HI_OP
20327 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
20328
20329 #undef TARGET_ASM_ALIGNED_SI_OP
20330 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
20331
20332 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
20333 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
20334   hook_bool_const_tree_hwi_hwi_const_tree_true
20335
20336 #undef TARGET_ASM_FILE_START
20337 #define TARGET_ASM_FILE_START aarch64_start_file
20338
20339 #undef TARGET_ASM_OUTPUT_MI_THUNK
20340 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
20341
20342 #undef TARGET_ASM_SELECT_RTX_SECTION
20343 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
20344
20345 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
20346 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
20347
20348 #undef TARGET_BUILD_BUILTIN_VA_LIST
20349 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
20350
20351 #undef TARGET_CALLEE_COPIES
20352 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
20353
20354 #undef TARGET_CAN_ELIMINATE
20355 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
20356
20357 #undef TARGET_CAN_INLINE_P
20358 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
20359
20360 #undef TARGET_CANNOT_FORCE_CONST_MEM
20361 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
20362
20363 #undef TARGET_CASE_VALUES_THRESHOLD
20364 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
20365
20366 #undef TARGET_CONDITIONAL_REGISTER_USAGE
20367 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
20368
20369 /* Only the least significant bit is used for initialization guard
20370    variables.  */
20371 #undef TARGET_CXX_GUARD_MASK_BIT
20372 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
20373
20374 #undef TARGET_C_MODE_FOR_SUFFIX
20375 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
20376
20377 #ifdef TARGET_BIG_ENDIAN_DEFAULT
20378 #undef  TARGET_DEFAULT_TARGET_FLAGS
20379 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
20380 #endif
20381
20382 #undef TARGET_CLASS_MAX_NREGS
20383 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
20384
20385 #undef TARGET_BUILTIN_DECL
20386 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
20387
20388 #undef TARGET_BUILTIN_RECIPROCAL
20389 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
20390
20391 #undef TARGET_C_EXCESS_PRECISION
20392 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
20393
20394 #undef  TARGET_EXPAND_BUILTIN
20395 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
20396
20397 #undef TARGET_EXPAND_BUILTIN_VA_START
20398 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
20399
20400 #undef TARGET_FOLD_BUILTIN
20401 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
20402
20403 #undef TARGET_FUNCTION_ARG
20404 #define TARGET_FUNCTION_ARG aarch64_function_arg
20405
20406 #undef TARGET_FUNCTION_ARG_ADVANCE
20407 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
20408
20409 #undef TARGET_FUNCTION_ARG_BOUNDARY
20410 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
20411
20412 #undef TARGET_FUNCTION_ARG_PADDING
20413 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
20414
20415 #undef TARGET_GET_RAW_RESULT_MODE
20416 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
20417 #undef TARGET_GET_RAW_ARG_MODE
20418 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
20419
20420 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
20421 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
20422
20423 #undef TARGET_FUNCTION_VALUE
20424 #define TARGET_FUNCTION_VALUE aarch64_function_value
20425
20426 #undef TARGET_FUNCTION_VALUE_REGNO_P
20427 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
20428
20429 #undef TARGET_GIMPLE_FOLD_BUILTIN
20430 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
20431
20432 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
20433 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
20434
20435 #undef  TARGET_INIT_BUILTINS
20436 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
20437
20438 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
20439 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
20440   aarch64_ira_change_pseudo_allocno_class
20441
20442 #undef TARGET_LEGITIMATE_ADDRESS_P
20443 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
20444
20445 #undef TARGET_LEGITIMATE_CONSTANT_P
20446 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
20447
20448 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
20449 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
20450   aarch64_legitimize_address_displacement
20451
20452 #undef TARGET_LIBGCC_CMP_RETURN_MODE
20453 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
20454
20455 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
20456 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
20457 aarch64_libgcc_floating_mode_supported_p
20458
20459 #undef TARGET_MANGLE_TYPE
20460 #define TARGET_MANGLE_TYPE aarch64_mangle_type
20461
20462 #undef TARGET_MEMORY_MOVE_COST
20463 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
20464
20465 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
20466 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
20467
20468 #undef TARGET_MUST_PASS_IN_STACK
20469 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
20470
20471 /* This target hook should return true if accesses to volatile bitfields
20472    should use the narrowest mode possible.  It should return false if these
20473    accesses should use the bitfield container type.  */
20474 #undef TARGET_NARROW_VOLATILE_BITFIELD
20475 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
20476
20477 #undef  TARGET_OPTION_OVERRIDE
20478 #define TARGET_OPTION_OVERRIDE aarch64_override_options
20479
20480 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
20481 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
20482   aarch64_override_options_after_change
20483
20484 #undef TARGET_OPTION_SAVE
20485 #define TARGET_OPTION_SAVE aarch64_option_save
20486
20487 #undef TARGET_OPTION_RESTORE
20488 #define TARGET_OPTION_RESTORE aarch64_option_restore
20489
20490 #undef TARGET_OPTION_PRINT
20491 #define TARGET_OPTION_PRINT aarch64_option_print
20492
20493 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
20494 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
20495
20496 #undef TARGET_SET_CURRENT_FUNCTION
20497 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
20498
20499 #undef TARGET_PASS_BY_REFERENCE
20500 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
20501
20502 #undef TARGET_PREFERRED_RELOAD_CLASS
20503 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
20504
20505 #undef TARGET_SCHED_REASSOCIATION_WIDTH
20506 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
20507
20508 #undef TARGET_PROMOTED_TYPE
20509 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
20510
20511 #undef TARGET_SECONDARY_RELOAD
20512 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
20513
20514 #undef TARGET_SHIFT_TRUNCATION_MASK
20515 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
20516
20517 #undef TARGET_SETUP_INCOMING_VARARGS
20518 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
20519
20520 #undef TARGET_STRUCT_VALUE_RTX
20521 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
20522
20523 #undef TARGET_REGISTER_MOVE_COST
20524 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
20525
20526 #undef TARGET_RETURN_IN_MEMORY
20527 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
20528
20529 #undef TARGET_RETURN_IN_MSB
20530 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
20531
20532 #undef TARGET_RTX_COSTS
20533 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
20534
20535 #undef TARGET_SCALAR_MODE_SUPPORTED_P
20536 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
20537
20538 #undef TARGET_SCHED_ISSUE_RATE
20539 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
20540
20541 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
20542 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
20543   aarch64_sched_first_cycle_multipass_dfa_lookahead
20544
20545 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
20546 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
20547   aarch64_first_cycle_multipass_dfa_lookahead_guard
20548
20549 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
20550 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
20551   aarch64_get_separate_components
20552
20553 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
20554 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
20555   aarch64_components_for_bb
20556
20557 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
20558 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
20559   aarch64_disqualify_components
20560
20561 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
20562 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
20563   aarch64_emit_prologue_components
20564
20565 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
20566 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
20567   aarch64_emit_epilogue_components
20568
20569 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
20570 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
20571   aarch64_set_handled_components
20572
20573 #undef TARGET_TRAMPOLINE_INIT
20574 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
20575
20576 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
20577 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
20578
20579 #undef TARGET_VECTOR_MODE_SUPPORTED_P
20580 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
20581
20582 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
20583 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
20584   aarch64_builtin_support_vector_misalignment
20585
20586 #undef TARGET_ARRAY_MODE
20587 #define TARGET_ARRAY_MODE aarch64_array_mode
20588
20589 #undef TARGET_ARRAY_MODE_SUPPORTED_P
20590 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
20591
20592 #undef TARGET_VECTORIZE_ADD_STMT_COST
20593 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
20594
20595 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
20596 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
20597   aarch64_builtin_vectorization_cost
20598
20599 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
20600 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
20601
20602 #undef TARGET_VECTORIZE_BUILTINS
20603 #define TARGET_VECTORIZE_BUILTINS
20604
20605 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
20606 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
20607   aarch64_builtin_vectorized_function
20608
20609 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
20610 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
20611   aarch64_autovectorize_vector_sizes
20612
20613 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
20614 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
20615   aarch64_atomic_assign_expand_fenv
20616
20617 /* Section anchor support.  */
20618
20619 #undef TARGET_MIN_ANCHOR_OFFSET
20620 #define TARGET_MIN_ANCHOR_OFFSET -256
20621
20622 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
20623    byte offset; we can do much more for larger data types, but have no way
20624    to determine the size of the access.  We assume accesses are aligned.  */
20625 #undef TARGET_MAX_ANCHOR_OFFSET
20626 #define TARGET_MAX_ANCHOR_OFFSET 4095
20627
20628 #undef TARGET_VECTOR_ALIGNMENT
20629 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
20630
20631 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
20632 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
20633   aarch64_vectorize_preferred_vector_alignment
20634 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
20635 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
20636   aarch64_simd_vector_alignment_reachable
20637
20638 /* vec_perm support.  */
20639
20640 #undef TARGET_VECTORIZE_VEC_PERM_CONST
20641 #define TARGET_VECTORIZE_VEC_PERM_CONST \
20642   aarch64_vectorize_vec_perm_const
20643
20644 #undef TARGET_VECTORIZE_GET_MASK_MODE
20645 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
20646 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
20647 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
20648   aarch64_empty_mask_is_expensive
20649 #undef TARGET_PREFERRED_ELSE_VALUE
20650 #define TARGET_PREFERRED_ELSE_VALUE \
20651   aarch64_preferred_else_value
20652
20653 #undef TARGET_INIT_LIBFUNCS
20654 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
20655
20656 #undef TARGET_FIXED_CONDITION_CODE_REGS
20657 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
20658
20659 #undef TARGET_FLAGS_REGNUM
20660 #define TARGET_FLAGS_REGNUM CC_REGNUM
20661
20662 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
20663 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
20664
20665 #undef TARGET_ASAN_SHADOW_OFFSET
20666 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
20667
20668 #undef TARGET_LEGITIMIZE_ADDRESS
20669 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
20670
20671 #undef TARGET_SCHED_CAN_SPECULATE_INSN
20672 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
20673
20674 #undef TARGET_CAN_USE_DOLOOP_P
20675 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
20676
20677 #undef TARGET_SCHED_ADJUST_PRIORITY
20678 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
20679
20680 #undef TARGET_SCHED_MACRO_FUSION_P
20681 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
20682
20683 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
20684 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
20685
20686 #undef TARGET_SCHED_FUSION_PRIORITY
20687 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
20688
20689 #undef TARGET_UNSPEC_MAY_TRAP_P
20690 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
20691
20692 #undef TARGET_USE_PSEUDO_PIC_REG
20693 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
20694
20695 #undef TARGET_PRINT_OPERAND
20696 #define TARGET_PRINT_OPERAND aarch64_print_operand
20697
20698 #undef TARGET_PRINT_OPERAND_ADDRESS
20699 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
20700
20701 #undef TARGET_OPTAB_SUPPORTED_P
20702 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
20703
20704 #undef TARGET_OMIT_STRUCT_RETURN_REG
20705 #define TARGET_OMIT_STRUCT_RETURN_REG true
20706
20707 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
20708 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
20709   aarch64_dwarf_poly_indeterminate_value
20710
20711 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
20712 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
20713 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
20714
20715 #undef TARGET_HARD_REGNO_NREGS
20716 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
20717 #undef TARGET_HARD_REGNO_MODE_OK
20718 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
20719
20720 #undef TARGET_MODES_TIEABLE_P
20721 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
20722
20723 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
20724 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
20725   aarch64_hard_regno_call_part_clobbered
20726
20727 #undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
20728 #define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \
20729   aarch64_remove_extra_call_preserved_regs
20730
20731 #undef TARGET_RETURN_CALL_WITH_MAX_CLOBBERS
20732 #define TARGET_RETURN_CALL_WITH_MAX_CLOBBERS \
20733   aarch64_return_call_with_max_clobbers
20734
20735 #undef TARGET_CONSTANT_ALIGNMENT
20736 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
20737
20738 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
20739 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
20740   aarch64_stack_clash_protection_alloca_probe_range
20741
20742 #undef TARGET_COMPUTE_PRESSURE_CLASSES
20743 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
20744
20745 #undef TARGET_CAN_CHANGE_MODE_CLASS
20746 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
20747
20748 #undef TARGET_SELECT_EARLY_REMAT_MODES
20749 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
20750
20751 #undef TARGET_SPECULATION_SAFE_VALUE
20752 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
20753
20754 #undef TARGET_ESTIMATED_POLY_VALUE
20755 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
20756
20757 #undef TARGET_ATTRIBUTE_TABLE
20758 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
20759
20760 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
20761 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
20762   aarch64_simd_clone_compute_vecsize_and_simdlen
20763
20764 #undef TARGET_SIMD_CLONE_ADJUST
20765 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
20766
20767 #undef TARGET_SIMD_CLONE_USABLE
20768 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
20769
20770 #undef TARGET_COMP_TYPE_ATTRIBUTES
20771 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
20772
20773 #undef TARGET_GET_MULTILIB_ABI_NAME
20774 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
20775
20776 #if CHECKING_P
20777 #undef TARGET_RUN_TARGET_SELFTESTS
20778 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
20779 #endif /* #if CHECKING_P */
20780
20781 #undef TARGET_ASM_POST_CFI_STARTPROC
20782 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
20783
20784 struct gcc_target targetm = TARGET_INITIALIZER;
20785
20786 #include "gt-aarch64.h"