gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2019 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #include "config.h"
  24 #define INCLUDE_STRING
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "memmodel.h"
  32 #include "gimple.h"
  33 #include "cfghooks.h"
  34 #include "cfgloop.h"
  35 #include "df.h"
  36 #include "tm_p.h"
  37 #include "stringpool.h"
  38 #include "attribs.h"
  39 #include "optabs.h"
  40 #include "regs.h"
  41 #include "emit-rtl.h"
  42 #include "recog.h"
  43 #include "cgraph.h"
  44 #include "diagnostic.h"
  45 #include "insn-attr.h"
  46 #include "alias.h"
  47 #include "fold-const.h"
  48 #include "stor-layout.h"
  49 #include "calls.h"
  50 #include "varasm.h"
  51 #include "output.h"
  52 #include "flags.h"
  53 #include "explow.h"
  54 #include "expr.h"
  55 #include "reload.h"
  56 #include "langhooks.h"
  57 #include "opts.h"
  58 #include "params.h"
  59 #include "gimplify.h"
  60 #include "dwarf2.h"
  61 #include "gimple-iterator.h"
  62 #include "tree-vectorizer.h"
  63 #include "aarch64-cost-tables.h"
  64 #include "dumpfile.h"
  65 #include "builtins.h"
  66 #include "rtl-iter.h"
  67 #include "tm-constrs.h"
  68 #include "sched-int.h"
  69 #include "target-globals.h"
  70 #include "common/common-target.h"
  71 #include "cfgrtl.h"
  72 #include "selftest.h"
  73 #include "selftest-rtl.h"
  74 #include "rtx-vector-builder.h"
  75 #include "intl.h"
  76 #include "expmed.h"
  77
  78 /* This file should be included last.  */
  79 #include "target-def.h"
  80
  81 /* Defined for convenience.  */
  82 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  83
  84 /* Information about a legitimate vector immediate operand.  */
  85 struct simd_immediate_info
  86 {
  87   enum insn_type { MOV, MVN, INDEX, PTRUE };
  88   enum modifier_type { LSL, MSL };
  89
  90   simd_immediate_info () {}
  91   simd_immediate_info (scalar_float_mode, rtx);
  92   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
  93                        insn_type = MOV, modifier_type = LSL,
  94                        unsigned int = 0);
  95   simd_immediate_info (scalar_mode, rtx, rtx);
  96   simd_immediate_info (scalar_int_mode, aarch64_svpattern);
  97
  98   /* The mode of the elements.  */
  99   scalar_mode elt_mode;
 100
 101   /* The instruction to use to move the immediate into a vector.  */
 102   insn_type insn;
 103
 104   union
 105   {
 106     /* For MOV and MVN.  */
 107     struct
 108     {
 109       /* The value of each element.  */
 110       rtx value;
 111
 112       /* The kind of shift modifier to use, and the number of bits to shift.
 113          This is (LSL, 0) if no shift is needed.  */
 114       modifier_type modifier;
 115       unsigned int shift;
 116     } mov;
 117
 118     /* For INDEX.  */
 119     struct
 120     {
 121       /* The value of the first element and the step to be added for each
 122          subsequent element.  */
 123       rtx base, step;
 124     } index;
 125
 126     /* For PTRUE.  */
 127     aarch64_svpattern pattern;
 128   } u;
 129 };
 130
 131 /* Construct a floating-point immediate in which each element has mode
 132    ELT_MODE_IN and value VALUE_IN.  */
 133 inline simd_immediate_info
 134 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 135   : elt_mode (elt_mode_in), insn (MOV)
 136 {
 137   u.mov.value = value_in;
 138   u.mov.modifier = LSL;
 139   u.mov.shift = 0;
 140 }
 141
 142 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 143    and value VALUE_IN.  The other parameters are as for the structure
 144    fields.  */
 145 inline simd_immediate_info
 146 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 147                        unsigned HOST_WIDE_INT value_in,
 148                        insn_type insn_in, modifier_type modifier_in,
 149                        unsigned int shift_in)
 150   : elt_mode (elt_mode_in), insn (insn_in)
 151 {
 152   u.mov.value = gen_int_mode (value_in, elt_mode_in);
 153   u.mov.modifier = modifier_in;
 154   u.mov.shift = shift_in;
 155 }
 156
 157 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 158    and where element I is equal to BASE_IN + I * STEP_IN.  */
 159 inline simd_immediate_info
 160 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
 161   : elt_mode (elt_mode_in), insn (INDEX)
 162 {
 163   u.index.base = base_in;
 164   u.index.step = step_in;
 165 }
 166
 167 /* Construct a predicate that controls elements of mode ELT_MODE_IN
 168    and has PTRUE pattern PATTERN_IN.  */
 169 inline simd_immediate_info
 170 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 171                        aarch64_svpattern pattern_in)
 172   : elt_mode (elt_mode_in), insn (PTRUE)
 173 {
 174   u.pattern = pattern_in;
 175 }
 176
 177 /* The current code model.  */
 178 enum aarch64_code_model aarch64_cmodel;
 179
 180 /* The number of 64-bit elements in an SVE vector.  */
 181 poly_uint16 aarch64_sve_vg;
 182
 183 #ifdef HAVE_AS_TLS
 184 #undef TARGET_HAVE_TLS
 185 #define TARGET_HAVE_TLS 1
 186 #endif
 187
 188 static bool aarch64_composite_type_p (const_tree, machine_mode);
 189 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 190                                                      const_tree,
 191                                                      machine_mode *, int *,
 192                                                      bool *);
 193 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 194 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 195 static void aarch64_override_options_after_change (void);
 196 static bool aarch64_vector_mode_supported_p (machine_mode);
 197 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 198 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 199                                                          const_tree type,
 200                                                          int misalignment,
 201                                                          bool is_packed);
 202 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 203 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 204                                             aarch64_addr_query_type);
 205 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
 206
 207 /* Major revision number of the ARM Architecture implemented by the target.  */
 208 unsigned aarch64_architecture_version;
 209
 210 /* The processor for which instructions should be scheduled.  */
 211 enum aarch64_processor aarch64_tune = cortexa53;
 212
 213 /* Mask to specify which instruction scheduling options should be used.  */
 214 uint64_t aarch64_tune_flags = 0;
 215
 216 /* Global flag for PC relative loads.  */
 217 bool aarch64_pcrelative_literal_loads;
 218
 219 /* Global flag for whether frame pointer is enabled.  */
 220 bool aarch64_use_frame_pointer;
 221
 222 #define BRANCH_PROTECT_STR_MAX 255
 223 char *accepted_branch_protection_string = NULL;
 224
 225 static enum aarch64_parse_opt_result
 226 aarch64_parse_branch_protection (const char*, char**);
 227
 228 /* Support for command line parsing of boolean flags in the tuning
 229    structures.  */
 230 struct aarch64_flag_desc
 231 {
 232   const char* name;
 233   unsigned int flag;
 234 };
 235
 236 #define AARCH64_FUSION_PAIR(name, internal_name) \
 237   { name, AARCH64_FUSE_##internal_name },
 238 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 239 {
 240   { "none", AARCH64_FUSE_NOTHING },
 241 #include "aarch64-fusion-pairs.def"
 242   { "all", AARCH64_FUSE_ALL },
 243   { NULL, AARCH64_FUSE_NOTHING }
 244 };
 245
 246 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 247   { name, AARCH64_EXTRA_TUNE_##internal_name },
 248 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 249 {
 250   { "none", AARCH64_EXTRA_TUNE_NONE },
 251 #include "aarch64-tuning-flags.def"
 252   { "all", AARCH64_EXTRA_TUNE_ALL },
 253   { NULL, AARCH64_EXTRA_TUNE_NONE }
 254 };
 255
 256 /* Tuning parameters.  */
 257
 258 static const struct cpu_addrcost_table generic_addrcost_table =
 259 {
 260     {
 261       1, /* hi  */
 262       0, /* si  */
 263       0, /* di  */
 264       1, /* ti  */
 265     },
 266   0, /* pre_modify  */
 267   0, /* post_modify  */
 268   0, /* register_offset  */
 269   0, /* register_sextend  */
 270   0, /* register_zextend  */
 271   0 /* imm_offset  */
 272 };
 273
 274 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 275 {
 276     {
 277       0, /* hi  */
 278       0, /* si  */
 279       0, /* di  */
 280       2, /* ti  */
 281     },
 282   0, /* pre_modify  */
 283   0, /* post_modify  */
 284   1, /* register_offset  */
 285   1, /* register_sextend  */
 286   2, /* register_zextend  */
 287   0, /* imm_offset  */
 288 };
 289
 290 static const struct cpu_addrcost_table xgene1_addrcost_table =
 291 {
 292     {
 293       1, /* hi  */
 294       0, /* si  */
 295       0, /* di  */
 296       1, /* ti  */
 297     },
 298   1, /* pre_modify  */
 299   1, /* post_modify  */
 300   0, /* register_offset  */
 301   1, /* register_sextend  */
 302   1, /* register_zextend  */
 303   0, /* imm_offset  */
 304 };
 305
 306 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 307 {
 308     {
 309       1, /* hi  */
 310       1, /* si  */
 311       1, /* di  */
 312       2, /* ti  */
 313     },
 314   0, /* pre_modify  */
 315   0, /* post_modify  */
 316   2, /* register_offset  */
 317   3, /* register_sextend  */
 318   3, /* register_zextend  */
 319   0, /* imm_offset  */
 320 };
 321
 322 static const struct cpu_addrcost_table tsv110_addrcost_table =
 323 {
 324     {
 325       1, /* hi  */
 326       0, /* si  */
 327       0, /* di  */
 328       1, /* ti  */
 329     },
 330   0, /* pre_modify  */
 331   0, /* post_modify  */
 332   0, /* register_offset  */
 333   1, /* register_sextend  */
 334   1, /* register_zextend  */
 335   0, /* imm_offset  */
 336 };
 337
 338 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 339 {
 340     {
 341       1, /* hi  */
 342       1, /* si  */
 343       1, /* di  */
 344       2, /* ti  */
 345     },
 346   1, /* pre_modify  */
 347   1, /* post_modify  */
 348   3, /* register_offset  */
 349   3, /* register_sextend  */
 350   3, /* register_zextend  */
 351   2, /* imm_offset  */
 352 };
 353
 354 static const struct cpu_regmove_cost generic_regmove_cost =
 355 {
 356   1, /* GP2GP  */
 357   /* Avoid the use of slow int<->fp moves for spilling by setting
 358      their cost higher than memmov_cost.  */
 359   5, /* GP2FP  */
 360   5, /* FP2GP  */
 361   2 /* FP2FP  */
 362 };
 363
 364 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 365 {
 366   1, /* GP2GP  */
 367   /* Avoid the use of slow int<->fp moves for spilling by setting
 368      their cost higher than memmov_cost.  */
 369   5, /* GP2FP  */
 370   5, /* FP2GP  */
 371   2 /* FP2FP  */
 372 };
 373
 374 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 375 {
 376   1, /* GP2GP  */
 377   /* Avoid the use of slow int<->fp moves for spilling by setting
 378      their cost higher than memmov_cost.  */
 379   5, /* GP2FP  */
 380   5, /* FP2GP  */
 381   2 /* FP2FP  */
 382 };
 383
 384 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 385 {
 386   1, /* GP2GP  */
 387   /* Avoid the use of slow int<->fp moves for spilling by setting
 388      their cost higher than memmov_cost (actual, 4 and 9).  */
 389   9, /* GP2FP  */
 390   9, /* FP2GP  */
 391   1 /* FP2FP  */
 392 };
 393
 394 static const struct cpu_regmove_cost thunderx_regmove_cost =
 395 {
 396   2, /* GP2GP  */
 397   2, /* GP2FP  */
 398   6, /* FP2GP  */
 399   4 /* FP2FP  */
 400 };
 401
 402 static const struct cpu_regmove_cost xgene1_regmove_cost =
 403 {
 404   1, /* GP2GP  */
 405   /* Avoid the use of slow int<->fp moves for spilling by setting
 406      their cost higher than memmov_cost.  */
 407   8, /* GP2FP  */
 408   8, /* FP2GP  */
 409   2 /* FP2FP  */
 410 };
 411
 412 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 413 {
 414   2, /* GP2GP  */
 415   /* Avoid the use of int<->fp moves for spilling.  */
 416   6, /* GP2FP  */
 417   6, /* FP2GP  */
 418   4 /* FP2FP  */
 419 };
 420
 421 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 422 {
 423   1, /* GP2GP  */
 424   /* Avoid the use of int<->fp moves for spilling.  */
 425   8, /* GP2FP  */
 426   8, /* FP2GP  */
 427   4  /* FP2FP  */
 428 };
 429
 430 static const struct cpu_regmove_cost tsv110_regmove_cost =
 431 {
 432   1, /* GP2GP  */
 433   /* Avoid the use of slow int<->fp moves for spilling by setting
 434      their cost higher than memmov_cost.  */
 435   2, /* GP2FP  */
 436   3, /* FP2GP  */
 437   2  /* FP2FP  */
 438 };
 439
 440 /* Generic costs for vector insn classes.  */
 441 static const struct cpu_vector_cost generic_vector_cost =
 442 {
 443   1, /* scalar_int_stmt_cost  */
 444   1, /* scalar_fp_stmt_cost  */
 445   1, /* scalar_load_cost  */
 446   1, /* scalar_store_cost  */
 447   1, /* vec_int_stmt_cost  */
 448   1, /* vec_fp_stmt_cost  */
 449   2, /* vec_permute_cost  */
 450   1, /* vec_to_scalar_cost  */
 451   1, /* scalar_to_vec_cost  */
 452   1, /* vec_align_load_cost  */
 453   1, /* vec_unalign_load_cost  */
 454   1, /* vec_unalign_store_cost  */
 455   1, /* vec_store_cost  */
 456   3, /* cond_taken_branch_cost  */
 457   1 /* cond_not_taken_branch_cost  */
 458 };
 459
 460 /* QDF24XX costs for vector insn classes.  */
 461 static const struct cpu_vector_cost qdf24xx_vector_cost =
 462 {
 463   1, /* scalar_int_stmt_cost  */
 464   1, /* scalar_fp_stmt_cost  */
 465   1, /* scalar_load_cost  */
 466   1, /* scalar_store_cost  */
 467   1, /* vec_int_stmt_cost  */
 468   3, /* vec_fp_stmt_cost  */
 469   2, /* vec_permute_cost  */
 470   1, /* vec_to_scalar_cost  */
 471   1, /* scalar_to_vec_cost  */
 472   1, /* vec_align_load_cost  */
 473   1, /* vec_unalign_load_cost  */
 474   1, /* vec_unalign_store_cost  */
 475   1, /* vec_store_cost  */
 476   3, /* cond_taken_branch_cost  */
 477   1 /* cond_not_taken_branch_cost  */
 478 };
 479
 480 /* ThunderX costs for vector insn classes.  */
 481 static const struct cpu_vector_cost thunderx_vector_cost =
 482 {
 483   1, /* scalar_int_stmt_cost  */
 484   1, /* scalar_fp_stmt_cost  */
 485   3, /* scalar_load_cost  */
 486   1, /* scalar_store_cost  */
 487   4, /* vec_int_stmt_cost  */
 488   1, /* vec_fp_stmt_cost  */
 489   4, /* vec_permute_cost  */
 490   2, /* vec_to_scalar_cost  */
 491   2, /* scalar_to_vec_cost  */
 492   3, /* vec_align_load_cost  */
 493   5, /* vec_unalign_load_cost  */
 494   5, /* vec_unalign_store_cost  */
 495   1, /* vec_store_cost  */
 496   3, /* cond_taken_branch_cost  */
 497   3 /* cond_not_taken_branch_cost  */
 498 };
 499
 500 static const struct cpu_vector_cost tsv110_vector_cost =
 501 {
 502   1, /* scalar_int_stmt_cost  */
 503   1, /* scalar_fp_stmt_cost  */
 504   5, /* scalar_load_cost  */
 505   1, /* scalar_store_cost  */
 506   2, /* vec_int_stmt_cost  */
 507   2, /* vec_fp_stmt_cost  */
 508   2, /* vec_permute_cost  */
 509   3, /* vec_to_scalar_cost  */
 510   2, /* scalar_to_vec_cost  */
 511   5, /* vec_align_load_cost  */
 512   5, /* vec_unalign_load_cost  */
 513   1, /* vec_unalign_store_cost  */
 514   1, /* vec_store_cost  */
 515   1, /* cond_taken_branch_cost  */
 516   1 /* cond_not_taken_branch_cost  */
 517 };
 518
 519 /* Generic costs for vector insn classes.  */
 520 static const struct cpu_vector_cost cortexa57_vector_cost =
 521 {
 522   1, /* scalar_int_stmt_cost  */
 523   1, /* scalar_fp_stmt_cost  */
 524   4, /* scalar_load_cost  */
 525   1, /* scalar_store_cost  */
 526   2, /* vec_int_stmt_cost  */
 527   2, /* vec_fp_stmt_cost  */
 528   3, /* vec_permute_cost  */
 529   8, /* vec_to_scalar_cost  */
 530   8, /* scalar_to_vec_cost  */
 531   4, /* vec_align_load_cost  */
 532   4, /* vec_unalign_load_cost  */
 533   1, /* vec_unalign_store_cost  */
 534   1, /* vec_store_cost  */
 535   1, /* cond_taken_branch_cost  */
 536   1 /* cond_not_taken_branch_cost  */
 537 };
 538
 539 static const struct cpu_vector_cost exynosm1_vector_cost =
 540 {
 541   1, /* scalar_int_stmt_cost  */
 542   1, /* scalar_fp_stmt_cost  */
 543   5, /* scalar_load_cost  */
 544   1, /* scalar_store_cost  */
 545   3, /* vec_int_stmt_cost  */
 546   3, /* vec_fp_stmt_cost  */
 547   3, /* vec_permute_cost  */
 548   3, /* vec_to_scalar_cost  */
 549   3, /* scalar_to_vec_cost  */
 550   5, /* vec_align_load_cost  */
 551   5, /* vec_unalign_load_cost  */
 552   1, /* vec_unalign_store_cost  */
 553   1, /* vec_store_cost  */
 554   1, /* cond_taken_branch_cost  */
 555   1 /* cond_not_taken_branch_cost  */
 556 };
 557
 558 /* Generic costs for vector insn classes.  */
 559 static const struct cpu_vector_cost xgene1_vector_cost =
 560 {
 561   1, /* scalar_int_stmt_cost  */
 562   1, /* scalar_fp_stmt_cost  */
 563   5, /* scalar_load_cost  */
 564   1, /* scalar_store_cost  */
 565   2, /* vec_int_stmt_cost  */
 566   2, /* vec_fp_stmt_cost  */
 567   2, /* vec_permute_cost  */
 568   4, /* vec_to_scalar_cost  */
 569   4, /* scalar_to_vec_cost  */
 570   10, /* vec_align_load_cost  */
 571   10, /* vec_unalign_load_cost  */
 572   2, /* vec_unalign_store_cost  */
 573   2, /* vec_store_cost  */
 574   2, /* cond_taken_branch_cost  */
 575   1 /* cond_not_taken_branch_cost  */
 576 };
 577
 578 /* Costs for vector insn classes for Vulcan.  */
 579 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 580 {
 581   1, /* scalar_int_stmt_cost  */
 582   6, /* scalar_fp_stmt_cost  */
 583   4, /* scalar_load_cost  */
 584   1, /* scalar_store_cost  */
 585   5, /* vec_int_stmt_cost  */
 586   6, /* vec_fp_stmt_cost  */
 587   3, /* vec_permute_cost  */
 588   6, /* vec_to_scalar_cost  */
 589   5, /* scalar_to_vec_cost  */
 590   8, /* vec_align_load_cost  */
 591   8, /* vec_unalign_load_cost  */
 592   4, /* vec_unalign_store_cost  */
 593   4, /* vec_store_cost  */
 594   2, /* cond_taken_branch_cost  */
 595   1  /* cond_not_taken_branch_cost  */
 596 };
 597
 598 /* Generic costs for branch instructions.  */
 599 static const struct cpu_branch_cost generic_branch_cost =
 600 {
 601   1,  /* Predictable.  */
 602   3   /* Unpredictable.  */
 603 };
 604
 605 /* Generic approximation modes.  */
 606 static const cpu_approx_modes generic_approx_modes =
 607 {
 608   AARCH64_APPROX_NONE,  /* division  */
 609   AARCH64_APPROX_NONE,  /* sqrt  */
 610   AARCH64_APPROX_NONE   /* recip_sqrt  */
 611 };
 612
 613 /* Approximation modes for Exynos M1.  */
 614 static const cpu_approx_modes exynosm1_approx_modes =
 615 {
 616   AARCH64_APPROX_NONE,  /* division  */
 617   AARCH64_APPROX_ALL,   /* sqrt  */
 618   AARCH64_APPROX_ALL    /* recip_sqrt  */
 619 };
 620
 621 /* Approximation modes for X-Gene 1.  */
 622 static const cpu_approx_modes xgene1_approx_modes =
 623 {
 624   AARCH64_APPROX_NONE,  /* division  */
 625   AARCH64_APPROX_NONE,  /* sqrt  */
 626   AARCH64_APPROX_ALL    /* recip_sqrt  */
 627 };
 628
 629 /* Generic prefetch settings (which disable prefetch).  */
 630 static const cpu_prefetch_tune generic_prefetch_tune =
 631 {
 632   0,                    /* num_slots  */
 633   -1,                   /* l1_cache_size  */
 634   -1,                   /* l1_cache_line_size  */
 635   -1,                   /* l2_cache_size  */
 636   true,                 /* prefetch_dynamic_strides */
 637   -1,                   /* minimum_stride */
 638   -1                    /* default_opt_level  */
 639 };
 640
 641 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 642 {
 643   0,                    /* num_slots  */
 644   -1,                   /* l1_cache_size  */
 645   64,                   /* l1_cache_line_size  */
 646   -1,                   /* l2_cache_size  */
 647   true,                 /* prefetch_dynamic_strides */
 648   -1,                   /* minimum_stride */
 649   -1                    /* default_opt_level  */
 650 };
 651
 652 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 653 {
 654   4,                    /* num_slots  */
 655   32,                   /* l1_cache_size  */
 656   64,                   /* l1_cache_line_size  */
 657   512,                  /* l2_cache_size  */
 658   false,                /* prefetch_dynamic_strides */
 659   2048,                 /* minimum_stride */
 660   3                     /* default_opt_level  */
 661 };
 662
 663 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 664 {
 665   8,                    /* num_slots  */
 666   32,                   /* l1_cache_size  */
 667   128,                  /* l1_cache_line_size  */
 668   16*1024,              /* l2_cache_size  */
 669   true,                 /* prefetch_dynamic_strides */
 670   -1,                   /* minimum_stride */
 671   3                     /* default_opt_level  */
 672 };
 673
 674 static const cpu_prefetch_tune thunderx_prefetch_tune =
 675 {
 676   8,                    /* num_slots  */
 677   32,                   /* l1_cache_size  */
 678   128,                  /* l1_cache_line_size  */
 679   -1,                   /* l2_cache_size  */
 680   true,                 /* prefetch_dynamic_strides */
 681   -1,                   /* minimum_stride */
 682   -1                    /* default_opt_level  */
 683 };
 684
 685 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 686 {
 687   8,                    /* num_slots  */
 688   32,                   /* l1_cache_size  */
 689   64,                   /* l1_cache_line_size  */
 690   256,                  /* l2_cache_size  */
 691   true,                 /* prefetch_dynamic_strides */
 692   -1,                   /* minimum_stride */
 693   -1                    /* default_opt_level  */
 694 };
 695
 696 static const cpu_prefetch_tune tsv110_prefetch_tune =
 697 {
 698   0,                    /* num_slots  */
 699   64,                   /* l1_cache_size  */
 700   64,                   /* l1_cache_line_size  */
 701   512,                  /* l2_cache_size  */
 702   true,                 /* prefetch_dynamic_strides */
 703   -1,                   /* minimum_stride */
 704   -1                    /* default_opt_level  */
 705 };
 706
 707 static const cpu_prefetch_tune xgene1_prefetch_tune =
 708 {
 709   8,                    /* num_slots  */
 710   32,                   /* l1_cache_size  */
 711   64,                   /* l1_cache_line_size  */
 712   256,                  /* l2_cache_size  */
 713   true,                 /* prefetch_dynamic_strides */
 714   -1,                   /* minimum_stride */
 715   -1                    /* default_opt_level  */
 716 };
 717
 718 static const struct tune_params generic_tunings =
 719 {
 720   &cortexa57_extra_costs,
 721   &generic_addrcost_table,
 722   &generic_regmove_cost,
 723   &generic_vector_cost,
 724   &generic_branch_cost,
 725   &generic_approx_modes,
 726   SVE_NOT_IMPLEMENTED, /* sve_width  */
 727   4, /* memmov_cost  */
 728   2, /* issue_rate  */
 729   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 730   "16:12",      /* function_align.  */
 731   "4",  /* jump_align.  */
 732   "8",  /* loop_align.  */
 733   2,    /* int_reassoc_width.  */
 734   4,    /* fp_reassoc_width.  */
 735   1,    /* vec_reassoc_width.  */
 736   2,    /* min_div_recip_mul_sf.  */
 737   2,    /* min_div_recip_mul_df.  */
 738   0,    /* max_case_values.  */
 739   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 740   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 741   &generic_prefetch_tune
 742 };
 743
 744 static const struct tune_params cortexa35_tunings =
 745 {
 746   &cortexa53_extra_costs,
 747   &generic_addrcost_table,
 748   &cortexa53_regmove_cost,
 749   &generic_vector_cost,
 750   &generic_branch_cost,
 751   &generic_approx_modes,
 752   SVE_NOT_IMPLEMENTED, /* sve_width  */
 753   4, /* memmov_cost  */
 754   1, /* issue_rate  */
 755   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 756    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 757   "16", /* function_align.  */
 758   "4",  /* jump_align.  */
 759   "8",  /* loop_align.  */
 760   2,    /* int_reassoc_width.  */
 761   4,    /* fp_reassoc_width.  */
 762   1,    /* vec_reassoc_width.  */
 763   2,    /* min_div_recip_mul_sf.  */
 764   2,    /* min_div_recip_mul_df.  */
 765   0,    /* max_case_values.  */
 766   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 767   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 768   &generic_prefetch_tune
 769 };
 770
 771 static const struct tune_params cortexa53_tunings =
 772 {
 773   &cortexa53_extra_costs,
 774   &generic_addrcost_table,
 775   &cortexa53_regmove_cost,
 776   &generic_vector_cost,
 777   &generic_branch_cost,
 778   &generic_approx_modes,
 779   SVE_NOT_IMPLEMENTED, /* sve_width  */
 780   4, /* memmov_cost  */
 781   2, /* issue_rate  */
 782   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 783    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 784   "16", /* function_align.  */
 785   "4",  /* jump_align.  */
 786   "8",  /* loop_align.  */
 787   2,    /* int_reassoc_width.  */
 788   4,    /* fp_reassoc_width.  */
 789   1,    /* vec_reassoc_width.  */
 790   2,    /* min_div_recip_mul_sf.  */
 791   2,    /* min_div_recip_mul_df.  */
 792   0,    /* max_case_values.  */
 793   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 794   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 795   &generic_prefetch_tune
 796 };
 797
 798 static const struct tune_params cortexa57_tunings =
 799 {
 800   &cortexa57_extra_costs,
 801   &generic_addrcost_table,
 802   &cortexa57_regmove_cost,
 803   &cortexa57_vector_cost,
 804   &generic_branch_cost,
 805   &generic_approx_modes,
 806   SVE_NOT_IMPLEMENTED, /* sve_width  */
 807   4, /* memmov_cost  */
 808   3, /* issue_rate  */
 809   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 810    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 811   "16", /* function_align.  */
 812   "4",  /* jump_align.  */
 813   "8",  /* loop_align.  */
 814   2,    /* int_reassoc_width.  */
 815   4,    /* fp_reassoc_width.  */
 816   1,    /* vec_reassoc_width.  */
 817   2,    /* min_div_recip_mul_sf.  */
 818   2,    /* min_div_recip_mul_df.  */
 819   0,    /* max_case_values.  */
 820   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 821   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 822   &generic_prefetch_tune
 823 };
 824
 825 static const struct tune_params cortexa72_tunings =
 826 {
 827   &cortexa57_extra_costs,
 828   &generic_addrcost_table,
 829   &cortexa57_regmove_cost,
 830   &cortexa57_vector_cost,
 831   &generic_branch_cost,
 832   &generic_approx_modes,
 833   SVE_NOT_IMPLEMENTED, /* sve_width  */
 834   4, /* memmov_cost  */
 835   3, /* issue_rate  */
 836   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 837    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 838   "16", /* function_align.  */
 839   "4",  /* jump_align.  */
 840   "8",  /* loop_align.  */
 841   2,    /* int_reassoc_width.  */
 842   4,    /* fp_reassoc_width.  */
 843   1,    /* vec_reassoc_width.  */
 844   2,    /* min_div_recip_mul_sf.  */
 845   2,    /* min_div_recip_mul_df.  */
 846   0,    /* max_case_values.  */
 847   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 848   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 849   &generic_prefetch_tune
 850 };
 851
 852 static const struct tune_params cortexa73_tunings =
 853 {
 854   &cortexa57_extra_costs,
 855   &generic_addrcost_table,
 856   &cortexa57_regmove_cost,
 857   &cortexa57_vector_cost,
 858   &generic_branch_cost,
 859   &generic_approx_modes,
 860   SVE_NOT_IMPLEMENTED, /* sve_width  */
 861   4, /* memmov_cost.  */
 862   2, /* issue_rate.  */
 863   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 864    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 865   "16", /* function_align.  */
 866   "4",  /* jump_align.  */
 867   "8",  /* loop_align.  */
 868   2,    /* int_reassoc_width.  */
 869   4,    /* fp_reassoc_width.  */
 870   1,    /* vec_reassoc_width.  */
 871   2,    /* min_div_recip_mul_sf.  */
 872   2,    /* min_div_recip_mul_df.  */
 873   0,    /* max_case_values.  */
 874   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 875   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 876   &generic_prefetch_tune
 877 };
 878
 879
 880
 881 static const struct tune_params exynosm1_tunings =
 882 {
 883   &exynosm1_extra_costs,
 884   &exynosm1_addrcost_table,
 885   &exynosm1_regmove_cost,
 886   &exynosm1_vector_cost,
 887   &generic_branch_cost,
 888   &exynosm1_approx_modes,
 889   SVE_NOT_IMPLEMENTED, /* sve_width  */
 890   4,    /* memmov_cost  */
 891   3,    /* issue_rate  */
 892   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 893   "4",  /* function_align.  */
 894   "4",  /* jump_align.  */
 895   "4",  /* loop_align.  */
 896   2,    /* int_reassoc_width.  */
 897   4,    /* fp_reassoc_width.  */
 898   1,    /* vec_reassoc_width.  */
 899   2,    /* min_div_recip_mul_sf.  */
 900   2,    /* min_div_recip_mul_df.  */
 901   48,   /* max_case_values.  */
 902   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 903   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 904   &exynosm1_prefetch_tune
 905 };
 906
 907 static const struct tune_params thunderxt88_tunings =
 908 {
 909   &thunderx_extra_costs,
 910   &generic_addrcost_table,
 911   &thunderx_regmove_cost,
 912   &thunderx_vector_cost,
 913   &generic_branch_cost,
 914   &generic_approx_modes,
 915   SVE_NOT_IMPLEMENTED, /* sve_width  */
 916   6, /* memmov_cost  */
 917   2, /* issue_rate  */
 918   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 919   "8",  /* function_align.  */
 920   "8",  /* jump_align.  */
 921   "8",  /* loop_align.  */
 922   2,    /* int_reassoc_width.  */
 923   4,    /* fp_reassoc_width.  */
 924   1,    /* vec_reassoc_width.  */
 925   2,    /* min_div_recip_mul_sf.  */
 926   2,    /* min_div_recip_mul_df.  */
 927   0,    /* max_case_values.  */
 928   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 929   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 930   &thunderxt88_prefetch_tune
 931 };
 932
 933 static const struct tune_params thunderx_tunings =
 934 {
 935   &thunderx_extra_costs,
 936   &generic_addrcost_table,
 937   &thunderx_regmove_cost,
 938   &thunderx_vector_cost,
 939   &generic_branch_cost,
 940   &generic_approx_modes,
 941   SVE_NOT_IMPLEMENTED, /* sve_width  */
 942   6, /* memmov_cost  */
 943   2, /* issue_rate  */
 944   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 945   "8",  /* function_align.  */
 946   "8",  /* jump_align.  */
 947   "8",  /* loop_align.  */
 948   2,    /* int_reassoc_width.  */
 949   4,    /* fp_reassoc_width.  */
 950   1,    /* vec_reassoc_width.  */
 951   2,    /* min_div_recip_mul_sf.  */
 952   2,    /* min_div_recip_mul_df.  */
 953   0,    /* max_case_values.  */
 954   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 955   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 956    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 957   &thunderx_prefetch_tune
 958 };
 959
 960 static const struct tune_params tsv110_tunings =
 961 {
 962   &tsv110_extra_costs,
 963   &tsv110_addrcost_table,
 964   &tsv110_regmove_cost,
 965   &tsv110_vector_cost,
 966   &generic_branch_cost,
 967   &generic_approx_modes,
 968   SVE_NOT_IMPLEMENTED, /* sve_width  */
 969   4,    /* memmov_cost  */
 970   4,    /* issue_rate  */
 971   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
 972    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 973   "16", /* function_align.  */
 974   "4",  /* jump_align.  */
 975   "8",  /* loop_align.  */
 976   2,    /* int_reassoc_width.  */
 977   4,    /* fp_reassoc_width.  */
 978   1,    /* vec_reassoc_width.  */
 979   2,    /* min_div_recip_mul_sf.  */
 980   2,    /* min_div_recip_mul_df.  */
 981   0,    /* max_case_values.  */
 982   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 983   (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
 984   &tsv110_prefetch_tune
 985 };
 986
 987 static const struct tune_params xgene1_tunings =
 988 {
 989   &xgene1_extra_costs,
 990   &xgene1_addrcost_table,
 991   &xgene1_regmove_cost,
 992   &xgene1_vector_cost,
 993   &generic_branch_cost,
 994   &xgene1_approx_modes,
 995   SVE_NOT_IMPLEMENTED, /* sve_width  */
 996   6, /* memmov_cost  */
 997   4, /* issue_rate  */
 998   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 999   "16", /* function_align.  */
1000   "16", /* jump_align.  */
1001   "16", /* loop_align.  */
1002   2,    /* int_reassoc_width.  */
1003   4,    /* fp_reassoc_width.  */
1004   1,    /* vec_reassoc_width.  */
1005   2,    /* min_div_recip_mul_sf.  */
1006   2,    /* min_div_recip_mul_df.  */
1007   17,   /* max_case_values.  */
1008   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1009   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1010   &xgene1_prefetch_tune
1011 };
1012
1013 static const struct tune_params emag_tunings =
1014 {
1015   &xgene1_extra_costs,
1016   &xgene1_addrcost_table,
1017   &xgene1_regmove_cost,
1018   &xgene1_vector_cost,
1019   &generic_branch_cost,
1020   &xgene1_approx_modes,
1021   SVE_NOT_IMPLEMENTED,
1022   6, /* memmov_cost  */
1023   4, /* issue_rate  */
1024   AARCH64_FUSE_NOTHING, /* fusible_ops  */
1025   "16", /* function_align.  */
1026   "16", /* jump_align.  */
1027   "16", /* loop_align.  */
1028   2,    /* int_reassoc_width.  */
1029   4,    /* fp_reassoc_width.  */
1030   1,    /* vec_reassoc_width.  */
1031   2,    /* min_div_recip_mul_sf.  */
1032   2,    /* min_div_recip_mul_df.  */
1033   17,   /* max_case_values.  */
1034   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1035   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1036   &xgene1_prefetch_tune
1037 };
1038
1039 static const struct tune_params qdf24xx_tunings =
1040 {
1041   &qdf24xx_extra_costs,
1042   &qdf24xx_addrcost_table,
1043   &qdf24xx_regmove_cost,
1044   &qdf24xx_vector_cost,
1045   &generic_branch_cost,
1046   &generic_approx_modes,
1047   SVE_NOT_IMPLEMENTED, /* sve_width  */
1048   4, /* memmov_cost  */
1049   4, /* issue_rate  */
1050   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1051    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1052   "16", /* function_align.  */
1053   "8",  /* jump_align.  */
1054   "16", /* loop_align.  */
1055   2,    /* int_reassoc_width.  */
1056   4,    /* fp_reassoc_width.  */
1057   1,    /* vec_reassoc_width.  */
1058   2,    /* min_div_recip_mul_sf.  */
1059   2,    /* min_div_recip_mul_df.  */
1060   0,    /* max_case_values.  */
1061   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1062   AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
1063   &qdf24xx_prefetch_tune
1064 };
1065
1066 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
1067    for now.  */
1068 static const struct tune_params saphira_tunings =
1069 {
1070   &generic_extra_costs,
1071   &generic_addrcost_table,
1072   &generic_regmove_cost,
1073   &generic_vector_cost,
1074   &generic_branch_cost,
1075   &generic_approx_modes,
1076   SVE_NOT_IMPLEMENTED, /* sve_width  */
1077   4, /* memmov_cost  */
1078   4, /* issue_rate  */
1079   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1080    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1081   "16", /* function_align.  */
1082   "8",  /* jump_align.  */
1083   "16", /* loop_align.  */
1084   2,    /* int_reassoc_width.  */
1085   4,    /* fp_reassoc_width.  */
1086   1,    /* vec_reassoc_width.  */
1087   2,    /* min_div_recip_mul_sf.  */
1088   2,    /* min_div_recip_mul_df.  */
1089   0,    /* max_case_values.  */
1090   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1091   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
1092   &generic_prefetch_tune
1093 };
1094
1095 static const struct tune_params thunderx2t99_tunings =
1096 {
1097   &thunderx2t99_extra_costs,
1098   &thunderx2t99_addrcost_table,
1099   &thunderx2t99_regmove_cost,
1100   &thunderx2t99_vector_cost,
1101   &generic_branch_cost,
1102   &generic_approx_modes,
1103   SVE_NOT_IMPLEMENTED, /* sve_width  */
1104   4, /* memmov_cost.  */
1105   4, /* issue_rate.  */
1106   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
1107    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
1108   "16", /* function_align.  */
1109   "8",  /* jump_align.  */
1110   "16", /* loop_align.  */
1111   3,    /* int_reassoc_width.  */
1112   2,    /* fp_reassoc_width.  */
1113   2,    /* vec_reassoc_width.  */
1114   2,    /* min_div_recip_mul_sf.  */
1115   2,    /* min_div_recip_mul_df.  */
1116   0,    /* max_case_values.  */
1117   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1118   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1119   &thunderx2t99_prefetch_tune
1120 };
1121
1122 static const struct tune_params neoversen1_tunings =
1123 {
1124   &cortexa57_extra_costs,
1125   &generic_addrcost_table,
1126   &generic_regmove_cost,
1127   &cortexa57_vector_cost,
1128   &generic_branch_cost,
1129   &generic_approx_modes,
1130   SVE_NOT_IMPLEMENTED, /* sve_width  */
1131   4, /* memmov_cost  */
1132   3, /* issue_rate  */
1133   AARCH64_FUSE_AES_AESMC, /* fusible_ops  */
1134   "32:16",      /* function_align.  */
1135   "32:16",      /* jump_align.  */
1136   "32:16",      /* loop_align.  */
1137   2,    /* int_reassoc_width.  */
1138   4,    /* fp_reassoc_width.  */
1139   2,    /* vec_reassoc_width.  */
1140   2,    /* min_div_recip_mul_sf.  */
1141   2,    /* min_div_recip_mul_df.  */
1142   0,    /* max_case_values.  */
1143   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1144   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1145   &generic_prefetch_tune
1146 };
1147
1148 /* Support for fine-grained override of the tuning structures.  */
1149 struct aarch64_tuning_override_function
1150 {
1151   const char* name;
1152   void (*parse_override)(const char*, struct tune_params*);
1153 };
1154
1155 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1156 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1157 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1158
1159 static const struct aarch64_tuning_override_function
1160 aarch64_tuning_override_functions[] =
1161 {
1162   { "fuse", aarch64_parse_fuse_string },
1163   { "tune", aarch64_parse_tune_string },
1164   { "sve_width", aarch64_parse_sve_width_string },
1165   { NULL, NULL }
1166 };
1167
1168 /* A processor implementing AArch64.  */
1169 struct processor
1170 {
1171   const char *const name;
1172   enum aarch64_processor ident;
1173   enum aarch64_processor sched_core;
1174   enum aarch64_arch arch;
1175   unsigned architecture_version;
1176   const uint64_t flags;
1177   const struct tune_params *const tune;
1178 };
1179
1180 /* Architectures implementing AArch64.  */
1181 static const struct processor all_architectures[] =
1182 {
1183 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1184   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1185 #include "aarch64-arches.def"
1186   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1187 };
1188
1189 /* Processor cores implementing AArch64.  */
1190 static const struct processor all_cores[] =
1191 {
1192 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1193   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
1194   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
1195   FLAGS, &COSTS##_tunings},
1196 #include "aarch64-cores.def"
1197   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1198     AARCH64_FL_FOR_ARCH8, &generic_tunings},
1199   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1200 };
1201
1202
1203 /* Target specification.  These are populated by the -march, -mtune, -mcpu
1204    handling code or by target attributes.  */
1205 static const struct processor *selected_arch;
1206 static const struct processor *selected_cpu;
1207 static const struct processor *selected_tune;
1208
1209 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1210
1211 /* The current tuning set.  */
1212 struct tune_params aarch64_tune_params = generic_tunings;
1213
1214 /* Table of machine attributes.  */
1215 static const struct attribute_spec aarch64_attribute_table[] =
1216 {
1217   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1218        affects_type_identity, handler, exclude } */
1219   { "aarch64_vector_pcs", 0, 0, false, true,  true,  true,  NULL, NULL },
1220   { NULL,                 0, 0, false, false, false, false, NULL, NULL }
1221 };
1222
1223 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1224
1225 /* An ISA extension in the co-processor and main instruction set space.  */
1226 struct aarch64_option_extension
1227 {
1228   const char *const name;
1229   const unsigned long flags_on;
1230   const unsigned long flags_off;
1231 };
1232
1233 typedef enum aarch64_cond_code
1234 {
1235   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1236   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1237   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1238 }
1239 aarch64_cc;
1240
1241 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1242
1243 struct aarch64_branch_protect_type
1244 {
1245   /* The type's name that the user passes to the branch-protection option
1246     string.  */
1247   const char* name;
1248   /* Function to handle the protection type and set global variables.
1249     First argument is the string token corresponding with this type and the
1250     second argument is the next token in the option string.
1251     Return values:
1252     * AARCH64_PARSE_OK: Handling was sucessful.
1253     * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1254       should print an error.
1255     * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1256       own error.  */
1257   enum aarch64_parse_opt_result (*handler)(char*, char*);
1258   /* A list of types that can follow this type in the option string.  */
1259   const aarch64_branch_protect_type* subtypes;
1260   unsigned int num_subtypes;
1261 };
1262
1263 static enum aarch64_parse_opt_result
1264 aarch64_handle_no_branch_protection (char* str, char* rest)
1265 {
1266   aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1267   aarch64_enable_bti = 0;
1268   if (rest)
1269     {
1270       error ("unexpected %<%s%> after %<%s%>", rest, str);
1271       return AARCH64_PARSE_INVALID_FEATURE;
1272     }
1273   return AARCH64_PARSE_OK;
1274 }
1275
1276 static enum aarch64_parse_opt_result
1277 aarch64_handle_standard_branch_protection (char* str, char* rest)
1278 {
1279   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1280   aarch64_ra_sign_key = AARCH64_KEY_A;
1281   aarch64_enable_bti = 1;
1282   if (rest)
1283     {
1284       error ("unexpected %<%s%> after %<%s%>", rest, str);
1285       return AARCH64_PARSE_INVALID_FEATURE;
1286     }
1287   return AARCH64_PARSE_OK;
1288 }
1289
1290 static enum aarch64_parse_opt_result
1291 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1292                                     char* rest ATTRIBUTE_UNUSED)
1293 {
1294   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1295   aarch64_ra_sign_key = AARCH64_KEY_A;
1296   return AARCH64_PARSE_OK;
1297 }
1298
1299 static enum aarch64_parse_opt_result
1300 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1301                               char* rest ATTRIBUTE_UNUSED)
1302 {
1303   aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1304   return AARCH64_PARSE_OK;
1305 }
1306
1307 static enum aarch64_parse_opt_result
1308 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
1309                               char* rest ATTRIBUTE_UNUSED)
1310 {
1311   aarch64_ra_sign_key = AARCH64_KEY_B;
1312   return AARCH64_PARSE_OK;
1313 }
1314
1315 static enum aarch64_parse_opt_result
1316 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1317                                     char* rest ATTRIBUTE_UNUSED)
1318 {
1319   aarch64_enable_bti = 1;
1320   return AARCH64_PARSE_OK;
1321 }
1322
1323 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1324   { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1325   { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
1326   { NULL, NULL, NULL, 0 }
1327 };
1328
1329 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1330   { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1331   { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1332   { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1333     ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1334   { "bti", aarch64_handle_bti_protection, NULL, 0 },
1335   { NULL, NULL, NULL, 0 }
1336 };
1337
1338 /* The condition codes of the processor, and the inverse function.  */
1339 static const char * const aarch64_condition_codes[] =
1340 {
1341   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1342   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1343 };
1344
1345 /* The preferred condition codes for SVE conditions.  */
1346 static const char *const aarch64_sve_condition_codes[] =
1347 {
1348   "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1349   "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1350 };
1351
1352 /* Return the assembly token for svpattern value VALUE.  */
1353
1354 static const char *
1355 svpattern_token (enum aarch64_svpattern pattern)
1356 {
1357   switch (pattern)
1358     {
1359 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1360     AARCH64_FOR_SVPATTERN (CASE)
1361 #undef CASE
1362     case AARCH64_NUM_SVPATTERNS:
1363       break;
1364     }
1365   gcc_unreachable ();
1366 }
1367
1368 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1369 const char *
1370 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1371                         const char * branch_format)
1372 {
1373     rtx_code_label * tmp_label = gen_label_rtx ();
1374     char label_buf[256];
1375     char buffer[128];
1376     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1377                                  CODE_LABEL_NUMBER (tmp_label));
1378     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1379     rtx dest_label = operands[pos_label];
1380     operands[pos_label] = tmp_label;
1381
1382     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1383     output_asm_insn (buffer, operands);
1384
1385     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1386     operands[pos_label] = dest_label;
1387     output_asm_insn (buffer, operands);
1388     return "";
1389 }
1390
1391 void
1392 aarch64_err_no_fpadvsimd (machine_mode mode)
1393 {
1394   if (TARGET_GENERAL_REGS_ONLY)
1395     if (FLOAT_MODE_P (mode))
1396       error ("%qs is incompatible with the use of floating-point types",
1397              "-mgeneral-regs-only");
1398     else
1399       error ("%qs is incompatible with the use of vector types",
1400              "-mgeneral-regs-only");
1401   else
1402     if (FLOAT_MODE_P (mode))
1403       error ("%qs feature modifier is incompatible with the use of"
1404              " floating-point types", "+nofp");
1405     else
1406       error ("%qs feature modifier is incompatible with the use of"
1407              " vector types", "+nofp");
1408 }
1409
1410 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1411    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1412    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1413    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1414    and GENERAL_REGS is lower than the memory cost (in this case the best class
1415    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
1416    cost results in bad allocations with many redundant int<->FP moves which
1417    are expensive on various cores.
1418    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1419    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
1420    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
1421    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
1422    The result of this is that it is no longer inefficient to have a higher
1423    memory move cost than the register move cost.
1424 */
1425
1426 static reg_class_t
1427 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1428                                          reg_class_t best_class)
1429 {
1430   machine_mode mode;
1431
1432   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1433       || !reg_class_subset_p (FP_REGS, allocno_class))
1434     return allocno_class;
1435
1436   if (!reg_class_subset_p (GENERAL_REGS, best_class)
1437       || !reg_class_subset_p (FP_REGS, best_class))
1438     return best_class;
1439
1440   mode = PSEUDO_REGNO_MODE (regno);
1441   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1442 }
1443
1444 static unsigned int
1445 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1446 {
1447   if (GET_MODE_UNIT_SIZE (mode) == 4)
1448     return aarch64_tune_params.min_div_recip_mul_sf;
1449   return aarch64_tune_params.min_div_recip_mul_df;
1450 }
1451
1452 /* Return the reassociation width of treeop OPC with mode MODE.  */
1453 static int
1454 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1455 {
1456   if (VECTOR_MODE_P (mode))
1457     return aarch64_tune_params.vec_reassoc_width;
1458   if (INTEGRAL_MODE_P (mode))
1459     return aarch64_tune_params.int_reassoc_width;
1460   /* Avoid reassociating floating point addition so we emit more FMAs.  */
1461   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1462     return aarch64_tune_params.fp_reassoc_width;
1463   return 1;
1464 }
1465
1466 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1467 unsigned
1468 aarch64_dbx_register_number (unsigned regno)
1469 {
1470    if (GP_REGNUM_P (regno))
1471      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1472    else if (regno == SP_REGNUM)
1473      return AARCH64_DWARF_SP;
1474    else if (FP_REGNUM_P (regno))
1475      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1476    else if (PR_REGNUM_P (regno))
1477      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1478    else if (regno == VG_REGNUM)
1479      return AARCH64_DWARF_VG;
1480
1481    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1482       equivalent DWARF register.  */
1483    return DWARF_FRAME_REGISTERS;
1484 }
1485
1486 /* If X is a CONST_DOUBLE, return its bit representation as a constant
1487    integer, otherwise return X unmodified.  */
1488 static rtx
1489 aarch64_bit_representation (rtx x)
1490 {
1491   if (CONST_DOUBLE_P (x))
1492     x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
1493   return x;
1494 }
1495
1496 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1497 static bool
1498 aarch64_advsimd_struct_mode_p (machine_mode mode)
1499 {
1500   return (TARGET_SIMD
1501           && (mode == OImode || mode == CImode || mode == XImode));
1502 }
1503
1504 /* Return true if MODE is an SVE predicate mode.  */
1505 static bool
1506 aarch64_sve_pred_mode_p (machine_mode mode)
1507 {
1508   return (TARGET_SVE
1509           && (mode == VNx16BImode
1510               || mode == VNx8BImode
1511               || mode == VNx4BImode
1512               || mode == VNx2BImode));
1513 }
1514
1515 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1516 const unsigned int VEC_ADVSIMD  = 1;
1517 const unsigned int VEC_SVE_DATA = 2;
1518 const unsigned int VEC_SVE_PRED = 4;
1519 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1520    a structure of 2, 3 or 4 vectors.  */
1521 const unsigned int VEC_STRUCT   = 8;
1522 /* Useful combinations of the above.  */
1523 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1524 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1525
1526 /* Return a set of flags describing the vector properties of mode MODE.
1527    Ignore modes that are not supported by the current target.  */
1528 static unsigned int
1529 aarch64_classify_vector_mode (machine_mode mode)
1530 {
1531   if (aarch64_advsimd_struct_mode_p (mode))
1532     return VEC_ADVSIMD | VEC_STRUCT;
1533
1534   if (aarch64_sve_pred_mode_p (mode))
1535     return VEC_SVE_PRED;
1536
1537   /* Make the decision based on the mode's enum value rather than its
1538      properties, so that we keep the correct classification regardless
1539      of -msve-vector-bits.  */
1540   switch (mode)
1541     {
1542     /* Single SVE vectors.  */
1543     case E_VNx16QImode:
1544     case E_VNx8HImode:
1545     case E_VNx4SImode:
1546     case E_VNx2DImode:
1547     case E_VNx8HFmode:
1548     case E_VNx4SFmode:
1549     case E_VNx2DFmode:
1550       return TARGET_SVE ? VEC_SVE_DATA : 0;
1551
1552     /* x2 SVE vectors.  */
1553     case E_VNx32QImode:
1554     case E_VNx16HImode:
1555     case E_VNx8SImode:
1556     case E_VNx4DImode:
1557     case E_VNx16HFmode:
1558     case E_VNx8SFmode:
1559     case E_VNx4DFmode:
1560     /* x3 SVE vectors.  */
1561     case E_VNx48QImode:
1562     case E_VNx24HImode:
1563     case E_VNx12SImode:
1564     case E_VNx6DImode:
1565     case E_VNx24HFmode:
1566     case E_VNx12SFmode:
1567     case E_VNx6DFmode:
1568     /* x4 SVE vectors.  */
1569     case E_VNx64QImode:
1570     case E_VNx32HImode:
1571     case E_VNx16SImode:
1572     case E_VNx8DImode:
1573     case E_VNx32HFmode:
1574     case E_VNx16SFmode:
1575     case E_VNx8DFmode:
1576       return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
1577
1578     /* 64-bit Advanced SIMD vectors.  */
1579     case E_V8QImode:
1580     case E_V4HImode:
1581     case E_V2SImode:
1582     /* ...E_V1DImode doesn't exist.  */
1583     case E_V4HFmode:
1584     case E_V2SFmode:
1585     case E_V1DFmode:
1586     /* 128-bit Advanced SIMD vectors.  */
1587     case E_V16QImode:
1588     case E_V8HImode:
1589     case E_V4SImode:
1590     case E_V2DImode:
1591     case E_V8HFmode:
1592     case E_V4SFmode:
1593     case E_V2DFmode:
1594       return TARGET_SIMD ? VEC_ADVSIMD : 0;
1595
1596     default:
1597       return 0;
1598     }
1599 }
1600
1601 /* Return true if MODE is any of the data vector modes, including
1602    structure modes.  */
1603 static bool
1604 aarch64_vector_data_mode_p (machine_mode mode)
1605 {
1606   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1607 }
1608
1609 /* Return true if MODE is any form of SVE mode, including predicates,
1610    vectors and structures.  */
1611 bool
1612 aarch64_sve_mode_p (machine_mode mode)
1613 {
1614   return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
1615 }
1616
1617 /* Return true if MODE is an SVE data vector mode; either a single vector
1618    or a structure of vectors.  */
1619 static bool
1620 aarch64_sve_data_mode_p (machine_mode mode)
1621 {
1622   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1623 }
1624
1625 /* Implement target hook TARGET_ARRAY_MODE.  */
1626 static opt_machine_mode
1627 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1628 {
1629   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1630       && IN_RANGE (nelems, 2, 4))
1631     return mode_for_vector (GET_MODE_INNER (mode),
1632                             GET_MODE_NUNITS (mode) * nelems);
1633
1634   return opt_machine_mode ();
1635 }
1636
1637 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1638 static bool
1639 aarch64_array_mode_supported_p (machine_mode mode,
1640                                 unsigned HOST_WIDE_INT nelems)
1641 {
1642   if (TARGET_SIMD
1643       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1644           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1645       && (nelems >= 2 && nelems <= 4))
1646     return true;
1647
1648   return false;
1649 }
1650
1651 /* Return the SVE predicate mode to use for elements that have
1652    ELEM_NBYTES bytes, if such a mode exists.  */
1653
1654 opt_machine_mode
1655 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1656 {
1657   if (TARGET_SVE)
1658     {
1659       if (elem_nbytes == 1)
1660         return VNx16BImode;
1661       if (elem_nbytes == 2)
1662         return VNx8BImode;
1663       if (elem_nbytes == 4)
1664         return VNx4BImode;
1665       if (elem_nbytes == 8)
1666         return VNx2BImode;
1667     }
1668   return opt_machine_mode ();
1669 }
1670
1671 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1672
1673 static opt_machine_mode
1674 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1675 {
1676   if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1677     {
1678       unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1679       machine_mode pred_mode;
1680       if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1681         return pred_mode;
1682     }
1683
1684   return default_get_mask_mode (nunits, nbytes);
1685 }
1686
1687 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE.  */
1688
1689 static opt_machine_mode
1690 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
1691 {
1692   enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
1693                             ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
1694   machine_mode mode;
1695   FOR_EACH_MODE_IN_CLASS (mode, mclass)
1696     if (inner_mode == GET_MODE_INNER (mode)
1697         && known_eq (nunits, GET_MODE_NUNITS (mode))
1698         && aarch64_sve_data_mode_p (mode))
1699       return mode;
1700   return opt_machine_mode ();
1701 }
1702
1703 /* Return the integer element mode associated with SVE mode MODE.  */
1704
1705 static scalar_int_mode
1706 aarch64_sve_element_int_mode (machine_mode mode)
1707 {
1708   unsigned int elt_bits = vector_element_size (BITS_PER_SVE_VECTOR,
1709                                                GET_MODE_NUNITS (mode));
1710   return int_mode_for_size (elt_bits, 0).require ();
1711 }
1712
1713 /* Return the integer vector mode associated with SVE mode MODE.
1714    Unlike mode_for_int_vector, this can handle the case in which
1715    MODE is a predicate (and thus has a different total size).  */
1716
1717 static machine_mode
1718 aarch64_sve_int_mode (machine_mode mode)
1719 {
1720   scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
1721   return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
1722 }
1723
1724 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
1725    prefer to use the first arithmetic operand as the else value if
1726    the else value doesn't matter, since that exactly matches the SVE
1727    destructive merging form.  For ternary operations we could either
1728    pick the first operand and use FMAD-like instructions or the last
1729    operand and use FMLA-like instructions; the latter seems more
1730    natural.  */
1731
1732 static tree
1733 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1734 {
1735   return nops == 3 ? ops[2] : ops[0];
1736 }
1737
1738 /* Implement TARGET_HARD_REGNO_NREGS.  */
1739
1740 static unsigned int
1741 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1742 {
1743   /* ??? Logically we should only need to provide a value when
1744      HARD_REGNO_MODE_OK says that the combination is valid,
1745      but at the moment we need to handle all modes.  Just ignore
1746      any runtime parts for registers that can't store them.  */
1747   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1748   switch (aarch64_regno_regclass (regno))
1749     {
1750     case FP_REGS:
1751     case FP_LO_REGS:
1752     case FP_LO8_REGS:
1753       if (aarch64_sve_data_mode_p (mode))
1754         return exact_div (GET_MODE_SIZE (mode),
1755                           BYTES_PER_SVE_VECTOR).to_constant ();
1756       return CEIL (lowest_size, UNITS_PER_VREG);
1757     case PR_REGS:
1758     case PR_LO_REGS:
1759     case PR_HI_REGS:
1760       return 1;
1761     default:
1762       return CEIL (lowest_size, UNITS_PER_WORD);
1763     }
1764   gcc_unreachable ();
1765 }
1766
1767 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1768
1769 static bool
1770 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1771 {
1772   if (GET_MODE_CLASS (mode) == MODE_CC)
1773     return regno == CC_REGNUM;
1774
1775   if (regno == VG_REGNUM)
1776     /* This must have the same size as _Unwind_Word.  */
1777     return mode == DImode;
1778
1779   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1780   if (vec_flags & VEC_SVE_PRED)
1781     return PR_REGNUM_P (regno);
1782
1783   if (PR_REGNUM_P (regno))
1784     return 0;
1785
1786   if (regno == SP_REGNUM)
1787     /* The purpose of comparing with ptr_mode is to support the
1788        global register variable associated with the stack pointer
1789        register via the syntax of asm ("wsp") in ILP32.  */
1790     return mode == Pmode || mode == ptr_mode;
1791
1792   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1793     return mode == Pmode;
1794
1795   if (GP_REGNUM_P (regno))
1796     {
1797       if (known_le (GET_MODE_SIZE (mode), 8))
1798         return true;
1799       else if (known_le (GET_MODE_SIZE (mode), 16))
1800         return (regno & 1) == 0;
1801     }
1802   else if (FP_REGNUM_P (regno))
1803     {
1804       if (vec_flags & VEC_STRUCT)
1805         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1806       else
1807         return !VECTOR_MODE_P (mode) || vec_flags != 0;
1808     }
1809
1810   return false;
1811 }
1812
1813 /* Return true if this is a definition of a vectorized simd function.  */
1814
1815 static bool
1816 aarch64_simd_decl_p (tree fndecl)
1817 {
1818   tree fntype;
1819
1820   if (fndecl == NULL)
1821     return false;
1822   fntype = TREE_TYPE (fndecl);
1823   if (fntype == NULL)
1824     return false;
1825
1826   /* Functions with the aarch64_vector_pcs attribute use the simd ABI.  */
1827   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
1828     return true;
1829
1830   return false;
1831 }
1832
1833 /* Return the mode a register save/restore should use.  DImode for integer
1834    registers, DFmode for FP registers in non-SIMD functions (they only save
1835    the bottom half of a 128 bit register), or TFmode for FP registers in
1836    SIMD functions.  */
1837
1838 static machine_mode
1839 aarch64_reg_save_mode (tree fndecl, unsigned regno)
1840 {
1841   return GP_REGNUM_P (regno)
1842            ? E_DImode
1843            : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
1844 }
1845
1846 /* Return true if the instruction is a call to a SIMD function, false
1847    if it is not a SIMD function or if we do not know anything about
1848    the function.  */
1849
1850 static bool
1851 aarch64_simd_call_p (rtx_insn *insn)
1852 {
1853   rtx symbol;
1854   rtx call;
1855   tree fndecl;
1856
1857   gcc_assert (CALL_P (insn));
1858   call = get_call_rtx_from (insn);
1859   symbol = XEXP (XEXP (call, 0), 0);
1860   if (GET_CODE (symbol) != SYMBOL_REF)
1861     return false;
1862   fndecl = SYMBOL_REF_DECL (symbol);
1863   if (!fndecl)
1864     return false;
1865
1866   return aarch64_simd_decl_p (fndecl);
1867 }
1868
1869 /* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS.  If INSN calls
1870    a function that uses the SIMD ABI, take advantage of the extra
1871    call-preserved registers that the ABI provides.  */
1872
1873 void
1874 aarch64_remove_extra_call_preserved_regs (rtx_insn *insn,
1875                                           HARD_REG_SET *return_set)
1876 {
1877   if (aarch64_simd_call_p (insn))
1878     {
1879       for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1880         if (FP_SIMD_SAVED_REGNUM_P (regno))
1881           CLEAR_HARD_REG_BIT (*return_set, regno);
1882     }
1883 }
1884
1885 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1886    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1887    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1888
1889 static bool
1890 aarch64_hard_regno_call_part_clobbered (rtx_insn *insn, unsigned int regno,
1891                                         machine_mode mode)
1892 {
1893   bool simd_p = insn && CALL_P (insn) && aarch64_simd_call_p (insn);
1894   return FP_REGNUM_P (regno)
1895          && maybe_gt (GET_MODE_SIZE (mode), simd_p ? 16 : 8);
1896 }
1897
1898 /* Implement TARGET_RETURN_CALL_WITH_MAX_CLOBBERS.  */
1899
1900 rtx_insn *
1901 aarch64_return_call_with_max_clobbers (rtx_insn *call_1, rtx_insn *call_2)
1902 {
1903   gcc_assert (CALL_P (call_1) && CALL_P (call_2));
1904
1905   if (!aarch64_simd_call_p (call_1) || aarch64_simd_call_p (call_2))
1906     return call_1;
1907   else
1908     return call_2;
1909 }
1910
1911 /* Implement REGMODE_NATURAL_SIZE.  */
1912 poly_uint64
1913 aarch64_regmode_natural_size (machine_mode mode)
1914 {
1915   /* The natural size for SVE data modes is one SVE data vector,
1916      and similarly for predicates.  We can't independently modify
1917      anything smaller than that.  */
1918   /* ??? For now, only do this for variable-width SVE registers.
1919      Doing it for constant-sized registers breaks lower-subreg.c.  */
1920   /* ??? And once that's fixed, we should probably have similar
1921      code for Advanced SIMD.  */
1922   if (!aarch64_sve_vg.is_constant ())
1923     {
1924       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1925       if (vec_flags & VEC_SVE_PRED)
1926         return BYTES_PER_SVE_PRED;
1927       if (vec_flags & VEC_SVE_DATA)
1928         return BYTES_PER_SVE_VECTOR;
1929     }
1930   return UNITS_PER_WORD;
1931 }
1932
1933 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1934 machine_mode
1935 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1936                                      machine_mode mode)
1937 {
1938   /* The predicate mode determines which bits are significant and
1939      which are "don't care".  Decreasing the number of lanes would
1940      lose data while increasing the number of lanes would make bits
1941      unnecessarily significant.  */
1942   if (PR_REGNUM_P (regno))
1943     return mode;
1944   if (known_ge (GET_MODE_SIZE (mode), 4))
1945     return mode;
1946   else
1947     return SImode;
1948 }
1949
1950 /* Return true if I's bits are consecutive ones from the MSB.  */
1951 bool
1952 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
1953 {
1954   return exact_log2 (-i) != HOST_WIDE_INT_M1;
1955 }
1956
1957 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
1958    that strcpy from constants will be faster.  */
1959
1960 static HOST_WIDE_INT
1961 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1962 {
1963   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1964     return MAX (align, BITS_PER_WORD);
1965   return align;
1966 }
1967
1968 /* Return true if calls to DECL should be treated as
1969    long-calls (ie called via a register).  */
1970 static bool
1971 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1972 {
1973   return false;
1974 }
1975
1976 /* Return true if calls to symbol-ref SYM should be treated as
1977    long-calls (ie called via a register).  */
1978 bool
1979 aarch64_is_long_call_p (rtx sym)
1980 {
1981   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1982 }
1983
1984 /* Return true if calls to symbol-ref SYM should not go through
1985    plt stubs.  */
1986
1987 bool
1988 aarch64_is_noplt_call_p (rtx sym)
1989 {
1990   const_tree decl = SYMBOL_REF_DECL (sym);
1991
1992   if (flag_pic
1993       && decl
1994       && (!flag_plt
1995           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1996       && !targetm.binds_local_p (decl))
1997     return true;
1998
1999   return false;
2000 }
2001
2002 /* Return true if the offsets to a zero/sign-extract operation
2003    represent an expression that matches an extend operation.  The
2004    operands represent the paramters from
2005
2006    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
2007 bool
2008 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
2009                                 rtx extract_imm)
2010 {
2011   HOST_WIDE_INT mult_val, extract_val;
2012
2013   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
2014     return false;
2015
2016   mult_val = INTVAL (mult_imm);
2017   extract_val = INTVAL (extract_imm);
2018
2019   if (extract_val > 8
2020       && extract_val < GET_MODE_BITSIZE (mode)
2021       && exact_log2 (extract_val & ~7) > 0
2022       && (extract_val & 7) <= 4
2023       && mult_val == (1 << (extract_val & 7)))
2024     return true;
2025
2026   return false;
2027 }
2028
2029 /* Emit an insn that's a simple single-set.  Both the operands must be
2030    known to be valid.  */
2031 inline static rtx_insn *
2032 emit_set_insn (rtx x, rtx y)
2033 {
2034   return emit_insn (gen_rtx_SET (x, y));
2035 }
2036
2037 /* X and Y are two things to compare using CODE.  Emit the compare insn and
2038    return the rtx for register 0 in the proper mode.  */
2039 rtx
2040 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
2041 {
2042   machine_mode mode = SELECT_CC_MODE (code, x, y);
2043   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
2044
2045   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
2046   return cc_reg;
2047 }
2048
2049 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
2050
2051 static rtx
2052 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2053                                   machine_mode y_mode)
2054 {
2055   if (y_mode == E_QImode || y_mode == E_HImode)
2056     {
2057       if (CONST_INT_P (y))
2058         y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2059       else
2060         {
2061           rtx t, cc_reg;
2062           machine_mode cc_mode;
2063
2064           t = gen_rtx_ZERO_EXTEND (SImode, y);
2065           t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2066           cc_mode = CC_SWPmode;
2067           cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2068           emit_set_insn (cc_reg, t);
2069           return cc_reg;
2070         }
2071     }
2072
2073   if (!aarch64_plus_operand (y, y_mode))
2074     y = force_reg (y_mode, y);
2075
2076   return aarch64_gen_compare_reg (code, x, y);
2077 }
2078
2079 /* Build the SYMBOL_REF for __tls_get_addr.  */
2080
2081 static GTY(()) rtx tls_get_addr_libfunc;
2082
2083 rtx
2084 aarch64_tls_get_addr (void)
2085 {
2086   if (!tls_get_addr_libfunc)
2087     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
2088   return tls_get_addr_libfunc;
2089 }
2090
2091 /* Return the TLS model to use for ADDR.  */
2092
2093 static enum tls_model
2094 tls_symbolic_operand_type (rtx addr)
2095 {
2096   enum tls_model tls_kind = TLS_MODEL_NONE;
2097   if (GET_CODE (addr) == CONST)
2098     {
2099       poly_int64 addend;
2100       rtx sym = strip_offset (addr, &addend);
2101       if (GET_CODE (sym) == SYMBOL_REF)
2102         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
2103     }
2104   else if (GET_CODE (addr) == SYMBOL_REF)
2105     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2106
2107   return tls_kind;
2108 }
2109
2110 /* We'll allow lo_sum's in addresses in our legitimate addresses
2111    so that combine would take care of combining addresses where
2112    necessary, but for generation purposes, we'll generate the address
2113    as :
2114    RTL                               Absolute
2115    tmp = hi (symbol_ref);            adrp  x1, foo
2116    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
2117                                      nop
2118
2119    PIC                               TLS
2120    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
2121    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
2122                                      bl   __tls_get_addr
2123                                      nop
2124
2125    Load TLS symbol, depending on TLS mechanism and TLS access model.
2126
2127    Global Dynamic - Traditional TLS:
2128    adrp tmp, :tlsgd:imm
2129    add  dest, tmp, #:tlsgd_lo12:imm
2130    bl   __tls_get_addr
2131
2132    Global Dynamic - TLS Descriptors:
2133    adrp dest, :tlsdesc:imm
2134    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
2135    add  dest, dest, #:tlsdesc_lo12:imm
2136    blr  tmp
2137    mrs  tp, tpidr_el0
2138    add  dest, dest, tp
2139
2140    Initial Exec:
2141    mrs  tp, tpidr_el0
2142    adrp tmp, :gottprel:imm
2143    ldr  dest, [tmp, #:gottprel_lo12:imm]
2144    add  dest, dest, tp
2145
2146    Local Exec:
2147    mrs  tp, tpidr_el0
2148    add  t0, tp, #:tprel_hi12:imm, lsl #12
2149    add  t0, t0, #:tprel_lo12_nc:imm
2150 */
2151
2152 static void
2153 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2154                                    enum aarch64_symbol_type type)
2155 {
2156   switch (type)
2157     {
2158     case SYMBOL_SMALL_ABSOLUTE:
2159       {
2160         /* In ILP32, the mode of dest can be either SImode or DImode.  */
2161         rtx tmp_reg = dest;
2162         machine_mode mode = GET_MODE (dest);
2163
2164         gcc_assert (mode == Pmode || mode == ptr_mode);
2165
2166         if (can_create_pseudo_p ())
2167           tmp_reg = gen_reg_rtx (mode);
2168
2169         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2170         emit_insn (gen_add_losym (dest, tmp_reg, imm));
2171         return;
2172       }
2173
2174     case SYMBOL_TINY_ABSOLUTE:
2175       emit_insn (gen_rtx_SET (dest, imm));
2176       return;
2177
2178     case SYMBOL_SMALL_GOT_28K:
2179       {
2180         machine_mode mode = GET_MODE (dest);
2181         rtx gp_rtx = pic_offset_table_rtx;
2182         rtx insn;
2183         rtx mem;
2184
2185         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2186            here before rtl expand.  Tree IVOPT will generate rtl pattern to
2187            decide rtx costs, in which case pic_offset_table_rtx is not
2188            initialized.  For that case no need to generate the first adrp
2189            instruction as the final cost for global variable access is
2190            one instruction.  */
2191         if (gp_rtx != NULL)
2192           {
2193             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2194                using the page base as GOT base, the first page may be wasted,
2195                in the worst scenario, there is only 28K space for GOT).
2196
2197                The generate instruction sequence for accessing global variable
2198                is:
2199
2200                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2201
2202                Only one instruction needed. But we must initialize
2203                pic_offset_table_rtx properly.  We generate initialize insn for
2204                every global access, and allow CSE to remove all redundant.
2205
2206                The final instruction sequences will look like the following
2207                for multiply global variables access.
2208
2209                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2210
2211                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2212                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2213                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2214                  ...  */
2215
2216             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2217             crtl->uses_pic_offset_table = 1;
2218             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2219
2220             if (mode != GET_MODE (gp_rtx))
2221              gp_rtx = gen_lowpart (mode, gp_rtx);
2222
2223           }
2224
2225         if (mode == ptr_mode)
2226           {
2227             if (mode == DImode)
2228               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2229             else
2230               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2231
2232             mem = XVECEXP (SET_SRC (insn), 0, 0);
2233           }
2234         else
2235           {
2236             gcc_assert (mode == Pmode);
2237
2238             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2239             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2240           }
2241
2242         /* The operand is expected to be MEM.  Whenever the related insn
2243            pattern changed, above code which calculate mem should be
2244            updated.  */
2245         gcc_assert (GET_CODE (mem) == MEM);
2246         MEM_READONLY_P (mem) = 1;
2247         MEM_NOTRAP_P (mem) = 1;
2248         emit_insn (insn);
2249         return;
2250       }
2251
2252     case SYMBOL_SMALL_GOT_4G:
2253       {
2254         /* In ILP32, the mode of dest can be either SImode or DImode,
2255            while the got entry is always of SImode size.  The mode of
2256            dest depends on how dest is used: if dest is assigned to a
2257            pointer (e.g. in the memory), it has SImode; it may have
2258            DImode if dest is dereferenced to access the memeory.
2259            This is why we have to handle three different ldr_got_small
2260            patterns here (two patterns for ILP32).  */
2261
2262         rtx insn;
2263         rtx mem;
2264         rtx tmp_reg = dest;
2265         machine_mode mode = GET_MODE (dest);
2266
2267         if (can_create_pseudo_p ())
2268           tmp_reg = gen_reg_rtx (mode);
2269
2270         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2271         if (mode == ptr_mode)
2272           {
2273             if (mode == DImode)
2274               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2275             else
2276               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2277
2278             mem = XVECEXP (SET_SRC (insn), 0, 0);
2279           }
2280         else
2281           {
2282             gcc_assert (mode == Pmode);
2283
2284             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2285             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2286           }
2287
2288         gcc_assert (GET_CODE (mem) == MEM);
2289         MEM_READONLY_P (mem) = 1;
2290         MEM_NOTRAP_P (mem) = 1;
2291         emit_insn (insn);
2292         return;
2293       }
2294
2295     case SYMBOL_SMALL_TLSGD:
2296       {
2297         rtx_insn *insns;
2298         machine_mode mode = GET_MODE (dest);
2299         rtx result = gen_rtx_REG (mode, R0_REGNUM);
2300
2301         start_sequence ();
2302         if (TARGET_ILP32)
2303           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2304         else
2305           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2306         insns = get_insns ();
2307         end_sequence ();
2308
2309         RTL_CONST_CALL_P (insns) = 1;
2310         emit_libcall_block (insns, dest, result, imm);
2311         return;
2312       }
2313
2314     case SYMBOL_SMALL_TLSDESC:
2315       {
2316         machine_mode mode = GET_MODE (dest);
2317         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2318         rtx tp;
2319
2320         gcc_assert (mode == Pmode || mode == ptr_mode);
2321
2322         /* In ILP32, the got entry is always of SImode size.  Unlike
2323            small GOT, the dest is fixed at reg 0.  */
2324         if (TARGET_ILP32)
2325           emit_insn (gen_tlsdesc_small_si (imm));
2326         else
2327           emit_insn (gen_tlsdesc_small_di (imm));
2328         tp = aarch64_load_tp (NULL);
2329
2330         if (mode != Pmode)
2331           tp = gen_lowpart (mode, tp);
2332
2333         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2334         if (REG_P (dest))
2335           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2336         return;
2337       }
2338
2339     case SYMBOL_SMALL_TLSIE:
2340       {
2341         /* In ILP32, the mode of dest can be either SImode or DImode,
2342            while the got entry is always of SImode size.  The mode of
2343            dest depends on how dest is used: if dest is assigned to a
2344            pointer (e.g. in the memory), it has SImode; it may have
2345            DImode if dest is dereferenced to access the memeory.
2346            This is why we have to handle three different tlsie_small
2347            patterns here (two patterns for ILP32).  */
2348         machine_mode mode = GET_MODE (dest);
2349         rtx tmp_reg = gen_reg_rtx (mode);
2350         rtx tp = aarch64_load_tp (NULL);
2351
2352         if (mode == ptr_mode)
2353           {
2354             if (mode == DImode)
2355               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2356             else
2357               {
2358                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2359                 tp = gen_lowpart (mode, tp);
2360               }
2361           }
2362         else
2363           {
2364             gcc_assert (mode == Pmode);
2365             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2366           }
2367
2368         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2369         if (REG_P (dest))
2370           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2371         return;
2372       }
2373
2374     case SYMBOL_TLSLE12:
2375     case SYMBOL_TLSLE24:
2376     case SYMBOL_TLSLE32:
2377     case SYMBOL_TLSLE48:
2378       {
2379         machine_mode mode = GET_MODE (dest);
2380         rtx tp = aarch64_load_tp (NULL);
2381
2382         if (mode != Pmode)
2383           tp = gen_lowpart (mode, tp);
2384
2385         switch (type)
2386           {
2387           case SYMBOL_TLSLE12:
2388             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2389                         (dest, tp, imm));
2390             break;
2391           case SYMBOL_TLSLE24:
2392             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2393                         (dest, tp, imm));
2394           break;
2395           case SYMBOL_TLSLE32:
2396             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2397                         (dest, imm));
2398             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2399                         (dest, dest, tp));
2400           break;
2401           case SYMBOL_TLSLE48:
2402             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2403                         (dest, imm));
2404             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2405                         (dest, dest, tp));
2406             break;
2407           default:
2408             gcc_unreachable ();
2409           }
2410
2411         if (REG_P (dest))
2412           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2413         return;
2414       }
2415
2416     case SYMBOL_TINY_GOT:
2417       emit_insn (gen_ldr_got_tiny (dest, imm));
2418       return;
2419
2420     case SYMBOL_TINY_TLSIE:
2421       {
2422         machine_mode mode = GET_MODE (dest);
2423         rtx tp = aarch64_load_tp (NULL);
2424
2425         if (mode == ptr_mode)
2426           {
2427             if (mode == DImode)
2428               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2429             else
2430               {
2431                 tp = gen_lowpart (mode, tp);
2432                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2433               }
2434           }
2435         else
2436           {
2437             gcc_assert (mode == Pmode);
2438             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2439           }
2440
2441         if (REG_P (dest))
2442           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2443         return;
2444       }
2445
2446     default:
2447       gcc_unreachable ();
2448     }
2449 }
2450
2451 /* Emit a move from SRC to DEST.  Assume that the move expanders can
2452    handle all moves if !can_create_pseudo_p ().  The distinction is
2453    important because, unlike emit_move_insn, the move expanders know
2454    how to force Pmode objects into the constant pool even when the
2455    constant pool address is not itself legitimate.  */
2456 static rtx
2457 aarch64_emit_move (rtx dest, rtx src)
2458 {
2459   return (can_create_pseudo_p ()
2460           ? emit_move_insn (dest, src)
2461           : emit_move_insn_1 (dest, src));
2462 }
2463
2464 /* Apply UNOPTAB to OP and store the result in DEST.  */
2465
2466 static void
2467 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2468 {
2469   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2470   if (dest != tmp)
2471     emit_move_insn (dest, tmp);
2472 }
2473
2474 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
2475
2476 static void
2477 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2478 {
2479   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2480                           OPTAB_DIRECT);
2481   if (dest != tmp)
2482     emit_move_insn (dest, tmp);
2483 }
2484
2485 /* Split a 128-bit move operation into two 64-bit move operations,
2486    taking care to handle partial overlap of register to register
2487    copies.  Special cases are needed when moving between GP regs and
2488    FP regs.  SRC can be a register, constant or memory; DST a register
2489    or memory.  If either operand is memory it must not have any side
2490    effects.  */
2491 void
2492 aarch64_split_128bit_move (rtx dst, rtx src)
2493 {
2494   rtx dst_lo, dst_hi;
2495   rtx src_lo, src_hi;
2496
2497   machine_mode mode = GET_MODE (dst);
2498
2499   gcc_assert (mode == TImode || mode == TFmode);
2500   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2501   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2502
2503   if (REG_P (dst) && REG_P (src))
2504     {
2505       int src_regno = REGNO (src);
2506       int dst_regno = REGNO (dst);
2507
2508       /* Handle FP <-> GP regs.  */
2509       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2510         {
2511           src_lo = gen_lowpart (word_mode, src);
2512           src_hi = gen_highpart (word_mode, src);
2513
2514           emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2515           emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2516           return;
2517         }
2518       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2519         {
2520           dst_lo = gen_lowpart (word_mode, dst);
2521           dst_hi = gen_highpart (word_mode, dst);
2522
2523           emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2524           emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2525           return;
2526         }
2527     }
2528
2529   dst_lo = gen_lowpart (word_mode, dst);
2530   dst_hi = gen_highpart (word_mode, dst);
2531   src_lo = gen_lowpart (word_mode, src);
2532   src_hi = gen_highpart_mode (word_mode, mode, src);
2533
2534   /* At most one pairing may overlap.  */
2535   if (reg_overlap_mentioned_p (dst_lo, src_hi))
2536     {
2537       aarch64_emit_move (dst_hi, src_hi);
2538       aarch64_emit_move (dst_lo, src_lo);
2539     }
2540   else
2541     {
2542       aarch64_emit_move (dst_lo, src_lo);
2543       aarch64_emit_move (dst_hi, src_hi);
2544     }
2545 }
2546
2547 bool
2548 aarch64_split_128bit_move_p (rtx dst, rtx src)
2549 {
2550   return (! REG_P (src)
2551           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2552 }
2553
2554 /* Split a complex SIMD combine.  */
2555
2556 void
2557 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2558 {
2559   machine_mode src_mode = GET_MODE (src1);
2560   machine_mode dst_mode = GET_MODE (dst);
2561
2562   gcc_assert (VECTOR_MODE_P (dst_mode));
2563   gcc_assert (register_operand (dst, dst_mode)
2564               && register_operand (src1, src_mode)
2565               && register_operand (src2, src_mode));
2566
2567   emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2568   return;
2569 }
2570
2571 /* Split a complex SIMD move.  */
2572
2573 void
2574 aarch64_split_simd_move (rtx dst, rtx src)
2575 {
2576   machine_mode src_mode = GET_MODE (src);
2577   machine_mode dst_mode = GET_MODE (dst);
2578
2579   gcc_assert (VECTOR_MODE_P (dst_mode));
2580
2581   if (REG_P (dst) && REG_P (src))
2582     {
2583       gcc_assert (VECTOR_MODE_P (src_mode));
2584       emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2585     }
2586 }
2587
2588 bool
2589 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2590                               machine_mode ymode, rtx y)
2591 {
2592   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2593   gcc_assert (r != NULL);
2594   return rtx_equal_p (x, r);
2595 }
2596
2597
2598 /* Return TARGET if it is nonnull and a register of mode MODE.
2599    Otherwise, return a fresh register of mode MODE if we can,
2600    or TARGET reinterpreted as MODE if we can't.  */
2601
2602 static rtx
2603 aarch64_target_reg (rtx target, machine_mode mode)
2604 {
2605   if (target && REG_P (target) && GET_MODE (target) == mode)
2606     return target;
2607   if (!can_create_pseudo_p ())
2608     {
2609       gcc_assert (target);
2610       return gen_lowpart (mode, target);
2611     }
2612   return gen_reg_rtx (mode);
2613 }
2614
2615 /* Return a register that contains the constant in BUILDER, given that
2616    the constant is a legitimate move operand.  Use TARGET as the register
2617    if it is nonnull and convenient.  */
2618
2619 static rtx
2620 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
2621 {
2622   rtx src = builder.build ();
2623   target = aarch64_target_reg (target, GET_MODE (src));
2624   emit_insn (gen_rtx_SET (target, src));
2625   return target;
2626 }
2627
2628 static rtx
2629 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2630 {
2631   if (can_create_pseudo_p ())
2632     return force_reg (mode, value);
2633   else
2634     {
2635       gcc_assert (x);
2636       aarch64_emit_move (x, value);
2637       return x;
2638     }
2639 }
2640
2641 /* Return true if predicate value X is a constant in which every element
2642    is a CONST_INT.  When returning true, describe X in BUILDER as a VNx16BI
2643    value, i.e. as a predicate in which all bits are significant.  */
2644
2645 static bool
2646 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
2647 {
2648   if (GET_CODE (x) != CONST_VECTOR)
2649     return false;
2650
2651   unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
2652                                              GET_MODE_NUNITS (GET_MODE (x)));
2653   unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
2654   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
2655   builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
2656
2657   unsigned int nelts = const_vector_encoded_nelts (x);
2658   for (unsigned int i = 0; i < nelts; ++i)
2659     {
2660       rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
2661       if (!CONST_INT_P (elt))
2662         return false;
2663
2664       builder.quick_push (elt);
2665       for (unsigned int j = 1; j < factor; ++j)
2666         builder.quick_push (const0_rtx);
2667     }
2668   builder.finalize ();
2669   return true;
2670 }
2671
2672 /* BUILDER contains a predicate constant of mode VNx16BI.  Return the
2673    widest predicate element size it can have (that is, the largest size
2674    for which each element would still be 0 or 1).  */
2675
2676 unsigned int
2677 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
2678 {
2679   /* Start with the most optimistic assumption: that we only need
2680      one bit per pattern.  This is what we will use if only the first
2681      bit in each pattern is ever set.  */
2682   unsigned int mask = GET_MODE_SIZE (DImode);
2683   mask |= builder.npatterns ();
2684
2685   /* Look for set bits.  */
2686   unsigned int nelts = builder.encoded_nelts ();
2687   for (unsigned int i = 1; i < nelts; ++i)
2688     if (INTVAL (builder.elt (i)) != 0)
2689       {
2690         if (i & 1)
2691           return 1;
2692         mask |= i;
2693       }
2694   return mask & -mask;
2695 }
2696
2697 /* BUILDER is a predicate constant of mode VNx16BI.  Consider the value
2698    that the constant would have with predicate element size ELT_SIZE
2699    (ignoring the upper bits in each element) and return:
2700
2701    * -1 if all bits are set
2702    * N if the predicate has N leading set bits followed by all clear bits
2703    * 0 if the predicate does not have any of these forms.  */
2704
2705 int
2706 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
2707                               unsigned int elt_size)
2708 {
2709   /* If nelts_per_pattern is 3, we have set bits followed by clear bits
2710      followed by set bits.  */
2711   if (builder.nelts_per_pattern () == 3)
2712     return 0;
2713
2714   /* Skip over leading set bits.  */
2715   unsigned int nelts = builder.encoded_nelts ();
2716   unsigned int i = 0;
2717   for (; i < nelts; i += elt_size)
2718     if (INTVAL (builder.elt (i)) == 0)
2719       break;
2720   unsigned int vl = i / elt_size;
2721
2722   /* Check for the all-true case.  */
2723   if (i == nelts)
2724     return -1;
2725
2726   /* If nelts_per_pattern is 1, then either VL is zero, or we have a
2727      repeating pattern of set bits followed by clear bits.  */
2728   if (builder.nelts_per_pattern () != 2)
2729     return 0;
2730
2731   /* We have a "foreground" value and a duplicated "background" value.
2732      If the background might repeat and the last set bit belongs to it,
2733      we might have set bits followed by clear bits followed by set bits.  */
2734   if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
2735     return 0;
2736
2737   /* Make sure that the rest are all clear.  */
2738   for (; i < nelts; i += elt_size)
2739     if (INTVAL (builder.elt (i)) != 0)
2740       return 0;
2741
2742   return vl;
2743 }
2744
2745 /* See if there is an svpattern that encodes an SVE predicate of mode
2746    PRED_MODE in which the first VL bits are set and the rest are clear.
2747    Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
2748    A VL of -1 indicates an all-true vector.  */
2749
2750 aarch64_svpattern
2751 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
2752 {
2753   if (vl < 0)
2754     return AARCH64_SV_ALL;
2755
2756   if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
2757     return AARCH64_NUM_SVPATTERNS;
2758
2759   if (vl >= 1 && vl <= 8)
2760     return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
2761
2762   if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
2763     return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
2764
2765   int max_vl;
2766   if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
2767     {
2768       if (vl == (max_vl / 3) * 3)
2769         return AARCH64_SV_MUL3;
2770       /* These would only trigger for non-power-of-2 lengths.  */
2771       if (vl == (max_vl & -4))
2772         return AARCH64_SV_MUL4;
2773       if (vl == (1 << floor_log2 (max_vl)))
2774         return AARCH64_SV_POW2;
2775       if (vl == max_vl)
2776         return AARCH64_SV_ALL;
2777     }
2778   return AARCH64_NUM_SVPATTERNS;
2779 }
2780
2781 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
2782    bits has the lowest bit set and the upper bits clear.  This is the
2783    VNx16BImode equivalent of a PTRUE for controlling elements of
2784    ELT_SIZE bytes.  However, because the constant is VNx16BImode,
2785    all bits are significant, even the upper zeros.  */
2786
2787 rtx
2788 aarch64_ptrue_all (unsigned int elt_size)
2789 {
2790   rtx_vector_builder builder (VNx16BImode, elt_size, 1);
2791   builder.quick_push (const1_rtx);
2792   for (unsigned int i = 1; i < elt_size; ++i)
2793     builder.quick_push (const0_rtx);
2794   return builder.build ();
2795 }
2796
2797 /* Return an all-true predicate register of mode MODE.  */
2798
2799 rtx
2800 aarch64_ptrue_reg (machine_mode mode)
2801 {
2802   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2803   rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
2804   return gen_lowpart (mode, reg);
2805 }
2806
2807 /* Return an all-false predicate register of mode MODE.  */
2808
2809 rtx
2810 aarch64_pfalse_reg (machine_mode mode)
2811 {
2812   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2813   rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
2814   return gen_lowpart (mode, reg);
2815 }
2816
2817 /* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
2818    true, or alternatively if we know that the operation predicated by
2819    PRED1[0] is safe to perform whenever PRED2 is true.  PRED1[1] is a
2820    aarch64_sve_gp_strictness operand that describes the operation
2821    predicated by PRED1[0].  */
2822
2823 bool
2824 aarch64_sve_pred_dominates_p (rtx *pred1, rtx pred2)
2825 {
2826   machine_mode mode = GET_MODE (pred2);
2827   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2828               && mode == GET_MODE (pred1[0])
2829               && aarch64_sve_gp_strictness (pred1[1], SImode));
2830   return (pred1[0] == CONSTM1_RTX (mode)
2831           || INTVAL (pred1[1]) == SVE_RELAXED_GP
2832           || rtx_equal_p (pred1[0], pred2));
2833 }
2834
2835 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
2836    for it.  PRED2[0] is the predicate for the instruction whose result
2837    is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
2838    for it.  Return true if we can prove that the two predicates are
2839    equivalent for PTEST purposes; that is, if we can replace PRED2[0]
2840    with PRED1[0] without changing behavior.  */
2841
2842 bool
2843 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
2844 {
2845   machine_mode mode = GET_MODE (pred1[0]);
2846   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2847               && mode == GET_MODE (pred2[0])
2848               && aarch64_sve_ptrue_flag (pred1[1], SImode)
2849               && aarch64_sve_ptrue_flag (pred2[1], SImode));
2850
2851   bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
2852                    || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
2853   bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
2854                    || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
2855   return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
2856 }
2857
2858 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
2859    DATA_MODE, and return the result in a predicate of mode PRED_MODE.
2860    Use TARGET as the target register if nonnull and convenient.  */
2861
2862 static rtx
2863 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
2864                           machine_mode data_mode, rtx op1, rtx op2)
2865 {
2866   insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
2867   expand_operand ops[5];
2868   create_output_operand (&ops[0], target, pred_mode);
2869   create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
2870   create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
2871   create_input_operand (&ops[3], op1, data_mode);
2872   create_input_operand (&ops[4], op2, data_mode);
2873   expand_insn (icode, 5, ops);
2874   return ops[0].value;
2875 }
2876
2877 /* Use a comparison to convert integer vector SRC into MODE, which is
2878    the corresponding SVE predicate mode.  Use TARGET for the result
2879    if it's nonnull and convenient.  */
2880
2881 static rtx
2882 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
2883 {
2884   machine_mode src_mode = GET_MODE (src);
2885   return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
2886                                    src, CONST0_RTX (src_mode));
2887 }
2888
2889 /* Return true if we can move VALUE into a register using a single
2890    CNT[BHWD] instruction.  */
2891
2892 static bool
2893 aarch64_sve_cnt_immediate_p (poly_int64 value)
2894 {
2895   HOST_WIDE_INT factor = value.coeffs[0];
2896   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
2897   return (value.coeffs[1] == factor
2898           && IN_RANGE (factor, 2, 16 * 16)
2899           && (factor & 1) == 0
2900           && factor <= 16 * (factor & -factor));
2901 }
2902
2903 /* Likewise for rtx X.  */
2904
2905 bool
2906 aarch64_sve_cnt_immediate_p (rtx x)
2907 {
2908   poly_int64 value;
2909   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2910 }
2911
2912 /* Return the asm string for an instruction with a CNT-like vector size
2913    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2914    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2915    first part of the operands template (the part that comes before the
2916    vector size itself).  PATTERN is the pattern to use.  FACTOR is the
2917    number of quadwords.  NELTS_PER_VQ, if nonzero, is the number of elements
2918    in each quadword.  If it is zero, we can use any element size.  */
2919
2920 static char *
2921 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2922                                   aarch64_svpattern pattern,
2923                                   unsigned int factor,
2924                                   unsigned int nelts_per_vq)
2925 {
2926   static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
2927
2928   if (nelts_per_vq == 0)
2929     /* There is some overlap in the ranges of the four CNT instructions.
2930        Here we always use the smallest possible element size, so that the
2931        multiplier is 1 whereever possible.  */
2932     nelts_per_vq = factor & -factor;
2933   int shift = std::min (exact_log2 (nelts_per_vq), 4);
2934   gcc_assert (IN_RANGE (shift, 1, 4));
2935   char suffix = "dwhb"[shift - 1];
2936
2937   factor >>= shift;
2938   unsigned int written;
2939   if (pattern == AARCH64_SV_ALL && factor == 1)
2940     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2941                         prefix, suffix, operands);
2942   else if (factor == 1)
2943     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
2944                         prefix, suffix, operands, svpattern_token (pattern));
2945   else
2946     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
2947                         prefix, suffix, operands, svpattern_token (pattern),
2948                         factor);
2949   gcc_assert (written < sizeof (buffer));
2950   return buffer;
2951 }
2952
2953 /* Return the asm string for an instruction with a CNT-like vector size
2954    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2955    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2956    first part of the operands template (the part that comes before the
2957    vector size itself).  X is the value of the vector size operand,
2958    as a polynomial integer rtx; we need to convert this into an "all"
2959    pattern with a multiplier.  */
2960
2961 char *
2962 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2963                                   rtx x)
2964 {
2965   poly_int64 value = rtx_to_poly_int64 (x);
2966   gcc_assert (aarch64_sve_cnt_immediate_p (value));
2967   return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
2968                                            value.coeffs[1], 0);
2969 }
2970
2971 /* Return true if we can add X using a single SVE INC or DEC instruction.  */
2972
2973 bool
2974 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
2975 {
2976   poly_int64 value;
2977   return (poly_int_rtx_p (x, &value)
2978           && (aarch64_sve_cnt_immediate_p (value)
2979               || aarch64_sve_cnt_immediate_p (-value)));
2980 }
2981
2982 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
2983    operand 0.  */
2984
2985 char *
2986 aarch64_output_sve_scalar_inc_dec (rtx offset)
2987 {
2988   poly_int64 offset_value = rtx_to_poly_int64 (offset);
2989   gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
2990   if (offset_value.coeffs[1] > 0)
2991     return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
2992                                              offset_value.coeffs[1], 0);
2993   else
2994     return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
2995                                              -offset_value.coeffs[1], 0);
2996 }
2997
2998 /* Return true if we can add VALUE to a register using a single ADDVL
2999    or ADDPL instruction.  */
3000
3001 static bool
3002 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
3003 {
3004   HOST_WIDE_INT factor = value.coeffs[0];
3005   if (factor == 0 || value.coeffs[1] != factor)
3006     return false;
3007   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
3008      and a value of 16 is one vector width.  */
3009   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
3010           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
3011 }
3012
3013 /* Likewise for rtx X.  */
3014
3015 bool
3016 aarch64_sve_addvl_addpl_immediate_p (rtx x)
3017 {
3018   poly_int64 value;
3019   return (poly_int_rtx_p (x, &value)
3020           && aarch64_sve_addvl_addpl_immediate_p (value));
3021 }
3022
3023 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
3024    to operand 1 and storing the result in operand 0.  */
3025
3026 char *
3027 aarch64_output_sve_addvl_addpl (rtx offset)
3028 {
3029   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
3030   poly_int64 offset_value = rtx_to_poly_int64 (offset);
3031   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
3032
3033   int factor = offset_value.coeffs[1];
3034   if ((factor & 15) == 0)
3035     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
3036   else
3037     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
3038   return buffer;
3039 }
3040
3041 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3042    instruction.  If it is, store the number of elements in each vector
3043    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
3044    factor in *FACTOR_OUT (if nonnull).  */
3045
3046 bool
3047 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
3048                                         unsigned int *nelts_per_vq_out)
3049 {
3050   rtx elt;
3051   poly_int64 value;
3052
3053   if (!const_vec_duplicate_p (x, &elt)
3054       || !poly_int_rtx_p (elt, &value))
3055     return false;
3056
3057   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
3058   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
3059     /* There's no vector INCB.  */
3060     return false;
3061
3062   HOST_WIDE_INT factor = value.coeffs[0];
3063   if (value.coeffs[1] != factor)
3064     return false;
3065
3066   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
3067   if ((factor % nelts_per_vq) != 0
3068       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
3069     return false;
3070
3071   if (factor_out)
3072     *factor_out = factor;
3073   if (nelts_per_vq_out)
3074     *nelts_per_vq_out = nelts_per_vq;
3075   return true;
3076 }
3077
3078 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3079    instruction.  */
3080
3081 bool
3082 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
3083 {
3084   return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
3085 }
3086
3087 /* Return the asm template for an SVE vector INC or DEC instruction.
3088    OPERANDS gives the operands before the vector count and X is the
3089    value of the vector count operand itself.  */
3090
3091 char *
3092 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
3093 {
3094   int factor;
3095   unsigned int nelts_per_vq;
3096   if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
3097     gcc_unreachable ();
3098   if (factor < 0)
3099     return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
3100                                              -factor, nelts_per_vq);
3101   else
3102     return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
3103                                              factor, nelts_per_vq);
3104 }
3105
3106 static int
3107 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
3108                                 scalar_int_mode mode)
3109 {
3110   int i;
3111   unsigned HOST_WIDE_INT val, val2, mask;
3112   int one_match, zero_match;
3113   int num_insns;
3114
3115   val = INTVAL (imm);
3116
3117   if (aarch64_move_imm (val, mode))
3118     {
3119       if (generate)
3120         emit_insn (gen_rtx_SET (dest, imm));
3121       return 1;
3122     }
3123
3124   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
3125      (with XXXX non-zero). In that case check to see if the move can be done in
3126      a smaller mode.  */
3127   val2 = val & 0xffffffff;
3128   if (mode == DImode
3129       && aarch64_move_imm (val2, SImode)
3130       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
3131     {
3132       if (generate)
3133         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3134
3135       /* Check if we have to emit a second instruction by checking to see
3136          if any of the upper 32 bits of the original DI mode value is set.  */
3137       if (val == val2)
3138         return 1;
3139
3140       i = (val >> 48) ? 48 : 32;
3141
3142       if (generate)
3143          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3144                                     GEN_INT ((val >> i) & 0xffff)));
3145
3146       return 2;
3147     }
3148
3149   if ((val >> 32) == 0 || mode == SImode)
3150     {
3151       if (generate)
3152         {
3153           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
3154           if (mode == SImode)
3155             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
3156                                        GEN_INT ((val >> 16) & 0xffff)));
3157           else
3158             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
3159                                        GEN_INT ((val >> 16) & 0xffff)));
3160         }
3161       return 2;
3162     }
3163
3164   /* Remaining cases are all for DImode.  */
3165
3166   mask = 0xffff;
3167   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
3168     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
3169   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
3170     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
3171
3172   if (zero_match != 2 && one_match != 2)
3173     {
3174       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
3175          For a 64-bit bitmask try whether changing 16 bits to all ones or
3176          zeroes creates a valid bitmask.  To check any repeated bitmask,
3177          try using 16 bits from the other 32-bit half of val.  */
3178
3179       for (i = 0; i < 64; i += 16, mask <<= 16)
3180         {
3181           val2 = val & ~mask;
3182           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3183             break;
3184           val2 = val | mask;
3185           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3186             break;
3187           val2 = val2 & ~mask;
3188           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
3189           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3190             break;
3191         }
3192       if (i != 64)
3193         {
3194           if (generate)
3195             {
3196               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3197               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3198                                          GEN_INT ((val >> i) & 0xffff)));
3199             }
3200           return 2;
3201         }
3202     }
3203
3204   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
3205      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
3206      otherwise skip zero bits.  */
3207
3208   num_insns = 1;
3209   mask = 0xffff;
3210   val2 = one_match > zero_match ? ~val : val;
3211   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
3212
3213   if (generate)
3214     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
3215                                            ? (val | ~(mask << i))
3216                                            : (val & (mask << i)))));
3217   for (i += 16; i < 64; i += 16)
3218     {
3219       if ((val2 & (mask << i)) == 0)
3220         continue;
3221       if (generate)
3222         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3223                                    GEN_INT ((val >> i) & 0xffff)));
3224       num_insns ++;
3225     }
3226
3227   return num_insns;
3228 }
3229
3230 /* Return whether imm is a 128-bit immediate which is simple enough to
3231    expand inline.  */
3232 bool
3233 aarch64_mov128_immediate (rtx imm)
3234 {
3235   if (GET_CODE (imm) == CONST_INT)
3236     return true;
3237
3238   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
3239
3240   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
3241   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
3242
3243   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
3244          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
3245 }
3246
3247
3248 /* Return the number of temporary registers that aarch64_add_offset_1
3249    would need to add OFFSET to a register.  */
3250
3251 static unsigned int
3252 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
3253 {
3254   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
3255 }
3256
3257 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
3258    a non-polynomial OFFSET.  MODE is the mode of the addition.
3259    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3260    be set and CFA adjustments added to the generated instructions.
3261
3262    TEMP1, if nonnull, is a register of mode MODE that can be used as a
3263    temporary if register allocation is already complete.  This temporary
3264    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
3265    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
3266    the immediate again.
3267
3268    Since this function may be used to adjust the stack pointer, we must
3269    ensure that it cannot cause transient stack deallocation (for example
3270    by first incrementing SP and then decrementing when adjusting by a
3271    large immediate).  */
3272
3273 static void
3274 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
3275                       rtx src, HOST_WIDE_INT offset, rtx temp1,
3276                       bool frame_related_p, bool emit_move_imm)
3277 {
3278   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3279   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3280
3281   HOST_WIDE_INT moffset = abs_hwi (offset);
3282   rtx_insn *insn;
3283
3284   if (!moffset)
3285     {
3286       if (!rtx_equal_p (dest, src))
3287         {
3288           insn = emit_insn (gen_rtx_SET (dest, src));
3289           RTX_FRAME_RELATED_P (insn) = frame_related_p;
3290         }
3291       return;
3292     }
3293
3294   /* Single instruction adjustment.  */
3295   if (aarch64_uimm12_shift (moffset))
3296     {
3297       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
3298       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3299       return;
3300     }
3301
3302   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
3303      and either:
3304
3305      a) the offset cannot be loaded by a 16-bit move or
3306      b) there is no spare register into which we can move it.  */
3307   if (moffset < 0x1000000
3308       && ((!temp1 && !can_create_pseudo_p ())
3309           || !aarch64_move_imm (moffset, mode)))
3310     {
3311       HOST_WIDE_INT low_off = moffset & 0xfff;
3312
3313       low_off = offset < 0 ? -low_off : low_off;
3314       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
3315       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3316       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
3317       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3318       return;
3319     }
3320
3321   /* Emit a move immediate if required and an addition/subtraction.  */
3322   if (emit_move_imm)
3323     {
3324       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
3325       temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
3326     }
3327   insn = emit_insn (offset < 0
3328                     ? gen_sub3_insn (dest, src, temp1)
3329                     : gen_add3_insn (dest, src, temp1));
3330   if (frame_related_p)
3331     {
3332       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3333       rtx adj = plus_constant (mode, src, offset);
3334       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
3335     }
3336 }
3337
3338 /* Return the number of temporary registers that aarch64_add_offset
3339    would need to move OFFSET into a register or add OFFSET to a register;
3340    ADD_P is true if we want the latter rather than the former.  */
3341
3342 static unsigned int
3343 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
3344 {
3345   /* This follows the same structure as aarch64_add_offset.  */
3346   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
3347     return 0;
3348
3349   unsigned int count = 0;
3350   HOST_WIDE_INT factor = offset.coeffs[1];
3351   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3352   poly_int64 poly_offset (factor, factor);
3353   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3354     /* Need one register for the ADDVL/ADDPL result.  */
3355     count += 1;
3356   else if (factor != 0)
3357     {
3358       factor = abs (factor);
3359       if (factor > 16 * (factor & -factor))
3360         /* Need one register for the CNT result and one for the multiplication
3361            factor.  If necessary, the second temporary can be reused for the
3362            constant part of the offset.  */
3363         return 2;
3364       /* Need one register for the CNT result (which might then
3365          be shifted).  */
3366       count += 1;
3367     }
3368   return count + aarch64_add_offset_1_temporaries (constant);
3369 }
3370
3371 /* If X can be represented as a poly_int64, return the number
3372    of temporaries that are required to add it to a register.
3373    Return -1 otherwise.  */
3374
3375 int
3376 aarch64_add_offset_temporaries (rtx x)
3377 {
3378   poly_int64 offset;
3379   if (!poly_int_rtx_p (x, &offset))
3380     return -1;
3381   return aarch64_offset_temporaries (true, offset);
3382 }
3383
3384 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
3385    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3386    be set and CFA adjustments added to the generated instructions.
3387
3388    TEMP1, if nonnull, is a register of mode MODE that can be used as a
3389    temporary if register allocation is already complete.  This temporary
3390    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
3391    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
3392    false to avoid emitting the immediate again.
3393
3394    TEMP2, if nonnull, is a second temporary register that doesn't
3395    overlap either DEST or REG.
3396
3397    Since this function may be used to adjust the stack pointer, we must
3398    ensure that it cannot cause transient stack deallocation (for example
3399    by first incrementing SP and then decrementing when adjusting by a
3400    large immediate).  */
3401
3402 static void
3403 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3404                     poly_int64 offset, rtx temp1, rtx temp2,
3405                     bool frame_related_p, bool emit_move_imm = true)
3406 {
3407   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3408   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3409   gcc_assert (temp1 == NULL_RTX
3410               || !frame_related_p
3411               || !reg_overlap_mentioned_p (temp1, dest));
3412   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
3413
3414   /* Try using ADDVL or ADDPL to add the whole value.  */
3415   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
3416     {
3417       rtx offset_rtx = gen_int_mode (offset, mode);
3418       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3419       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3420       return;
3421     }
3422
3423   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
3424      SVE vector register, over and above the minimum size of 128 bits.
3425      This is equivalent to half the value returned by CNTD with a
3426      vector shape of ALL.  */
3427   HOST_WIDE_INT factor = offset.coeffs[1];
3428   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3429
3430   /* Try using ADDVL or ADDPL to add the VG-based part.  */
3431   poly_int64 poly_offset (factor, factor);
3432   if (src != const0_rtx
3433       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3434     {
3435       rtx offset_rtx = gen_int_mode (poly_offset, mode);
3436       if (frame_related_p)
3437         {
3438           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3439           RTX_FRAME_RELATED_P (insn) = true;
3440           src = dest;
3441         }
3442       else
3443         {
3444           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
3445           src = aarch64_force_temporary (mode, temp1, addr);
3446           temp1 = temp2;
3447           temp2 = NULL_RTX;
3448         }
3449     }
3450   /* Otherwise use a CNT-based sequence.  */
3451   else if (factor != 0)
3452     {
3453       /* Use a subtraction if we have a negative factor.  */
3454       rtx_code code = PLUS;
3455       if (factor < 0)
3456         {
3457           factor = -factor;
3458           code = MINUS;
3459         }
3460
3461       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
3462          into the multiplication.  */
3463       rtx val;
3464       int shift = 0;
3465       if (factor & 1)
3466         /* Use a right shift by 1.  */
3467         shift = -1;
3468       else
3469         factor /= 2;
3470       HOST_WIDE_INT low_bit = factor & -factor;
3471       if (factor <= 16 * low_bit)
3472         {
3473           if (factor > 16 * 8)
3474             {
3475               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3476                  the value with the minimum multiplier and shift it into
3477                  position.  */
3478               int extra_shift = exact_log2 (low_bit);
3479               shift += extra_shift;
3480               factor >>= extra_shift;
3481             }
3482           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
3483         }
3484       else
3485         {
3486           /* Base the factor on LOW_BIT if we can calculate LOW_BIT
3487              directly, since that should increase the chances of being
3488              able to use a shift and add sequence.  If LOW_BIT itself
3489              is out of range, just use CNTD.  */
3490           if (low_bit <= 16 * 8)
3491             factor /= low_bit;
3492           else
3493             low_bit = 1;
3494
3495           val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
3496           val = aarch64_force_temporary (mode, temp1, val);
3497
3498           if (can_create_pseudo_p ())
3499             {
3500               rtx coeff1 = gen_int_mode (factor, mode);
3501               val = expand_mult (mode, val, coeff1, NULL_RTX, false, true);
3502             }
3503           else
3504             {
3505               /* Go back to using a negative multiplication factor if we have
3506                  no register from which to subtract.  */
3507               if (code == MINUS && src == const0_rtx)
3508                 {
3509                   factor = -factor;
3510                   code = PLUS;
3511                 }
3512               rtx coeff1 = gen_int_mode (factor, mode);
3513               coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
3514               val = gen_rtx_MULT (mode, val, coeff1);
3515             }
3516         }
3517
3518       if (shift > 0)
3519         {
3520           /* Multiply by 1 << SHIFT.  */
3521           val = aarch64_force_temporary (mode, temp1, val);
3522           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3523         }
3524       else if (shift == -1)
3525         {
3526           /* Divide by 2.  */
3527           val = aarch64_force_temporary (mode, temp1, val);
3528           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3529         }
3530
3531       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
3532       if (src != const0_rtx)
3533         {
3534           val = aarch64_force_temporary (mode, temp1, val);
3535           val = gen_rtx_fmt_ee (code, mode, src, val);
3536         }
3537       else if (code == MINUS)
3538         {
3539           val = aarch64_force_temporary (mode, temp1, val);
3540           val = gen_rtx_NEG (mode, val);
3541         }
3542
3543       if (constant == 0 || frame_related_p)
3544         {
3545           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3546           if (frame_related_p)
3547             {
3548               RTX_FRAME_RELATED_P (insn) = true;
3549               add_reg_note (insn, REG_CFA_ADJUST_CFA,
3550                             gen_rtx_SET (dest, plus_constant (Pmode, src,
3551                                                               poly_offset)));
3552             }
3553           src = dest;
3554           if (constant == 0)
3555             return;
3556         }
3557       else
3558         {
3559           src = aarch64_force_temporary (mode, temp1, val);
3560           temp1 = temp2;
3561           temp2 = NULL_RTX;
3562         }
3563
3564       emit_move_imm = true;
3565     }
3566
3567   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
3568                         frame_related_p, emit_move_imm);
3569 }
3570
3571 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3572    than a poly_int64.  */
3573
3574 void
3575 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3576                           rtx offset_rtx, rtx temp1, rtx temp2)
3577 {
3578   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
3579                       temp1, temp2, false);
3580 }
3581
3582 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3583    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
3584    if TEMP1 already contains abs (DELTA).  */
3585
3586 static inline void
3587 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
3588 {
3589   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
3590                       temp1, temp2, true, emit_move_imm);
3591 }
3592
3593 /* Subtract DELTA from the stack pointer, marking the instructions
3594    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
3595    if nonnull.  */
3596
3597 static inline void
3598 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
3599                 bool emit_move_imm = true)
3600 {
3601   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
3602                       temp1, temp2, frame_related_p, emit_move_imm);
3603 }
3604
3605 /* Set DEST to (vec_series BASE STEP).  */
3606
3607 static void
3608 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
3609 {
3610   machine_mode mode = GET_MODE (dest);
3611   scalar_mode inner = GET_MODE_INNER (mode);
3612
3613   /* Each operand can be a register or an immediate in the range [-16, 15].  */
3614   if (!aarch64_sve_index_immediate_p (base))
3615     base = force_reg (inner, base);
3616   if (!aarch64_sve_index_immediate_p (step))
3617     step = force_reg (inner, step);
3618
3619   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
3620 }
3621
3622 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
3623    register of mode MODE.  Use TARGET for the result if it's nonnull
3624    and convenient.
3625
3626    The two vector modes must have the same element mode.  The behavior
3627    is to duplicate architectural lane N of SRC into architectural lanes
3628    N + I * STEP of the result.  On big-endian targets, architectural
3629    lane 0 of an Advanced SIMD vector is the last element of the vector
3630    in memory layout, so for big-endian targets this operation has the
3631    effect of reversing SRC before duplicating it.  Callers need to
3632    account for this.  */
3633
3634 rtx
3635 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
3636 {
3637   machine_mode src_mode = GET_MODE (src);
3638   gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
3639   insn_code icode = (BYTES_BIG_ENDIAN
3640                      ? code_for_aarch64_vec_duplicate_vq_be (mode)
3641                      : code_for_aarch64_vec_duplicate_vq_le (mode));
3642
3643   unsigned int i = 0;
3644   expand_operand ops[3];
3645   create_output_operand (&ops[i++], target, mode);
3646   create_output_operand (&ops[i++], src, src_mode);
3647   if (BYTES_BIG_ENDIAN)
3648     {
3649       /* Create a PARALLEL describing the reversal of SRC.  */
3650       unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
3651       rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
3652                                                   nelts_per_vq - 1, -1);
3653       create_fixed_operand (&ops[i++], sel);
3654     }
3655   expand_insn (icode, i, ops);
3656   return ops[0].value;
3657 }
3658
3659 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
3660    the memory image into DEST.  Return true on success.  */
3661
3662 static bool
3663 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
3664 {
3665   src = force_const_mem (GET_MODE (src), src);
3666   if (!src)
3667     return false;
3668
3669   /* Make sure that the address is legitimate.  */
3670   if (!aarch64_sve_ld1rq_operand_p (src))
3671     {
3672       rtx addr = force_reg (Pmode, XEXP (src, 0));
3673       src = replace_equiv_address (src, addr);
3674     }
3675
3676   machine_mode mode = GET_MODE (dest);
3677   unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
3678   machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
3679   rtx ptrue = aarch64_ptrue_reg (pred_mode);
3680   emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
3681   return true;
3682 }
3683
3684 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
3685    SVE data mode and isn't a legitimate constant.  Use TARGET for the
3686    result if convenient.
3687
3688    The returned register can have whatever mode seems most natural
3689    given the contents of SRC.  */
3690
3691 static rtx
3692 aarch64_expand_sve_const_vector (rtx target, rtx src)
3693 {
3694   machine_mode mode = GET_MODE (src);
3695   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
3696   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
3697   scalar_mode elt_mode = GET_MODE_INNER (mode);
3698   unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
3699   unsigned int encoded_bits = npatterns * nelts_per_pattern * elt_bits;
3700
3701   if (nelts_per_pattern == 1 && encoded_bits == 128)
3702     {
3703       /* The constant is a duplicated quadword but can't be narrowed
3704          beyond a quadword.  Get the memory image of the first quadword
3705          as a 128-bit vector and try using LD1RQ to load it from memory.
3706
3707          The effect for both endiannesses is to load memory lane N into
3708          architectural lanes N + I * STEP of the result.  On big-endian
3709          targets, the layout of the 128-bit vector in an Advanced SIMD
3710          register would be different from its layout in an SVE register,
3711          but this 128-bit vector is a memory value only.  */
3712       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
3713       rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
3714       if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
3715         return target;
3716     }
3717
3718   if (nelts_per_pattern == 1 && encoded_bits < 128)
3719     {
3720       /* The vector is a repeating sequence of 64 bits or fewer.
3721          See if we can load them using an Advanced SIMD move and then
3722          duplicate it to fill a vector.  This is better than using a GPR
3723          move because it keeps everything in the same register file.  */
3724       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
3725       rtx_vector_builder builder (vq_mode, npatterns, 1);
3726       for (unsigned int i = 0; i < npatterns; ++i)
3727         {
3728           /* We want memory lane N to go into architectural lane N,
3729              so reverse for big-endian targets.  The DUP .Q pattern
3730              has a compensating reverse built-in.  */
3731           unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
3732           builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
3733         }
3734       rtx vq_src = builder.build ();
3735       if (aarch64_simd_valid_immediate (vq_src, NULL))
3736         {
3737           vq_src = force_reg (vq_mode, vq_src);
3738           return aarch64_expand_sve_dupq (target, mode, vq_src);
3739         }
3740
3741       /* Get an integer representation of the repeating part of Advanced
3742          SIMD vector VQ_SRC.  This preserves the endianness of VQ_SRC,
3743          which for big-endian targets is lane-swapped wrt a normal
3744          Advanced SIMD vector.  This means that for both endiannesses,
3745          memory lane N of SVE vector SRC corresponds to architectural
3746          lane N of a register holding VQ_SRC.  This in turn means that
3747          memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
3748          as a single 128-bit value) and thus that memory lane 0 of SRC is
3749          in the lsb of the integer.  Duplicating the integer therefore
3750          ensures that memory lane N of SRC goes into architectural lane
3751          N + I * INDEX of the SVE register.  */
3752       scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
3753       rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
3754       if (elt_value)
3755         {
3756           /* Pretend that we had a vector of INT_MODE to start with.  */
3757           elt_mode = int_mode;
3758           mode = aarch64_full_sve_mode (int_mode).require ();
3759
3760           /* If the integer can be moved into a general register by a
3761              single instruction, do that and duplicate the result.  */
3762           if (CONST_INT_P (elt_value)
3763               && aarch64_move_imm (INTVAL (elt_value), elt_mode))
3764             {
3765               elt_value = force_reg (elt_mode, elt_value);
3766               return expand_vector_broadcast (mode, elt_value);
3767             }
3768         }
3769       else if (npatterns == 1)
3770         /* We're duplicating a single value, but can't do better than
3771            force it to memory and load from there.  This handles things
3772            like symbolic constants.  */
3773         elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
3774
3775       if (elt_value)
3776         {
3777           /* Load the element from memory if we can, otherwise move it into
3778              a register and use a DUP.  */
3779           rtx op = force_const_mem (elt_mode, elt_value);
3780           if (!op)
3781             op = force_reg (elt_mode, elt_value);
3782           return expand_vector_broadcast (mode, op);
3783         }
3784     }
3785
3786   /* Try using INDEX.  */
3787   rtx base, step;
3788   if (const_vec_series_p (src, &base, &step))
3789     {
3790       aarch64_expand_vec_series (target, base, step);
3791       return target;
3792     }
3793
3794   /* From here on, it's better to force the whole constant to memory
3795      if we can.  */
3796   if (GET_MODE_NUNITS (mode).is_constant ())
3797     return NULL_RTX;
3798
3799   /* Expand each pattern individually.  */
3800   gcc_assert (npatterns > 1);
3801   rtx_vector_builder builder;
3802   auto_vec<rtx, 16> vectors (npatterns);
3803   for (unsigned int i = 0; i < npatterns; ++i)
3804     {
3805       builder.new_vector (mode, 1, nelts_per_pattern);
3806       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
3807         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
3808       vectors.quick_push (force_reg (mode, builder.build ()));
3809     }
3810
3811   /* Use permutes to interleave the separate vectors.  */
3812   while (npatterns > 1)
3813     {
3814       npatterns /= 2;
3815       for (unsigned int i = 0; i < npatterns; ++i)
3816         {
3817           rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
3818           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
3819           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
3820           vectors[i] = tmp;
3821         }
3822     }
3823   gcc_assert (vectors[0] == target);
3824   return target;
3825 }
3826
3827 /* Use WHILE to set a predicate register of mode MODE in which the first
3828    VL bits are set and the rest are clear.  Use TARGET for the register
3829    if it's nonnull and convenient.  */
3830
3831 static rtx
3832 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
3833                                  unsigned int vl)
3834 {
3835   rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
3836   target = aarch64_target_reg (target, mode);
3837   emit_insn (gen_while_ult (DImode, mode, target, const0_rtx, limit));
3838   return target;
3839 }
3840
3841 static rtx
3842 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
3843
3844 /* BUILDER is a constant predicate in which the index of every set bit
3845    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
3846    by inverting every element at a multiple of ELT_SIZE and EORing the
3847    result with an ELT_SIZE PTRUE.
3848
3849    Return a register that contains the constant on success, otherwise
3850    return null.  Use TARGET as the register if it is nonnull and
3851    convenient.  */
3852
3853 static rtx
3854 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
3855                                    unsigned int elt_size)
3856 {
3857   /* Invert every element at a multiple of ELT_SIZE, keeping the
3858      other bits zero.  */
3859   rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
3860                                   builder.nelts_per_pattern ());
3861   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
3862     if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
3863       inv_builder.quick_push (const1_rtx);
3864     else
3865       inv_builder.quick_push (const0_rtx);
3866   inv_builder.finalize ();
3867
3868   /* See if we can load the constant cheaply.  */
3869   rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
3870   if (!inv)
3871     return NULL_RTX;
3872
3873   /* EOR the result with an ELT_SIZE PTRUE.  */
3874   rtx mask = aarch64_ptrue_all (elt_size);
3875   mask = force_reg (VNx16BImode, mask);
3876   target = aarch64_target_reg (target, VNx16BImode);
3877   emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
3878   return target;
3879 }
3880
3881 /* BUILDER is a constant predicate in which the index of every set bit
3882    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
3883    using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE.  Return the
3884    register on success, otherwise return null.  Use TARGET as the register
3885    if nonnull and convenient.  */
3886
3887 static rtx
3888 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
3889                                    unsigned int elt_size,
3890                                    unsigned int permute_size)
3891 {
3892   /* We're going to split the constant into two new constants A and B,
3893      with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
3894      and into B otherwise.  E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
3895
3896      A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
3897      B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
3898
3899      where _ indicates elements that will be discarded by the permute.
3900
3901      First calculate the ELT_SIZEs for A and B.  */
3902   unsigned int a_elt_size = GET_MODE_SIZE (DImode);
3903   unsigned int b_elt_size = GET_MODE_SIZE (DImode);
3904   for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
3905     if (INTVAL (builder.elt (i)) != 0)
3906       {
3907         if (i & permute_size)
3908           b_elt_size |= i - permute_size;
3909         else
3910           a_elt_size |= i;
3911       }
3912   a_elt_size &= -a_elt_size;
3913   b_elt_size &= -b_elt_size;
3914
3915   /* Now construct the vectors themselves.  */
3916   rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
3917                                 builder.nelts_per_pattern ());
3918   rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
3919                                 builder.nelts_per_pattern ());
3920   unsigned int nelts = builder.encoded_nelts ();
3921   for (unsigned int i = 0; i < nelts; ++i)
3922     if (i & (elt_size - 1))
3923       {
3924         a_builder.quick_push (const0_rtx);
3925         b_builder.quick_push (const0_rtx);
3926       }
3927     else if ((i & permute_size) == 0)
3928       {
3929         /* The A and B elements are significant.  */
3930         a_builder.quick_push (builder.elt (i));
3931         b_builder.quick_push (builder.elt (i + permute_size));
3932       }
3933     else
3934       {
3935         /* The A and B elements are going to be discarded, so pick whatever
3936            is likely to give a nice constant.  We are targeting element
3937            sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
3938            with the aim of each being a sequence of ones followed by
3939            a sequence of zeros.  So:
3940
3941            * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
3942              duplicate the last X_ELT_SIZE element, to extend the
3943              current sequence of ones or zeros.
3944
3945            * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
3946              zero, so that the constant really does have X_ELT_SIZE and
3947              not a smaller size.  */
3948         if (a_elt_size > permute_size)
3949           a_builder.quick_push (const0_rtx);
3950         else
3951           a_builder.quick_push (a_builder.elt (i - a_elt_size));
3952         if (b_elt_size > permute_size)
3953           b_builder.quick_push (const0_rtx);
3954         else
3955           b_builder.quick_push (b_builder.elt (i - b_elt_size));
3956       }
3957   a_builder.finalize ();
3958   b_builder.finalize ();
3959
3960   /* Try loading A into a register.  */
3961   rtx_insn *last = get_last_insn ();
3962   rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
3963   if (!a)
3964     return NULL_RTX;
3965
3966   /* Try loading B into a register.  */
3967   rtx b = a;
3968   if (a_builder != b_builder)
3969     {
3970       b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
3971       if (!b)
3972         {
3973           delete_insns_since (last);
3974           return NULL_RTX;
3975         }
3976     }
3977
3978   /* Emit the TRN1 itself.  */
3979   machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
3980   target = aarch64_target_reg (target, mode);
3981   emit_insn (gen_aarch64_sve (UNSPEC_TRN1, mode, target,
3982                               gen_lowpart (mode, a),
3983                               gen_lowpart (mode, b)));
3984   return target;
3985 }
3986
3987 /* Subroutine of aarch64_expand_sve_const_pred.  Try to load the VNx16BI
3988    constant in BUILDER into an SVE predicate register.  Return the register
3989    on success, otherwise return null.  Use TARGET for the register if
3990    nonnull and convenient.
3991
3992    ALLOW_RECURSE_P is true if we can use methods that would call this
3993    function recursively.  */
3994
3995 static rtx
3996 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
3997                                  bool allow_recurse_p)
3998 {
3999   if (builder.encoded_nelts () == 1)
4000     /* A PFALSE or a PTRUE .B ALL.  */
4001     return aarch64_emit_set_immediate (target, builder);
4002
4003   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
4004   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
4005     {
4006       /* If we can load the constant using PTRUE, use it as-is.  */
4007       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
4008       if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
4009         return aarch64_emit_set_immediate (target, builder);
4010
4011       /* Otherwise use WHILE to set the first VL bits.  */
4012       return aarch64_sve_move_pred_via_while (target, mode, vl);
4013     }
4014
4015   if (!allow_recurse_p)
4016     return NULL_RTX;
4017
4018   /* Try inverting the vector in element size ELT_SIZE and then EORing
4019      the result with an ELT_SIZE PTRUE.  */
4020   if (INTVAL (builder.elt (0)) == 0)
4021     if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
4022                                                      elt_size))
4023       return res;
4024
4025   /* Try using TRN1 to permute two simpler constants.  */
4026   for (unsigned int i = elt_size; i <= 8; i *= 2)
4027     if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
4028                                                      elt_size, i))
4029       return res;
4030
4031   return NULL_RTX;
4032 }
4033
4034 /* Return an SVE predicate register that contains the VNx16BImode
4035    constant in BUILDER, without going through the move expanders.
4036
4037    The returned register can have whatever mode seems most natural
4038    given the contents of BUILDER.  Use TARGET for the result if
4039    convenient.  */
4040
4041 static rtx
4042 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
4043 {
4044   /* Try loading the constant using pure predicate operations.  */
4045   if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
4046     return res;
4047
4048   /* Try forcing the constant to memory.  */
4049   if (builder.full_nelts ().is_constant ())
4050     if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
4051       {
4052         target = aarch64_target_reg (target, VNx16BImode);
4053         emit_move_insn (target, mem);
4054         return target;
4055       }
4056
4057   /* The last resort is to load the constant as an integer and then
4058      compare it against zero.  Use -1 for set bits in order to increase
4059      the changes of using SVE DUPM or an Advanced SIMD byte mask.  */
4060   rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
4061                                   builder.nelts_per_pattern ());
4062   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
4063     int_builder.quick_push (INTVAL (builder.elt (i))
4064                             ? constm1_rtx : const0_rtx);
4065   return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
4066                                            int_builder.build ());
4067 }
4068
4069 /* Set DEST to immediate IMM.  */
4070
4071 void
4072 aarch64_expand_mov_immediate (rtx dest, rtx imm)
4073 {
4074   machine_mode mode = GET_MODE (dest);
4075
4076   /* Check on what type of symbol it is.  */
4077   scalar_int_mode int_mode;
4078   if ((GET_CODE (imm) == SYMBOL_REF
4079        || GET_CODE (imm) == LABEL_REF
4080        || GET_CODE (imm) == CONST
4081        || GET_CODE (imm) == CONST_POLY_INT)
4082       && is_a <scalar_int_mode> (mode, &int_mode))
4083     {
4084       rtx mem;
4085       poly_int64 offset;
4086       HOST_WIDE_INT const_offset;
4087       enum aarch64_symbol_type sty;
4088
4089       /* If we have (const (plus symbol offset)), separate out the offset
4090          before we start classifying the symbol.  */
4091       rtx base = strip_offset (imm, &offset);
4092
4093       /* We must always add an offset involving VL separately, rather than
4094          folding it into the relocation.  */
4095       if (!offset.is_constant (&const_offset))
4096         {
4097           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
4098             emit_insn (gen_rtx_SET (dest, imm));
4099           else
4100             {
4101               /* Do arithmetic on 32-bit values if the result is smaller
4102                  than that.  */
4103               if (partial_subreg_p (int_mode, SImode))
4104                 {
4105                   /* It is invalid to do symbol calculations in modes
4106                      narrower than SImode.  */
4107                   gcc_assert (base == const0_rtx);
4108                   dest = gen_lowpart (SImode, dest);
4109                   int_mode = SImode;
4110                 }
4111               if (base != const0_rtx)
4112                 {
4113                   base = aarch64_force_temporary (int_mode, dest, base);
4114                   aarch64_add_offset (int_mode, dest, base, offset,
4115                                       NULL_RTX, NULL_RTX, false);
4116                 }
4117               else
4118                 aarch64_add_offset (int_mode, dest, base, offset,
4119                                     dest, NULL_RTX, false);
4120             }
4121           return;
4122         }
4123
4124       sty = aarch64_classify_symbol (base, const_offset);
4125       switch (sty)
4126         {
4127         case SYMBOL_FORCE_TO_MEM:
4128           if (const_offset != 0
4129               && targetm.cannot_force_const_mem (int_mode, imm))
4130             {
4131               gcc_assert (can_create_pseudo_p ());
4132               base = aarch64_force_temporary (int_mode, dest, base);
4133               aarch64_add_offset (int_mode, dest, base, const_offset,
4134                                   NULL_RTX, NULL_RTX, false);
4135               return;
4136             }
4137
4138           mem = force_const_mem (ptr_mode, imm);
4139           gcc_assert (mem);
4140
4141           /* If we aren't generating PC relative literals, then
4142              we need to expand the literal pool access carefully.
4143              This is something that needs to be done in a number
4144              of places, so could well live as a separate function.  */
4145           if (!aarch64_pcrelative_literal_loads)
4146             {
4147               gcc_assert (can_create_pseudo_p ());
4148               base = gen_reg_rtx (ptr_mode);
4149               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
4150               if (ptr_mode != Pmode)
4151                 base = convert_memory_address (Pmode, base);
4152               mem = gen_rtx_MEM (ptr_mode, base);
4153             }
4154
4155           if (int_mode != ptr_mode)
4156             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
4157
4158           emit_insn (gen_rtx_SET (dest, mem));
4159
4160           return;
4161
4162         case SYMBOL_SMALL_TLSGD:
4163         case SYMBOL_SMALL_TLSDESC:
4164         case SYMBOL_SMALL_TLSIE:
4165         case SYMBOL_SMALL_GOT_28K:
4166         case SYMBOL_SMALL_GOT_4G:
4167         case SYMBOL_TINY_GOT:
4168         case SYMBOL_TINY_TLSIE:
4169           if (const_offset != 0)
4170             {
4171               gcc_assert(can_create_pseudo_p ());
4172               base = aarch64_force_temporary (int_mode, dest, base);
4173               aarch64_add_offset (int_mode, dest, base, const_offset,
4174                                   NULL_RTX, NULL_RTX, false);
4175               return;
4176             }
4177           /* FALLTHRU */
4178
4179         case SYMBOL_SMALL_ABSOLUTE:
4180         case SYMBOL_TINY_ABSOLUTE:
4181         case SYMBOL_TLSLE12:
4182         case SYMBOL_TLSLE24:
4183         case SYMBOL_TLSLE32:
4184         case SYMBOL_TLSLE48:
4185           aarch64_load_symref_appropriately (dest, imm, sty);
4186           return;
4187
4188         default:
4189           gcc_unreachable ();
4190         }
4191     }
4192
4193   if (!CONST_INT_P (imm))
4194     {
4195       if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
4196         {
4197           /* Only the low bit of each .H, .S and .D element is defined,
4198              so we can set the upper bits to whatever we like.  If the
4199              predicate is all-true in MODE, prefer to set all the undefined
4200              bits as well, so that we can share a single .B predicate for
4201              all modes.  */
4202           if (imm == CONSTM1_RTX (mode))
4203             imm = CONSTM1_RTX (VNx16BImode);
4204
4205           /* All methods for constructing predicate modes wider than VNx16BI
4206              will set the upper bits of each element to zero.  Expose this
4207              by moving such constants as a VNx16BI, so that all bits are
4208              significant and so that constants for different modes can be
4209              shared.  The wider constant will still be available as a
4210              REG_EQUAL note.  */
4211           rtx_vector_builder builder;
4212           if (aarch64_get_sve_pred_bits (builder, imm))
4213             {
4214               rtx res = aarch64_expand_sve_const_pred (dest, builder);
4215               if (dest != res)
4216                 emit_move_insn (dest, gen_lowpart (mode, res));
4217               return;
4218             }
4219         }
4220
4221       if (GET_CODE (imm) == HIGH
4222           || aarch64_simd_valid_immediate (imm, NULL))
4223         {
4224           emit_insn (gen_rtx_SET (dest, imm));
4225           return;
4226         }
4227
4228       if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
4229         if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
4230           {
4231             if (dest != res)
4232               emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
4233             return;
4234           }
4235
4236       rtx mem = force_const_mem (mode, imm);
4237       gcc_assert (mem);
4238       emit_move_insn (dest, mem);
4239       return;
4240     }
4241
4242   aarch64_internal_mov_immediate (dest, imm, true,
4243                                   as_a <scalar_int_mode> (mode));
4244 }
4245
4246 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
4247    that is known to contain PTRUE.  */
4248
4249 void
4250 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
4251 {
4252   expand_operand ops[3];
4253   machine_mode mode = GET_MODE (dest);
4254   create_output_operand (&ops[0], dest, mode);
4255   create_input_operand (&ops[1], pred, GET_MODE(pred));
4256   create_input_operand (&ops[2], src, mode);
4257   temporary_volatile_ok v (true);
4258   expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
4259 }
4260
4261 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
4262    operand is in memory.  In this case we need to use the predicated LD1
4263    and ST1 instead of LDR and STR, both for correctness on big-endian
4264    targets and because LD1 and ST1 support a wider range of addressing modes.
4265    PRED_MODE is the mode of the predicate.
4266
4267    See the comment at the head of aarch64-sve.md for details about the
4268    big-endian handling.  */
4269
4270 void
4271 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
4272 {
4273   machine_mode mode = GET_MODE (dest);
4274   rtx ptrue = aarch64_ptrue_reg (pred_mode);
4275   if (!register_operand (src, mode)
4276       && !register_operand (dest, mode))
4277     {
4278       rtx tmp = gen_reg_rtx (mode);
4279       if (MEM_P (src))
4280         aarch64_emit_sve_pred_move (tmp, ptrue, src);
4281       else
4282         emit_move_insn (tmp, src);
4283       src = tmp;
4284     }
4285   aarch64_emit_sve_pred_move (dest, ptrue, src);
4286 }
4287
4288 /* Called only on big-endian targets.  See whether an SVE vector move
4289    from SRC to DEST is effectively a REV[BHW] instruction, because at
4290    least one operand is a subreg of an SVE vector that has wider or
4291    narrower elements.  Return true and emit the instruction if so.
4292
4293    For example:
4294
4295      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
4296
4297    represents a VIEW_CONVERT between the following vectors, viewed
4298    in memory order:
4299
4300      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
4301      R1: { [0],      [1],      [2],      [3],     ... }
4302
4303    The high part of lane X in R2 should therefore correspond to lane X*2
4304    of R1, but the register representations are:
4305
4306          msb                                      lsb
4307      R2: ...... [1].high  [1].low   [0].high  [0].low
4308      R1: ...... [3]       [2]       [1]       [0]
4309
4310    where the low part of lane X in R2 corresponds to lane X*2 in R1.
4311    We therefore need a reverse operation to swap the high and low values
4312    around.
4313
4314    This is purely an optimization.  Without it we would spill the
4315    subreg operand to the stack in one mode and reload it in the
4316    other mode, which has the same effect as the REV.  */
4317
4318 bool
4319 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
4320 {
4321   gcc_assert (BYTES_BIG_ENDIAN);
4322   if (GET_CODE (dest) == SUBREG)
4323     dest = SUBREG_REG (dest);
4324   if (GET_CODE (src) == SUBREG)
4325     src = SUBREG_REG (src);
4326
4327   /* The optimization handles two single SVE REGs with different element
4328      sizes.  */
4329   if (!REG_P (dest)
4330       || !REG_P (src)
4331       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
4332       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
4333       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
4334           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
4335     return false;
4336
4337   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
4338   rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
4339   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
4340                                UNSPEC_REV_SUBREG);
4341   emit_insn (gen_rtx_SET (dest, unspec));
4342   return true;
4343 }
4344
4345 /* Return a copy of X with mode MODE, without changing its other
4346    attributes.  Unlike gen_lowpart, this doesn't care whether the
4347    mode change is valid.  */
4348
4349 static rtx
4350 aarch64_replace_reg_mode (rtx x, machine_mode mode)
4351 {
4352   if (GET_MODE (x) == mode)
4353     return x;
4354
4355   x = shallow_copy_rtx (x);
4356   set_mode_and_regno (x, mode, REGNO (x));
4357   return x;
4358 }
4359
4360 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
4361    stored in wider integer containers.  */
4362
4363 static unsigned int
4364 aarch64_sve_rev_unspec (machine_mode mode)
4365 {
4366   switch (GET_MODE_UNIT_SIZE (mode))
4367     {
4368     case 1: return UNSPEC_REVB;
4369     case 2: return UNSPEC_REVH;
4370     case 4: return UNSPEC_REVW;
4371     }
4372   gcc_unreachable ();
4373 }
4374
4375 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
4376    operands.  */
4377
4378 void
4379 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
4380 {
4381   /* Decide which REV operation we need.  The mode with wider elements
4382      determines the mode of the operands and the mode with the narrower
4383      elements determines the reverse width.  */
4384   machine_mode mode_with_wider_elts = GET_MODE (dest);
4385   machine_mode mode_with_narrower_elts = GET_MODE (src);
4386   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
4387       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
4388     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
4389
4390   unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
4391   unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
4392   machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
4393
4394   /* Get the operands in the appropriate modes and emit the instruction.  */
4395   ptrue = gen_lowpart (pred_mode, ptrue);
4396   dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
4397   src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
4398   emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
4399                                dest, ptrue, src));
4400 }
4401
4402 static bool
4403 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
4404                                  tree exp ATTRIBUTE_UNUSED)
4405 {
4406   if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
4407     return false;
4408
4409   return true;
4410 }
4411
4412 /* Implement TARGET_PASS_BY_REFERENCE.  */
4413
4414 static bool
4415 aarch64_pass_by_reference (cumulative_args_t, const function_arg_info &arg)
4416 {
4417   HOST_WIDE_INT size;
4418   machine_mode dummymode;
4419   int nregs;
4420
4421   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
4422   if (arg.mode == BLKmode && arg.type)
4423     size = int_size_in_bytes (arg.type);
4424   else
4425     /* No frontends can create types with variable-sized modes, so we
4426        shouldn't be asked to pass or return them.  */
4427     size = GET_MODE_SIZE (arg.mode).to_constant ();
4428
4429   /* Aggregates are passed by reference based on their size.  */
4430   if (arg.aggregate_type_p ())
4431     size = int_size_in_bytes (arg.type);
4432
4433   /* Variable sized arguments are always returned by reference.  */
4434   if (size < 0)
4435     return true;
4436
4437   /* Can this be a candidate to be passed in fp/simd register(s)?  */
4438   if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
4439                                                &dummymode, &nregs,
4440                                                NULL))
4441     return false;
4442
4443   /* Arguments which are variable sized or larger than 2 registers are
4444      passed by reference unless they are a homogenous floating point
4445      aggregate.  */
4446   return size > 2 * UNITS_PER_WORD;
4447 }
4448
4449 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
4450 static bool
4451 aarch64_return_in_msb (const_tree valtype)
4452 {
4453   machine_mode dummy_mode;
4454   int dummy_int;
4455
4456   /* Never happens in little-endian mode.  */
4457   if (!BYTES_BIG_ENDIAN)
4458     return false;
4459
4460   /* Only composite types smaller than or equal to 16 bytes can
4461      be potentially returned in registers.  */
4462   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
4463       || int_size_in_bytes (valtype) <= 0
4464       || int_size_in_bytes (valtype) > 16)
4465     return false;
4466
4467   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
4468      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
4469      is always passed/returned in the least significant bits of fp/simd
4470      register(s).  */
4471   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
4472                                                &dummy_mode, &dummy_int, NULL))
4473     return false;
4474
4475   return true;
4476 }
4477
4478 /* Implement TARGET_FUNCTION_VALUE.
4479    Define how to find the value returned by a function.  */
4480
4481 static rtx
4482 aarch64_function_value (const_tree type, const_tree func,
4483                         bool outgoing ATTRIBUTE_UNUSED)
4484 {
4485   machine_mode mode;
4486   int unsignedp;
4487   int count;
4488   machine_mode ag_mode;
4489
4490   mode = TYPE_MODE (type);
4491   if (INTEGRAL_TYPE_P (type))
4492     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
4493
4494   if (aarch64_return_in_msb (type))
4495     {
4496       HOST_WIDE_INT size = int_size_in_bytes (type);
4497
4498       if (size % UNITS_PER_WORD != 0)
4499         {
4500           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
4501           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
4502         }
4503     }
4504
4505   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
4506                                                &ag_mode, &count, NULL))
4507     {
4508       if (!aarch64_composite_type_p (type, mode))
4509         {
4510           gcc_assert (count == 1 && mode == ag_mode);
4511           return gen_rtx_REG (mode, V0_REGNUM);
4512         }
4513       else
4514         {
4515           int i;
4516           rtx par;
4517
4518           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
4519           for (i = 0; i < count; i++)
4520             {
4521               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
4522               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
4523               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4524               XVECEXP (par, 0, i) = tmp;
4525             }
4526           return par;
4527         }
4528     }
4529   else
4530     return gen_rtx_REG (mode, R0_REGNUM);
4531 }
4532
4533 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
4534    Return true if REGNO is the number of a hard register in which the values
4535    of called function may come back.  */
4536
4537 static bool
4538 aarch64_function_value_regno_p (const unsigned int regno)
4539 {
4540   /* Maximum of 16 bytes can be returned in the general registers.  Examples
4541      of 16-byte return values are: 128-bit integers and 16-byte small
4542      structures (excluding homogeneous floating-point aggregates).  */
4543   if (regno == R0_REGNUM || regno == R1_REGNUM)
4544     return true;
4545
4546   /* Up to four fp/simd registers can return a function value, e.g. a
4547      homogeneous floating-point aggregate having four members.  */
4548   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
4549     return TARGET_FLOAT;
4550
4551   return false;
4552 }
4553
4554 /* Implement TARGET_RETURN_IN_MEMORY.
4555
4556    If the type T of the result of a function is such that
4557      void func (T arg)
4558    would require that arg be passed as a value in a register (or set of
4559    registers) according to the parameter passing rules, then the result
4560    is returned in the same registers as would be used for such an
4561    argument.  */
4562
4563 static bool
4564 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
4565 {
4566   HOST_WIDE_INT size;
4567   machine_mode ag_mode;
4568   int count;
4569
4570   if (!AGGREGATE_TYPE_P (type)
4571       && TREE_CODE (type) != COMPLEX_TYPE
4572       && TREE_CODE (type) != VECTOR_TYPE)
4573     /* Simple scalar types always returned in registers.  */
4574     return false;
4575
4576   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
4577                                                type,
4578                                                &ag_mode,
4579                                                &count,
4580                                                NULL))
4581     return false;
4582
4583   /* Types larger than 2 registers returned in memory.  */
4584   size = int_size_in_bytes (type);
4585   return (size < 0 || size > 2 * UNITS_PER_WORD);
4586 }
4587
4588 static bool
4589 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
4590                                const_tree type, int *nregs)
4591 {
4592   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4593   return aarch64_vfp_is_call_or_return_candidate (mode,
4594                                                   type,
4595                                                   &pcum->aapcs_vfp_rmode,
4596                                                   nregs,
4597                                                   NULL);
4598 }
4599
4600 /* Given MODE and TYPE of a function argument, return the alignment in
4601    bits.  The idea is to suppress any stronger alignment requested by
4602    the user and opt for the natural alignment (specified in AAPCS64 \S
4603    4.1).  ABI_BREAK is set to true if the alignment was incorrectly
4604    calculated in versions of GCC prior to GCC-9.  This is a helper
4605    function for local use only.  */
4606
4607 static unsigned int
4608 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
4609                                 bool *abi_break)
4610 {
4611   *abi_break = false;
4612   if (!type)
4613     return GET_MODE_ALIGNMENT (mode);
4614
4615   if (integer_zerop (TYPE_SIZE (type)))
4616     return 0;
4617
4618   gcc_assert (TYPE_MODE (type) == mode);
4619
4620   if (!AGGREGATE_TYPE_P (type))
4621     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
4622
4623   if (TREE_CODE (type) == ARRAY_TYPE)
4624     return TYPE_ALIGN (TREE_TYPE (type));
4625
4626   unsigned int alignment = 0;
4627   unsigned int bitfield_alignment = 0;
4628   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
4629     if (TREE_CODE (field) == FIELD_DECL)
4630       {
4631         alignment = std::max (alignment, DECL_ALIGN (field));
4632         if (DECL_BIT_FIELD_TYPE (field))
4633           bitfield_alignment
4634             = std::max (bitfield_alignment,
4635                         TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
4636       }
4637
4638   if (bitfield_alignment > alignment)
4639     {
4640       *abi_break = true;
4641       return bitfield_alignment;
4642     }
4643
4644   return alignment;
4645 }
4646
4647 /* Layout a function argument according to the AAPCS64 rules.  The rule
4648    numbers refer to the rule numbers in the AAPCS64.  */
4649
4650 static void
4651 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
4652                     const_tree type,
4653                     bool named ATTRIBUTE_UNUSED)
4654 {
4655   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4656   int ncrn, nvrn, nregs;
4657   bool allocate_ncrn, allocate_nvrn;
4658   HOST_WIDE_INT size;
4659   bool abi_break;
4660
4661   /* We need to do this once per argument.  */
4662   if (pcum->aapcs_arg_processed)
4663     return;
4664
4665   pcum->aapcs_arg_processed = true;
4666
4667   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
4668   if (type)
4669     size = int_size_in_bytes (type);
4670   else
4671     /* No frontends can create types with variable-sized modes, so we
4672        shouldn't be asked to pass or return them.  */
4673     size = GET_MODE_SIZE (mode).to_constant ();
4674   size = ROUND_UP (size, UNITS_PER_WORD);
4675
4676   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
4677   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
4678                                                  mode,
4679                                                  type,
4680                                                  &nregs);
4681
4682   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
4683      The following code thus handles passing by SIMD/FP registers first.  */
4684
4685   nvrn = pcum->aapcs_nvrn;
4686
4687   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
4688      and homogenous short-vector aggregates (HVA).  */
4689   if (allocate_nvrn)
4690     {
4691       if (!TARGET_FLOAT)
4692         aarch64_err_no_fpadvsimd (mode);
4693
4694       if (nvrn + nregs <= NUM_FP_ARG_REGS)
4695         {
4696           pcum->aapcs_nextnvrn = nvrn + nregs;
4697           if (!aarch64_composite_type_p (type, mode))
4698             {
4699               gcc_assert (nregs == 1);
4700               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
4701             }
4702           else
4703             {
4704               rtx par;
4705               int i;
4706               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4707               for (i = 0; i < nregs; i++)
4708                 {
4709                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
4710                                          V0_REGNUM + nvrn + i);
4711                   rtx offset = gen_int_mode
4712                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
4713                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4714                   XVECEXP (par, 0, i) = tmp;
4715                 }
4716               pcum->aapcs_reg = par;
4717             }
4718           return;
4719         }
4720       else
4721         {
4722           /* C.3 NSRN is set to 8.  */
4723           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
4724           goto on_stack;
4725         }
4726     }
4727
4728   ncrn = pcum->aapcs_ncrn;
4729   nregs = size / UNITS_PER_WORD;
4730
4731   /* C6 - C9.  though the sign and zero extension semantics are
4732      handled elsewhere.  This is the case where the argument fits
4733      entirely general registers.  */
4734   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
4735     {
4736       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
4737
4738       /* C.8 if the argument has an alignment of 16 then the NGRN is
4739          rounded up to the next even number.  */
4740       if (nregs == 2
4741           && ncrn % 2
4742           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
4743              comparison is there because for > 16 * BITS_PER_UNIT
4744              alignment nregs should be > 2 and therefore it should be
4745              passed by reference rather than value.  */
4746           && (aarch64_function_arg_alignment (mode, type, &abi_break)
4747               == 16 * BITS_PER_UNIT))
4748         {
4749           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4750             inform (input_location, "parameter passing for argument of type "
4751                     "%qT changed in GCC 9.1", type);
4752           ++ncrn;
4753           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
4754         }
4755
4756       /* NREGS can be 0 when e.g. an empty structure is to be passed.
4757          A reg is still generated for it, but the caller should be smart
4758          enough not to use it.  */
4759       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
4760         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
4761       else
4762         {
4763           rtx par;
4764           int i;
4765
4766           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4767           for (i = 0; i < nregs; i++)
4768             {
4769               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
4770               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
4771                                        GEN_INT (i * UNITS_PER_WORD));
4772               XVECEXP (par, 0, i) = tmp;
4773             }
4774           pcum->aapcs_reg = par;
4775         }
4776
4777       pcum->aapcs_nextncrn = ncrn + nregs;
4778       return;
4779     }
4780
4781   /* C.11  */
4782   pcum->aapcs_nextncrn = NUM_ARG_REGS;
4783
4784   /* The argument is passed on stack; record the needed number of words for
4785      this argument and align the total size if necessary.  */
4786 on_stack:
4787   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
4788
4789   if (aarch64_function_arg_alignment (mode, type, &abi_break)
4790       == 16 * BITS_PER_UNIT)
4791     {
4792       int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
4793       if (pcum->aapcs_stack_size != new_size)
4794         {
4795           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4796             inform (input_location, "parameter passing for argument of type "
4797                     "%qT changed in GCC 9.1", type);
4798           pcum->aapcs_stack_size = new_size;
4799         }
4800     }
4801   return;
4802 }
4803
4804 /* Implement TARGET_FUNCTION_ARG.  */
4805
4806 static rtx
4807 aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
4808 {
4809   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4810   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
4811
4812   if (arg.end_marker_p ())
4813     return NULL_RTX;
4814
4815   aarch64_layout_arg (pcum_v, arg.mode, arg.type, arg.named);
4816   return pcum->aapcs_reg;
4817 }
4818
4819 void
4820 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
4821                            const_tree fntype ATTRIBUTE_UNUSED,
4822                            rtx libname ATTRIBUTE_UNUSED,
4823                            const_tree fndecl ATTRIBUTE_UNUSED,
4824                            unsigned n_named ATTRIBUTE_UNUSED)
4825 {
4826   pcum->aapcs_ncrn = 0;
4827   pcum->aapcs_nvrn = 0;
4828   pcum->aapcs_nextncrn = 0;
4829   pcum->aapcs_nextnvrn = 0;
4830   pcum->pcs_variant = ARM_PCS_AAPCS64;
4831   pcum->aapcs_reg = NULL_RTX;
4832   pcum->aapcs_arg_processed = false;
4833   pcum->aapcs_stack_words = 0;
4834   pcum->aapcs_stack_size = 0;
4835
4836   if (!TARGET_FLOAT
4837       && fndecl && TREE_PUBLIC (fndecl)
4838       && fntype && fntype != error_mark_node)
4839     {
4840       const_tree type = TREE_TYPE (fntype);
4841       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
4842       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
4843       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
4844                                                    &mode, &nregs, NULL))
4845         aarch64_err_no_fpadvsimd (TYPE_MODE (type));
4846     }
4847   return;
4848 }
4849
4850 static void
4851 aarch64_function_arg_advance (cumulative_args_t pcum_v,
4852                               const function_arg_info &arg)
4853 {
4854   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4855   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
4856     {
4857       aarch64_layout_arg (pcum_v, arg.mode, arg.type, arg.named);
4858       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
4859                   != (pcum->aapcs_stack_words != 0));
4860       pcum->aapcs_arg_processed = false;
4861       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
4862       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
4863       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
4864       pcum->aapcs_stack_words = 0;
4865       pcum->aapcs_reg = NULL_RTX;
4866     }
4867 }
4868
4869 bool
4870 aarch64_function_arg_regno_p (unsigned regno)
4871 {
4872   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
4873           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
4874 }
4875
4876 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
4877    PARM_BOUNDARY bits of alignment, but will be given anything up
4878    to STACK_BOUNDARY bits if the type requires it.  This makes sure
4879    that both before and after the layout of each argument, the Next
4880    Stacked Argument Address (NSAA) will have a minimum alignment of
4881    8 bytes.  */
4882
4883 static unsigned int
4884 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
4885 {
4886   bool abi_break;
4887   unsigned int alignment = aarch64_function_arg_alignment (mode, type,
4888                                                            &abi_break);
4889   if (abi_break & warn_psabi)
4890     inform (input_location, "parameter passing for argument of type "
4891             "%qT changed in GCC 9.1", type);
4892
4893   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
4894 }
4895
4896 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
4897
4898 static fixed_size_mode
4899 aarch64_get_reg_raw_mode (int regno)
4900 {
4901   if (TARGET_SVE && FP_REGNUM_P (regno))
4902     /* Don't use the SVE part of the register for __builtin_apply and
4903        __builtin_return.  The SVE registers aren't used by the normal PCS,
4904        so using them there would be a waste of time.  The PCS extensions
4905        for SVE types are fundamentally incompatible with the
4906        __builtin_return/__builtin_apply interface.  */
4907     return as_a <fixed_size_mode> (V16QImode);
4908   return default_get_reg_raw_mode (regno);
4909 }
4910
4911 /* Implement TARGET_FUNCTION_ARG_PADDING.
4912
4913    Small aggregate types are placed in the lowest memory address.
4914
4915    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
4916
4917 static pad_direction
4918 aarch64_function_arg_padding (machine_mode mode, const_tree type)
4919 {
4920   /* On little-endian targets, the least significant byte of every stack
4921      argument is passed at the lowest byte address of the stack slot.  */
4922   if (!BYTES_BIG_ENDIAN)
4923     return PAD_UPWARD;
4924
4925   /* Otherwise, integral, floating-point and pointer types are padded downward:
4926      the least significant byte of a stack argument is passed at the highest
4927      byte address of the stack slot.  */
4928   if (type
4929       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
4930          || POINTER_TYPE_P (type))
4931       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
4932     return PAD_DOWNWARD;
4933
4934   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
4935   return PAD_UPWARD;
4936 }
4937
4938 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4939
4940    It specifies padding for the last (may also be the only)
4941    element of a block move between registers and memory.  If
4942    assuming the block is in the memory, padding upward means that
4943    the last element is padded after its highest significant byte,
4944    while in downward padding, the last element is padded at the
4945    its least significant byte side.
4946
4947    Small aggregates and small complex types are always padded
4948    upwards.
4949
4950    We don't need to worry about homogeneous floating-point or
4951    short-vector aggregates; their move is not affected by the
4952    padding direction determined here.  Regardless of endianness,
4953    each element of such an aggregate is put in the least
4954    significant bits of a fp/simd register.
4955
4956    Return !BYTES_BIG_ENDIAN if the least significant byte of the
4957    register has useful data, and return the opposite if the most
4958    significant byte does.  */
4959
4960 bool
4961 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
4962                      bool first ATTRIBUTE_UNUSED)
4963 {
4964
4965   /* Small composite types are always padded upward.  */
4966   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
4967     {
4968       HOST_WIDE_INT size;
4969       if (type)
4970         size = int_size_in_bytes (type);
4971       else
4972         /* No frontends can create types with variable-sized modes, so we
4973            shouldn't be asked to pass or return them.  */
4974         size = GET_MODE_SIZE (mode).to_constant ();
4975       if (size < 2 * UNITS_PER_WORD)
4976         return true;
4977     }
4978
4979   /* Otherwise, use the default padding.  */
4980   return !BYTES_BIG_ENDIAN;
4981 }
4982
4983 static scalar_int_mode
4984 aarch64_libgcc_cmp_return_mode (void)
4985 {
4986   return SImode;
4987 }
4988
4989 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
4990
4991 /* We use the 12-bit shifted immediate arithmetic instructions so values
4992    must be multiple of (1 << 12), i.e. 4096.  */
4993 #define ARITH_FACTOR 4096
4994
4995 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
4996 #error Cannot use simple address calculation for stack probing
4997 #endif
4998
4999 /* The pair of scratch registers used for stack probing.  */
5000 #define PROBE_STACK_FIRST_REG  R9_REGNUM
5001 #define PROBE_STACK_SECOND_REG R10_REGNUM
5002
5003 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
5004    inclusive.  These are offsets from the current stack pointer.  */
5005
5006 static void
5007 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
5008 {
5009   HOST_WIDE_INT size;
5010   if (!poly_size.is_constant (&size))
5011     {
5012       sorry ("stack probes for SVE frames");
5013       return;
5014     }
5015
5016   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
5017
5018   /* See the same assertion on PROBE_INTERVAL above.  */
5019   gcc_assert ((first % ARITH_FACTOR) == 0);
5020
5021   /* See if we have a constant small number of probes to generate.  If so,
5022      that's the easy case.  */
5023   if (size <= PROBE_INTERVAL)
5024     {
5025       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
5026
5027       emit_set_insn (reg1,
5028                      plus_constant (Pmode,
5029                                     stack_pointer_rtx, -(first + base)));
5030       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
5031     }
5032
5033   /* The run-time loop is made up of 8 insns in the generic case while the
5034      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
5035   else if (size <= 4 * PROBE_INTERVAL)
5036     {
5037       HOST_WIDE_INT i, rem;
5038
5039       emit_set_insn (reg1,
5040                      plus_constant (Pmode,
5041                                     stack_pointer_rtx,
5042                                     -(first + PROBE_INTERVAL)));
5043       emit_stack_probe (reg1);
5044
5045       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
5046          it exceeds SIZE.  If only two probes are needed, this will not
5047          generate any code.  Then probe at FIRST + SIZE.  */
5048       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
5049         {
5050           emit_set_insn (reg1,
5051                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
5052           emit_stack_probe (reg1);
5053         }
5054
5055       rem = size - (i - PROBE_INTERVAL);
5056       if (rem > 256)
5057         {
5058           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5059
5060           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
5061           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
5062         }
5063       else
5064         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
5065     }
5066
5067   /* Otherwise, do the same as above, but in a loop.  Note that we must be
5068      extra careful with variables wrapping around because we might be at
5069      the very top (or the very bottom) of the address space and we have
5070      to be able to handle this case properly; in particular, we use an
5071      equality test for the loop condition.  */
5072   else
5073     {
5074       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
5075
5076       /* Step 1: round SIZE to the previous multiple of the interval.  */
5077
5078       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
5079
5080
5081       /* Step 2: compute initial and final value of the loop counter.  */
5082
5083       /* TEST_ADDR = SP + FIRST.  */
5084       emit_set_insn (reg1,
5085                      plus_constant (Pmode, stack_pointer_rtx, -first));
5086
5087       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
5088       HOST_WIDE_INT adjustment = - (first + rounded_size);
5089       if (! aarch64_uimm12_shift (adjustment))
5090         {
5091           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
5092                                           true, Pmode);
5093           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
5094         }
5095       else
5096         emit_set_insn (reg2,
5097                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
5098
5099       /* Step 3: the loop
5100
5101          do
5102            {
5103              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
5104              probe at TEST_ADDR
5105            }
5106          while (TEST_ADDR != LAST_ADDR)
5107
5108          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
5109          until it is equal to ROUNDED_SIZE.  */
5110
5111       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
5112
5113
5114       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
5115          that SIZE is equal to ROUNDED_SIZE.  */
5116
5117       if (size != rounded_size)
5118         {
5119           HOST_WIDE_INT rem = size - rounded_size;
5120
5121           if (rem > 256)
5122             {
5123               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5124
5125               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
5126               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
5127             }
5128           else
5129             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
5130         }
5131     }
5132
5133   /* Make sure nothing is scheduled before we are done.  */
5134   emit_insn (gen_blockage ());
5135 }
5136
5137 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
5138    absolute addresses.  */
5139
5140 const char *
5141 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
5142 {
5143   static int labelno = 0;
5144   char loop_lab[32];
5145   rtx xops[2];
5146
5147   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
5148
5149   /* Loop.  */
5150   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
5151
5152   HOST_WIDE_INT stack_clash_probe_interval
5153     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5154
5155   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
5156   xops[0] = reg1;
5157   HOST_WIDE_INT interval;
5158   if (flag_stack_clash_protection)
5159     interval = stack_clash_probe_interval;
5160   else
5161     interval = PROBE_INTERVAL;
5162
5163   gcc_assert (aarch64_uimm12_shift (interval));
5164   xops[1] = GEN_INT (interval);
5165
5166   output_asm_insn ("sub\t%0, %0, %1", xops);
5167
5168   /* If doing stack clash protection then we probe up by the ABI specified
5169      amount.  We do this because we're dropping full pages at a time in the
5170      loop.  But if we're doing non-stack clash probing, probe at SP 0.  */
5171   if (flag_stack_clash_protection)
5172     xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
5173   else
5174     xops[1] = CONST0_RTX (GET_MODE (xops[1]));
5175
5176   /* Probe at TEST_ADDR.  If we're inside the loop it is always safe to probe
5177      by this amount for each iteration.  */
5178   output_asm_insn ("str\txzr, [%0, %1]", xops);
5179
5180   /* Test if TEST_ADDR == LAST_ADDR.  */
5181   xops[1] = reg2;
5182   output_asm_insn ("cmp\t%0, %1", xops);
5183
5184   /* Branch.  */
5185   fputs ("\tb.ne\t", asm_out_file);
5186   assemble_name_raw (asm_out_file, loop_lab);
5187   fputc ('\n', asm_out_file);
5188
5189   return "";
5190 }
5191
5192 /* Emit the probe loop for doing stack clash probes and stack adjustments for
5193    SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
5194    of GUARD_SIZE.  When a probe is emitted it is done at most
5195    MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
5196    at most MIN_PROBE_THRESHOLD.  By the end of this function
5197    BASE = BASE - ADJUSTMENT.  */
5198
5199 const char *
5200 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
5201                                       rtx min_probe_threshold, rtx guard_size)
5202 {
5203   /* This function is not allowed to use any instruction generation function
5204      like gen_ and friends.  If you do you'll likely ICE during CFG validation,
5205      so instead emit the code you want using output_asm_insn.  */
5206   gcc_assert (flag_stack_clash_protection);
5207   gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
5208   gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
5209
5210   /* The minimum required allocation before the residual requires probing.  */
5211   HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
5212
5213   /* Clamp the value down to the nearest value that can be used with a cmp.  */
5214   residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
5215   rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
5216
5217   gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
5218   gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
5219
5220   static int labelno = 0;
5221   char loop_start_lab[32];
5222   char loop_end_lab[32];
5223   rtx xops[2];
5224
5225   ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
5226   ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
5227
5228   /* Emit loop start label.  */
5229   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
5230
5231   /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
5232   xops[0] = adjustment;
5233   xops[1] = probe_offset_value_rtx;
5234   output_asm_insn ("cmp\t%0, %1", xops);
5235
5236   /* Branch to end if not enough adjustment to probe.  */
5237   fputs ("\tb.lt\t", asm_out_file);
5238   assemble_name_raw (asm_out_file, loop_end_lab);
5239   fputc ('\n', asm_out_file);
5240
5241   /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
5242   xops[0] = base;
5243   xops[1] = probe_offset_value_rtx;
5244   output_asm_insn ("sub\t%0, %0, %1", xops);
5245
5246   /* Probe at BASE.  */
5247   xops[1] = const0_rtx;
5248   output_asm_insn ("str\txzr, [%0, %1]", xops);
5249
5250   /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
5251   xops[0] = adjustment;
5252   xops[1] = probe_offset_value_rtx;
5253   output_asm_insn ("sub\t%0, %0, %1", xops);
5254
5255   /* Branch to start if still more bytes to allocate.  */
5256   fputs ("\tb\t", asm_out_file);
5257   assemble_name_raw (asm_out_file, loop_start_lab);
5258   fputc ('\n', asm_out_file);
5259
5260   /* No probe leave.  */
5261   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
5262
5263   /* BASE = BASE - ADJUSTMENT.  */
5264   xops[0] = base;
5265   xops[1] = adjustment;
5266   output_asm_insn ("sub\t%0, %0, %1", xops);
5267   return "";
5268 }
5269
5270 /* Determine whether a frame chain needs to be generated.  */
5271 static bool
5272 aarch64_needs_frame_chain (void)
5273 {
5274   /* Force a frame chain for EH returns so the return address is at FP+8.  */
5275   if (frame_pointer_needed || crtl->calls_eh_return)
5276     return true;
5277
5278   /* A leaf function cannot have calls or write LR.  */
5279   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
5280
5281   /* Don't use a frame chain in leaf functions if leaf frame pointers
5282      are disabled.  */
5283   if (flag_omit_leaf_frame_pointer && is_leaf)
5284     return false;
5285
5286   return aarch64_use_frame_pointer;
5287 }
5288
5289 /* Mark the registers that need to be saved by the callee and calculate
5290    the size of the callee-saved registers area and frame record (both FP
5291    and LR may be omitted).  */
5292 static void
5293 aarch64_layout_frame (void)
5294 {
5295   HOST_WIDE_INT offset = 0;
5296   int regno, last_fp_reg = INVALID_REGNUM;
5297   bool simd_function = aarch64_simd_decl_p (cfun->decl);
5298
5299   cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
5300
5301   /* Adjust the outgoing arguments size if required.  Keep it in sync with what
5302      the mid-end is doing.  */
5303   crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
5304
5305 #define SLOT_NOT_REQUIRED (-2)
5306 #define SLOT_REQUIRED     (-1)
5307
5308   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
5309   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
5310
5311   /* If this is a non-leaf simd function with calls we assume that
5312      at least one of those calls is to a non-simd function and thus
5313      we must save V8 to V23 in the prologue.  */
5314
5315   if (simd_function && !crtl->is_leaf)
5316     {
5317       for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5318         if (FP_SIMD_SAVED_REGNUM_P (regno))
5319           df_set_regs_ever_live (regno, true);
5320     }
5321
5322   /* First mark all the registers that really need to be saved...  */
5323   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5324     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5325
5326   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5327     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5328
5329   /* ... that includes the eh data registers (if needed)...  */
5330   if (crtl->calls_eh_return)
5331     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
5332       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
5333         = SLOT_REQUIRED;
5334
5335   /* ... and any callee saved register that dataflow says is live.  */
5336   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5337     if (df_regs_ever_live_p (regno)
5338         && (regno == R30_REGNUM
5339             || !call_used_regs[regno]))
5340       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
5341
5342   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5343     if (df_regs_ever_live_p (regno)
5344         && (!call_used_regs[regno]
5345             || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
5346       {
5347         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
5348         last_fp_reg = regno;
5349       }
5350
5351   if (cfun->machine->frame.emit_frame_chain)
5352     {
5353       /* FP and LR are placed in the linkage record.  */
5354       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
5355       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
5356       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
5357       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
5358       offset = 2 * UNITS_PER_WORD;
5359     }
5360
5361   /* With stack-clash, LR must be saved in non-leaf functions.  */
5362   gcc_assert (crtl->is_leaf
5363               || (cfun->machine->frame.reg_offset[R30_REGNUM]
5364                   != SLOT_NOT_REQUIRED));
5365
5366   /* Now assign stack slots for them.  */
5367   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5368     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
5369       {
5370         cfun->machine->frame.reg_offset[regno] = offset;
5371         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
5372           cfun->machine->frame.wb_candidate1 = regno;
5373         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
5374           cfun->machine->frame.wb_candidate2 = regno;
5375         offset += UNITS_PER_WORD;
5376       }
5377
5378   HOST_WIDE_INT max_int_offset = offset;
5379   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
5380   bool has_align_gap = offset != max_int_offset;
5381
5382   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5383     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
5384       {
5385         /* If there is an alignment gap between integer and fp callee-saves,
5386            allocate the last fp register to it if possible.  */
5387         if (regno == last_fp_reg
5388             && has_align_gap
5389             && !simd_function
5390             && (offset & 8) == 0)
5391           {
5392             cfun->machine->frame.reg_offset[regno] = max_int_offset;
5393             break;
5394           }
5395
5396         cfun->machine->frame.reg_offset[regno] = offset;
5397         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
5398           cfun->machine->frame.wb_candidate1 = regno;
5399         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
5400                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
5401           cfun->machine->frame.wb_candidate2 = regno;
5402         offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
5403       }
5404
5405   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
5406
5407   cfun->machine->frame.saved_regs_size = offset;
5408
5409   HOST_WIDE_INT varargs_and_saved_regs_size
5410     = offset + cfun->machine->frame.saved_varargs_size;
5411
5412   cfun->machine->frame.hard_fp_offset
5413     = aligned_upper_bound (varargs_and_saved_regs_size
5414                            + get_frame_size (),
5415                            STACK_BOUNDARY / BITS_PER_UNIT);
5416
5417   /* Both these values are already aligned.  */
5418   gcc_assert (multiple_p (crtl->outgoing_args_size,
5419                           STACK_BOUNDARY / BITS_PER_UNIT));
5420   cfun->machine->frame.frame_size
5421     = (cfun->machine->frame.hard_fp_offset
5422        + crtl->outgoing_args_size);
5423
5424   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
5425
5426   cfun->machine->frame.initial_adjust = 0;
5427   cfun->machine->frame.final_adjust = 0;
5428   cfun->machine->frame.callee_adjust = 0;
5429   cfun->machine->frame.callee_offset = 0;
5430
5431   HOST_WIDE_INT max_push_offset = 0;
5432   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
5433     max_push_offset = 512;
5434   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
5435     max_push_offset = 256;
5436
5437   HOST_WIDE_INT const_size, const_fp_offset;
5438   if (cfun->machine->frame.frame_size.is_constant (&const_size)
5439       && const_size < max_push_offset
5440       && known_eq (crtl->outgoing_args_size, 0))
5441     {
5442       /* Simple, small frame with no outgoing arguments:
5443          stp reg1, reg2, [sp, -frame_size]!
5444          stp reg3, reg4, [sp, 16]  */
5445       cfun->machine->frame.callee_adjust = const_size;
5446     }
5447   else if (known_lt (crtl->outgoing_args_size
5448                      + cfun->machine->frame.saved_regs_size, 512)
5449            && !(cfun->calls_alloca
5450                 && known_lt (cfun->machine->frame.hard_fp_offset,
5451                              max_push_offset)))
5452     {
5453       /* Frame with small outgoing arguments:
5454          sub sp, sp, frame_size
5455          stp reg1, reg2, [sp, outgoing_args_size]
5456          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
5457       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
5458       cfun->machine->frame.callee_offset
5459         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
5460     }
5461   else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
5462            && const_fp_offset < max_push_offset)
5463     {
5464       /* Frame with large outgoing arguments but a small local area:
5465          stp reg1, reg2, [sp, -hard_fp_offset]!
5466          stp reg3, reg4, [sp, 16]
5467          sub sp, sp, outgoing_args_size  */
5468       cfun->machine->frame.callee_adjust = const_fp_offset;
5469       cfun->machine->frame.final_adjust
5470         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
5471     }
5472   else
5473     {
5474       /* Frame with large local area and outgoing arguments using frame pointer:
5475          sub sp, sp, hard_fp_offset
5476          stp x29, x30, [sp, 0]
5477          add x29, sp, 0
5478          stp reg3, reg4, [sp, 16]
5479          sub sp, sp, outgoing_args_size  */
5480       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
5481       cfun->machine->frame.final_adjust
5482         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
5483     }
5484
5485   cfun->machine->frame.laid_out = true;
5486 }
5487
5488 /* Return true if the register REGNO is saved on entry to
5489    the current function.  */
5490
5491 static bool
5492 aarch64_register_saved_on_entry (int regno)
5493 {
5494   return cfun->machine->frame.reg_offset[regno] >= 0;
5495 }
5496
5497 /* Return the next register up from REGNO up to LIMIT for the callee
5498    to save.  */
5499
5500 static unsigned
5501 aarch64_next_callee_save (unsigned regno, unsigned limit)
5502 {
5503   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
5504     regno ++;
5505   return regno;
5506 }
5507
5508 /* Push the register number REGNO of mode MODE to the stack with write-back
5509    adjusting the stack by ADJUSTMENT.  */
5510
5511 static void
5512 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
5513                            HOST_WIDE_INT adjustment)
5514  {
5515   rtx base_rtx = stack_pointer_rtx;
5516   rtx insn, reg, mem;
5517
5518   reg = gen_rtx_REG (mode, regno);
5519   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
5520                             plus_constant (Pmode, base_rtx, -adjustment));
5521   mem = gen_frame_mem (mode, mem);
5522
5523   insn = emit_move_insn (mem, reg);
5524   RTX_FRAME_RELATED_P (insn) = 1;
5525 }
5526
5527 /* Generate and return an instruction to store the pair of registers
5528    REG and REG2 of mode MODE to location BASE with write-back adjusting
5529    the stack location BASE by ADJUSTMENT.  */
5530
5531 static rtx
5532 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
5533                           HOST_WIDE_INT adjustment)
5534 {
5535   switch (mode)
5536     {
5537     case E_DImode:
5538       return gen_storewb_pairdi_di (base, base, reg, reg2,
5539                                     GEN_INT (-adjustment),
5540                                     GEN_INT (UNITS_PER_WORD - adjustment));
5541     case E_DFmode:
5542       return gen_storewb_pairdf_di (base, base, reg, reg2,
5543                                     GEN_INT (-adjustment),
5544                                     GEN_INT (UNITS_PER_WORD - adjustment));
5545     case E_TFmode:
5546       return gen_storewb_pairtf_di (base, base, reg, reg2,
5547                                     GEN_INT (-adjustment),
5548                                     GEN_INT (UNITS_PER_VREG - adjustment));
5549     default:
5550       gcc_unreachable ();
5551     }
5552 }
5553
5554 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
5555    stack pointer by ADJUSTMENT.  */
5556
5557 static void
5558 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
5559 {
5560   rtx_insn *insn;
5561   machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
5562
5563   if (regno2 == INVALID_REGNUM)
5564     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
5565
5566   rtx reg1 = gen_rtx_REG (mode, regno1);
5567   rtx reg2 = gen_rtx_REG (mode, regno2);
5568
5569   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
5570                                               reg2, adjustment));
5571   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
5572   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5573   RTX_FRAME_RELATED_P (insn) = 1;
5574 }
5575
5576 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
5577    adjusting it by ADJUSTMENT afterwards.  */
5578
5579 static rtx
5580 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
5581                          HOST_WIDE_INT adjustment)
5582 {
5583   switch (mode)
5584     {
5585     case E_DImode:
5586       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
5587                                    GEN_INT (UNITS_PER_WORD));
5588     case E_DFmode:
5589       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
5590                                    GEN_INT (UNITS_PER_WORD));
5591     case E_TFmode:
5592       return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
5593                                    GEN_INT (UNITS_PER_VREG));
5594     default:
5595       gcc_unreachable ();
5596     }
5597 }
5598
5599 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
5600    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
5601    into CFI_OPS.  */
5602
5603 static void
5604 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
5605                   rtx *cfi_ops)
5606 {
5607   machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
5608   rtx reg1 = gen_rtx_REG (mode, regno1);
5609
5610   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
5611
5612   if (regno2 == INVALID_REGNUM)
5613     {
5614       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
5615       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
5616       emit_move_insn (reg1, gen_frame_mem (mode, mem));
5617     }
5618   else
5619     {
5620       rtx reg2 = gen_rtx_REG (mode, regno2);
5621       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5622       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
5623                                           reg2, adjustment));
5624     }
5625 }
5626
5627 /* Generate and return a store pair instruction of mode MODE to store
5628    register REG1 to MEM1 and register REG2 to MEM2.  */
5629
5630 static rtx
5631 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
5632                         rtx reg2)
5633 {
5634   switch (mode)
5635     {
5636     case E_DImode:
5637       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
5638
5639     case E_DFmode:
5640       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
5641
5642     case E_TFmode:
5643       return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
5644
5645     default:
5646       gcc_unreachable ();
5647     }
5648 }
5649
5650 /* Generate and regurn a load pair isntruction of mode MODE to load register
5651    REG1 from MEM1 and register REG2 from MEM2.  */
5652
5653 static rtx
5654 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
5655                        rtx mem2)
5656 {
5657   switch (mode)
5658     {
5659     case E_DImode:
5660       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
5661
5662     case E_DFmode:
5663       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
5664
5665     case E_TFmode:
5666       return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
5667
5668     default:
5669       gcc_unreachable ();
5670     }
5671 }
5672
5673 /* Return TRUE if return address signing should be enabled for the current
5674    function, otherwise return FALSE.  */
5675
5676 bool
5677 aarch64_return_address_signing_enabled (void)
5678 {
5679   /* This function should only be called after frame laid out.   */
5680   gcc_assert (cfun->machine->frame.laid_out);
5681
5682   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
5683      if its LR is pushed onto stack.  */
5684   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
5685           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
5686               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
5687 }
5688
5689 /* Return TRUE if Branch Target Identification Mechanism is enabled.  */
5690 bool
5691 aarch64_bti_enabled (void)
5692 {
5693   return (aarch64_enable_bti == 1);
5694 }
5695
5696 /* Emit code to save the callee-saved registers from register number START
5697    to LIMIT to the stack at the location starting at offset START_OFFSET,
5698    skipping any write-back candidates if SKIP_WB is true.  */
5699
5700 static void
5701 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
5702                            unsigned start, unsigned limit, bool skip_wb)
5703 {
5704   rtx_insn *insn;
5705   unsigned regno;
5706   unsigned regno2;
5707
5708   for (regno = aarch64_next_callee_save (start, limit);
5709        regno <= limit;
5710        regno = aarch64_next_callee_save (regno + 1, limit))
5711     {
5712       rtx reg, mem;
5713       poly_int64 offset;
5714       int offset_diff;
5715
5716       if (skip_wb
5717           && (regno == cfun->machine->frame.wb_candidate1
5718               || regno == cfun->machine->frame.wb_candidate2))
5719         continue;
5720
5721       if (cfun->machine->reg_is_wrapped_separately[regno])
5722        continue;
5723
5724       reg = gen_rtx_REG (mode, regno);
5725       offset = start_offset + cfun->machine->frame.reg_offset[regno];
5726       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5727                                                 offset));
5728
5729       regno2 = aarch64_next_callee_save (regno + 1, limit);
5730       offset_diff = cfun->machine->frame.reg_offset[regno2]
5731                     - cfun->machine->frame.reg_offset[regno];
5732
5733       if (regno2 <= limit
5734           && !cfun->machine->reg_is_wrapped_separately[regno2]
5735           && known_eq (GET_MODE_SIZE (mode), offset_diff))
5736         {
5737           rtx reg2 = gen_rtx_REG (mode, regno2);
5738           rtx mem2;
5739
5740           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5741           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5742                                                      offset));
5743           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
5744                                                     reg2));
5745
5746           /* The first part of a frame-related parallel insn is
5747              always assumed to be relevant to the frame
5748              calculations; subsequent parts, are only
5749              frame-related if explicitly marked.  */
5750           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5751           regno = regno2;
5752         }
5753       else
5754         insn = emit_move_insn (mem, reg);
5755
5756       RTX_FRAME_RELATED_P (insn) = 1;
5757     }
5758 }
5759
5760 /* Emit code to restore the callee registers of mode MODE from register
5761    number START up to and including LIMIT.  Restore from the stack offset
5762    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
5763    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
5764
5765 static void
5766 aarch64_restore_callee_saves (machine_mode mode,
5767                               poly_int64 start_offset, unsigned start,
5768                               unsigned limit, bool skip_wb, rtx *cfi_ops)
5769 {
5770   rtx base_rtx = stack_pointer_rtx;
5771   unsigned regno;
5772   unsigned regno2;
5773   poly_int64 offset;
5774
5775   for (regno = aarch64_next_callee_save (start, limit);
5776        regno <= limit;
5777        regno = aarch64_next_callee_save (regno + 1, limit))
5778     {
5779       if (cfun->machine->reg_is_wrapped_separately[regno])
5780        continue;
5781
5782       rtx reg, mem;
5783       int offset_diff;
5784
5785       if (skip_wb
5786           && (regno == cfun->machine->frame.wb_candidate1
5787               || regno == cfun->machine->frame.wb_candidate2))
5788         continue;
5789
5790       reg = gen_rtx_REG (mode, regno);
5791       offset = start_offset + cfun->machine->frame.reg_offset[regno];
5792       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5793
5794       regno2 = aarch64_next_callee_save (regno + 1, limit);
5795       offset_diff = cfun->machine->frame.reg_offset[regno2]
5796                     - cfun->machine->frame.reg_offset[regno];
5797
5798       if (regno2 <= limit
5799           && !cfun->machine->reg_is_wrapped_separately[regno2]
5800           && known_eq (GET_MODE_SIZE (mode), offset_diff))
5801         {
5802           rtx reg2 = gen_rtx_REG (mode, regno2);
5803           rtx mem2;
5804
5805           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5806           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5807           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5808
5809           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5810           regno = regno2;
5811         }
5812       else
5813         emit_move_insn (reg, mem);
5814       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
5815     }
5816 }
5817
5818 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
5819    of MODE.  */
5820
5821 static inline bool
5822 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5823 {
5824   HOST_WIDE_INT multiple;
5825   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5826           && IN_RANGE (multiple, -8, 7));
5827 }
5828
5829 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
5830    of MODE.  */
5831
5832 static inline bool
5833 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5834 {
5835   HOST_WIDE_INT multiple;
5836   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5837           && IN_RANGE (multiple, 0, 63));
5838 }
5839
5840 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
5841    of MODE.  */
5842
5843 bool
5844 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5845 {
5846   HOST_WIDE_INT multiple;
5847   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5848           && IN_RANGE (multiple, -64, 63));
5849 }
5850
5851 /* Return true if OFFSET is a signed 9-bit value.  */
5852
5853 bool
5854 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
5855                                        poly_int64 offset)
5856 {
5857   HOST_WIDE_INT const_offset;
5858   return (offset.is_constant (&const_offset)
5859           && IN_RANGE (const_offset, -256, 255));
5860 }
5861
5862 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
5863    of MODE.  */
5864
5865 static inline bool
5866 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5867 {
5868   HOST_WIDE_INT multiple;
5869   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5870           && IN_RANGE (multiple, -256, 255));
5871 }
5872
5873 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
5874    of MODE.  */
5875
5876 static inline bool
5877 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5878 {
5879   HOST_WIDE_INT multiple;
5880   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5881           && IN_RANGE (multiple, 0, 4095));
5882 }
5883
5884 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
5885
5886 static sbitmap
5887 aarch64_get_separate_components (void)
5888 {
5889   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5890   bitmap_clear (components);
5891
5892   /* The registers we need saved to the frame.  */
5893   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5894     if (aarch64_register_saved_on_entry (regno))
5895       {
5896         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5897         if (!frame_pointer_needed)
5898           offset += cfun->machine->frame.frame_size
5899                     - cfun->machine->frame.hard_fp_offset;
5900         /* Check that we can access the stack slot of the register with one
5901            direct load with no adjustments needed.  */
5902         if (offset_12bit_unsigned_scaled_p (DImode, offset))
5903           bitmap_set_bit (components, regno);
5904       }
5905
5906   /* Don't mess with the hard frame pointer.  */
5907   if (frame_pointer_needed)
5908     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
5909
5910   unsigned reg1 = cfun->machine->frame.wb_candidate1;
5911   unsigned reg2 = cfun->machine->frame.wb_candidate2;
5912   /* If registers have been chosen to be stored/restored with
5913      writeback don't interfere with them to avoid having to output explicit
5914      stack adjustment instructions.  */
5915   if (reg2 != INVALID_REGNUM)
5916     bitmap_clear_bit (components, reg2);
5917   if (reg1 != INVALID_REGNUM)
5918     bitmap_clear_bit (components, reg1);
5919
5920   bitmap_clear_bit (components, LR_REGNUM);
5921   bitmap_clear_bit (components, SP_REGNUM);
5922
5923   return components;
5924 }
5925
5926 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
5927
5928 static sbitmap
5929 aarch64_components_for_bb (basic_block bb)
5930 {
5931   bitmap in = DF_LIVE_IN (bb);
5932   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
5933   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
5934   bool simd_function = aarch64_simd_decl_p (cfun->decl);
5935
5936   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5937   bitmap_clear (components);
5938
5939   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
5940   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5941     if ((!call_used_regs[regno]
5942         || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
5943        && (bitmap_bit_p (in, regno)
5944            || bitmap_bit_p (gen, regno)
5945            || bitmap_bit_p (kill, regno)))
5946       {
5947         unsigned regno2, offset, offset2;
5948         bitmap_set_bit (components, regno);
5949
5950         /* If there is a callee-save at an adjacent offset, add it too
5951            to increase the use of LDP/STP.  */
5952         offset = cfun->machine->frame.reg_offset[regno];
5953         regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
5954
5955         if (regno2 <= LAST_SAVED_REGNUM)
5956           {
5957             offset2 = cfun->machine->frame.reg_offset[regno2];
5958             if ((offset & ~8) == (offset2 & ~8))
5959               bitmap_set_bit (components, regno2);
5960           }
5961       }
5962
5963   return components;
5964 }
5965
5966 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5967    Nothing to do for aarch64.  */
5968
5969 static void
5970 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
5971 {
5972 }
5973
5974 /* Return the next set bit in BMP from START onwards.  Return the total number
5975    of bits in BMP if no set bit is found at or after START.  */
5976
5977 static unsigned int
5978 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
5979 {
5980   unsigned int nbits = SBITMAP_SIZE (bmp);
5981   if (start == nbits)
5982     return start;
5983
5984   gcc_assert (start < nbits);
5985   for (unsigned int i = start; i < nbits; i++)
5986     if (bitmap_bit_p (bmp, i))
5987       return i;
5988
5989   return nbits;
5990 }
5991
5992 /* Do the work for aarch64_emit_prologue_components and
5993    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
5994    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
5995    for these components or the epilogue sequence.  That is, it determines
5996    whether we should emit stores or loads and what kind of CFA notes to attach
5997    to the insns.  Otherwise the logic for the two sequences is very
5998    similar.  */
5999
6000 static void
6001 aarch64_process_components (sbitmap components, bool prologue_p)
6002 {
6003   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
6004                              ? HARD_FRAME_POINTER_REGNUM
6005                              : STACK_POINTER_REGNUM);
6006
6007   unsigned last_regno = SBITMAP_SIZE (components);
6008   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
6009   rtx_insn *insn = NULL;
6010
6011   while (regno != last_regno)
6012     {
6013       /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
6014          so DFmode for the vector registers is enough.  For simd functions
6015          we want to save the low 128 bits.  */
6016       machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
6017
6018       rtx reg = gen_rtx_REG (mode, regno);
6019       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
6020       if (!frame_pointer_needed)
6021         offset += cfun->machine->frame.frame_size
6022                   - cfun->machine->frame.hard_fp_offset;
6023       rtx addr = plus_constant (Pmode, ptr_reg, offset);
6024       rtx mem = gen_frame_mem (mode, addr);
6025
6026       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
6027       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
6028       /* No more registers to handle after REGNO.
6029          Emit a single save/restore and exit.  */
6030       if (regno2 == last_regno)
6031         {
6032           insn = emit_insn (set);
6033           RTX_FRAME_RELATED_P (insn) = 1;
6034           if (prologue_p)
6035             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
6036           else
6037             add_reg_note (insn, REG_CFA_RESTORE, reg);
6038           break;
6039         }
6040
6041       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
6042       /* The next register is not of the same class or its offset is not
6043          mergeable with the current one into a pair.  */
6044       if (!satisfies_constraint_Ump (mem)
6045           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
6046           || (aarch64_simd_decl_p (cfun->decl) && FP_REGNUM_P (regno))
6047           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
6048                        GET_MODE_SIZE (mode)))
6049         {
6050           insn = emit_insn (set);
6051           RTX_FRAME_RELATED_P (insn) = 1;
6052           if (prologue_p)
6053             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
6054           else
6055             add_reg_note (insn, REG_CFA_RESTORE, reg);
6056
6057           regno = regno2;
6058           continue;
6059         }
6060
6061       /* REGNO2 can be saved/restored in a pair with REGNO.  */
6062       rtx reg2 = gen_rtx_REG (mode, regno2);
6063       if (!frame_pointer_needed)
6064         offset2 += cfun->machine->frame.frame_size
6065                   - cfun->machine->frame.hard_fp_offset;
6066       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
6067       rtx mem2 = gen_frame_mem (mode, addr2);
6068       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
6069                              : gen_rtx_SET (reg2, mem2);
6070
6071       if (prologue_p)
6072         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
6073       else
6074         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
6075
6076       RTX_FRAME_RELATED_P (insn) = 1;
6077       if (prologue_p)
6078         {
6079           add_reg_note (insn, REG_CFA_OFFSET, set);
6080           add_reg_note (insn, REG_CFA_OFFSET, set2);
6081         }
6082       else
6083         {
6084           add_reg_note (insn, REG_CFA_RESTORE, reg);
6085           add_reg_note (insn, REG_CFA_RESTORE, reg2);
6086         }
6087
6088       regno = aarch64_get_next_set_bit (components, regno2 + 1);
6089     }
6090 }
6091
6092 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
6093
6094 static void
6095 aarch64_emit_prologue_components (sbitmap components)
6096 {
6097   aarch64_process_components (components, true);
6098 }
6099
6100 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
6101
6102 static void
6103 aarch64_emit_epilogue_components (sbitmap components)
6104 {
6105   aarch64_process_components (components, false);
6106 }
6107
6108 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
6109
6110 static void
6111 aarch64_set_handled_components (sbitmap components)
6112 {
6113   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6114     if (bitmap_bit_p (components, regno))
6115       cfun->machine->reg_is_wrapped_separately[regno] = true;
6116 }
6117
6118 /* On AArch64 we have an ABI defined safe buffer.  This constant is used to
6119    determining the probe offset for alloca.  */
6120
6121 static HOST_WIDE_INT
6122 aarch64_stack_clash_protection_alloca_probe_range (void)
6123 {
6124   return STACK_CLASH_CALLER_GUARD;
6125 }
6126
6127
6128 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
6129    registers.  If POLY_SIZE is not large enough to require a probe this function
6130    will only adjust the stack.  When allocating the stack space
6131    FRAME_RELATED_P is then used to indicate if the allocation is frame related.
6132    FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
6133    arguments.  If we are then we ensure that any allocation larger than the ABI
6134    defined buffer needs a probe so that the invariant of having a 1KB buffer is
6135    maintained.
6136
6137    We emit barriers after each stack adjustment to prevent optimizations from
6138    breaking the invariant that we never drop the stack more than a page.  This
6139    invariant is needed to make it easier to correctly handle asynchronous
6140    events, e.g. if we were to allow the stack to be dropped by more than a page
6141    and then have multiple probes up and we take a signal somewhere in between
6142    then the signal handler doesn't know the state of the stack and can make no
6143    assumptions about which pages have been probed.  */
6144
6145 static void
6146 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
6147                                         poly_int64 poly_size,
6148                                         bool frame_related_p,
6149                                         bool final_adjustment_p)
6150 {
6151   HOST_WIDE_INT guard_size
6152     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
6153   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
6154   /* When doing the final adjustment for the outgoing argument size we can't
6155      assume that LR was saved at position 0.  So subtract it's offset from the
6156      ABI safe buffer so that we don't accidentally allow an adjustment that
6157      would result in an allocation larger than the ABI buffer without
6158      probing.  */
6159   HOST_WIDE_INT min_probe_threshold
6160     = final_adjustment_p
6161       ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
6162       : guard_size - guard_used_by_caller;
6163
6164   poly_int64 frame_size = cfun->machine->frame.frame_size;
6165
6166   /* We should always have a positive probe threshold.  */
6167   gcc_assert (min_probe_threshold > 0);
6168
6169   if (flag_stack_clash_protection && !final_adjustment_p)
6170     {
6171       poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6172       poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6173
6174       if (known_eq (frame_size, 0))
6175         {
6176           dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
6177         }
6178       else if (known_lt (initial_adjust, guard_size - guard_used_by_caller)
6179                && known_lt (final_adjust, guard_used_by_caller))
6180         {
6181           dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
6182         }
6183     }
6184
6185   /* If SIZE is not large enough to require probing, just adjust the stack and
6186      exit.  */
6187   if (known_lt (poly_size, min_probe_threshold)
6188       || !flag_stack_clash_protection)
6189     {
6190       aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
6191       return;
6192     }
6193
6194   HOST_WIDE_INT size;
6195   /* Handle the SVE non-constant case first.  */
6196   if (!poly_size.is_constant (&size))
6197     {
6198      if (dump_file)
6199       {
6200         fprintf (dump_file, "Stack clash SVE prologue: ");
6201         print_dec (poly_size, dump_file);
6202         fprintf (dump_file, " bytes, dynamic probing will be required.\n");
6203       }
6204
6205       /* First calculate the amount of bytes we're actually spilling.  */
6206       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
6207                           poly_size, temp1, temp2, false, true);
6208
6209       rtx_insn *insn = get_last_insn ();
6210
6211       if (frame_related_p)
6212         {
6213           /* This is done to provide unwinding information for the stack
6214              adjustments we're about to do, however to prevent the optimizers
6215              from removing the R11 move and leaving the CFA note (which would be
6216              very wrong) we tie the old and new stack pointer together.
6217              The tie will expand to nothing but the optimizers will not touch
6218              the instruction.  */
6219           rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
6220           emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
6221           emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
6222
6223           /* We want the CFA independent of the stack pointer for the
6224              duration of the loop.  */
6225           add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
6226           RTX_FRAME_RELATED_P (insn) = 1;
6227         }
6228
6229       rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
6230       rtx guard_const = gen_int_mode (guard_size, Pmode);
6231
6232       insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
6233                                                    stack_pointer_rtx, temp1,
6234                                                    probe_const, guard_const));
6235
6236       /* Now reset the CFA register if needed.  */
6237       if (frame_related_p)
6238         {
6239           add_reg_note (insn, REG_CFA_DEF_CFA,
6240                         gen_rtx_PLUS (Pmode, stack_pointer_rtx,
6241                                       gen_int_mode (poly_size, Pmode)));
6242           RTX_FRAME_RELATED_P (insn) = 1;
6243         }
6244
6245       return;
6246     }
6247
6248   if (dump_file)
6249     fprintf (dump_file,
6250              "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
6251              " bytes, probing will be required.\n", size);
6252
6253   /* Round size to the nearest multiple of guard_size, and calculate the
6254      residual as the difference between the original size and the rounded
6255      size.  */
6256   HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
6257   HOST_WIDE_INT residual = size - rounded_size;
6258
6259   /* We can handle a small number of allocations/probes inline.  Otherwise
6260      punt to a loop.  */
6261   if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
6262     {
6263       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
6264         {
6265           aarch64_sub_sp (NULL, temp2, guard_size, true);
6266           emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
6267                                            guard_used_by_caller));
6268           emit_insn (gen_blockage ());
6269         }
6270       dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
6271     }
6272   else
6273     {
6274       /* Compute the ending address.  */
6275       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
6276                           temp1, NULL, false, true);
6277       rtx_insn *insn = get_last_insn ();
6278
6279       /* For the initial allocation, we don't have a frame pointer
6280          set up, so we always need CFI notes.  If we're doing the
6281          final allocation, then we may have a frame pointer, in which
6282          case it is the CFA, otherwise we need CFI notes.
6283
6284          We can determine which allocation we are doing by looking at
6285          the value of FRAME_RELATED_P since the final allocations are not
6286          frame related.  */
6287       if (frame_related_p)
6288         {
6289           /* We want the CFA independent of the stack pointer for the
6290              duration of the loop.  */
6291           add_reg_note (insn, REG_CFA_DEF_CFA,
6292                         plus_constant (Pmode, temp1, rounded_size));
6293           RTX_FRAME_RELATED_P (insn) = 1;
6294         }
6295
6296       /* This allocates and probes the stack.  Note that this re-uses some of
6297          the existing Ada stack protection code.  However we are guaranteed not
6298          to enter the non loop or residual branches of that code.
6299
6300          The non-loop part won't be entered because if our allocation amount
6301          doesn't require a loop, the case above would handle it.
6302
6303          The residual amount won't be entered because TEMP1 is a mutliple of
6304          the allocation size.  The residual will always be 0.  As such, the only
6305          part we are actually using from that code is the loop setup.  The
6306          actual probing is done in aarch64_output_probe_stack_range.  */
6307       insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
6308                                                stack_pointer_rtx, temp1));
6309
6310       /* Now reset the CFA register if needed.  */
6311       if (frame_related_p)
6312         {
6313           add_reg_note (insn, REG_CFA_DEF_CFA,
6314                         plus_constant (Pmode, stack_pointer_rtx, rounded_size));
6315           RTX_FRAME_RELATED_P (insn) = 1;
6316         }
6317
6318       emit_insn (gen_blockage ());
6319       dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
6320     }
6321
6322   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
6323      be probed.  This maintains the requirement that each page is probed at
6324      least once.  For initial probing we probe only if the allocation is
6325      more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
6326      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
6327      GUARD_SIZE.  This works that for any allocation that is large enough to
6328      trigger a probe here, we'll have at least one, and if they're not large
6329      enough for this code to emit anything for them, The page would have been
6330      probed by the saving of FP/LR either by this function or any callees.  If
6331      we don't have any callees then we won't have more stack adjustments and so
6332      are still safe.  */
6333   if (residual)
6334     {
6335       HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
6336       /* If we're doing final adjustments, and we've done any full page
6337          allocations then any residual needs to be probed.  */
6338       if (final_adjustment_p && rounded_size != 0)
6339         min_probe_threshold = 0;
6340       /* If doing a small final adjustment, we always probe at offset 0.
6341          This is done to avoid issues when LR is not at position 0 or when
6342          the final adjustment is smaller than the probing offset.  */
6343       else if (final_adjustment_p && rounded_size == 0)
6344         residual_probe_offset = 0;
6345
6346       aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
6347       if (residual >= min_probe_threshold)
6348         {
6349           if (dump_file)
6350             fprintf (dump_file,
6351                      "Stack clash AArch64 prologue residuals: "
6352                      HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
6353                      "\n", residual);
6354
6355             emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
6356                                              residual_probe_offset));
6357           emit_insn (gen_blockage ());
6358         }
6359     }
6360 }
6361
6362 /* Return 1 if the register is used by the epilogue.  We need to say the
6363    return register is used, but only after epilogue generation is complete.
6364    Note that in the case of sibcalls, the values "used by the epilogue" are
6365    considered live at the start of the called function.
6366
6367    For SIMD functions we need to return 1 for FP registers that are saved and
6368    restored by a function but are not zero in call_used_regs.  If we do not do
6369    this optimizations may remove the restore of the register.  */
6370
6371 int
6372 aarch64_epilogue_uses (int regno)
6373 {
6374   if (epilogue_completed)
6375     {
6376       if (regno == LR_REGNUM)
6377         return 1;
6378       if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno))
6379         return 1;
6380     }
6381   return 0;
6382 }
6383
6384 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
6385    is saved at BASE + OFFSET.  */
6386
6387 static void
6388 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
6389                             rtx base, poly_int64 offset)
6390 {
6391   rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
6392   add_reg_note (insn, REG_CFA_EXPRESSION,
6393                 gen_rtx_SET (mem, regno_reg_rtx[reg]));
6394 }
6395
6396 /* AArch64 stack frames generated by this compiler look like:
6397
6398         +-------------------------------+
6399         |                               |
6400         |  incoming stack arguments     |
6401         |                               |
6402         +-------------------------------+
6403         |                               | <-- incoming stack pointer (aligned)
6404         |  callee-allocated save area   |
6405         |  for register varargs         |
6406         |                               |
6407         +-------------------------------+
6408         |  local variables              | <-- frame_pointer_rtx
6409         |                               |
6410         +-------------------------------+
6411         |  padding                      | \
6412         +-------------------------------+  |
6413         |  callee-saved registers       |  | frame.saved_regs_size
6414         +-------------------------------+  |
6415         |  LR'                          |  |
6416         +-------------------------------+  |
6417         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
6418         +-------------------------------+
6419         |  dynamic allocation           |
6420         +-------------------------------+
6421         |  padding                      |
6422         +-------------------------------+
6423         |  outgoing stack arguments     | <-- arg_pointer
6424         |                               |
6425         +-------------------------------+
6426         |                               | <-- stack_pointer_rtx (aligned)
6427
6428    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
6429    but leave frame_pointer_rtx and hard_frame_pointer_rtx
6430    unchanged.
6431
6432    By default for stack-clash we assume the guard is at least 64KB, but this
6433    value is configurable to either 4KB or 64KB.  We also force the guard size to
6434    be the same as the probing interval and both values are kept in sync.
6435
6436    With those assumptions the callee can allocate up to 63KB (or 3KB depending
6437    on the guard size) of stack space without probing.
6438
6439    When probing is needed, we emit a probe at the start of the prologue
6440    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
6441
6442    We have to track how much space has been allocated and the only stores
6443    to the stack we track as implicit probes are the FP/LR stores.
6444
6445    For outgoing arguments we probe if the size is larger than 1KB, such that
6446    the ABI specified buffer is maintained for the next callee.
6447
6448    The following registers are reserved during frame layout and should not be
6449    used for any other purpose:
6450
6451    - r11: Used by stack clash protection when SVE is enabled.
6452    - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
6453    - r14 and r15: Used for speculation tracking.
6454    - r16(IP0), r17(IP1): Used by indirect tailcalls.
6455    - r30(LR), r29(FP): Used by standard frame layout.
6456
6457    These registers must be avoided in frame layout related code unless the
6458    explicit intention is to interact with one of the features listed above.  */
6459
6460 /* Generate the prologue instructions for entry into a function.
6461    Establish the stack frame by decreasing the stack pointer with a
6462    properly calculated size and, if necessary, create a frame record
6463    filled with the values of LR and previous frame pointer.  The
6464    current FP is also set up if it is in use.  */
6465
6466 void
6467 aarch64_expand_prologue (void)
6468 {
6469   poly_int64 frame_size = cfun->machine->frame.frame_size;
6470   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6471   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6472   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6473   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
6474   unsigned reg1 = cfun->machine->frame.wb_candidate1;
6475   unsigned reg2 = cfun->machine->frame.wb_candidate2;
6476   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
6477   rtx_insn *insn;
6478
6479   /* Sign return address for functions.  */
6480   if (aarch64_return_address_signing_enabled ())
6481     {
6482       switch (aarch64_ra_sign_key)
6483         {
6484           case AARCH64_KEY_A:
6485             insn = emit_insn (gen_paciasp ());
6486             break;
6487           case AARCH64_KEY_B:
6488             insn = emit_insn (gen_pacibsp ());
6489             break;
6490           default:
6491             gcc_unreachable ();
6492         }
6493       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6494       RTX_FRAME_RELATED_P (insn) = 1;
6495     }
6496
6497   if (flag_stack_usage_info)
6498     current_function_static_stack_size = constant_lower_bound (frame_size);
6499
6500   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
6501     {
6502       if (crtl->is_leaf && !cfun->calls_alloca)
6503         {
6504           if (maybe_gt (frame_size, PROBE_INTERVAL)
6505               && maybe_gt (frame_size, get_stack_check_protect ()))
6506             aarch64_emit_probe_stack_range (get_stack_check_protect (),
6507                                             (frame_size
6508                                              - get_stack_check_protect ()));
6509         }
6510       else if (maybe_gt (frame_size, 0))
6511         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
6512     }
6513
6514   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
6515   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6516
6517   /* In theory we should never have both an initial adjustment
6518      and a callee save adjustment.  Verify that is the case since the
6519      code below does not handle it for -fstack-clash-protection.  */
6520   gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
6521
6522   /* Will only probe if the initial adjustment is larger than the guard
6523      less the amount of the guard reserved for use by the caller's
6524      outgoing args.  */
6525   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
6526                                           true, false);
6527
6528   if (callee_adjust != 0)
6529     aarch64_push_regs (reg1, reg2, callee_adjust);
6530
6531   if (emit_frame_chain)
6532     {
6533       poly_int64 reg_offset = callee_adjust;
6534       if (callee_adjust == 0)
6535         {
6536           reg1 = R29_REGNUM;
6537           reg2 = R30_REGNUM;
6538           reg_offset = callee_offset;
6539           aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
6540         }
6541       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
6542                           stack_pointer_rtx, callee_offset,
6543                           tmp1_rtx, tmp0_rtx, frame_pointer_needed);
6544       if (frame_pointer_needed && !frame_size.is_constant ())
6545         {
6546           /* Variable-sized frames need to describe the save slot
6547              address using DW_CFA_expression rather than DW_CFA_offset.
6548              This means that, without taking further action, the
6549              locations of the registers that we've already saved would
6550              remain based on the stack pointer even after we redefine
6551              the CFA based on the frame pointer.  We therefore need new
6552              DW_CFA_expressions to re-express the save slots with addresses
6553              based on the frame pointer.  */
6554           rtx_insn *insn = get_last_insn ();
6555           gcc_assert (RTX_FRAME_RELATED_P (insn));
6556
6557           /* Add an explicit CFA definition if this was previously
6558              implicit.  */
6559           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
6560             {
6561               rtx src = plus_constant (Pmode, stack_pointer_rtx,
6562                                        callee_offset);
6563               add_reg_note (insn, REG_CFA_ADJUST_CFA,
6564                             gen_rtx_SET (hard_frame_pointer_rtx, src));
6565             }
6566
6567           /* Change the save slot expressions for the registers that
6568              we've already saved.  */
6569           reg_offset -= callee_offset;
6570           aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
6571                                       reg_offset + UNITS_PER_WORD);
6572           aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
6573                                       reg_offset);
6574         }
6575       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
6576     }
6577
6578   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
6579                              callee_adjust != 0 || emit_frame_chain);
6580   if (aarch64_simd_decl_p (cfun->decl))
6581     aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6582                                callee_adjust != 0 || emit_frame_chain);
6583   else
6584     aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6585                                callee_adjust != 0 || emit_frame_chain);
6586
6587   /* We may need to probe the final adjustment if it is larger than the guard
6588      that is assumed by the called.  */
6589   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
6590                                           !frame_pointer_needed, true);
6591 }
6592
6593 /* Return TRUE if we can use a simple_return insn.
6594
6595    This function checks whether the callee saved stack is empty, which
6596    means no restore actions are need. The pro_and_epilogue will use
6597    this to check whether shrink-wrapping opt is feasible.  */
6598
6599 bool
6600 aarch64_use_return_insn_p (void)
6601 {
6602   if (!reload_completed)
6603     return false;
6604
6605   if (crtl->profile)
6606     return false;
6607
6608   return known_eq (cfun->machine->frame.frame_size, 0);
6609 }
6610
6611 /* Return false for non-leaf SIMD functions in order to avoid
6612    shrink-wrapping them.  Doing this will lose the necessary
6613    save/restore of FP registers.  */
6614
6615 bool
6616 aarch64_use_simple_return_insn_p (void)
6617 {
6618   if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf)
6619     return false;
6620
6621   return true;
6622 }
6623
6624 /* Generate the epilogue instructions for returning from a function.
6625    This is almost exactly the reverse of the prolog sequence, except
6626    that we need to insert barriers to avoid scheduling loads that read
6627    from a deallocated stack, and we optimize the unwind records by
6628    emitting them all together if possible.  */
6629 void
6630 aarch64_expand_epilogue (bool for_sibcall)
6631 {
6632   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6633   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6634   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6635   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
6636   unsigned reg1 = cfun->machine->frame.wb_candidate1;
6637   unsigned reg2 = cfun->machine->frame.wb_candidate2;
6638   rtx cfi_ops = NULL;
6639   rtx_insn *insn;
6640   /* A stack clash protection prologue may not have left EP0_REGNUM or
6641      EP1_REGNUM in a usable state.  The same is true for allocations
6642      with an SVE component, since we then need both temporary registers
6643      for each allocation.  For stack clash we are in a usable state if
6644      the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
6645   HOST_WIDE_INT guard_size
6646     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
6647   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
6648
6649   /* We can re-use the registers when the allocation amount is smaller than
6650      guard_size - guard_used_by_caller because we won't be doing any probes
6651      then.  In such situations the register should remain live with the correct
6652      value.  */
6653   bool can_inherit_p = (initial_adjust.is_constant ()
6654                         && final_adjust.is_constant ())
6655                         && (!flag_stack_clash_protection
6656                             || known_lt (initial_adjust,
6657                                          guard_size - guard_used_by_caller));
6658
6659   /* We need to add memory barrier to prevent read from deallocated stack.  */
6660   bool need_barrier_p
6661     = maybe_ne (get_frame_size ()
6662                 + cfun->machine->frame.saved_varargs_size, 0);
6663
6664   /* Emit a barrier to prevent loads from a deallocated stack.  */
6665   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
6666       || cfun->calls_alloca
6667       || crtl->calls_eh_return)
6668     {
6669       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6670       need_barrier_p = false;
6671     }
6672
6673   /* Restore the stack pointer from the frame pointer if it may not
6674      be the same as the stack pointer.  */
6675   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
6676   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6677   if (frame_pointer_needed
6678       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
6679     /* If writeback is used when restoring callee-saves, the CFA
6680        is restored on the instruction doing the writeback.  */
6681     aarch64_add_offset (Pmode, stack_pointer_rtx,
6682                         hard_frame_pointer_rtx, -callee_offset,
6683                         tmp1_rtx, tmp0_rtx, callee_adjust == 0);
6684   else
6685      /* The case where we need to re-use the register here is very rare, so
6686         avoid the complicated condition and just always emit a move if the
6687         immediate doesn't fit.  */
6688      aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
6689
6690   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
6691                                 callee_adjust != 0, &cfi_ops);
6692   if (aarch64_simd_decl_p (cfun->decl))
6693     aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6694                                   callee_adjust != 0, &cfi_ops);
6695   else
6696     aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6697                                   callee_adjust != 0, &cfi_ops);
6698
6699   if (need_barrier_p)
6700     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6701
6702   if (callee_adjust != 0)
6703     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
6704
6705   if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
6706     {
6707       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
6708       insn = get_last_insn ();
6709       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
6710       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
6711       RTX_FRAME_RELATED_P (insn) = 1;
6712       cfi_ops = NULL;
6713     }
6714
6715   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
6716      add restriction on emit_move optimization to leaf functions.  */
6717   aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
6718                   (!can_inherit_p || !crtl->is_leaf
6719                    || df_regs_ever_live_p (EP0_REGNUM)));
6720
6721   if (cfi_ops)
6722     {
6723       /* Emit delayed restores and reset the CFA to be SP.  */
6724       insn = get_last_insn ();
6725       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
6726       REG_NOTES (insn) = cfi_ops;
6727       RTX_FRAME_RELATED_P (insn) = 1;
6728     }
6729
6730   /* We prefer to emit the combined return/authenticate instruction RETAA,
6731      however there are three cases in which we must instead emit an explicit
6732      authentication instruction.
6733
6734         1) Sibcalls don't return in a normal way, so if we're about to call one
6735            we must authenticate.
6736
6737         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
6738            generating code for !TARGET_ARMV8_3 we can't use it and must
6739            explicitly authenticate.
6740
6741         3) On an eh_return path we make extra stack adjustments to update the
6742            canonical frame address to be the exception handler's CFA.  We want
6743            to authenticate using the CFA of the function which calls eh_return.
6744     */
6745   if (aarch64_return_address_signing_enabled ()
6746       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
6747     {
6748       switch (aarch64_ra_sign_key)
6749         {
6750           case AARCH64_KEY_A:
6751             insn = emit_insn (gen_autiasp ());
6752             break;
6753           case AARCH64_KEY_B:
6754             insn = emit_insn (gen_autibsp ());
6755             break;
6756           default:
6757             gcc_unreachable ();
6758         }
6759       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6760       RTX_FRAME_RELATED_P (insn) = 1;
6761     }
6762
6763   /* Stack adjustment for exception handler.  */
6764   if (crtl->calls_eh_return && !for_sibcall)
6765     {
6766       /* We need to unwind the stack by the offset computed by
6767          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
6768          to be SP; letting the CFA move during this adjustment
6769          is just as correct as retaining the CFA from the body
6770          of the function.  Therefore, do nothing special.  */
6771       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
6772     }
6773
6774   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
6775   if (!for_sibcall)
6776     emit_jump_insn (ret_rtx);
6777 }
6778
6779 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
6780    normally or return to a previous frame after unwinding.
6781
6782    An EH return uses a single shared return sequence.  The epilogue is
6783    exactly like a normal epilogue except that it has an extra input
6784    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
6785    that must be applied after the frame has been destroyed.  An extra label
6786    is inserted before the epilogue which initializes this register to zero,
6787    and this is the entry point for a normal return.
6788
6789    An actual EH return updates the return address, initializes the stack
6790    adjustment and jumps directly into the epilogue (bypassing the zeroing
6791    of the adjustment).  Since the return address is typically saved on the
6792    stack when a function makes a call, the saved LR must be updated outside
6793    the epilogue.
6794
6795    This poses problems as the store is generated well before the epilogue,
6796    so the offset of LR is not known yet.  Also optimizations will remove the
6797    store as it appears dead, even after the epilogue is generated (as the
6798    base or offset for loading LR is different in many cases).
6799
6800    To avoid these problems this implementation forces the frame pointer
6801    in eh_return functions so that the location of LR is fixed and known early.
6802    It also marks the store volatile, so no optimization is permitted to
6803    remove the store.  */
6804 rtx
6805 aarch64_eh_return_handler_rtx (void)
6806 {
6807   rtx tmp = gen_frame_mem (Pmode,
6808     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
6809
6810   /* Mark the store volatile, so no optimization is permitted to remove it.  */
6811   MEM_VOLATILE_P (tmp) = true;
6812   return tmp;
6813 }
6814
6815 /* Output code to add DELTA to the first argument, and then jump
6816    to FUNCTION.  Used for C++ multiple inheritance.  */
6817 static void
6818 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
6819                          HOST_WIDE_INT delta,
6820                          HOST_WIDE_INT vcall_offset,
6821                          tree function)
6822 {
6823   /* The this pointer is always in x0.  Note that this differs from
6824      Arm where the this pointer maybe bumped to r1 if r0 is required
6825      to return a pointer to an aggregate.  On AArch64 a result value
6826      pointer will be in x8.  */
6827   int this_regno = R0_REGNUM;
6828   rtx this_rtx, temp0, temp1, addr, funexp;
6829   rtx_insn *insn;
6830   const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
6831
6832   if (aarch64_bti_enabled ())
6833     emit_insn (gen_bti_c());
6834
6835   reload_completed = 1;
6836   emit_note (NOTE_INSN_PROLOGUE_END);
6837
6838   this_rtx = gen_rtx_REG (Pmode, this_regno);
6839   temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
6840   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
6841
6842   if (vcall_offset == 0)
6843     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
6844   else
6845     {
6846       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
6847
6848       addr = this_rtx;
6849       if (delta != 0)
6850         {
6851           if (delta >= -256 && delta < 256)
6852             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
6853                                        plus_constant (Pmode, this_rtx, delta));
6854           else
6855             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
6856                                 temp1, temp0, false);
6857         }
6858
6859       if (Pmode == ptr_mode)
6860         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
6861       else
6862         aarch64_emit_move (temp0,
6863                            gen_rtx_ZERO_EXTEND (Pmode,
6864                                                 gen_rtx_MEM (ptr_mode, addr)));
6865
6866       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
6867           addr = plus_constant (Pmode, temp0, vcall_offset);
6868       else
6869         {
6870           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
6871                                           Pmode);
6872           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
6873         }
6874
6875       if (Pmode == ptr_mode)
6876         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
6877       else
6878         aarch64_emit_move (temp1,
6879                            gen_rtx_SIGN_EXTEND (Pmode,
6880                                                 gen_rtx_MEM (ptr_mode, addr)));
6881
6882       emit_insn (gen_add2_insn (this_rtx, temp1));
6883     }
6884
6885   /* Generate a tail call to the target function.  */
6886   if (!TREE_USED (function))
6887     {
6888       assemble_external (function);
6889       TREE_USED (function) = 1;
6890     }
6891   funexp = XEXP (DECL_RTL (function), 0);
6892   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
6893   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
6894   SIBLING_CALL_P (insn) = 1;
6895
6896   insn = get_insns ();
6897   shorten_branches (insn);
6898
6899   assemble_start_function (thunk, fnname);
6900   final_start_function (insn, file, 1);
6901   final (insn, file, 1);
6902   final_end_function ();
6903   assemble_end_function (thunk, fnname);
6904
6905   /* Stop pretending to be a post-reload pass.  */
6906   reload_completed = 0;
6907 }
6908
6909 static bool
6910 aarch64_tls_referenced_p (rtx x)
6911 {
6912   if (!TARGET_HAVE_TLS)
6913     return false;
6914   subrtx_iterator::array_type array;
6915   FOR_EACH_SUBRTX (iter, array, x, ALL)
6916     {
6917       const_rtx x = *iter;
6918       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
6919         return true;
6920       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
6921          TLS offsets, not real symbol references.  */
6922       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
6923         iter.skip_subrtxes ();
6924     }
6925   return false;
6926 }
6927
6928
6929 /* Return true if val can be encoded as a 12-bit unsigned immediate with
6930    a left shift of 0 or 12 bits.  */
6931 bool
6932 aarch64_uimm12_shift (HOST_WIDE_INT val)
6933 {
6934   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
6935           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
6936           );
6937 }
6938
6939 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
6940    that can be created with a left shift of 0 or 12.  */
6941 static HOST_WIDE_INT
6942 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
6943 {
6944   /* Check to see if the value fits in 24 bits, as that is the maximum we can
6945      handle correctly.  */
6946   gcc_assert ((val & 0xffffff) == val);
6947
6948   if (((val & 0xfff) << 0) == val)
6949     return val;
6950
6951   return val & (0xfff << 12);
6952 }
6953
6954 /* Return true if val is an immediate that can be loaded into a
6955    register by a MOVZ instruction.  */
6956 static bool
6957 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
6958 {
6959   if (GET_MODE_SIZE (mode) > 4)
6960     {
6961       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
6962           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
6963         return 1;
6964     }
6965   else
6966     {
6967       /* Ignore sign extension.  */
6968       val &= (HOST_WIDE_INT) 0xffffffff;
6969     }
6970   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
6971           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
6972 }
6973
6974 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
6975    64-bit (DImode) integer.  */
6976
6977 static unsigned HOST_WIDE_INT
6978 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
6979 {
6980   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
6981   while (size < 64)
6982     {
6983       val &= (HOST_WIDE_INT_1U << size) - 1;
6984       val |= val << size;
6985       size *= 2;
6986     }
6987   return val;
6988 }
6989
6990 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
6991
6992 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
6993   {
6994     0x0000000100000001ull,
6995     0x0001000100010001ull,
6996     0x0101010101010101ull,
6997     0x1111111111111111ull,
6998     0x5555555555555555ull,
6999   };
7000
7001
7002 /* Return true if val is a valid bitmask immediate.  */
7003
7004 bool
7005 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
7006 {
7007   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
7008   int bits;
7009
7010   /* Check for a single sequence of one bits and return quickly if so.
7011      The special cases of all ones and all zeroes returns false.  */
7012   val = aarch64_replicate_bitmask_imm (val_in, mode);
7013   tmp = val + (val & -val);
7014
7015   if (tmp == (tmp & -tmp))
7016     return (val + 1) > 1;
7017
7018   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
7019   if (mode == SImode)
7020     val = (val << 32) | (val & 0xffffffff);
7021
7022   /* Invert if the immediate doesn't start with a zero bit - this means we
7023      only need to search for sequences of one bits.  */
7024   if (val & 1)
7025     val = ~val;
7026
7027   /* Find the first set bit and set tmp to val with the first sequence of one
7028      bits removed.  Return success if there is a single sequence of ones.  */
7029   first_one = val & -val;
7030   tmp = val & (val + first_one);
7031
7032   if (tmp == 0)
7033     return true;
7034
7035   /* Find the next set bit and compute the difference in bit position.  */
7036   next_one = tmp & -tmp;
7037   bits = clz_hwi (first_one) - clz_hwi (next_one);
7038   mask = val ^ tmp;
7039
7040   /* Check the bit position difference is a power of 2, and that the first
7041      sequence of one bits fits within 'bits' bits.  */
7042   if ((mask >> bits) != 0 || bits != (bits & -bits))
7043     return false;
7044
7045   /* Check the sequence of one bits is repeated 64/bits times.  */
7046   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
7047 }
7048
7049 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
7050    Assumed precondition: VAL_IN Is not zero.  */
7051
7052 unsigned HOST_WIDE_INT
7053 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
7054 {
7055   int lowest_bit_set = ctz_hwi (val_in);
7056   int highest_bit_set = floor_log2 (val_in);
7057   gcc_assert (val_in != 0);
7058
7059   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
7060           (HOST_WIDE_INT_1U << lowest_bit_set));
7061 }
7062
7063 /* Create constant where bits outside of lowest bit set to highest bit set
7064    are set to 1.  */
7065
7066 unsigned HOST_WIDE_INT
7067 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
7068 {
7069   return val_in | ~aarch64_and_split_imm1 (val_in);
7070 }
7071
7072 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
7073
7074 bool
7075 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
7076 {
7077   scalar_int_mode int_mode;
7078   if (!is_a <scalar_int_mode> (mode, &int_mode))
7079     return false;
7080
7081   if (aarch64_bitmask_imm (val_in, int_mode))
7082     return false;
7083
7084   if (aarch64_move_imm (val_in, int_mode))
7085     return false;
7086
7087   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
7088
7089   return aarch64_bitmask_imm (imm2, int_mode);
7090 }
7091
7092 /* Return true if val is an immediate that can be loaded into a
7093    register in a single instruction.  */
7094 bool
7095 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
7096 {
7097   scalar_int_mode int_mode;
7098   if (!is_a <scalar_int_mode> (mode, &int_mode))
7099     return false;
7100
7101   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
7102     return 1;
7103   return aarch64_bitmask_imm (val, int_mode);
7104 }
7105
7106 static bool
7107 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
7108 {
7109   rtx base, offset;
7110
7111   if (GET_CODE (x) == HIGH)
7112     return true;
7113
7114   /* There's no way to calculate VL-based values using relocations.  */
7115   subrtx_iterator::array_type array;
7116   FOR_EACH_SUBRTX (iter, array, x, ALL)
7117     if (GET_CODE (*iter) == CONST_POLY_INT)
7118       return true;
7119
7120   split_const (x, &base, &offset);
7121   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
7122     {
7123       if (aarch64_classify_symbol (base, INTVAL (offset))
7124           != SYMBOL_FORCE_TO_MEM)
7125         return true;
7126       else
7127         /* Avoid generating a 64-bit relocation in ILP32; leave
7128            to aarch64_expand_mov_immediate to handle it properly.  */
7129         return mode != ptr_mode;
7130     }
7131
7132   return aarch64_tls_referenced_p (x);
7133 }
7134
7135 /* Implement TARGET_CASE_VALUES_THRESHOLD.
7136    The expansion for a table switch is quite expensive due to the number
7137    of instructions, the table lookup and hard to predict indirect jump.
7138    When optimizing for speed, and -O3 enabled, use the per-core tuning if
7139    set, otherwise use tables for > 16 cases as a tradeoff between size and
7140    performance.  When optimizing for size, use the default setting.  */
7141
7142 static unsigned int
7143 aarch64_case_values_threshold (void)
7144 {
7145   /* Use the specified limit for the number of cases before using jump
7146      tables at higher optimization levels.  */
7147   if (optimize > 2
7148       && selected_cpu->tune->max_case_values != 0)
7149     return selected_cpu->tune->max_case_values;
7150   else
7151     return optimize_size ? default_case_values_threshold () : 17;
7152 }
7153
7154 /* Return true if register REGNO is a valid index register.
7155    STRICT_P is true if REG_OK_STRICT is in effect.  */
7156
7157 bool
7158 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
7159 {
7160   if (!HARD_REGISTER_NUM_P (regno))
7161     {
7162       if (!strict_p)
7163         return true;
7164
7165       if (!reg_renumber)
7166         return false;
7167
7168       regno = reg_renumber[regno];
7169     }
7170   return GP_REGNUM_P (regno);
7171 }
7172
7173 /* Return true if register REGNO is a valid base register for mode MODE.
7174    STRICT_P is true if REG_OK_STRICT is in effect.  */
7175
7176 bool
7177 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
7178 {
7179   if (!HARD_REGISTER_NUM_P (regno))
7180     {
7181       if (!strict_p)
7182         return true;
7183
7184       if (!reg_renumber)
7185         return false;
7186
7187       regno = reg_renumber[regno];
7188     }
7189
7190   /* The fake registers will be eliminated to either the stack or
7191      hard frame pointer, both of which are usually valid base registers.
7192      Reload deals with the cases where the eliminated form isn't valid.  */
7193   return (GP_REGNUM_P (regno)
7194           || regno == SP_REGNUM
7195           || regno == FRAME_POINTER_REGNUM
7196           || regno == ARG_POINTER_REGNUM);
7197 }
7198
7199 /* Return true if X is a valid base register for mode MODE.
7200    STRICT_P is true if REG_OK_STRICT is in effect.  */
7201
7202 static bool
7203 aarch64_base_register_rtx_p (rtx x, bool strict_p)
7204 {
7205   if (!strict_p
7206       && GET_CODE (x) == SUBREG
7207       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
7208     x = SUBREG_REG (x);
7209
7210   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
7211 }
7212
7213 /* Return true if address offset is a valid index.  If it is, fill in INFO
7214    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
7215
7216 static bool
7217 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
7218                         machine_mode mode, bool strict_p)
7219 {
7220   enum aarch64_address_type type;
7221   rtx index;
7222   int shift;
7223
7224   /* (reg:P) */
7225   if ((REG_P (x) || GET_CODE (x) == SUBREG)
7226       && GET_MODE (x) == Pmode)
7227     {
7228       type = ADDRESS_REG_REG;
7229       index = x;
7230       shift = 0;
7231     }
7232   /* (sign_extend:DI (reg:SI)) */
7233   else if ((GET_CODE (x) == SIGN_EXTEND
7234             || GET_CODE (x) == ZERO_EXTEND)
7235            && GET_MODE (x) == DImode
7236            && GET_MODE (XEXP (x, 0)) == SImode)
7237     {
7238       type = (GET_CODE (x) == SIGN_EXTEND)
7239         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7240       index = XEXP (x, 0);
7241       shift = 0;
7242     }
7243   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
7244   else if (GET_CODE (x) == MULT
7245            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
7246                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
7247            && GET_MODE (XEXP (x, 0)) == DImode
7248            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
7249            && CONST_INT_P (XEXP (x, 1)))
7250     {
7251       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
7252         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7253       index = XEXP (XEXP (x, 0), 0);
7254       shift = exact_log2 (INTVAL (XEXP (x, 1)));
7255     }
7256   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
7257   else if (GET_CODE (x) == ASHIFT
7258            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
7259                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
7260            && GET_MODE (XEXP (x, 0)) == DImode
7261            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
7262            && CONST_INT_P (XEXP (x, 1)))
7263     {
7264       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
7265         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7266       index = XEXP (XEXP (x, 0), 0);
7267       shift = INTVAL (XEXP (x, 1));
7268     }
7269   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
7270   else if ((GET_CODE (x) == SIGN_EXTRACT
7271             || GET_CODE (x) == ZERO_EXTRACT)
7272            && GET_MODE (x) == DImode
7273            && GET_CODE (XEXP (x, 0)) == MULT
7274            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7275            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
7276     {
7277       type = (GET_CODE (x) == SIGN_EXTRACT)
7278         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7279       index = XEXP (XEXP (x, 0), 0);
7280       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
7281       if (INTVAL (XEXP (x, 1)) != 32 + shift
7282           || INTVAL (XEXP (x, 2)) != 0)
7283         shift = -1;
7284     }
7285   /* (and:DI (mult:DI (reg:DI) (const_int scale))
7286      (const_int 0xffffffff<<shift)) */
7287   else if (GET_CODE (x) == AND
7288            && GET_MODE (x) == DImode
7289            && GET_CODE (XEXP (x, 0)) == MULT
7290            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7291            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7292            && CONST_INT_P (XEXP (x, 1)))
7293     {
7294       type = ADDRESS_REG_UXTW;
7295       index = XEXP (XEXP (x, 0), 0);
7296       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
7297       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
7298         shift = -1;
7299     }
7300   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
7301   else if ((GET_CODE (x) == SIGN_EXTRACT
7302             || GET_CODE (x) == ZERO_EXTRACT)
7303            && GET_MODE (x) == DImode
7304            && GET_CODE (XEXP (x, 0)) == ASHIFT
7305            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7306            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
7307     {
7308       type = (GET_CODE (x) == SIGN_EXTRACT)
7309         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7310       index = XEXP (XEXP (x, 0), 0);
7311       shift = INTVAL (XEXP (XEXP (x, 0), 1));
7312       if (INTVAL (XEXP (x, 1)) != 32 + shift
7313           || INTVAL (XEXP (x, 2)) != 0)
7314         shift = -1;
7315     }
7316   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
7317      (const_int 0xffffffff<<shift)) */
7318   else if (GET_CODE (x) == AND
7319            && GET_MODE (x) == DImode
7320            && GET_CODE (XEXP (x, 0)) == ASHIFT
7321            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7322            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7323            && CONST_INT_P (XEXP (x, 1)))
7324     {
7325       type = ADDRESS_REG_UXTW;
7326       index = XEXP (XEXP (x, 0), 0);
7327       shift = INTVAL (XEXP (XEXP (x, 0), 1));
7328       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
7329         shift = -1;
7330     }
7331   /* (mult:P (reg:P) (const_int scale)) */
7332   else if (GET_CODE (x) == MULT
7333            && GET_MODE (x) == Pmode
7334            && GET_MODE (XEXP (x, 0)) == Pmode
7335            && CONST_INT_P (XEXP (x, 1)))
7336     {
7337       type = ADDRESS_REG_REG;
7338       index = XEXP (x, 0);
7339       shift = exact_log2 (INTVAL (XEXP (x, 1)));
7340     }
7341   /* (ashift:P (reg:P) (const_int shift)) */
7342   else if (GET_CODE (x) == ASHIFT
7343            && GET_MODE (x) == Pmode
7344            && GET_MODE (XEXP (x, 0)) == Pmode
7345            && CONST_INT_P (XEXP (x, 1)))
7346     {
7347       type = ADDRESS_REG_REG;
7348       index = XEXP (x, 0);
7349       shift = INTVAL (XEXP (x, 1));
7350     }
7351   else
7352     return false;
7353
7354   if (!strict_p
7355       && GET_CODE (index) == SUBREG
7356       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
7357     index = SUBREG_REG (index);
7358
7359   if (aarch64_sve_data_mode_p (mode))
7360     {
7361       if (type != ADDRESS_REG_REG
7362           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
7363         return false;
7364     }
7365   else
7366     {
7367       if (shift != 0
7368           && !(IN_RANGE (shift, 1, 3)
7369                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
7370         return false;
7371     }
7372
7373   if (REG_P (index)
7374       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
7375     {
7376       info->type = type;
7377       info->offset = index;
7378       info->shift = shift;
7379       return true;
7380     }
7381
7382   return false;
7383 }
7384
7385 /* Return true if MODE is one of the modes for which we
7386    support LDP/STP operations.  */
7387
7388 static bool
7389 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
7390 {
7391   return mode == SImode || mode == DImode
7392          || mode == SFmode || mode == DFmode
7393          || (aarch64_vector_mode_supported_p (mode)
7394              && (known_eq (GET_MODE_SIZE (mode), 8)
7395                  || (known_eq (GET_MODE_SIZE (mode), 16)
7396                     && (aarch64_tune_params.extra_tuning_flags
7397                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
7398 }
7399
7400 /* Return true if REGNO is a virtual pointer register, or an eliminable
7401    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
7402    include stack_pointer or hard_frame_pointer.  */
7403 static bool
7404 virt_or_elim_regno_p (unsigned regno)
7405 {
7406   return ((regno >= FIRST_VIRTUAL_REGISTER
7407            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
7408           || regno == FRAME_POINTER_REGNUM
7409           || regno == ARG_POINTER_REGNUM);
7410 }
7411
7412 /* Return true if X is a valid address of type TYPE for machine mode MODE.
7413    If it is, fill in INFO appropriately.  STRICT_P is true if
7414    REG_OK_STRICT is in effect.  */
7415
7416 bool
7417 aarch64_classify_address (struct aarch64_address_info *info,
7418                           rtx x, machine_mode mode, bool strict_p,
7419                           aarch64_addr_query_type type)
7420 {
7421   enum rtx_code code = GET_CODE (x);
7422   rtx op0, op1;
7423   poly_int64 offset;
7424
7425   HOST_WIDE_INT const_size;
7426
7427   /* On BE, we use load/store pair for all large int mode load/stores.
7428      TI/TFmode may also use a load/store pair.  */
7429   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7430   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
7431   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
7432                             || type == ADDR_QUERY_LDP_STP_N
7433                             || mode == TImode
7434                             || mode == TFmode
7435                             || (BYTES_BIG_ENDIAN && advsimd_struct_p));
7436
7437   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
7438      corresponds to the actual size of the memory being loaded/stored and the
7439      mode of the corresponding addressing mode is half of that.  */
7440   if (type == ADDR_QUERY_LDP_STP_N
7441       && known_eq (GET_MODE_SIZE (mode), 16))
7442     mode = DFmode;
7443
7444   bool allow_reg_index_p = (!load_store_pair_p
7445                             && (known_lt (GET_MODE_SIZE (mode), 16)
7446                                 || vec_flags == VEC_ADVSIMD
7447                                 || vec_flags & VEC_SVE_DATA));
7448
7449   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
7450      [Rn, #offset, MUL VL].  */
7451   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
7452       && (code != REG && code != PLUS))
7453     return false;
7454
7455   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
7456      REG addressing.  */
7457   if (advsimd_struct_p
7458       && !BYTES_BIG_ENDIAN
7459       && (code != POST_INC && code != REG))
7460     return false;
7461
7462   gcc_checking_assert (GET_MODE (x) == VOIDmode
7463                        || SCALAR_INT_MODE_P (GET_MODE (x)));
7464
7465   switch (code)
7466     {
7467     case REG:
7468     case SUBREG:
7469       info->type = ADDRESS_REG_IMM;
7470       info->base = x;
7471       info->offset = const0_rtx;
7472       info->const_offset = 0;
7473       return aarch64_base_register_rtx_p (x, strict_p);
7474
7475     case PLUS:
7476       op0 = XEXP (x, 0);
7477       op1 = XEXP (x, 1);
7478
7479       if (! strict_p
7480           && REG_P (op0)
7481           && virt_or_elim_regno_p (REGNO (op0))
7482           && poly_int_rtx_p (op1, &offset))
7483         {
7484           info->type = ADDRESS_REG_IMM;
7485           info->base = op0;
7486           info->offset = op1;
7487           info->const_offset = offset;
7488
7489           return true;
7490         }
7491
7492       if (maybe_ne (GET_MODE_SIZE (mode), 0)
7493           && aarch64_base_register_rtx_p (op0, strict_p)
7494           && poly_int_rtx_p (op1, &offset))
7495         {
7496           info->type = ADDRESS_REG_IMM;
7497           info->base = op0;
7498           info->offset = op1;
7499           info->const_offset = offset;
7500
7501           /* TImode and TFmode values are allowed in both pairs of X
7502              registers and individual Q registers.  The available
7503              address modes are:
7504              X,X: 7-bit signed scaled offset
7505              Q:   9-bit signed offset
7506              We conservatively require an offset representable in either mode.
7507              When performing the check for pairs of X registers i.e.  LDP/STP
7508              pass down DImode since that is the natural size of the LDP/STP
7509              instruction memory accesses.  */
7510           if (mode == TImode || mode == TFmode)
7511             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
7512                     && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
7513                         || offset_12bit_unsigned_scaled_p (mode, offset)));
7514
7515           /* A 7bit offset check because OImode will emit a ldp/stp
7516              instruction (only big endian will get here).
7517              For ldp/stp instructions, the offset is scaled for the size of a
7518              single element of the pair.  */
7519           if (mode == OImode)
7520             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
7521
7522           /* Three 9/12 bit offsets checks because CImode will emit three
7523              ldr/str instructions (only big endian will get here).  */
7524           if (mode == CImode)
7525             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
7526                     && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
7527                                                                offset + 32)
7528                         || offset_12bit_unsigned_scaled_p (V16QImode,
7529                                                            offset + 32)));
7530
7531           /* Two 7bit offsets checks because XImode will emit two ldp/stp
7532              instructions (only big endian will get here).  */
7533           if (mode == XImode)
7534             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
7535                     && aarch64_offset_7bit_signed_scaled_p (TImode,
7536                                                             offset + 32));
7537
7538           /* Make "m" use the LD1 offset range for SVE data modes, so
7539              that pre-RTL optimizers like ivopts will work to that
7540              instead of the wider LDR/STR range.  */
7541           if (vec_flags == VEC_SVE_DATA)
7542             return (type == ADDR_QUERY_M
7543                     ? offset_4bit_signed_scaled_p (mode, offset)
7544                     : offset_9bit_signed_scaled_p (mode, offset));
7545
7546           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
7547             {
7548               poly_int64 end_offset = (offset
7549                                        + GET_MODE_SIZE (mode)
7550                                        - BYTES_PER_SVE_VECTOR);
7551               return (type == ADDR_QUERY_M
7552                       ? offset_4bit_signed_scaled_p (mode, offset)
7553                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
7554                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
7555                                                          end_offset)));
7556             }
7557
7558           if (vec_flags == VEC_SVE_PRED)
7559             return offset_9bit_signed_scaled_p (mode, offset);
7560
7561           if (load_store_pair_p)
7562             return ((known_eq (GET_MODE_SIZE (mode), 4)
7563                      || known_eq (GET_MODE_SIZE (mode), 8)
7564                      || known_eq (GET_MODE_SIZE (mode), 16))
7565                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
7566           else
7567             return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
7568                     || offset_12bit_unsigned_scaled_p (mode, offset));
7569         }
7570
7571       if (allow_reg_index_p)
7572         {
7573           /* Look for base + (scaled/extended) index register.  */
7574           if (aarch64_base_register_rtx_p (op0, strict_p)
7575               && aarch64_classify_index (info, op1, mode, strict_p))
7576             {
7577               info->base = op0;
7578               return true;
7579             }
7580           if (aarch64_base_register_rtx_p (op1, strict_p)
7581               && aarch64_classify_index (info, op0, mode, strict_p))
7582             {
7583               info->base = op1;
7584               return true;
7585             }
7586         }
7587
7588       return false;
7589
7590     case POST_INC:
7591     case POST_DEC:
7592     case PRE_INC:
7593     case PRE_DEC:
7594       info->type = ADDRESS_REG_WB;
7595       info->base = XEXP (x, 0);
7596       info->offset = NULL_RTX;
7597       return aarch64_base_register_rtx_p (info->base, strict_p);
7598
7599     case POST_MODIFY:
7600     case PRE_MODIFY:
7601       info->type = ADDRESS_REG_WB;
7602       info->base = XEXP (x, 0);
7603       if (GET_CODE (XEXP (x, 1)) == PLUS
7604           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
7605           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
7606           && aarch64_base_register_rtx_p (info->base, strict_p))
7607         {
7608           info->offset = XEXP (XEXP (x, 1), 1);
7609           info->const_offset = offset;
7610
7611           /* TImode and TFmode values are allowed in both pairs of X
7612              registers and individual Q registers.  The available
7613              address modes are:
7614              X,X: 7-bit signed scaled offset
7615              Q:   9-bit signed offset
7616              We conservatively require an offset representable in either mode.
7617            */
7618           if (mode == TImode || mode == TFmode)
7619             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
7620                     && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
7621
7622           if (load_store_pair_p)
7623             return ((known_eq (GET_MODE_SIZE (mode), 4)
7624                      || known_eq (GET_MODE_SIZE (mode), 8)
7625                      || known_eq (GET_MODE_SIZE (mode), 16))
7626                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
7627           else
7628             return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
7629         }
7630       return false;
7631
7632     case CONST:
7633     case SYMBOL_REF:
7634     case LABEL_REF:
7635       /* load literal: pc-relative constant pool entry.  Only supported
7636          for SI mode or larger.  */
7637       info->type = ADDRESS_SYMBOLIC;
7638
7639       if (!load_store_pair_p
7640           && GET_MODE_SIZE (mode).is_constant (&const_size)
7641           && const_size >= 4)
7642         {
7643           rtx sym, addend;
7644
7645           split_const (x, &sym, &addend);
7646           return ((GET_CODE (sym) == LABEL_REF
7647                    || (GET_CODE (sym) == SYMBOL_REF
7648                        && CONSTANT_POOL_ADDRESS_P (sym)
7649                        && aarch64_pcrelative_literal_loads)));
7650         }
7651       return false;
7652
7653     case LO_SUM:
7654       info->type = ADDRESS_LO_SUM;
7655       info->base = XEXP (x, 0);
7656       info->offset = XEXP (x, 1);
7657       if (allow_reg_index_p
7658           && aarch64_base_register_rtx_p (info->base, strict_p))
7659         {
7660           rtx sym, offs;
7661           split_const (info->offset, &sym, &offs);
7662           if (GET_CODE (sym) == SYMBOL_REF
7663               && (aarch64_classify_symbol (sym, INTVAL (offs))
7664                   == SYMBOL_SMALL_ABSOLUTE))
7665             {
7666               /* The symbol and offset must be aligned to the access size.  */
7667               unsigned int align;
7668
7669               if (CONSTANT_POOL_ADDRESS_P (sym))
7670                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
7671               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
7672                 {
7673                   tree exp = SYMBOL_REF_DECL (sym);
7674                   align = TYPE_ALIGN (TREE_TYPE (exp));
7675                   align = aarch64_constant_alignment (exp, align);
7676                 }
7677               else if (SYMBOL_REF_DECL (sym))
7678                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
7679               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
7680                        && SYMBOL_REF_BLOCK (sym) != NULL)
7681                 align = SYMBOL_REF_BLOCK (sym)->alignment;
7682               else
7683                 align = BITS_PER_UNIT;
7684
7685               poly_int64 ref_size = GET_MODE_SIZE (mode);
7686               if (known_eq (ref_size, 0))
7687                 ref_size = GET_MODE_SIZE (DImode);
7688
7689               return (multiple_p (INTVAL (offs), ref_size)
7690                       && multiple_p (align / BITS_PER_UNIT, ref_size));
7691             }
7692         }
7693       return false;
7694
7695     default:
7696       return false;
7697     }
7698 }
7699
7700 /* Return true if the address X is valid for a PRFM instruction.
7701    STRICT_P is true if we should do strict checking with
7702    aarch64_classify_address.  */
7703
7704 bool
7705 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
7706 {
7707   struct aarch64_address_info addr;
7708
7709   /* PRFM accepts the same addresses as DImode...  */
7710   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
7711   if (!res)
7712     return false;
7713
7714   /* ... except writeback forms.  */
7715   return addr.type != ADDRESS_REG_WB;
7716 }
7717
7718 bool
7719 aarch64_symbolic_address_p (rtx x)
7720 {
7721   rtx offset;
7722
7723   split_const (x, &x, &offset);
7724   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
7725 }
7726
7727 /* Classify the base of symbolic expression X.  */
7728
7729 enum aarch64_symbol_type
7730 aarch64_classify_symbolic_expression (rtx x)
7731 {
7732   rtx offset;
7733
7734   split_const (x, &x, &offset);
7735   return aarch64_classify_symbol (x, INTVAL (offset));
7736 }
7737
7738
7739 /* Return TRUE if X is a legitimate address for accessing memory in
7740    mode MODE.  */
7741 static bool
7742 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
7743 {
7744   struct aarch64_address_info addr;
7745
7746   return aarch64_classify_address (&addr, x, mode, strict_p);
7747 }
7748
7749 /* Return TRUE if X is a legitimate address of type TYPE for accessing
7750    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
7751 bool
7752 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
7753                               aarch64_addr_query_type type)
7754 {
7755   struct aarch64_address_info addr;
7756
7757   return aarch64_classify_address (&addr, x, mode, strict_p, type);
7758 }
7759
7760 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
7761
7762 static bool
7763 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
7764                                          poly_int64 orig_offset,
7765                                          machine_mode mode)
7766 {
7767   HOST_WIDE_INT size;
7768   if (GET_MODE_SIZE (mode).is_constant (&size))
7769     {
7770       HOST_WIDE_INT const_offset, second_offset;
7771
7772       /* A general SVE offset is A * VQ + B.  Remove the A component from
7773          coefficient 0 in order to get the constant B.  */
7774       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
7775
7776       /* Split an out-of-range address displacement into a base and
7777          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
7778          range otherwise to increase opportunities for sharing the base
7779          address of different sizes.  Unaligned accesses use the signed
7780          9-bit range, TImode/TFmode use the intersection of signed
7781          scaled 7-bit and signed 9-bit offset.  */
7782       if (mode == TImode || mode == TFmode)
7783         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
7784       else if ((const_offset & (size - 1)) != 0)
7785         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
7786       else
7787         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
7788
7789       if (second_offset == 0 || known_eq (orig_offset, second_offset))
7790         return false;
7791
7792       /* Split the offset into second_offset and the rest.  */
7793       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7794       *offset2 = gen_int_mode (second_offset, Pmode);
7795       return true;
7796     }
7797   else
7798     {
7799       /* Get the mode we should use as the basis of the range.  For structure
7800          modes this is the mode of one vector.  */
7801       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7802       machine_mode step_mode
7803         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
7804
7805       /* Get the "mul vl" multiplier we'd like to use.  */
7806       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
7807       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
7808       if (vec_flags & VEC_SVE_DATA)
7809         /* LDR supports a 9-bit range, but the move patterns for
7810            structure modes require all vectors to be in range of the
7811            same base.  The simplest way of accomodating that while still
7812            promoting reuse of anchor points between different modes is
7813            to use an 8-bit range unconditionally.  */
7814         vnum = ((vnum + 128) & 255) - 128;
7815       else
7816         /* Predicates are only handled singly, so we might as well use
7817            the full range.  */
7818         vnum = ((vnum + 256) & 511) - 256;
7819       if (vnum == 0)
7820         return false;
7821
7822       /* Convert the "mul vl" multiplier into a byte offset.  */
7823       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
7824       if (known_eq (second_offset, orig_offset))
7825         return false;
7826
7827       /* Split the offset into second_offset and the rest.  */
7828       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7829       *offset2 = gen_int_mode (second_offset, Pmode);
7830       return true;
7831     }
7832 }
7833
7834 /* Return the binary representation of floating point constant VALUE in INTVAL.
7835    If the value cannot be converted, return false without setting INTVAL.
7836    The conversion is done in the given MODE.  */
7837 bool
7838 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
7839 {
7840
7841   /* We make a general exception for 0.  */
7842   if (aarch64_float_const_zero_rtx_p (value))
7843     {
7844       *intval = 0;
7845       return true;
7846     }
7847
7848   scalar_float_mode mode;
7849   if (GET_CODE (value) != CONST_DOUBLE
7850       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
7851       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
7852       /* Only support up to DF mode.  */
7853       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
7854     return false;
7855
7856   unsigned HOST_WIDE_INT ival = 0;
7857
7858   long res[2];
7859   real_to_target (res,
7860                   CONST_DOUBLE_REAL_VALUE (value),
7861                   REAL_MODE_FORMAT (mode));
7862
7863   if (mode == DFmode)
7864     {
7865       int order = BYTES_BIG_ENDIAN ? 1 : 0;
7866       ival = zext_hwi (res[order], 32);
7867       ival |= (zext_hwi (res[1 - order], 32) << 32);
7868     }
7869   else
7870       ival = zext_hwi (res[0], 32);
7871
7872   *intval = ival;
7873   return true;
7874 }
7875
7876 /* Return TRUE if rtx X is an immediate constant that can be moved using a
7877    single MOV(+MOVK) followed by an FMOV.  */
7878 bool
7879 aarch64_float_const_rtx_p (rtx x)
7880 {
7881   machine_mode mode = GET_MODE (x);
7882   if (mode == VOIDmode)
7883     return false;
7884
7885   /* Determine whether it's cheaper to write float constants as
7886      mov/movk pairs over ldr/adrp pairs.  */
7887   unsigned HOST_WIDE_INT ival;
7888
7889   if (GET_CODE (x) == CONST_DOUBLE
7890       && SCALAR_FLOAT_MODE_P (mode)
7891       && aarch64_reinterpret_float_as_int (x, &ival))
7892     {
7893       scalar_int_mode imode = (mode == HFmode
7894                                ? SImode
7895                                : int_mode_for_mode (mode).require ());
7896       int num_instr = aarch64_internal_mov_immediate
7897                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7898       return num_instr < 3;
7899     }
7900
7901   return false;
7902 }
7903
7904 /* Return TRUE if rtx X is immediate constant 0.0 */
7905 bool
7906 aarch64_float_const_zero_rtx_p (rtx x)
7907 {
7908   if (GET_MODE (x) == VOIDmode)
7909     return false;
7910
7911   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
7912     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
7913   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
7914 }
7915
7916 /* Return TRUE if rtx X is immediate constant that fits in a single
7917    MOVI immediate operation.  */
7918 bool
7919 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
7920 {
7921   if (!TARGET_SIMD)
7922      return false;
7923
7924   machine_mode vmode;
7925   scalar_int_mode imode;
7926   unsigned HOST_WIDE_INT ival;
7927
7928   if (GET_CODE (x) == CONST_DOUBLE
7929       && SCALAR_FLOAT_MODE_P (mode))
7930     {
7931       if (!aarch64_reinterpret_float_as_int (x, &ival))
7932         return false;
7933
7934       /* We make a general exception for 0.  */
7935       if (aarch64_float_const_zero_rtx_p (x))
7936         return true;
7937
7938       imode = int_mode_for_mode (mode).require ();
7939     }
7940   else if (GET_CODE (x) == CONST_INT
7941            && is_a <scalar_int_mode> (mode, &imode))
7942     ival = INTVAL (x);
7943   else
7944     return false;
7945
7946    /* use a 64 bit mode for everything except for DI/DF mode, where we use
7947      a 128 bit vector mode.  */
7948   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
7949
7950   vmode = aarch64_simd_container_mode (imode, width);
7951   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
7952
7953   return aarch64_simd_valid_immediate (v_op, NULL);
7954 }
7955
7956
7957 /* Return the fixed registers used for condition codes.  */
7958
7959 static bool
7960 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
7961 {
7962   *p1 = CC_REGNUM;
7963   *p2 = INVALID_REGNUM;
7964   return true;
7965 }
7966
7967 /* This function is used by the call expanders of the machine description.
7968    RESULT is the register in which the result is returned.  It's NULL for
7969    "call" and "sibcall".
7970    MEM is the location of the function call.
7971    SIBCALL indicates whether this function call is normal call or sibling call.
7972    It will generate different pattern accordingly.  */
7973
7974 void
7975 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
7976 {
7977   rtx call, callee, tmp;
7978   rtvec vec;
7979   machine_mode mode;
7980
7981   gcc_assert (MEM_P (mem));
7982   callee = XEXP (mem, 0);
7983   mode = GET_MODE (callee);
7984   gcc_assert (mode == Pmode);
7985
7986   /* Decide if we should generate indirect calls by loading the
7987      address of the callee into a register before performing
7988      the branch-and-link.  */
7989   if (SYMBOL_REF_P (callee)
7990       ? (aarch64_is_long_call_p (callee)
7991          || aarch64_is_noplt_call_p (callee))
7992       : !REG_P (callee))
7993     XEXP (mem, 0) = force_reg (mode, callee);
7994
7995   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
7996
7997   if (result != NULL_RTX)
7998     call = gen_rtx_SET (result, call);
7999
8000   if (sibcall)
8001     tmp = ret_rtx;
8002   else
8003     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
8004
8005   vec = gen_rtvec (2, call, tmp);
8006   call = gen_rtx_PARALLEL (VOIDmode, vec);
8007
8008   aarch64_emit_call_insn (call);
8009 }
8010
8011 /* Emit call insn with PAT and do aarch64-specific handling.  */
8012
8013 void
8014 aarch64_emit_call_insn (rtx pat)
8015 {
8016   rtx insn = emit_call_insn (pat);
8017
8018   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
8019   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
8020   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
8021 }
8022
8023 machine_mode
8024 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
8025 {
8026   machine_mode mode_x = GET_MODE (x);
8027   rtx_code code_x = GET_CODE (x);
8028
8029   /* All floating point compares return CCFP if it is an equality
8030      comparison, and CCFPE otherwise.  */
8031   if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
8032     {
8033       switch (code)
8034         {
8035         case EQ:
8036         case NE:
8037         case UNORDERED:
8038         case ORDERED:
8039         case UNLT:
8040         case UNLE:
8041         case UNGT:
8042         case UNGE:
8043         case UNEQ:
8044           return CCFPmode;
8045
8046         case LT:
8047         case LE:
8048         case GT:
8049         case GE:
8050         case LTGT:
8051           return CCFPEmode;
8052
8053         default:
8054           gcc_unreachable ();
8055         }
8056     }
8057
8058   /* Equality comparisons of short modes against zero can be performed
8059      using the TST instruction with the appropriate bitmask.  */
8060   if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
8061       && (code == EQ || code == NE)
8062       && (mode_x == HImode || mode_x == QImode))
8063     return CC_NZmode;
8064
8065   /* Similarly, comparisons of zero_extends from shorter modes can
8066      be performed using an ANDS with an immediate mask.  */
8067   if (y == const0_rtx && code_x == ZERO_EXTEND
8068       && (mode_x == SImode || mode_x == DImode)
8069       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
8070       && (code == EQ || code == NE))
8071     return CC_NZmode;
8072
8073   if ((mode_x == SImode || mode_x == DImode)
8074       && y == const0_rtx
8075       && (code == EQ || code == NE || code == LT || code == GE)
8076       && (code_x == PLUS || code_x == MINUS || code_x == AND
8077           || code_x == NEG
8078           || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
8079               && CONST_INT_P (XEXP (x, 2)))))
8080     return CC_NZmode;
8081
8082   /* A compare with a shifted operand.  Because of canonicalization,
8083      the comparison will have to be swapped when we emit the assembly
8084      code.  */
8085   if ((mode_x == SImode || mode_x == DImode)
8086       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
8087       && (code_x == ASHIFT || code_x == ASHIFTRT
8088           || code_x == LSHIFTRT
8089           || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
8090     return CC_SWPmode;
8091
8092   /* Similarly for a negated operand, but we can only do this for
8093      equalities.  */
8094   if ((mode_x == SImode || mode_x == DImode)
8095       && (REG_P (y) || GET_CODE (y) == SUBREG)
8096       && (code == EQ || code == NE)
8097       && code_x == NEG)
8098     return CC_Zmode;
8099
8100   /* A test for unsigned overflow from an addition.  */
8101   if ((mode_x == DImode || mode_x == TImode)
8102       && (code == LTU || code == GEU)
8103       && code_x == PLUS
8104       && rtx_equal_p (XEXP (x, 0), y))
8105     return CC_Cmode;
8106
8107   /* A test for unsigned overflow from an add with carry.  */
8108   if ((mode_x == DImode || mode_x == TImode)
8109       && (code == LTU || code == GEU)
8110       && code_x == PLUS
8111       && CONST_SCALAR_INT_P (y)
8112       && (rtx_mode_t (y, mode_x)
8113           == (wi::shwi (1, mode_x)
8114               << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
8115     return CC_ADCmode;
8116
8117   /* A test for signed overflow.  */
8118   if ((mode_x == DImode || mode_x == TImode)
8119       && code == NE
8120       && code_x == PLUS
8121       && GET_CODE (y) == SIGN_EXTEND)
8122     return CC_Vmode;
8123
8124   /* For everything else, return CCmode.  */
8125   return CCmode;
8126 }
8127
8128 static int
8129 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
8130
8131 int
8132 aarch64_get_condition_code (rtx x)
8133 {
8134   machine_mode mode = GET_MODE (XEXP (x, 0));
8135   enum rtx_code comp_code = GET_CODE (x);
8136
8137   if (GET_MODE_CLASS (mode) != MODE_CC)
8138     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
8139   return aarch64_get_condition_code_1 (mode, comp_code);
8140 }
8141
8142 static int
8143 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
8144 {
8145   switch (mode)
8146     {
8147     case E_CCFPmode:
8148     case E_CCFPEmode:
8149       switch (comp_code)
8150         {
8151         case GE: return AARCH64_GE;
8152         case GT: return AARCH64_GT;
8153         case LE: return AARCH64_LS;
8154         case LT: return AARCH64_MI;
8155         case NE: return AARCH64_NE;
8156         case EQ: return AARCH64_EQ;
8157         case ORDERED: return AARCH64_VC;
8158         case UNORDERED: return AARCH64_VS;
8159         case UNLT: return AARCH64_LT;
8160         case UNLE: return AARCH64_LE;
8161         case UNGT: return AARCH64_HI;
8162         case UNGE: return AARCH64_PL;
8163         default: return -1;
8164         }
8165       break;
8166
8167     case E_CCmode:
8168       switch (comp_code)
8169         {
8170         case NE: return AARCH64_NE;
8171         case EQ: return AARCH64_EQ;
8172         case GE: return AARCH64_GE;
8173         case GT: return AARCH64_GT;
8174         case LE: return AARCH64_LE;
8175         case LT: return AARCH64_LT;
8176         case GEU: return AARCH64_CS;
8177         case GTU: return AARCH64_HI;
8178         case LEU: return AARCH64_LS;
8179         case LTU: return AARCH64_CC;
8180         default: return -1;
8181         }
8182       break;
8183
8184     case E_CC_SWPmode:
8185       switch (comp_code)
8186         {
8187         case NE: return AARCH64_NE;
8188         case EQ: return AARCH64_EQ;
8189         case GE: return AARCH64_LE;
8190         case GT: return AARCH64_LT;
8191         case LE: return AARCH64_GE;
8192         case LT: return AARCH64_GT;
8193         case GEU: return AARCH64_LS;
8194         case GTU: return AARCH64_CC;
8195         case LEU: return AARCH64_CS;
8196         case LTU: return AARCH64_HI;
8197         default: return -1;
8198         }
8199       break;
8200
8201     case E_CC_NZCmode:
8202       switch (comp_code)
8203         {
8204         case NE: return AARCH64_NE; /* = any */
8205         case EQ: return AARCH64_EQ; /* = none */
8206         case GE: return AARCH64_PL; /* = nfrst */
8207         case LT: return AARCH64_MI; /* = first */
8208         case GEU: return AARCH64_CS; /* = nlast */
8209         case GTU: return AARCH64_HI; /* = pmore */
8210         case LEU: return AARCH64_LS; /* = plast */
8211         case LTU: return AARCH64_CC; /* = last */
8212         default: return -1;
8213         }
8214       break;
8215
8216     case E_CC_NZmode:
8217       switch (comp_code)
8218         {
8219         case NE: return AARCH64_NE;
8220         case EQ: return AARCH64_EQ;
8221         case GE: return AARCH64_PL;
8222         case LT: return AARCH64_MI;
8223         default: return -1;
8224         }
8225       break;
8226
8227     case E_CC_Zmode:
8228       switch (comp_code)
8229         {
8230         case NE: return AARCH64_NE;
8231         case EQ: return AARCH64_EQ;
8232         default: return -1;
8233         }
8234       break;
8235
8236     case E_CC_Cmode:
8237       switch (comp_code)
8238         {
8239         case LTU: return AARCH64_CS;
8240         case GEU: return AARCH64_CC;
8241         default: return -1;
8242         }
8243       break;
8244
8245     case E_CC_ADCmode:
8246       switch (comp_code)
8247         {
8248         case GEU: return AARCH64_CS;
8249         case LTU: return AARCH64_CC;
8250         default: return -1;
8251         }
8252       break;
8253
8254     case E_CC_Vmode:
8255       switch (comp_code)
8256         {
8257         case NE: return AARCH64_VS;
8258         case EQ: return AARCH64_VC;
8259         default: return -1;
8260         }
8261       break;
8262
8263     default:
8264       return -1;
8265     }
8266
8267   return -1;
8268 }
8269
8270 bool
8271 aarch64_const_vec_all_same_in_range_p (rtx x,
8272                                        HOST_WIDE_INT minval,
8273                                        HOST_WIDE_INT maxval)
8274 {
8275   rtx elt;
8276   return (const_vec_duplicate_p (x, &elt)
8277           && CONST_INT_P (elt)
8278           && IN_RANGE (INTVAL (elt), minval, maxval));
8279 }
8280
8281 bool
8282 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
8283 {
8284   return aarch64_const_vec_all_same_in_range_p (x, val, val);
8285 }
8286
8287 /* Return true if VEC is a constant in which every element is in the range
8288    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
8289
8290 static bool
8291 aarch64_const_vec_all_in_range_p (rtx vec,
8292                                   HOST_WIDE_INT minval,
8293                                   HOST_WIDE_INT maxval)
8294 {
8295   if (GET_CODE (vec) != CONST_VECTOR
8296       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
8297     return false;
8298
8299   int nunits;
8300   if (!CONST_VECTOR_STEPPED_P (vec))
8301     nunits = const_vector_encoded_nelts (vec);
8302   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
8303     return false;
8304
8305   for (int i = 0; i < nunits; i++)
8306     {
8307       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
8308       if (!CONST_INT_P (vec_elem)
8309           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
8310         return false;
8311     }
8312   return true;
8313 }
8314
8315 /* N Z C V.  */
8316 #define AARCH64_CC_V 1
8317 #define AARCH64_CC_C (1 << 1)
8318 #define AARCH64_CC_Z (1 << 2)
8319 #define AARCH64_CC_N (1 << 3)
8320
8321 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
8322 static const int aarch64_nzcv_codes[] =
8323 {
8324   0,            /* EQ, Z == 1.  */
8325   AARCH64_CC_Z, /* NE, Z == 0.  */
8326   0,            /* CS, C == 1.  */
8327   AARCH64_CC_C, /* CC, C == 0.  */
8328   0,            /* MI, N == 1.  */
8329   AARCH64_CC_N, /* PL, N == 0.  */
8330   0,            /* VS, V == 1.  */
8331   AARCH64_CC_V, /* VC, V == 0.  */
8332   0,            /* HI, C ==1 && Z == 0.  */
8333   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
8334   AARCH64_CC_V, /* GE, N == V.  */
8335   0,            /* LT, N != V.  */
8336   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
8337   0,            /* LE, !(Z == 0 && N == V).  */
8338   0,            /* AL, Any.  */
8339   0             /* NV, Any.  */
8340 };
8341
8342 /* Print floating-point vector immediate operand X to F, negating it
8343    first if NEGATE is true.  Return true on success, false if it isn't
8344    a constant we can handle.  */
8345
8346 static bool
8347 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
8348 {
8349   rtx elt;
8350
8351   if (!const_vec_duplicate_p (x, &elt))
8352     return false;
8353
8354   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
8355   if (negate)
8356     r = real_value_negate (&r);
8357
8358   /* Handle the SVE single-bit immediates specially, since they have a
8359      fixed form in the assembly syntax.  */
8360   if (real_equal (&r, &dconst0))
8361     asm_fprintf (f, "0.0");
8362   else if (real_equal (&r, &dconst2))
8363     asm_fprintf (f, "2.0");
8364   else if (real_equal (&r, &dconst1))
8365     asm_fprintf (f, "1.0");
8366   else if (real_equal (&r, &dconsthalf))
8367     asm_fprintf (f, "0.5");
8368   else
8369     {
8370       const int buf_size = 20;
8371       char float_buf[buf_size] = {'\0'};
8372       real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
8373                                 1, GET_MODE (elt));
8374       asm_fprintf (f, "%s", float_buf);
8375     }
8376
8377   return true;
8378 }
8379
8380 /* Return the equivalent letter for size.  */
8381 static char
8382 sizetochar (int size)
8383 {
8384   switch (size)
8385     {
8386     case 64: return 'd';
8387     case 32: return 's';
8388     case 16: return 'h';
8389     case 8 : return 'b';
8390     default: gcc_unreachable ();
8391     }
8392 }
8393
8394 /* Print operand X to file F in a target specific manner according to CODE.
8395    The acceptable formatting commands given by CODE are:
8396      'c':               An integer or symbol address without a preceding #
8397                         sign.
8398      'C':               Take the duplicated element in a vector constant
8399                         and print it in hex.
8400      'D':               Take the duplicated element in a vector constant
8401                         and print it as an unsigned integer, in decimal.
8402      'e':               Print the sign/zero-extend size as a character 8->b,
8403                         16->h, 32->w.  Can also be used for masks:
8404                         0xff->b, 0xffff->h, 0xffffffff->w.
8405      'I':               If the operand is a duplicated vector constant,
8406                         replace it with the duplicated scalar.  If the
8407                         operand is then a floating-point constant, replace
8408                         it with the integer bit representation.  Print the
8409                         transformed constant as a signed decimal number.
8410      'p':               Prints N such that 2^N == X (X must be power of 2 and
8411                         const int).
8412      'P':               Print the number of non-zero bits in X (a const_int).
8413      'H':               Print the higher numbered register of a pair (TImode)
8414                         of regs.
8415      'm':               Print a condition (eq, ne, etc).
8416      'M':               Same as 'm', but invert condition.
8417      'N':               Take the duplicated element in a vector constant
8418                         and print the negative of it in decimal.
8419      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
8420      'S/T/U/V':         Print a FP/SIMD register name for a register list.
8421                         The register printed is the FP/SIMD register name
8422                         of X + 0/1/2/3 for S/T/U/V.
8423      'R':               Print a scalar FP/SIMD register name + 1.
8424      'X':               Print bottom 16 bits of integer constant in hex.
8425      'w/x':             Print a general register name or the zero register
8426                         (32-bit or 64-bit).
8427      '0':               Print a normal operand, if it's a general register,
8428                         then we assume DImode.
8429      'k':               Print NZCV for conditional compare instructions.
8430      'A':               Output address constant representing the first
8431                         argument of X, specifying a relocation offset
8432                         if appropriate.
8433      'L':               Output constant address specified by X
8434                         with a relocation offset if appropriate.
8435      'G':               Prints address of X, specifying a PC relative
8436                         relocation mode if appropriate.
8437      'y':               Output address of LDP or STP - this is used for
8438                         some LDP/STPs which don't use a PARALLEL in their
8439                         pattern (so the mode needs to be adjusted).
8440      'z':               Output address of a typical LDP or STP.  */
8441
8442 static void
8443 aarch64_print_operand (FILE *f, rtx x, int code)
8444 {
8445   rtx elt;
8446   switch (code)
8447     {
8448     case 'c':
8449       switch (GET_CODE (x))
8450         {
8451         case CONST_INT:
8452           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
8453           break;
8454
8455         case SYMBOL_REF:
8456           output_addr_const (f, x);
8457           break;
8458
8459         case CONST:
8460           if (GET_CODE (XEXP (x, 0)) == PLUS
8461               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
8462             {
8463               output_addr_const (f, x);
8464               break;
8465             }
8466           /* Fall through.  */
8467
8468         default:
8469           output_operand_lossage ("unsupported operand for code '%c'", code);
8470         }
8471       break;
8472
8473     case 'e':
8474       {
8475         x = unwrap_const_vec_duplicate (x);
8476         if (!CONST_INT_P (x))
8477           {
8478             output_operand_lossage ("invalid operand for '%%%c'", code);
8479             return;
8480           }
8481
8482         HOST_WIDE_INT val = INTVAL (x);
8483         if ((val & ~7) == 8 || val == 0xff)
8484           fputc ('b', f);
8485         else if ((val & ~7) == 16 || val == 0xffff)
8486           fputc ('h', f);
8487         else if ((val & ~7) == 32 || val == 0xffffffff)
8488           fputc ('w', f);
8489         else
8490           {
8491             output_operand_lossage ("invalid operand for '%%%c'", code);
8492             return;
8493           }
8494       }
8495       break;
8496
8497     case 'p':
8498       {
8499         int n;
8500
8501         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
8502           {
8503             output_operand_lossage ("invalid operand for '%%%c'", code);
8504             return;
8505           }
8506
8507         asm_fprintf (f, "%d", n);
8508       }
8509       break;
8510
8511     case 'P':
8512       if (!CONST_INT_P (x))
8513         {
8514           output_operand_lossage ("invalid operand for '%%%c'", code);
8515           return;
8516         }
8517
8518       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
8519       break;
8520
8521     case 'H':
8522       if (x == const0_rtx)
8523         {
8524           asm_fprintf (f, "xzr");
8525           break;
8526         }
8527
8528       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
8529         {
8530           output_operand_lossage ("invalid operand for '%%%c'", code);
8531           return;
8532         }
8533
8534       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
8535       break;
8536
8537     case 'I':
8538       {
8539         x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
8540         if (CONST_INT_P (x))
8541           asm_fprintf (f, "%wd", INTVAL (x));
8542         else
8543           {
8544             output_operand_lossage ("invalid operand for '%%%c'", code);
8545             return;
8546           }
8547         break;
8548       }
8549
8550     case 'M':
8551     case 'm':
8552       {
8553         int cond_code;
8554         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
8555         if (x == const_true_rtx)
8556           {
8557             if (code == 'M')
8558               fputs ("nv", f);
8559             return;
8560           }
8561
8562         if (!COMPARISON_P (x))
8563           {
8564             output_operand_lossage ("invalid operand for '%%%c'", code);
8565             return;
8566           }
8567
8568         cond_code = aarch64_get_condition_code (x);
8569         gcc_assert (cond_code >= 0);
8570         if (code == 'M')
8571           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
8572         if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
8573           fputs (aarch64_sve_condition_codes[cond_code], f);
8574         else
8575           fputs (aarch64_condition_codes[cond_code], f);
8576       }
8577       break;
8578
8579     case 'N':
8580       if (!const_vec_duplicate_p (x, &elt))
8581         {
8582           output_operand_lossage ("invalid vector constant");
8583           return;
8584         }
8585
8586       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
8587         asm_fprintf (f, "%wd", -INTVAL (elt));
8588       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
8589                && aarch64_print_vector_float_operand (f, x, true))
8590         ;
8591       else
8592         {
8593           output_operand_lossage ("invalid vector constant");
8594           return;
8595         }
8596       break;
8597
8598     case 'b':
8599     case 'h':
8600     case 's':
8601     case 'd':
8602     case 'q':
8603       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8604         {
8605           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8606           return;
8607         }
8608       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
8609       break;
8610
8611     case 'S':
8612     case 'T':
8613     case 'U':
8614     case 'V':
8615       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8616         {
8617           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8618           return;
8619         }
8620       asm_fprintf (f, "%c%d",
8621                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
8622                    REGNO (x) - V0_REGNUM + (code - 'S'));
8623       break;
8624
8625     case 'R':
8626       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8627         {
8628           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8629           return;
8630         }
8631       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
8632       break;
8633
8634     case 'X':
8635       if (!CONST_INT_P (x))
8636         {
8637           output_operand_lossage ("invalid operand for '%%%c'", code);
8638           return;
8639         }
8640       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
8641       break;
8642
8643     case 'C':
8644       {
8645         /* Print a replicated constant in hex.  */
8646         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
8647           {
8648             output_operand_lossage ("invalid operand for '%%%c'", code);
8649             return;
8650           }
8651         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
8652         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
8653       }
8654       break;
8655
8656     case 'D':
8657       {
8658         /* Print a replicated constant in decimal, treating it as
8659            unsigned.  */
8660         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
8661           {
8662             output_operand_lossage ("invalid operand for '%%%c'", code);
8663             return;
8664           }
8665         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
8666         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
8667       }
8668       break;
8669
8670     case 'w':
8671     case 'x':
8672       if (x == const0_rtx
8673           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
8674         {
8675           asm_fprintf (f, "%czr", code);
8676           break;
8677         }
8678
8679       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
8680         {
8681           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
8682           break;
8683         }
8684
8685       if (REG_P (x) && REGNO (x) == SP_REGNUM)
8686         {
8687           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
8688           break;
8689         }
8690
8691       /* Fall through */
8692
8693     case 0:
8694       if (x == NULL)
8695         {
8696           output_operand_lossage ("missing operand");
8697           return;
8698         }
8699
8700       switch (GET_CODE (x))
8701         {
8702         case REG:
8703           if (aarch64_sve_data_mode_p (GET_MODE (x)))
8704             {
8705               if (REG_NREGS (x) == 1)
8706                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
8707               else
8708                 {
8709                   char suffix
8710                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
8711                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
8712                                REGNO (x) - V0_REGNUM, suffix,
8713                                END_REGNO (x) - V0_REGNUM - 1, suffix);
8714                 }
8715             }
8716           else
8717             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
8718           break;
8719
8720         case MEM:
8721           output_address (GET_MODE (x), XEXP (x, 0));
8722           break;
8723
8724         case LABEL_REF:
8725         case SYMBOL_REF:
8726           output_addr_const (asm_out_file, x);
8727           break;
8728
8729         case CONST_INT:
8730           asm_fprintf (f, "%wd", INTVAL (x));
8731           break;
8732
8733         case CONST:
8734           if (!VECTOR_MODE_P (GET_MODE (x)))
8735             {
8736               output_addr_const (asm_out_file, x);
8737               break;
8738             }
8739           /* fall through */
8740
8741         case CONST_VECTOR:
8742           if (!const_vec_duplicate_p (x, &elt))
8743             {
8744               output_operand_lossage ("invalid vector constant");
8745               return;
8746             }
8747
8748           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
8749             asm_fprintf (f, "%wd", INTVAL (elt));
8750           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
8751                    && aarch64_print_vector_float_operand (f, x, false))
8752             ;
8753           else
8754             {
8755               output_operand_lossage ("invalid vector constant");
8756               return;
8757             }
8758           break;
8759
8760         case CONST_DOUBLE:
8761           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
8762              be getting CONST_DOUBLEs holding integers.  */
8763           gcc_assert (GET_MODE (x) != VOIDmode);
8764           if (aarch64_float_const_zero_rtx_p (x))
8765             {
8766               fputc ('0', f);
8767               break;
8768             }
8769           else if (aarch64_float_const_representable_p (x))
8770             {
8771 #define buf_size 20
8772               char float_buf[buf_size] = {'\0'};
8773               real_to_decimal_for_mode (float_buf,
8774                                         CONST_DOUBLE_REAL_VALUE (x),
8775                                         buf_size, buf_size,
8776                                         1, GET_MODE (x));
8777               asm_fprintf (asm_out_file, "%s", float_buf);
8778               break;
8779 #undef buf_size
8780             }
8781           output_operand_lossage ("invalid constant");
8782           return;
8783         default:
8784           output_operand_lossage ("invalid operand");
8785           return;
8786         }
8787       break;
8788
8789     case 'A':
8790       if (GET_CODE (x) == HIGH)
8791         x = XEXP (x, 0);
8792
8793       switch (aarch64_classify_symbolic_expression (x))
8794         {
8795         case SYMBOL_SMALL_GOT_4G:
8796           asm_fprintf (asm_out_file, ":got:");
8797           break;
8798
8799         case SYMBOL_SMALL_TLSGD:
8800           asm_fprintf (asm_out_file, ":tlsgd:");
8801           break;
8802
8803         case SYMBOL_SMALL_TLSDESC:
8804           asm_fprintf (asm_out_file, ":tlsdesc:");
8805           break;
8806
8807         case SYMBOL_SMALL_TLSIE:
8808           asm_fprintf (asm_out_file, ":gottprel:");
8809           break;
8810
8811         case SYMBOL_TLSLE24:
8812           asm_fprintf (asm_out_file, ":tprel:");
8813           break;
8814
8815         case SYMBOL_TINY_GOT:
8816           gcc_unreachable ();
8817           break;
8818
8819         default:
8820           break;
8821         }
8822       output_addr_const (asm_out_file, x);
8823       break;
8824
8825     case 'L':
8826       switch (aarch64_classify_symbolic_expression (x))
8827         {
8828         case SYMBOL_SMALL_GOT_4G:
8829           asm_fprintf (asm_out_file, ":lo12:");
8830           break;
8831
8832         case SYMBOL_SMALL_TLSGD:
8833           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
8834           break;
8835
8836         case SYMBOL_SMALL_TLSDESC:
8837           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
8838           break;
8839
8840         case SYMBOL_SMALL_TLSIE:
8841           asm_fprintf (asm_out_file, ":gottprel_lo12:");
8842           break;
8843
8844         case SYMBOL_TLSLE12:
8845           asm_fprintf (asm_out_file, ":tprel_lo12:");
8846           break;
8847
8848         case SYMBOL_TLSLE24:
8849           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
8850           break;
8851
8852         case SYMBOL_TINY_GOT:
8853           asm_fprintf (asm_out_file, ":got:");
8854           break;
8855
8856         case SYMBOL_TINY_TLSIE:
8857           asm_fprintf (asm_out_file, ":gottprel:");
8858           break;
8859
8860         default:
8861           break;
8862         }
8863       output_addr_const (asm_out_file, x);
8864       break;
8865
8866     case 'G':
8867       switch (aarch64_classify_symbolic_expression (x))
8868         {
8869         case SYMBOL_TLSLE24:
8870           asm_fprintf (asm_out_file, ":tprel_hi12:");
8871           break;
8872         default:
8873           break;
8874         }
8875       output_addr_const (asm_out_file, x);
8876       break;
8877
8878     case 'k':
8879       {
8880         HOST_WIDE_INT cond_code;
8881
8882         if (!CONST_INT_P (x))
8883           {
8884             output_operand_lossage ("invalid operand for '%%%c'", code);
8885             return;
8886           }
8887
8888         cond_code = INTVAL (x);
8889         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
8890         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
8891       }
8892       break;
8893
8894     case 'y':
8895     case 'z':
8896       {
8897         machine_mode mode = GET_MODE (x);
8898
8899         if (GET_CODE (x) != MEM
8900             || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
8901           {
8902             output_operand_lossage ("invalid operand for '%%%c'", code);
8903             return;
8904           }
8905
8906         if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
8907                                             code == 'y'
8908                                             ? ADDR_QUERY_LDP_STP_N
8909                                             : ADDR_QUERY_LDP_STP))
8910           output_operand_lossage ("invalid operand prefix '%%%c'", code);
8911       }
8912       break;
8913
8914     default:
8915       output_operand_lossage ("invalid operand prefix '%%%c'", code);
8916       return;
8917     }
8918 }
8919
8920 /* Print address 'x' of a memory access with mode 'mode'.
8921    'op' is the context required by aarch64_classify_address.  It can either be
8922    MEM for a normal memory access or PARALLEL for LDP/STP.  */
8923 static bool
8924 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
8925                                 aarch64_addr_query_type type)
8926 {
8927   struct aarch64_address_info addr;
8928   unsigned int size;
8929
8930   /* Check all addresses are Pmode - including ILP32.  */
8931   if (GET_MODE (x) != Pmode
8932       && (!CONST_INT_P (x)
8933           || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
8934     {
8935       output_operand_lossage ("invalid address mode");
8936       return false;
8937     }
8938
8939   if (aarch64_classify_address (&addr, x, mode, true, type))
8940     switch (addr.type)
8941       {
8942       case ADDRESS_REG_IMM:
8943         if (known_eq (addr.const_offset, 0))
8944           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
8945         else if (aarch64_sve_data_mode_p (mode))
8946           {
8947             HOST_WIDE_INT vnum
8948               = exact_div (addr.const_offset,
8949                            BYTES_PER_SVE_VECTOR).to_constant ();
8950             asm_fprintf (f, "[%s, #%wd, mul vl]",
8951                          reg_names[REGNO (addr.base)], vnum);
8952           }
8953         else if (aarch64_sve_pred_mode_p (mode))
8954           {
8955             HOST_WIDE_INT vnum
8956               = exact_div (addr.const_offset,
8957                            BYTES_PER_SVE_PRED).to_constant ();
8958             asm_fprintf (f, "[%s, #%wd, mul vl]",
8959                          reg_names[REGNO (addr.base)], vnum);
8960           }
8961         else
8962           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
8963                        INTVAL (addr.offset));
8964         return true;
8965
8966       case ADDRESS_REG_REG:
8967         if (addr.shift == 0)
8968           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
8969                        reg_names [REGNO (addr.offset)]);
8970         else
8971           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
8972                        reg_names [REGNO (addr.offset)], addr.shift);
8973         return true;
8974
8975       case ADDRESS_REG_UXTW:
8976         if (addr.shift == 0)
8977           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
8978                        REGNO (addr.offset) - R0_REGNUM);
8979         else
8980           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
8981                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
8982         return true;
8983
8984       case ADDRESS_REG_SXTW:
8985         if (addr.shift == 0)
8986           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
8987                        REGNO (addr.offset) - R0_REGNUM);
8988         else
8989           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
8990                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
8991         return true;
8992
8993       case ADDRESS_REG_WB:
8994         /* Writeback is only supported for fixed-width modes.  */
8995         size = GET_MODE_SIZE (mode).to_constant ();
8996         switch (GET_CODE (x))
8997           {
8998           case PRE_INC:
8999             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
9000             return true;
9001           case POST_INC:
9002             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
9003             return true;
9004           case PRE_DEC:
9005             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
9006             return true;
9007           case POST_DEC:
9008             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
9009             return true;
9010           case PRE_MODIFY:
9011             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
9012                          INTVAL (addr.offset));
9013             return true;
9014           case POST_MODIFY:
9015             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
9016                          INTVAL (addr.offset));
9017             return true;
9018           default:
9019             break;
9020           }
9021         break;
9022
9023       case ADDRESS_LO_SUM:
9024         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
9025         output_addr_const (f, addr.offset);
9026         asm_fprintf (f, "]");
9027         return true;
9028
9029       case ADDRESS_SYMBOLIC:
9030         output_addr_const (f, x);
9031         return true;
9032       }
9033
9034   return false;
9035 }
9036
9037 /* Print address 'x' of a memory access with mode 'mode'.  */
9038 static void
9039 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
9040 {
9041   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
9042     output_addr_const (f, x);
9043 }
9044
9045 bool
9046 aarch64_label_mentioned_p (rtx x)
9047 {
9048   const char *fmt;
9049   int i;
9050
9051   if (GET_CODE (x) == LABEL_REF)
9052     return true;
9053
9054   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
9055      referencing instruction, but they are constant offsets, not
9056      symbols.  */
9057   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
9058     return false;
9059
9060   fmt = GET_RTX_FORMAT (GET_CODE (x));
9061   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
9062     {
9063       if (fmt[i] == 'E')
9064         {
9065           int j;
9066
9067           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
9068             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
9069               return 1;
9070         }
9071       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
9072         return 1;
9073     }
9074
9075   return 0;
9076 }
9077
9078 /* Implement REGNO_REG_CLASS.  */
9079
9080 enum reg_class
9081 aarch64_regno_regclass (unsigned regno)
9082 {
9083   if (GP_REGNUM_P (regno))
9084     return GENERAL_REGS;
9085
9086   if (regno == SP_REGNUM)
9087     return STACK_REG;
9088
9089   if (regno == FRAME_POINTER_REGNUM
9090       || regno == ARG_POINTER_REGNUM)
9091     return POINTER_REGS;
9092
9093   if (FP_REGNUM_P (regno))
9094     return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
9095             : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
9096
9097   if (PR_REGNUM_P (regno))
9098     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
9099
9100   return NO_REGS;
9101 }
9102
9103 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
9104    If OFFSET is out of range, return an offset of an anchor point
9105    that is in range.  Return 0 otherwise.  */
9106
9107 static HOST_WIDE_INT
9108 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
9109                        machine_mode mode)
9110 {
9111   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
9112   if (size > 16)
9113     return (offset + 0x400) & ~0x7f0;
9114
9115   /* For offsets that aren't a multiple of the access size, the limit is
9116      -256...255.  */
9117   if (offset & (size - 1))
9118     {
9119       /* BLKmode typically uses LDP of X-registers.  */
9120       if (mode == BLKmode)
9121         return (offset + 512) & ~0x3ff;
9122       return (offset + 0x100) & ~0x1ff;
9123     }
9124
9125   /* Small negative offsets are supported.  */
9126   if (IN_RANGE (offset, -256, 0))
9127     return 0;
9128
9129   if (mode == TImode || mode == TFmode)
9130     return (offset + 0x100) & ~0x1ff;
9131
9132   /* Use 12-bit offset by access size.  */
9133   return offset & (~0xfff * size);
9134 }
9135
9136 static rtx
9137 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
9138 {
9139   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
9140      where mask is selected by alignment and size of the offset.
9141      We try to pick as large a range for the offset as possible to
9142      maximize the chance of a CSE.  However, for aligned addresses
9143      we limit the range to 4k so that structures with different sized
9144      elements are likely to use the same base.  We need to be careful
9145      not to split a CONST for some forms of address expression, otherwise
9146      it will generate sub-optimal code.  */
9147
9148   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
9149     {
9150       rtx base = XEXP (x, 0);
9151       rtx offset_rtx = XEXP (x, 1);
9152       HOST_WIDE_INT offset = INTVAL (offset_rtx);
9153
9154       if (GET_CODE (base) == PLUS)
9155         {
9156           rtx op0 = XEXP (base, 0);
9157           rtx op1 = XEXP (base, 1);
9158
9159           /* Force any scaling into a temp for CSE.  */
9160           op0 = force_reg (Pmode, op0);
9161           op1 = force_reg (Pmode, op1);
9162
9163           /* Let the pointer register be in op0.  */
9164           if (REG_POINTER (op1))
9165             std::swap (op0, op1);
9166
9167           /* If the pointer is virtual or frame related, then we know that
9168              virtual register instantiation or register elimination is going
9169              to apply a second constant.  We want the two constants folded
9170              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
9171           if (virt_or_elim_regno_p (REGNO (op0)))
9172             {
9173               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
9174                                    NULL_RTX, true, OPTAB_DIRECT);
9175               return gen_rtx_PLUS (Pmode, base, op1);
9176             }
9177
9178           /* Otherwise, in order to encourage CSE (and thence loop strength
9179              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
9180           base = expand_binop (Pmode, add_optab, op0, op1,
9181                                NULL_RTX, true, OPTAB_DIRECT);
9182           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
9183         }
9184
9185       HOST_WIDE_INT size;
9186       if (GET_MODE_SIZE (mode).is_constant (&size))
9187         {
9188           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
9189                                                              mode);
9190           if (base_offset != 0)
9191             {
9192               base = plus_constant (Pmode, base, base_offset);
9193               base = force_operand (base, NULL_RTX);
9194               return plus_constant (Pmode, base, offset - base_offset);
9195             }
9196         }
9197     }
9198
9199   return x;
9200 }
9201
9202 static reg_class_t
9203 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
9204                           reg_class_t rclass,
9205                           machine_mode mode,
9206                           secondary_reload_info *sri)
9207 {
9208   /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
9209      directly by the *aarch64_sve_mov<mode>_be move pattern.  See the
9210      comment at the head of aarch64-sve.md for more details about the
9211      big-endian handling.  */
9212   if (BYTES_BIG_ENDIAN
9213       && reg_class_subset_p (rclass, FP_REGS)
9214       && !((REG_P (x) && HARD_REGISTER_P (x))
9215            || aarch64_simd_valid_immediate (x, NULL))
9216       && aarch64_sve_data_mode_p (mode))
9217     {
9218       sri->icode = CODE_FOR_aarch64_sve_reload_be;
9219       return NO_REGS;
9220     }
9221
9222   /* If we have to disable direct literal pool loads and stores because the
9223      function is too big, then we need a scratch register.  */
9224   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
9225       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
9226           || targetm.vector_mode_supported_p (GET_MODE (x)))
9227       && !aarch64_pcrelative_literal_loads)
9228     {
9229       sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
9230       return NO_REGS;
9231     }
9232
9233   /* Without the TARGET_SIMD instructions we cannot move a Q register
9234      to a Q register directly.  We need a scratch.  */
9235   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
9236       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
9237       && reg_class_subset_p (rclass, FP_REGS))
9238     {
9239       sri->icode = code_for_aarch64_reload_mov (mode);
9240       return NO_REGS;
9241     }
9242
9243   /* A TFmode or TImode memory access should be handled via an FP_REGS
9244      because AArch64 has richer addressing modes for LDR/STR instructions
9245      than LDP/STP instructions.  */
9246   if (TARGET_FLOAT && rclass == GENERAL_REGS
9247       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
9248     return FP_REGS;
9249
9250   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
9251       return GENERAL_REGS;
9252
9253   return NO_REGS;
9254 }
9255
9256 static bool
9257 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
9258 {
9259   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
9260
9261   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
9262      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
9263   if (frame_pointer_needed)
9264     return to == HARD_FRAME_POINTER_REGNUM;
9265   return true;
9266 }
9267
9268 poly_int64
9269 aarch64_initial_elimination_offset (unsigned from, unsigned to)
9270 {
9271   if (to == HARD_FRAME_POINTER_REGNUM)
9272     {
9273       if (from == ARG_POINTER_REGNUM)
9274         return cfun->machine->frame.hard_fp_offset;
9275
9276       if (from == FRAME_POINTER_REGNUM)
9277         return cfun->machine->frame.hard_fp_offset
9278                - cfun->machine->frame.locals_offset;
9279     }
9280
9281   if (to == STACK_POINTER_REGNUM)
9282     {
9283       if (from == FRAME_POINTER_REGNUM)
9284           return cfun->machine->frame.frame_size
9285                  - cfun->machine->frame.locals_offset;
9286     }
9287
9288   return cfun->machine->frame.frame_size;
9289 }
9290
9291 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
9292    previous frame.  */
9293
9294 rtx
9295 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
9296 {
9297   if (count != 0)
9298     return const0_rtx;
9299   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
9300 }
9301
9302
9303 static void
9304 aarch64_asm_trampoline_template (FILE *f)
9305 {
9306   int offset1 = 16;
9307   int offset2 = 20;
9308
9309   if (aarch64_bti_enabled ())
9310     {
9311       asm_fprintf (f, "\thint\t34 // bti c\n");
9312       offset1 -= 4;
9313       offset2 -= 4;
9314     }
9315
9316   if (TARGET_ILP32)
9317     {
9318       asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
9319       asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
9320                    offset1);
9321     }
9322   else
9323     {
9324       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
9325       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
9326                    offset2);
9327     }
9328   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
9329
9330   /* The trampoline needs an extra padding instruction.  In case if BTI is
9331      enabled the padding instruction is replaced by the BTI instruction at
9332      the beginning.  */
9333   if (!aarch64_bti_enabled ())
9334     assemble_aligned_integer (4, const0_rtx);
9335
9336   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
9337   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
9338 }
9339
9340 static void
9341 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
9342 {
9343   rtx fnaddr, mem, a_tramp;
9344   const int tramp_code_sz = 16;
9345
9346   /* Don't need to copy the trailing D-words, we fill those in below.  */
9347   emit_block_move (m_tramp, assemble_trampoline_template (),
9348                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
9349   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
9350   fnaddr = XEXP (DECL_RTL (fndecl), 0);
9351   if (GET_MODE (fnaddr) != ptr_mode)
9352     fnaddr = convert_memory_address (ptr_mode, fnaddr);
9353   emit_move_insn (mem, fnaddr);
9354
9355   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
9356   emit_move_insn (mem, chain_value);
9357
9358   /* XXX We should really define a "clear_cache" pattern and use
9359      gen_clear_cache().  */
9360   a_tramp = XEXP (m_tramp, 0);
9361   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
9362                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
9363                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
9364                      ptr_mode);
9365 }
9366
9367 static unsigned char
9368 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
9369 {
9370   /* ??? Logically we should only need to provide a value when
9371      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
9372      can hold MODE, but at the moment we need to handle all modes.
9373      Just ignore any runtime parts for registers that can't store them.  */
9374   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
9375   unsigned int nregs;
9376   switch (regclass)
9377     {
9378     case TAILCALL_ADDR_REGS:
9379     case POINTER_REGS:
9380     case GENERAL_REGS:
9381     case ALL_REGS:
9382     case POINTER_AND_FP_REGS:
9383     case FP_REGS:
9384     case FP_LO_REGS:
9385     case FP_LO8_REGS:
9386       if (aarch64_sve_data_mode_p (mode)
9387           && constant_multiple_p (GET_MODE_SIZE (mode),
9388                                   BYTES_PER_SVE_VECTOR, &nregs))
9389         return nregs;
9390       return (aarch64_vector_data_mode_p (mode)
9391               ? CEIL (lowest_size, UNITS_PER_VREG)
9392               : CEIL (lowest_size, UNITS_PER_WORD));
9393     case STACK_REG:
9394     case PR_REGS:
9395     case PR_LO_REGS:
9396     case PR_HI_REGS:
9397       return 1;
9398
9399     case NO_REGS:
9400       return 0;
9401
9402     default:
9403       break;
9404     }
9405   gcc_unreachable ();
9406 }
9407
9408 static reg_class_t
9409 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
9410 {
9411   if (regclass == POINTER_REGS)
9412     return GENERAL_REGS;
9413
9414   if (regclass == STACK_REG)
9415     {
9416       if (REG_P(x)
9417           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
9418           return regclass;
9419
9420       return NO_REGS;
9421     }
9422
9423   /* Register eliminiation can result in a request for
9424      SP+constant->FP_REGS.  We cannot support such operations which
9425      use SP as source and an FP_REG as destination, so reject out
9426      right now.  */
9427   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
9428     {
9429       rtx lhs = XEXP (x, 0);
9430
9431       /* Look through a possible SUBREG introduced by ILP32.  */
9432       if (GET_CODE (lhs) == SUBREG)
9433         lhs = SUBREG_REG (lhs);
9434
9435       gcc_assert (REG_P (lhs));
9436       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
9437                                       POINTER_REGS));
9438       return NO_REGS;
9439     }
9440
9441   return regclass;
9442 }
9443
9444 void
9445 aarch64_asm_output_labelref (FILE* f, const char *name)
9446 {
9447   asm_fprintf (f, "%U%s", name);
9448 }
9449
9450 static void
9451 aarch64_elf_asm_constructor (rtx symbol, int priority)
9452 {
9453   if (priority == DEFAULT_INIT_PRIORITY)
9454     default_ctor_section_asm_out_constructor (symbol, priority);
9455   else
9456     {
9457       section *s;
9458       /* While priority is known to be in range [0, 65535], so 18 bytes
9459          would be enough, the compiler might not know that.  To avoid
9460          -Wformat-truncation false positive, use a larger size.  */
9461       char buf[23];
9462       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
9463       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
9464       switch_to_section (s);
9465       assemble_align (POINTER_SIZE);
9466       assemble_aligned_integer (POINTER_BYTES, symbol);
9467     }
9468 }
9469
9470 static void
9471 aarch64_elf_asm_destructor (rtx symbol, int priority)
9472 {
9473   if (priority == DEFAULT_INIT_PRIORITY)
9474     default_dtor_section_asm_out_destructor (symbol, priority);
9475   else
9476     {
9477       section *s;
9478       /* While priority is known to be in range [0, 65535], so 18 bytes
9479          would be enough, the compiler might not know that.  To avoid
9480          -Wformat-truncation false positive, use a larger size.  */
9481       char buf[23];
9482       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
9483       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
9484       switch_to_section (s);
9485       assemble_align (POINTER_SIZE);
9486       assemble_aligned_integer (POINTER_BYTES, symbol);
9487     }
9488 }
9489
9490 const char*
9491 aarch64_output_casesi (rtx *operands)
9492 {
9493   char buf[100];
9494   char label[100];
9495   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
9496   int index;
9497   static const char *const patterns[4][2] =
9498   {
9499     {
9500       "ldrb\t%w3, [%0,%w1,uxtw]",
9501       "add\t%3, %4, %w3, sxtb #2"
9502     },
9503     {
9504       "ldrh\t%w3, [%0,%w1,uxtw #1]",
9505       "add\t%3, %4, %w3, sxth #2"
9506     },
9507     {
9508       "ldr\t%w3, [%0,%w1,uxtw #2]",
9509       "add\t%3, %4, %w3, sxtw #2"
9510     },
9511     /* We assume that DImode is only generated when not optimizing and
9512        that we don't really need 64-bit address offsets.  That would
9513        imply an object file with 8GB of code in a single function!  */
9514     {
9515       "ldr\t%w3, [%0,%w1,uxtw #2]",
9516       "add\t%3, %4, %w3, sxtw #2"
9517     }
9518   };
9519
9520   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
9521
9522   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
9523   index = exact_log2 (GET_MODE_SIZE (mode));
9524
9525   gcc_assert (index >= 0 && index <= 3);
9526
9527   /* Need to implement table size reduction, by chaning the code below.  */
9528   output_asm_insn (patterns[index][0], operands);
9529   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
9530   snprintf (buf, sizeof (buf),
9531             "adr\t%%4, %s", targetm.strip_name_encoding (label));
9532   output_asm_insn (buf, operands);
9533   output_asm_insn (patterns[index][1], operands);
9534   output_asm_insn ("br\t%3", operands);
9535   assemble_label (asm_out_file, label);
9536   return "";
9537 }
9538
9539
9540 /* Return size in bits of an arithmetic operand which is shifted/scaled and
9541    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
9542    operator.  */
9543
9544 int
9545 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
9546 {
9547   if (shift >= 0 && shift <= 3)
9548     {
9549       int size;
9550       for (size = 8; size <= 32; size *= 2)
9551         {
9552           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
9553           if (mask == bits << shift)
9554             return size;
9555         }
9556     }
9557   return 0;
9558 }
9559
9560 /* Constant pools are per function only when PC relative
9561    literal loads are true or we are in the large memory
9562    model.  */
9563
9564 static inline bool
9565 aarch64_can_use_per_function_literal_pools_p (void)
9566 {
9567   return (aarch64_pcrelative_literal_loads
9568           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
9569 }
9570
9571 static bool
9572 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
9573 {
9574   /* We can't use blocks for constants when we're using a per-function
9575      constant pool.  */
9576   return !aarch64_can_use_per_function_literal_pools_p ();
9577 }
9578
9579 /* Select appropriate section for constants depending
9580    on where we place literal pools.  */
9581
9582 static section *
9583 aarch64_select_rtx_section (machine_mode mode,
9584                             rtx x,
9585                             unsigned HOST_WIDE_INT align)
9586 {
9587   if (aarch64_can_use_per_function_literal_pools_p ())
9588     return function_section (current_function_decl);
9589
9590   return default_elf_select_rtx_section (mode, x, align);
9591 }
9592
9593 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
9594 void
9595 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
9596                                   HOST_WIDE_INT offset)
9597 {
9598   /* When using per-function literal pools, we must ensure that any code
9599      section is aligned to the minimal instruction length, lest we get
9600      errors from the assembler re "unaligned instructions".  */
9601   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
9602     ASM_OUTPUT_ALIGN (f, 2);
9603 }
9604
9605 /* Costs.  */
9606
9607 /* Helper function for rtx cost calculation.  Strip a shift expression
9608    from X.  Returns the inner operand if successful, or the original
9609    expression on failure.  */
9610 static rtx
9611 aarch64_strip_shift (rtx x)
9612 {
9613   rtx op = x;
9614
9615   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
9616      we can convert both to ROR during final output.  */
9617   if ((GET_CODE (op) == ASHIFT
9618        || GET_CODE (op) == ASHIFTRT
9619        || GET_CODE (op) == LSHIFTRT
9620        || GET_CODE (op) == ROTATERT
9621        || GET_CODE (op) == ROTATE)
9622       && CONST_INT_P (XEXP (op, 1)))
9623     return XEXP (op, 0);
9624
9625   if (GET_CODE (op) == MULT
9626       && CONST_INT_P (XEXP (op, 1))
9627       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
9628     return XEXP (op, 0);
9629
9630   return x;
9631 }
9632
9633 /* Helper function for rtx cost calculation.  Strip an extend
9634    expression from X.  Returns the inner operand if successful, or the
9635    original expression on failure.  We deal with a number of possible
9636    canonicalization variations here. If STRIP_SHIFT is true, then
9637    we can strip off a shift also.  */
9638 static rtx
9639 aarch64_strip_extend (rtx x, bool strip_shift)
9640 {
9641   scalar_int_mode mode;
9642   rtx op = x;
9643
9644   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
9645     return op;
9646
9647   /* Zero and sign extraction of a widened value.  */
9648   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
9649       && XEXP (op, 2) == const0_rtx
9650       && GET_CODE (XEXP (op, 0)) == MULT
9651       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
9652                                          XEXP (op, 1)))
9653     return XEXP (XEXP (op, 0), 0);
9654
9655   /* It can also be represented (for zero-extend) as an AND with an
9656      immediate.  */
9657   if (GET_CODE (op) == AND
9658       && GET_CODE (XEXP (op, 0)) == MULT
9659       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
9660       && CONST_INT_P (XEXP (op, 1))
9661       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
9662                            INTVAL (XEXP (op, 1))) != 0)
9663     return XEXP (XEXP (op, 0), 0);
9664
9665   /* Now handle extended register, as this may also have an optional
9666      left shift by 1..4.  */
9667   if (strip_shift
9668       && GET_CODE (op) == ASHIFT
9669       && CONST_INT_P (XEXP (op, 1))
9670       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
9671     op = XEXP (op, 0);
9672
9673   if (GET_CODE (op) == ZERO_EXTEND
9674       || GET_CODE (op) == SIGN_EXTEND)
9675     op = XEXP (op, 0);
9676
9677   if (op != x)
9678     return op;
9679
9680   return x;
9681 }
9682
9683 /* Return true iff CODE is a shift supported in combination
9684    with arithmetic instructions.  */
9685
9686 static bool
9687 aarch64_shift_p (enum rtx_code code)
9688 {
9689   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
9690 }
9691
9692
9693 /* Return true iff X is a cheap shift without a sign extend. */
9694
9695 static bool
9696 aarch64_cheap_mult_shift_p (rtx x)
9697 {
9698   rtx op0, op1;
9699
9700   op0 = XEXP (x, 0);
9701   op1 = XEXP (x, 1);
9702
9703   if (!(aarch64_tune_params.extra_tuning_flags
9704                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
9705     return false;
9706
9707   if (GET_CODE (op0) == SIGN_EXTEND)
9708     return false;
9709
9710   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
9711       && UINTVAL (op1) <= 4)
9712     return true;
9713
9714   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
9715     return false;
9716
9717   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
9718
9719   if (l2 > 0 && l2 <= 4)
9720     return true;
9721
9722   return false;
9723 }
9724
9725 /* Helper function for rtx cost calculation.  Calculate the cost of
9726    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
9727    Return the calculated cost of the expression, recursing manually in to
9728    operands where needed.  */
9729
9730 static int
9731 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
9732 {
9733   rtx op0, op1;
9734   const struct cpu_cost_table *extra_cost
9735     = aarch64_tune_params.insn_extra_cost;
9736   int cost = 0;
9737   bool compound_p = (outer == PLUS || outer == MINUS);
9738   machine_mode mode = GET_MODE (x);
9739
9740   gcc_checking_assert (code == MULT);
9741
9742   op0 = XEXP (x, 0);
9743   op1 = XEXP (x, 1);
9744
9745   if (VECTOR_MODE_P (mode))
9746     mode = GET_MODE_INNER (mode);
9747
9748   /* Integer multiply/fma.  */
9749   if (GET_MODE_CLASS (mode) == MODE_INT)
9750     {
9751       /* The multiply will be canonicalized as a shift, cost it as such.  */
9752       if (aarch64_shift_p (GET_CODE (x))
9753           || (CONST_INT_P (op1)
9754               && exact_log2 (INTVAL (op1)) > 0))
9755         {
9756           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
9757                            || GET_CODE (op0) == SIGN_EXTEND;
9758           if (speed)
9759             {
9760               if (compound_p)
9761                 {
9762                   /* If the shift is considered cheap,
9763                      then don't add any cost. */
9764                   if (aarch64_cheap_mult_shift_p (x))
9765                     ;
9766                   else if (REG_P (op1))
9767                     /* ARITH + shift-by-register.  */
9768                     cost += extra_cost->alu.arith_shift_reg;
9769                   else if (is_extend)
9770                     /* ARITH + extended register.  We don't have a cost field
9771                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
9772                     cost += extra_cost->alu.extend_arith;
9773                   else
9774                     /* ARITH + shift-by-immediate.  */
9775                     cost += extra_cost->alu.arith_shift;
9776                 }
9777               else
9778                 /* LSL (immediate).  */
9779                 cost += extra_cost->alu.shift;
9780
9781             }
9782           /* Strip extends as we will have costed them in the case above.  */
9783           if (is_extend)
9784             op0 = aarch64_strip_extend (op0, true);
9785
9786           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
9787
9788           return cost;
9789         }
9790
9791       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
9792          compound and let the below cases handle it.  After all, MNEG is a
9793          special-case alias of MSUB.  */
9794       if (GET_CODE (op0) == NEG)
9795         {
9796           op0 = XEXP (op0, 0);
9797           compound_p = true;
9798         }
9799
9800       /* Integer multiplies or FMAs have zero/sign extending variants.  */
9801       if ((GET_CODE (op0) == ZERO_EXTEND
9802            && GET_CODE (op1) == ZERO_EXTEND)
9803           || (GET_CODE (op0) == SIGN_EXTEND
9804               && GET_CODE (op1) == SIGN_EXTEND))
9805         {
9806           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
9807           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
9808
9809           if (speed)
9810             {
9811               if (compound_p)
9812                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
9813                 cost += extra_cost->mult[0].extend_add;
9814               else
9815                 /* MUL/SMULL/UMULL.  */
9816                 cost += extra_cost->mult[0].extend;
9817             }
9818
9819           return cost;
9820         }
9821
9822       /* This is either an integer multiply or a MADD.  In both cases
9823          we want to recurse and cost the operands.  */
9824       cost += rtx_cost (op0, mode, MULT, 0, speed);
9825       cost += rtx_cost (op1, mode, MULT, 1, speed);
9826
9827       if (speed)
9828         {
9829           if (compound_p)
9830             /* MADD/MSUB.  */
9831             cost += extra_cost->mult[mode == DImode].add;
9832           else
9833             /* MUL.  */
9834             cost += extra_cost->mult[mode == DImode].simple;
9835         }
9836
9837       return cost;
9838     }
9839   else
9840     {
9841       if (speed)
9842         {
9843           /* Floating-point FMA/FMUL can also support negations of the
9844              operands, unless the rounding mode is upward or downward in
9845              which case FNMUL is different than FMUL with operand negation.  */
9846           bool neg0 = GET_CODE (op0) == NEG;
9847           bool neg1 = GET_CODE (op1) == NEG;
9848           if (compound_p || !flag_rounding_math || (neg0 && neg1))
9849             {
9850               if (neg0)
9851                 op0 = XEXP (op0, 0);
9852               if (neg1)
9853                 op1 = XEXP (op1, 0);
9854             }
9855
9856           if (compound_p)
9857             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
9858             cost += extra_cost->fp[mode == DFmode].fma;
9859           else
9860             /* FMUL/FNMUL.  */
9861             cost += extra_cost->fp[mode == DFmode].mult;
9862         }
9863
9864       cost += rtx_cost (op0, mode, MULT, 0, speed);
9865       cost += rtx_cost (op1, mode, MULT, 1, speed);
9866       return cost;
9867     }
9868 }
9869
9870 static int
9871 aarch64_address_cost (rtx x,
9872                       machine_mode mode,
9873                       addr_space_t as ATTRIBUTE_UNUSED,
9874                       bool speed)
9875 {
9876   enum rtx_code c = GET_CODE (x);
9877   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
9878   struct aarch64_address_info info;
9879   int cost = 0;
9880   info.shift = 0;
9881
9882   if (!aarch64_classify_address (&info, x, mode, false))
9883     {
9884       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
9885         {
9886           /* This is a CONST or SYMBOL ref which will be split
9887              in a different way depending on the code model in use.
9888              Cost it through the generic infrastructure.  */
9889           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
9890           /* Divide through by the cost of one instruction to
9891              bring it to the same units as the address costs.  */
9892           cost_symbol_ref /= COSTS_N_INSNS (1);
9893           /* The cost is then the cost of preparing the address,
9894              followed by an immediate (possibly 0) offset.  */
9895           return cost_symbol_ref + addr_cost->imm_offset;
9896         }
9897       else
9898         {
9899           /* This is most likely a jump table from a case
9900              statement.  */
9901           return addr_cost->register_offset;
9902         }
9903     }
9904
9905   switch (info.type)
9906     {
9907       case ADDRESS_LO_SUM:
9908       case ADDRESS_SYMBOLIC:
9909       case ADDRESS_REG_IMM:
9910         cost += addr_cost->imm_offset;
9911         break;
9912
9913       case ADDRESS_REG_WB:
9914         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
9915           cost += addr_cost->pre_modify;
9916         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
9917           cost += addr_cost->post_modify;
9918         else
9919           gcc_unreachable ();
9920
9921         break;
9922
9923       case ADDRESS_REG_REG:
9924         cost += addr_cost->register_offset;
9925         break;
9926
9927       case ADDRESS_REG_SXTW:
9928         cost += addr_cost->register_sextend;
9929         break;
9930
9931       case ADDRESS_REG_UXTW:
9932         cost += addr_cost->register_zextend;
9933         break;
9934
9935       default:
9936         gcc_unreachable ();
9937     }
9938
9939
9940   if (info.shift > 0)
9941     {
9942       /* For the sake of calculating the cost of the shifted register
9943          component, we can treat same sized modes in the same way.  */
9944       if (known_eq (GET_MODE_BITSIZE (mode), 16))
9945         cost += addr_cost->addr_scale_costs.hi;
9946       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
9947         cost += addr_cost->addr_scale_costs.si;
9948       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
9949         cost += addr_cost->addr_scale_costs.di;
9950       else
9951         /* We can't tell, or this is a 128-bit vector.  */
9952         cost += addr_cost->addr_scale_costs.ti;
9953     }
9954
9955   return cost;
9956 }
9957
9958 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
9959    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
9960    to be taken.  */
9961
9962 int
9963 aarch64_branch_cost (bool speed_p, bool predictable_p)
9964 {
9965   /* When optimizing for speed, use the cost of unpredictable branches.  */
9966   const struct cpu_branch_cost *branch_costs =
9967     aarch64_tune_params.branch_costs;
9968
9969   if (!speed_p || predictable_p)
9970     return branch_costs->predictable;
9971   else
9972     return branch_costs->unpredictable;
9973 }
9974
9975 /* Return true if the RTX X in mode MODE is a zero or sign extract
9976    usable in an ADD or SUB (extended register) instruction.  */
9977 static bool
9978 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
9979 {
9980   /* Catch add with a sign extract.
9981      This is add_<optab><mode>_multp2.  */
9982   if (GET_CODE (x) == SIGN_EXTRACT
9983       || GET_CODE (x) == ZERO_EXTRACT)
9984     {
9985       rtx op0 = XEXP (x, 0);
9986       rtx op1 = XEXP (x, 1);
9987       rtx op2 = XEXP (x, 2);
9988
9989       if (GET_CODE (op0) == MULT
9990           && CONST_INT_P (op1)
9991           && op2 == const0_rtx
9992           && CONST_INT_P (XEXP (op0, 1))
9993           && aarch64_is_extend_from_extract (mode,
9994                                              XEXP (op0, 1),
9995                                              op1))
9996         {
9997           return true;
9998         }
9999     }
10000   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
10001      No shift.  */
10002   else if (GET_CODE (x) == SIGN_EXTEND
10003            || GET_CODE (x) == ZERO_EXTEND)
10004     return REG_P (XEXP (x, 0));
10005
10006   return false;
10007 }
10008
10009 static bool
10010 aarch64_frint_unspec_p (unsigned int u)
10011 {
10012   switch (u)
10013     {
10014       case UNSPEC_FRINTZ:
10015       case UNSPEC_FRINTP:
10016       case UNSPEC_FRINTM:
10017       case UNSPEC_FRINTA:
10018       case UNSPEC_FRINTN:
10019       case UNSPEC_FRINTX:
10020       case UNSPEC_FRINTI:
10021         return true;
10022
10023       default:
10024         return false;
10025     }
10026 }
10027
10028 /* Return true iff X is an rtx that will match an extr instruction
10029    i.e. as described in the *extr<mode>5_insn family of patterns.
10030    OP0 and OP1 will be set to the operands of the shifts involved
10031    on success and will be NULL_RTX otherwise.  */
10032
10033 static bool
10034 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
10035 {
10036   rtx op0, op1;
10037   scalar_int_mode mode;
10038   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
10039     return false;
10040
10041   *res_op0 = NULL_RTX;
10042   *res_op1 = NULL_RTX;
10043
10044   if (GET_CODE (x) != IOR)
10045     return false;
10046
10047   op0 = XEXP (x, 0);
10048   op1 = XEXP (x, 1);
10049
10050   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
10051       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
10052     {
10053      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
10054       if (GET_CODE (op1) == ASHIFT)
10055         std::swap (op0, op1);
10056
10057       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
10058         return false;
10059
10060       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
10061       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
10062
10063       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
10064           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
10065         {
10066           *res_op0 = XEXP (op0, 0);
10067           *res_op1 = XEXP (op1, 0);
10068           return true;
10069         }
10070     }
10071
10072   return false;
10073 }
10074
10075 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
10076    storing it in *COST.  Result is true if the total cost of the operation
10077    has now been calculated.  */
10078 static bool
10079 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
10080 {
10081   rtx inner;
10082   rtx comparator;
10083   enum rtx_code cmpcode;
10084
10085   if (COMPARISON_P (op0))
10086     {
10087       inner = XEXP (op0, 0);
10088       comparator = XEXP (op0, 1);
10089       cmpcode = GET_CODE (op0);
10090     }
10091   else
10092     {
10093       inner = op0;
10094       comparator = const0_rtx;
10095       cmpcode = NE;
10096     }
10097
10098   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
10099     {
10100       /* Conditional branch.  */
10101       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
10102         return true;
10103       else
10104         {
10105           if (cmpcode == NE || cmpcode == EQ)
10106             {
10107               if (comparator == const0_rtx)
10108                 {
10109                   /* TBZ/TBNZ/CBZ/CBNZ.  */
10110                   if (GET_CODE (inner) == ZERO_EXTRACT)
10111                     /* TBZ/TBNZ.  */
10112                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
10113                                        ZERO_EXTRACT, 0, speed);
10114                   else
10115                     /* CBZ/CBNZ.  */
10116                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
10117
10118                 return true;
10119               }
10120             }
10121           else if (cmpcode == LT || cmpcode == GE)
10122             {
10123               /* TBZ/TBNZ.  */
10124               if (comparator == const0_rtx)
10125                 return true;
10126             }
10127         }
10128     }
10129   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
10130     {
10131       /* CCMP.  */
10132       if (GET_CODE (op1) == COMPARE)
10133         {
10134           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
10135           if (XEXP (op1, 1) == const0_rtx)
10136             *cost += 1;
10137           if (speed)
10138             {
10139               machine_mode mode = GET_MODE (XEXP (op1, 0));
10140               const struct cpu_cost_table *extra_cost
10141                 = aarch64_tune_params.insn_extra_cost;
10142
10143               if (GET_MODE_CLASS (mode) == MODE_INT)
10144                 *cost += extra_cost->alu.arith;
10145               else
10146                 *cost += extra_cost->fp[mode == DFmode].compare;
10147             }
10148           return true;
10149         }
10150
10151       /* It's a conditional operation based on the status flags,
10152          so it must be some flavor of CSEL.  */
10153
10154       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
10155       if (GET_CODE (op1) == NEG
10156           || GET_CODE (op1) == NOT
10157           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
10158         op1 = XEXP (op1, 0);
10159       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
10160         {
10161           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
10162           op1 = XEXP (op1, 0);
10163           op2 = XEXP (op2, 0);
10164         }
10165
10166       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
10167       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
10168       return true;
10169     }
10170
10171   /* We don't know what this is, cost all operands.  */
10172   return false;
10173 }
10174
10175 /* Check whether X is a bitfield operation of the form shift + extend that
10176    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
10177    operand to which the bitfield operation is applied.  Otherwise return
10178    NULL_RTX.  */
10179
10180 static rtx
10181 aarch64_extend_bitfield_pattern_p (rtx x)
10182 {
10183   rtx_code outer_code = GET_CODE (x);
10184   machine_mode outer_mode = GET_MODE (x);
10185
10186   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
10187       && outer_mode != SImode && outer_mode != DImode)
10188     return NULL_RTX;
10189
10190   rtx inner = XEXP (x, 0);
10191   rtx_code inner_code = GET_CODE (inner);
10192   machine_mode inner_mode = GET_MODE (inner);
10193   rtx op = NULL_RTX;
10194
10195   switch (inner_code)
10196     {
10197       case ASHIFT:
10198         if (CONST_INT_P (XEXP (inner, 1))
10199             && (inner_mode == QImode || inner_mode == HImode))
10200           op = XEXP (inner, 0);
10201         break;
10202       case LSHIFTRT:
10203         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
10204             && (inner_mode == QImode || inner_mode == HImode))
10205           op = XEXP (inner, 0);
10206         break;
10207       case ASHIFTRT:
10208         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
10209             && (inner_mode == QImode || inner_mode == HImode))
10210           op = XEXP (inner, 0);
10211         break;
10212       default:
10213         break;
10214     }
10215
10216   return op;
10217 }
10218
10219 /* Return true if the mask and a shift amount from an RTX of the form
10220    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
10221    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
10222
10223 bool
10224 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
10225                                     rtx shft_amnt)
10226 {
10227   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
10228          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
10229          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
10230          && (INTVAL (mask)
10231              & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
10232 }
10233
10234 /* Return true if the masks and a shift amount from an RTX of the form
10235    ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
10236    a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
10237
10238 bool
10239 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
10240                                    unsigned HOST_WIDE_INT mask1,
10241                                    unsigned HOST_WIDE_INT shft_amnt,
10242                                    unsigned HOST_WIDE_INT mask2)
10243 {
10244   unsigned HOST_WIDE_INT t;
10245
10246   /* Verify that there is no overlap in what bits are set in the two masks.  */
10247   if (mask1 != ~mask2)
10248     return false;
10249
10250   /* Verify that mask2 is not all zeros or ones.  */
10251   if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
10252     return false;
10253
10254   /* The shift amount should always be less than the mode size.  */
10255   gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
10256
10257   /* Verify that the mask being shifted is contiguous and would be in the
10258      least significant bits after shifting by shft_amnt.  */
10259   t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
10260   return (t == (t & -t));
10261 }
10262
10263 /* Calculate the cost of calculating X, storing it in *COST.  Result
10264    is true if the total cost of the operation has now been calculated.  */
10265 static bool
10266 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
10267                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
10268 {
10269   rtx op0, op1, op2;
10270   const struct cpu_cost_table *extra_cost
10271     = aarch64_tune_params.insn_extra_cost;
10272   int code = GET_CODE (x);
10273   scalar_int_mode int_mode;
10274
10275   /* By default, assume that everything has equivalent cost to the
10276      cheapest instruction.  Any additional costs are applied as a delta
10277      above this default.  */
10278   *cost = COSTS_N_INSNS (1);
10279
10280   switch (code)
10281     {
10282     case SET:
10283       /* The cost depends entirely on the operands to SET.  */
10284       *cost = 0;
10285       op0 = SET_DEST (x);
10286       op1 = SET_SRC (x);
10287
10288       switch (GET_CODE (op0))
10289         {
10290         case MEM:
10291           if (speed)
10292             {
10293               rtx address = XEXP (op0, 0);
10294               if (VECTOR_MODE_P (mode))
10295                 *cost += extra_cost->ldst.storev;
10296               else if (GET_MODE_CLASS (mode) == MODE_INT)
10297                 *cost += extra_cost->ldst.store;
10298               else if (mode == SFmode)
10299                 *cost += extra_cost->ldst.storef;
10300               else if (mode == DFmode)
10301                 *cost += extra_cost->ldst.stored;
10302
10303               *cost +=
10304                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10305                                                      0, speed));
10306             }
10307
10308           *cost += rtx_cost (op1, mode, SET, 1, speed);
10309           return true;
10310
10311         case SUBREG:
10312           if (! REG_P (SUBREG_REG (op0)))
10313             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
10314
10315           /* Fall through.  */
10316         case REG:
10317           /* The cost is one per vector-register copied.  */
10318           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
10319             {
10320               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
10321               *cost = COSTS_N_INSNS (nregs);
10322             }
10323           /* const0_rtx is in general free, but we will use an
10324              instruction to set a register to 0.  */
10325           else if (REG_P (op1) || op1 == const0_rtx)
10326             {
10327               /* The cost is 1 per register copied.  */
10328               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
10329               *cost = COSTS_N_INSNS (nregs);
10330             }
10331           else
10332             /* Cost is just the cost of the RHS of the set.  */
10333             *cost += rtx_cost (op1, mode, SET, 1, speed);
10334           return true;
10335
10336         case ZERO_EXTRACT:
10337         case SIGN_EXTRACT:
10338           /* Bit-field insertion.  Strip any redundant widening of
10339              the RHS to meet the width of the target.  */
10340           if (GET_CODE (op1) == SUBREG)
10341             op1 = SUBREG_REG (op1);
10342           if ((GET_CODE (op1) == ZERO_EXTEND
10343                || GET_CODE (op1) == SIGN_EXTEND)
10344               && CONST_INT_P (XEXP (op0, 1))
10345               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
10346               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
10347             op1 = XEXP (op1, 0);
10348
10349           if (CONST_INT_P (op1))
10350             {
10351               /* MOV immediate is assumed to always be cheap.  */
10352               *cost = COSTS_N_INSNS (1);
10353             }
10354           else
10355             {
10356               /* BFM.  */
10357               if (speed)
10358                 *cost += extra_cost->alu.bfi;
10359               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
10360             }
10361
10362           return true;
10363
10364         default:
10365           /* We can't make sense of this, assume default cost.  */
10366           *cost = COSTS_N_INSNS (1);
10367           return false;
10368         }
10369       return false;
10370
10371     case CONST_INT:
10372       /* If an instruction can incorporate a constant within the
10373          instruction, the instruction's expression avoids calling
10374          rtx_cost() on the constant.  If rtx_cost() is called on a
10375          constant, then it is usually because the constant must be
10376          moved into a register by one or more instructions.
10377
10378          The exception is constant 0, which can be expressed
10379          as XZR/WZR and is therefore free.  The exception to this is
10380          if we have (set (reg) (const0_rtx)) in which case we must cost
10381          the move.  However, we can catch that when we cost the SET, so
10382          we don't need to consider that here.  */
10383       if (x == const0_rtx)
10384         *cost = 0;
10385       else
10386         {
10387           /* To an approximation, building any other constant is
10388              proportionally expensive to the number of instructions
10389              required to build that constant.  This is true whether we
10390              are compiling for SPEED or otherwise.  */
10391           if (!is_a <scalar_int_mode> (mode, &int_mode))
10392             int_mode = word_mode;
10393           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
10394                                  (NULL_RTX, x, false, int_mode));
10395         }
10396       return true;
10397
10398     case CONST_DOUBLE:
10399
10400       /* First determine number of instructions to do the move
10401           as an integer constant.  */
10402       if (!aarch64_float_const_representable_p (x)
10403            && !aarch64_can_const_movi_rtx_p (x, mode)
10404            && aarch64_float_const_rtx_p (x))
10405         {
10406           unsigned HOST_WIDE_INT ival;
10407           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
10408           gcc_assert (succeed);
10409
10410           scalar_int_mode imode = (mode == HFmode
10411                                    ? SImode
10412                                    : int_mode_for_mode (mode).require ());
10413           int ncost = aarch64_internal_mov_immediate
10414                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
10415           *cost += COSTS_N_INSNS (ncost);
10416           return true;
10417         }
10418
10419       if (speed)
10420         {
10421           /* mov[df,sf]_aarch64.  */
10422           if (aarch64_float_const_representable_p (x))
10423             /* FMOV (scalar immediate).  */
10424             *cost += extra_cost->fp[mode == DFmode].fpconst;
10425           else if (!aarch64_float_const_zero_rtx_p (x))
10426             {
10427               /* This will be a load from memory.  */
10428               if (mode == DFmode)
10429                 *cost += extra_cost->ldst.loadd;
10430               else
10431                 *cost += extra_cost->ldst.loadf;
10432             }
10433           else
10434             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
10435                or MOV v0.s[0], wzr - neither of which are modeled by the
10436                cost tables.  Just use the default cost.  */
10437             {
10438             }
10439         }
10440
10441       return true;
10442
10443     case MEM:
10444       if (speed)
10445         {
10446           /* For loads we want the base cost of a load, plus an
10447              approximation for the additional cost of the addressing
10448              mode.  */
10449           rtx address = XEXP (x, 0);
10450           if (VECTOR_MODE_P (mode))
10451             *cost += extra_cost->ldst.loadv;
10452           else if (GET_MODE_CLASS (mode) == MODE_INT)
10453             *cost += extra_cost->ldst.load;
10454           else if (mode == SFmode)
10455             *cost += extra_cost->ldst.loadf;
10456           else if (mode == DFmode)
10457             *cost += extra_cost->ldst.loadd;
10458
10459           *cost +=
10460                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10461                                                      0, speed));
10462         }
10463
10464       return true;
10465
10466     case NEG:
10467       op0 = XEXP (x, 0);
10468
10469       if (VECTOR_MODE_P (mode))
10470         {
10471           if (speed)
10472             {
10473               /* FNEG.  */
10474               *cost += extra_cost->vect.alu;
10475             }
10476           return false;
10477         }
10478
10479       if (GET_MODE_CLASS (mode) == MODE_INT)
10480         {
10481           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
10482               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
10483             {
10484               /* CSETM.  */
10485               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
10486               return true;
10487             }
10488
10489           /* Cost this as SUB wzr, X.  */
10490           op0 = CONST0_RTX (mode);
10491           op1 = XEXP (x, 0);
10492           goto cost_minus;
10493         }
10494
10495       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10496         {
10497           /* Support (neg(fma...)) as a single instruction only if
10498              sign of zeros is unimportant.  This matches the decision
10499              making in aarch64.md.  */
10500           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
10501             {
10502               /* FNMADD.  */
10503               *cost = rtx_cost (op0, mode, NEG, 0, speed);
10504               return true;
10505             }
10506           if (GET_CODE (op0) == MULT)
10507             {
10508               /* FNMUL.  */
10509               *cost = rtx_cost (op0, mode, NEG, 0, speed);
10510               return true;
10511             }
10512           if (speed)
10513             /* FNEG.  */
10514             *cost += extra_cost->fp[mode == DFmode].neg;
10515           return false;
10516         }
10517
10518       return false;
10519
10520     case CLRSB:
10521     case CLZ:
10522       if (speed)
10523         {
10524           if (VECTOR_MODE_P (mode))
10525             *cost += extra_cost->vect.alu;
10526           else
10527             *cost += extra_cost->alu.clz;
10528         }
10529
10530       return false;
10531
10532     case COMPARE:
10533       op0 = XEXP (x, 0);
10534       op1 = XEXP (x, 1);
10535
10536       if (op1 == const0_rtx
10537           && GET_CODE (op0) == AND)
10538         {
10539           x = op0;
10540           mode = GET_MODE (op0);
10541           goto cost_logic;
10542         }
10543
10544       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
10545         {
10546           /* TODO: A write to the CC flags possibly costs extra, this
10547              needs encoding in the cost tables.  */
10548
10549           mode = GET_MODE (op0);
10550           /* ANDS.  */
10551           if (GET_CODE (op0) == AND)
10552             {
10553               x = op0;
10554               goto cost_logic;
10555             }
10556
10557           if (GET_CODE (op0) == PLUS)
10558             {
10559               /* ADDS (and CMN alias).  */
10560               x = op0;
10561               goto cost_plus;
10562             }
10563
10564           if (GET_CODE (op0) == MINUS)
10565             {
10566               /* SUBS.  */
10567               x = op0;
10568               goto cost_minus;
10569             }
10570
10571           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
10572               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
10573               && CONST_INT_P (XEXP (op0, 2)))
10574             {
10575               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
10576                  Handle it here directly rather than going to cost_logic
10577                  since we know the immediate generated for the TST is valid
10578                  so we can avoid creating an intermediate rtx for it only
10579                  for costing purposes.  */
10580               if (speed)
10581                 *cost += extra_cost->alu.logical;
10582
10583               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
10584                                  ZERO_EXTRACT, 0, speed);
10585               return true;
10586             }
10587
10588           if (GET_CODE (op1) == NEG)
10589             {
10590               /* CMN.  */
10591               if (speed)
10592                 *cost += extra_cost->alu.arith;
10593
10594               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
10595               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
10596               return true;
10597             }
10598
10599           /* CMP.
10600
10601              Compare can freely swap the order of operands, and
10602              canonicalization puts the more complex operation first.
10603              But the integer MINUS logic expects the shift/extend
10604              operation in op1.  */
10605           if (! (REG_P (op0)
10606                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
10607           {
10608             op0 = XEXP (x, 1);
10609             op1 = XEXP (x, 0);
10610           }
10611           goto cost_minus;
10612         }
10613
10614       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
10615         {
10616           /* FCMP.  */
10617           if (speed)
10618             *cost += extra_cost->fp[mode == DFmode].compare;
10619
10620           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
10621             {
10622               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
10623               /* FCMP supports constant 0.0 for no extra cost. */
10624               return true;
10625             }
10626           return false;
10627         }
10628
10629       if (VECTOR_MODE_P (mode))
10630         {
10631           /* Vector compare.  */
10632           if (speed)
10633             *cost += extra_cost->vect.alu;
10634
10635           if (aarch64_float_const_zero_rtx_p (op1))
10636             {
10637               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
10638                  cost.  */
10639               return true;
10640             }
10641           return false;
10642         }
10643       return false;
10644
10645     case MINUS:
10646       {
10647         op0 = XEXP (x, 0);
10648         op1 = XEXP (x, 1);
10649
10650 cost_minus:
10651         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
10652
10653         /* Detect valid immediates.  */
10654         if ((GET_MODE_CLASS (mode) == MODE_INT
10655              || (GET_MODE_CLASS (mode) == MODE_CC
10656                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
10657             && CONST_INT_P (op1)
10658             && aarch64_uimm12_shift (INTVAL (op1)))
10659           {
10660             if (speed)
10661               /* SUB(S) (immediate).  */
10662               *cost += extra_cost->alu.arith;
10663             return true;
10664           }
10665
10666         /* Look for SUB (extended register).  */
10667         if (is_a <scalar_int_mode> (mode, &int_mode)
10668             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
10669           {
10670             if (speed)
10671               *cost += extra_cost->alu.extend_arith;
10672
10673             op1 = aarch64_strip_extend (op1, true);
10674             *cost += rtx_cost (op1, VOIDmode,
10675                                (enum rtx_code) GET_CODE (op1), 0, speed);
10676             return true;
10677           }
10678
10679         rtx new_op1 = aarch64_strip_extend (op1, false);
10680
10681         /* Cost this as an FMA-alike operation.  */
10682         if ((GET_CODE (new_op1) == MULT
10683              || aarch64_shift_p (GET_CODE (new_op1)))
10684             && code != COMPARE)
10685           {
10686             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
10687                                             (enum rtx_code) code,
10688                                             speed);
10689             return true;
10690           }
10691
10692         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
10693
10694         if (speed)
10695           {
10696             if (VECTOR_MODE_P (mode))
10697               {
10698                 /* Vector SUB.  */
10699                 *cost += extra_cost->vect.alu;
10700               }
10701             else if (GET_MODE_CLASS (mode) == MODE_INT)
10702               {
10703                 /* SUB(S).  */
10704                 *cost += extra_cost->alu.arith;
10705               }
10706             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10707               {
10708                 /* FSUB.  */
10709                 *cost += extra_cost->fp[mode == DFmode].addsub;
10710               }
10711           }
10712         return true;
10713       }
10714
10715     case PLUS:
10716       {
10717         rtx new_op0;
10718
10719         op0 = XEXP (x, 0);
10720         op1 = XEXP (x, 1);
10721
10722 cost_plus:
10723         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
10724             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
10725           {
10726             /* CSINC.  */
10727             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
10728             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10729             return true;
10730           }
10731
10732         if (GET_MODE_CLASS (mode) == MODE_INT
10733             && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
10734                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
10735           {
10736             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
10737
10738             if (speed)
10739               /* ADD (immediate).  */
10740               *cost += extra_cost->alu.arith;
10741             return true;
10742           }
10743
10744         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10745
10746         /* Look for ADD (extended register).  */
10747         if (is_a <scalar_int_mode> (mode, &int_mode)
10748             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
10749           {
10750             if (speed)
10751               *cost += extra_cost->alu.extend_arith;
10752
10753             op0 = aarch64_strip_extend (op0, true);
10754             *cost += rtx_cost (op0, VOIDmode,
10755                                (enum rtx_code) GET_CODE (op0), 0, speed);
10756             return true;
10757           }
10758
10759         /* Strip any extend, leave shifts behind as we will
10760            cost them through mult_cost.  */
10761         new_op0 = aarch64_strip_extend (op0, false);
10762
10763         if (GET_CODE (new_op0) == MULT
10764             || aarch64_shift_p (GET_CODE (new_op0)))
10765           {
10766             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
10767                                             speed);
10768             return true;
10769           }
10770
10771         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
10772
10773         if (speed)
10774           {
10775             if (VECTOR_MODE_P (mode))
10776               {
10777                 /* Vector ADD.  */
10778                 *cost += extra_cost->vect.alu;
10779               }
10780             else if (GET_MODE_CLASS (mode) == MODE_INT)
10781               {
10782                 /* ADD.  */
10783                 *cost += extra_cost->alu.arith;
10784               }
10785             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10786               {
10787                 /* FADD.  */
10788                 *cost += extra_cost->fp[mode == DFmode].addsub;
10789               }
10790           }
10791         return true;
10792       }
10793
10794     case BSWAP:
10795       *cost = COSTS_N_INSNS (1);
10796
10797       if (speed)
10798         {
10799           if (VECTOR_MODE_P (mode))
10800             *cost += extra_cost->vect.alu;
10801           else
10802             *cost += extra_cost->alu.rev;
10803         }
10804       return false;
10805
10806     case IOR:
10807       if (aarch_rev16_p (x))
10808         {
10809           *cost = COSTS_N_INSNS (1);
10810
10811           if (speed)
10812             {
10813               if (VECTOR_MODE_P (mode))
10814                 *cost += extra_cost->vect.alu;
10815               else
10816                 *cost += extra_cost->alu.rev;
10817             }
10818           return true;
10819         }
10820
10821       if (aarch64_extr_rtx_p (x, &op0, &op1))
10822         {
10823           *cost += rtx_cost (op0, mode, IOR, 0, speed);
10824           *cost += rtx_cost (op1, mode, IOR, 1, speed);
10825           if (speed)
10826             *cost += extra_cost->alu.shift;
10827
10828           return true;
10829         }
10830     /* Fall through.  */
10831     case XOR:
10832     case AND:
10833     cost_logic:
10834       op0 = XEXP (x, 0);
10835       op1 = XEXP (x, 1);
10836
10837       if (VECTOR_MODE_P (mode))
10838         {
10839           if (speed)
10840             *cost += extra_cost->vect.alu;
10841           return true;
10842         }
10843
10844       if (code == AND
10845           && GET_CODE (op0) == MULT
10846           && CONST_INT_P (XEXP (op0, 1))
10847           && CONST_INT_P (op1)
10848           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
10849                                INTVAL (op1)) != 0)
10850         {
10851           /* This is a UBFM/SBFM.  */
10852           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
10853           if (speed)
10854             *cost += extra_cost->alu.bfx;
10855           return true;
10856         }
10857
10858       if (is_int_mode (mode, &int_mode))
10859         {
10860           if (CONST_INT_P (op1))
10861             {
10862               /* We have a mask + shift version of a UBFIZ
10863                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
10864               if (GET_CODE (op0) == ASHIFT
10865                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
10866                                                          XEXP (op0, 1)))
10867                 {
10868                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
10869                                      (enum rtx_code) code, 0, speed);
10870                   if (speed)
10871                     *cost += extra_cost->alu.bfx;
10872
10873                   return true;
10874                 }
10875               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
10876                 {
10877                 /* We possibly get the immediate for free, this is not
10878                    modelled.  */
10879                   *cost += rtx_cost (op0, int_mode,
10880                                      (enum rtx_code) code, 0, speed);
10881                   if (speed)
10882                     *cost += extra_cost->alu.logical;
10883
10884                   return true;
10885                 }
10886             }
10887           else
10888             {
10889               rtx new_op0 = op0;
10890
10891               /* Handle ORN, EON, or BIC.  */
10892               if (GET_CODE (op0) == NOT)
10893                 op0 = XEXP (op0, 0);
10894
10895               new_op0 = aarch64_strip_shift (op0);
10896
10897               /* If we had a shift on op0 then this is a logical-shift-
10898                  by-register/immediate operation.  Otherwise, this is just
10899                  a logical operation.  */
10900               if (speed)
10901                 {
10902                   if (new_op0 != op0)
10903                     {
10904                       /* Shift by immediate.  */
10905                       if (CONST_INT_P (XEXP (op0, 1)))
10906                         *cost += extra_cost->alu.log_shift;
10907                       else
10908                         *cost += extra_cost->alu.log_shift_reg;
10909                     }
10910                   else
10911                     *cost += extra_cost->alu.logical;
10912                 }
10913
10914               /* In both cases we want to cost both operands.  */
10915               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
10916                                  0, speed);
10917               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
10918                                  1, speed);
10919
10920               return true;
10921             }
10922         }
10923       return false;
10924
10925     case NOT:
10926       x = XEXP (x, 0);
10927       op0 = aarch64_strip_shift (x);
10928
10929       if (VECTOR_MODE_P (mode))
10930         {
10931           /* Vector NOT.  */
10932           *cost += extra_cost->vect.alu;
10933           return false;
10934         }
10935
10936       /* MVN-shifted-reg.  */
10937       if (op0 != x)
10938         {
10939           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10940
10941           if (speed)
10942             *cost += extra_cost->alu.log_shift;
10943
10944           return true;
10945         }
10946       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
10947          Handle the second form here taking care that 'a' in the above can
10948          be a shift.  */
10949       else if (GET_CODE (op0) == XOR)
10950         {
10951           rtx newop0 = XEXP (op0, 0);
10952           rtx newop1 = XEXP (op0, 1);
10953           rtx op0_stripped = aarch64_strip_shift (newop0);
10954
10955           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
10956           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
10957
10958           if (speed)
10959             {
10960               if (op0_stripped != newop0)
10961                 *cost += extra_cost->alu.log_shift;
10962               else
10963                 *cost += extra_cost->alu.logical;
10964             }
10965
10966           return true;
10967         }
10968       /* MVN.  */
10969       if (speed)
10970         *cost += extra_cost->alu.logical;
10971
10972       return false;
10973
10974     case ZERO_EXTEND:
10975
10976       op0 = XEXP (x, 0);
10977       /* If a value is written in SI mode, then zero extended to DI
10978          mode, the operation will in general be free as a write to
10979          a 'w' register implicitly zeroes the upper bits of an 'x'
10980          register.  However, if this is
10981
10982            (set (reg) (zero_extend (reg)))
10983
10984          we must cost the explicit register move.  */
10985       if (mode == DImode
10986           && GET_MODE (op0) == SImode
10987           && outer == SET)
10988         {
10989           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
10990
10991         /* If OP_COST is non-zero, then the cost of the zero extend
10992            is effectively the cost of the inner operation.  Otherwise
10993            we have a MOV instruction and we take the cost from the MOV
10994            itself.  This is true independently of whether we are
10995            optimizing for space or time.  */
10996           if (op_cost)
10997             *cost = op_cost;
10998
10999           return true;
11000         }
11001       else if (MEM_P (op0))
11002         {
11003           /* All loads can zero extend to any size for free.  */
11004           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
11005           return true;
11006         }
11007
11008       op0 = aarch64_extend_bitfield_pattern_p (x);
11009       if (op0)
11010         {
11011           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
11012           if (speed)
11013             *cost += extra_cost->alu.bfx;
11014           return true;
11015         }
11016
11017       if (speed)
11018         {
11019           if (VECTOR_MODE_P (mode))
11020             {
11021               /* UMOV.  */
11022               *cost += extra_cost->vect.alu;
11023             }
11024           else
11025             {
11026               /* We generate an AND instead of UXTB/UXTH.  */
11027               *cost += extra_cost->alu.logical;
11028             }
11029         }
11030       return false;
11031
11032     case SIGN_EXTEND:
11033       if (MEM_P (XEXP (x, 0)))
11034         {
11035           /* LDRSH.  */
11036           if (speed)
11037             {
11038               rtx address = XEXP (XEXP (x, 0), 0);
11039               *cost += extra_cost->ldst.load_sign_extend;
11040
11041               *cost +=
11042                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
11043                                                      0, speed));
11044             }
11045           return true;
11046         }
11047
11048       op0 = aarch64_extend_bitfield_pattern_p (x);
11049       if (op0)
11050         {
11051           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
11052           if (speed)
11053             *cost += extra_cost->alu.bfx;
11054           return true;
11055         }
11056
11057       if (speed)
11058         {
11059           if (VECTOR_MODE_P (mode))
11060             *cost += extra_cost->vect.alu;
11061           else
11062             *cost += extra_cost->alu.extend;
11063         }
11064       return false;
11065
11066     case ASHIFT:
11067       op0 = XEXP (x, 0);
11068       op1 = XEXP (x, 1);
11069
11070       if (CONST_INT_P (op1))
11071         {
11072           if (speed)
11073             {
11074               if (VECTOR_MODE_P (mode))
11075                 {
11076                   /* Vector shift (immediate).  */
11077                   *cost += extra_cost->vect.alu;
11078                 }
11079               else
11080                 {
11081                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
11082                      aliases.  */
11083                   *cost += extra_cost->alu.shift;
11084                 }
11085             }
11086
11087           /* We can incorporate zero/sign extend for free.  */
11088           if (GET_CODE (op0) == ZERO_EXTEND
11089               || GET_CODE (op0) == SIGN_EXTEND)
11090             op0 = XEXP (op0, 0);
11091
11092           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
11093           return true;
11094         }
11095       else
11096         {
11097           if (VECTOR_MODE_P (mode))
11098             {
11099               if (speed)
11100                 /* Vector shift (register).  */
11101                 *cost += extra_cost->vect.alu;
11102             }
11103           else
11104             {
11105               if (speed)
11106                 /* LSLV.  */
11107                 *cost += extra_cost->alu.shift_reg;
11108
11109               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
11110                   && CONST_INT_P (XEXP (op1, 1))
11111                   && known_eq (INTVAL (XEXP (op1, 1)),
11112                                GET_MODE_BITSIZE (mode) - 1))
11113                 {
11114                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
11115                   /* We already demanded XEXP (op1, 0) to be REG_P, so
11116                      don't recurse into it.  */
11117                   return true;
11118                 }
11119             }
11120           return false;  /* All arguments need to be in registers.  */
11121         }
11122
11123     case ROTATE:
11124     case ROTATERT:
11125     case LSHIFTRT:
11126     case ASHIFTRT:
11127       op0 = XEXP (x, 0);
11128       op1 = XEXP (x, 1);
11129
11130       if (CONST_INT_P (op1))
11131         {
11132           /* ASR (immediate) and friends.  */
11133           if (speed)
11134             {
11135               if (VECTOR_MODE_P (mode))
11136                 *cost += extra_cost->vect.alu;
11137               else
11138                 *cost += extra_cost->alu.shift;
11139             }
11140
11141           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
11142           return true;
11143         }
11144       else
11145         {
11146           if (VECTOR_MODE_P (mode))
11147             {
11148               if (speed)
11149                 /* Vector shift (register).  */
11150                 *cost += extra_cost->vect.alu;
11151             }
11152           else
11153             {
11154               if (speed)
11155                 /* ASR (register) and friends.  */
11156                 *cost += extra_cost->alu.shift_reg;
11157
11158               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
11159                   && CONST_INT_P (XEXP (op1, 1))
11160                   && known_eq (INTVAL (XEXP (op1, 1)),
11161                                GET_MODE_BITSIZE (mode) - 1))
11162                 {
11163                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
11164                   /* We already demanded XEXP (op1, 0) to be REG_P, so
11165                      don't recurse into it.  */
11166                   return true;
11167                 }
11168             }
11169           return false;  /* All arguments need to be in registers.  */
11170         }
11171
11172     case SYMBOL_REF:
11173
11174       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
11175           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
11176         {
11177           /* LDR.  */
11178           if (speed)
11179             *cost += extra_cost->ldst.load;
11180         }
11181       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
11182                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
11183         {
11184           /* ADRP, followed by ADD.  */
11185           *cost += COSTS_N_INSNS (1);
11186           if (speed)
11187             *cost += 2 * extra_cost->alu.arith;
11188         }
11189       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
11190                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
11191         {
11192           /* ADR.  */
11193           if (speed)
11194             *cost += extra_cost->alu.arith;
11195         }
11196
11197       if (flag_pic)
11198         {
11199           /* One extra load instruction, after accessing the GOT.  */
11200           *cost += COSTS_N_INSNS (1);
11201           if (speed)
11202             *cost += extra_cost->ldst.load;
11203         }
11204       return true;
11205
11206     case HIGH:
11207     case LO_SUM:
11208       /* ADRP/ADD (immediate).  */
11209       if (speed)
11210         *cost += extra_cost->alu.arith;
11211       return true;
11212
11213     case ZERO_EXTRACT:
11214     case SIGN_EXTRACT:
11215       /* UBFX/SBFX.  */
11216       if (speed)
11217         {
11218           if (VECTOR_MODE_P (mode))
11219             *cost += extra_cost->vect.alu;
11220           else
11221             *cost += extra_cost->alu.bfx;
11222         }
11223
11224       /* We can trust that the immediates used will be correct (there
11225          are no by-register forms), so we need only cost op0.  */
11226       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
11227       return true;
11228
11229     case MULT:
11230       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
11231       /* aarch64_rtx_mult_cost always handles recursion to its
11232          operands.  */
11233       return true;
11234
11235     case MOD:
11236     /* We can expand signed mod by power of 2 using a NEGS, two parallel
11237        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
11238        an unconditional negate.  This case should only ever be reached through
11239        the set_smod_pow2_cheap check in expmed.c.  */
11240       if (CONST_INT_P (XEXP (x, 1))
11241           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
11242           && (mode == SImode || mode == DImode))
11243         {
11244           /* We expand to 4 instructions.  Reset the baseline.  */
11245           *cost = COSTS_N_INSNS (4);
11246
11247           if (speed)
11248             *cost += 2 * extra_cost->alu.logical
11249                      + 2 * extra_cost->alu.arith;
11250
11251           return true;
11252         }
11253
11254     /* Fall-through.  */
11255     case UMOD:
11256       if (speed)
11257         {
11258           /* Slighly prefer UMOD over SMOD.  */
11259           if (VECTOR_MODE_P (mode))
11260             *cost += extra_cost->vect.alu;
11261           else if (GET_MODE_CLASS (mode) == MODE_INT)
11262             *cost += (extra_cost->mult[mode == DImode].add
11263                       + extra_cost->mult[mode == DImode].idiv
11264                       + (code == MOD ? 1 : 0));
11265         }
11266       return false;  /* All arguments need to be in registers.  */
11267
11268     case DIV:
11269     case UDIV:
11270     case SQRT:
11271       if (speed)
11272         {
11273           if (VECTOR_MODE_P (mode))
11274             *cost += extra_cost->vect.alu;
11275           else if (GET_MODE_CLASS (mode) == MODE_INT)
11276             /* There is no integer SQRT, so only DIV and UDIV can get
11277                here.  */
11278             *cost += (extra_cost->mult[mode == DImode].idiv
11279                      /* Slighly prefer UDIV over SDIV.  */
11280                      + (code == DIV ? 1 : 0));
11281           else
11282             *cost += extra_cost->fp[mode == DFmode].div;
11283         }
11284       return false;  /* All arguments need to be in registers.  */
11285
11286     case IF_THEN_ELSE:
11287       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
11288                                          XEXP (x, 2), cost, speed);
11289
11290     case EQ:
11291     case NE:
11292     case GT:
11293     case GTU:
11294     case LT:
11295     case LTU:
11296     case GE:
11297     case GEU:
11298     case LE:
11299     case LEU:
11300
11301       return false; /* All arguments must be in registers.  */
11302
11303     case FMA:
11304       op0 = XEXP (x, 0);
11305       op1 = XEXP (x, 1);
11306       op2 = XEXP (x, 2);
11307
11308       if (speed)
11309         {
11310           if (VECTOR_MODE_P (mode))
11311             *cost += extra_cost->vect.alu;
11312           else
11313             *cost += extra_cost->fp[mode == DFmode].fma;
11314         }
11315
11316       /* FMSUB, FNMADD, and FNMSUB are free.  */
11317       if (GET_CODE (op0) == NEG)
11318         op0 = XEXP (op0, 0);
11319
11320       if (GET_CODE (op2) == NEG)
11321         op2 = XEXP (op2, 0);
11322
11323       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
11324          and the by-element operand as operand 0.  */
11325       if (GET_CODE (op1) == NEG)
11326         op1 = XEXP (op1, 0);
11327
11328       /* Catch vector-by-element operations.  The by-element operand can
11329          either be (vec_duplicate (vec_select (x))) or just
11330          (vec_select (x)), depending on whether we are multiplying by
11331          a vector or a scalar.
11332
11333          Canonicalization is not very good in these cases, FMA4 will put the
11334          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
11335       if (GET_CODE (op0) == VEC_DUPLICATE)
11336         op0 = XEXP (op0, 0);
11337       else if (GET_CODE (op1) == VEC_DUPLICATE)
11338         op1 = XEXP (op1, 0);
11339
11340       if (GET_CODE (op0) == VEC_SELECT)
11341         op0 = XEXP (op0, 0);
11342       else if (GET_CODE (op1) == VEC_SELECT)
11343         op1 = XEXP (op1, 0);
11344
11345       /* If the remaining parameters are not registers,
11346          get the cost to put them into registers.  */
11347       *cost += rtx_cost (op0, mode, FMA, 0, speed);
11348       *cost += rtx_cost (op1, mode, FMA, 1, speed);
11349       *cost += rtx_cost (op2, mode, FMA, 2, speed);
11350       return true;
11351
11352     case FLOAT:
11353     case UNSIGNED_FLOAT:
11354       if (speed)
11355         *cost += extra_cost->fp[mode == DFmode].fromint;
11356       return false;
11357
11358     case FLOAT_EXTEND:
11359       if (speed)
11360         {
11361           if (VECTOR_MODE_P (mode))
11362             {
11363               /*Vector truncate.  */
11364               *cost += extra_cost->vect.alu;
11365             }
11366           else
11367             *cost += extra_cost->fp[mode == DFmode].widen;
11368         }
11369       return false;
11370
11371     case FLOAT_TRUNCATE:
11372       if (speed)
11373         {
11374           if (VECTOR_MODE_P (mode))
11375             {
11376               /*Vector conversion.  */
11377               *cost += extra_cost->vect.alu;
11378             }
11379           else
11380             *cost += extra_cost->fp[mode == DFmode].narrow;
11381         }
11382       return false;
11383
11384     case FIX:
11385     case UNSIGNED_FIX:
11386       x = XEXP (x, 0);
11387       /* Strip the rounding part.  They will all be implemented
11388          by the fcvt* family of instructions anyway.  */
11389       if (GET_CODE (x) == UNSPEC)
11390         {
11391           unsigned int uns_code = XINT (x, 1);
11392
11393           if (uns_code == UNSPEC_FRINTA
11394               || uns_code == UNSPEC_FRINTM
11395               || uns_code == UNSPEC_FRINTN
11396               || uns_code == UNSPEC_FRINTP
11397               || uns_code == UNSPEC_FRINTZ)
11398             x = XVECEXP (x, 0, 0);
11399         }
11400
11401       if (speed)
11402         {
11403           if (VECTOR_MODE_P (mode))
11404             *cost += extra_cost->vect.alu;
11405           else
11406             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
11407         }
11408
11409       /* We can combine fmul by a power of 2 followed by a fcvt into a single
11410          fixed-point fcvt.  */
11411       if (GET_CODE (x) == MULT
11412           && ((VECTOR_MODE_P (mode)
11413                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
11414               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
11415         {
11416           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
11417                              0, speed);
11418           return true;
11419         }
11420
11421       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
11422       return true;
11423
11424     case ABS:
11425       if (VECTOR_MODE_P (mode))
11426         {
11427           /* ABS (vector).  */
11428           if (speed)
11429             *cost += extra_cost->vect.alu;
11430         }
11431       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11432         {
11433           op0 = XEXP (x, 0);
11434
11435           /* FABD, which is analogous to FADD.  */
11436           if (GET_CODE (op0) == MINUS)
11437             {
11438               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
11439               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
11440               if (speed)
11441                 *cost += extra_cost->fp[mode == DFmode].addsub;
11442
11443               return true;
11444             }
11445           /* Simple FABS is analogous to FNEG.  */
11446           if (speed)
11447             *cost += extra_cost->fp[mode == DFmode].neg;
11448         }
11449       else
11450         {
11451           /* Integer ABS will either be split to
11452              two arithmetic instructions, or will be an ABS
11453              (scalar), which we don't model.  */
11454           *cost = COSTS_N_INSNS (2);
11455           if (speed)
11456             *cost += 2 * extra_cost->alu.arith;
11457         }
11458       return false;
11459
11460     case SMAX:
11461     case SMIN:
11462       if (speed)
11463         {
11464           if (VECTOR_MODE_P (mode))
11465             *cost += extra_cost->vect.alu;
11466           else
11467             {
11468               /* FMAXNM/FMINNM/FMAX/FMIN.
11469                  TODO: This may not be accurate for all implementations, but
11470                  we do not model this in the cost tables.  */
11471               *cost += extra_cost->fp[mode == DFmode].addsub;
11472             }
11473         }
11474       return false;
11475
11476     case UNSPEC:
11477       /* The floating point round to integer frint* instructions.  */
11478       if (aarch64_frint_unspec_p (XINT (x, 1)))
11479         {
11480           if (speed)
11481             *cost += extra_cost->fp[mode == DFmode].roundint;
11482
11483           return false;
11484         }
11485
11486       if (XINT (x, 1) == UNSPEC_RBIT)
11487         {
11488           if (speed)
11489             *cost += extra_cost->alu.rev;
11490
11491           return false;
11492         }
11493       break;
11494
11495     case TRUNCATE:
11496
11497       /* Decompose <su>muldi3_highpart.  */
11498       if (/* (truncate:DI  */
11499           mode == DImode
11500           /*   (lshiftrt:TI  */
11501           && GET_MODE (XEXP (x, 0)) == TImode
11502           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
11503           /*      (mult:TI  */
11504           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
11505           /*        (ANY_EXTEND:TI (reg:DI))
11506                     (ANY_EXTEND:TI (reg:DI)))  */
11507           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
11508                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
11509               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
11510                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
11511           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
11512           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
11513           /*     (const_int 64)  */
11514           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
11515           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
11516         {
11517           /* UMULH/SMULH.  */
11518           if (speed)
11519             *cost += extra_cost->mult[mode == DImode].extend;
11520           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
11521                              mode, MULT, 0, speed);
11522           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
11523                              mode, MULT, 1, speed);
11524           return true;
11525         }
11526
11527       /* Fall through.  */
11528     default:
11529       break;
11530     }
11531
11532   if (dump_file
11533       && flag_aarch64_verbose_cost)
11534     fprintf (dump_file,
11535       "\nFailed to cost RTX.  Assuming default cost.\n");
11536
11537   return true;
11538 }
11539
11540 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
11541    calculated for X.  This cost is stored in *COST.  Returns true
11542    if the total cost of X was calculated.  */
11543 static bool
11544 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
11545                    int param, int *cost, bool speed)
11546 {
11547   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
11548
11549   if (dump_file
11550       && flag_aarch64_verbose_cost)
11551     {
11552       print_rtl_single (dump_file, x);
11553       fprintf (dump_file, "\n%s cost: %d (%s)\n",
11554                speed ? "Hot" : "Cold",
11555                *cost, result ? "final" : "partial");
11556     }
11557
11558   return result;
11559 }
11560
11561 static int
11562 aarch64_register_move_cost (machine_mode mode,
11563                             reg_class_t from_i, reg_class_t to_i)
11564 {
11565   enum reg_class from = (enum reg_class) from_i;
11566   enum reg_class to = (enum reg_class) to_i;
11567   const struct cpu_regmove_cost *regmove_cost
11568     = aarch64_tune_params.regmove_cost;
11569
11570   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
11571   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
11572     to = GENERAL_REGS;
11573
11574   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
11575     from = GENERAL_REGS;
11576
11577   /* Moving between GPR and stack cost is the same as GP2GP.  */
11578   if ((from == GENERAL_REGS && to == STACK_REG)
11579       || (to == GENERAL_REGS && from == STACK_REG))
11580     return regmove_cost->GP2GP;
11581
11582   /* To/From the stack register, we move via the gprs.  */
11583   if (to == STACK_REG || from == STACK_REG)
11584     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
11585             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
11586
11587   if (known_eq (GET_MODE_SIZE (mode), 16))
11588     {
11589       /* 128-bit operations on general registers require 2 instructions.  */
11590       if (from == GENERAL_REGS && to == GENERAL_REGS)
11591         return regmove_cost->GP2GP * 2;
11592       else if (from == GENERAL_REGS)
11593         return regmove_cost->GP2FP * 2;
11594       else if (to == GENERAL_REGS)
11595         return regmove_cost->FP2GP * 2;
11596
11597       /* When AdvSIMD instructions are disabled it is not possible to move
11598          a 128-bit value directly between Q registers.  This is handled in
11599          secondary reload.  A general register is used as a scratch to move
11600          the upper DI value and the lower DI value is moved directly,
11601          hence the cost is the sum of three moves. */
11602       if (! TARGET_SIMD)
11603         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
11604
11605       return regmove_cost->FP2FP;
11606     }
11607
11608   if (from == GENERAL_REGS && to == GENERAL_REGS)
11609     return regmove_cost->GP2GP;
11610   else if (from == GENERAL_REGS)
11611     return regmove_cost->GP2FP;
11612   else if (to == GENERAL_REGS)
11613     return regmove_cost->FP2GP;
11614
11615   return regmove_cost->FP2FP;
11616 }
11617
11618 static int
11619 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
11620                           reg_class_t rclass ATTRIBUTE_UNUSED,
11621                           bool in ATTRIBUTE_UNUSED)
11622 {
11623   return aarch64_tune_params.memmov_cost;
11624 }
11625
11626 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
11627    to optimize 1.0/sqrt.  */
11628
11629 static bool
11630 use_rsqrt_p (machine_mode mode)
11631 {
11632   return (!flag_trapping_math
11633           && flag_unsafe_math_optimizations
11634           && ((aarch64_tune_params.approx_modes->recip_sqrt
11635                & AARCH64_APPROX_MODE (mode))
11636               || flag_mrecip_low_precision_sqrt));
11637 }
11638
11639 /* Function to decide when to use the approximate reciprocal square root
11640    builtin.  */
11641
11642 static tree
11643 aarch64_builtin_reciprocal (tree fndecl)
11644 {
11645   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
11646
11647   if (!use_rsqrt_p (mode))
11648     return NULL_TREE;
11649   return aarch64_builtin_rsqrt (DECL_MD_FUNCTION_CODE (fndecl));
11650 }
11651
11652 /* Emit instruction sequence to compute either the approximate square root
11653    or its approximate reciprocal, depending on the flag RECP, and return
11654    whether the sequence was emitted or not.  */
11655
11656 bool
11657 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
11658 {
11659   machine_mode mode = GET_MODE (dst);
11660
11661   if (GET_MODE_INNER (mode) == HFmode)
11662     {
11663       gcc_assert (!recp);
11664       return false;
11665     }
11666
11667   if (!recp)
11668     {
11669       if (!(flag_mlow_precision_sqrt
11670             || (aarch64_tune_params.approx_modes->sqrt
11671                 & AARCH64_APPROX_MODE (mode))))
11672         return false;
11673
11674       if (flag_finite_math_only
11675           || flag_trapping_math
11676           || !flag_unsafe_math_optimizations
11677           || optimize_function_for_size_p (cfun))
11678         return false;
11679     }
11680   else
11681     /* Caller assumes we cannot fail.  */
11682     gcc_assert (use_rsqrt_p (mode));
11683
11684   machine_mode mmsk = mode_for_int_vector (mode).require ();
11685   rtx xmsk = gen_reg_rtx (mmsk);
11686   if (!recp)
11687     /* When calculating the approximate square root, compare the
11688        argument with 0.0 and create a mask.  */
11689     emit_insn (gen_rtx_SET (xmsk,
11690                             gen_rtx_NEG (mmsk,
11691                                          gen_rtx_EQ (mmsk, src,
11692                                                      CONST0_RTX (mode)))));
11693
11694   /* Estimate the approximate reciprocal square root.  */
11695   rtx xdst = gen_reg_rtx (mode);
11696   emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
11697
11698   /* Iterate over the series twice for SF and thrice for DF.  */
11699   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11700
11701   /* Optionally iterate over the series once less for faster performance
11702      while sacrificing the accuracy.  */
11703   if ((recp && flag_mrecip_low_precision_sqrt)
11704       || (!recp && flag_mlow_precision_sqrt))
11705     iterations--;
11706
11707   /* Iterate over the series to calculate the approximate reciprocal square
11708      root.  */
11709   rtx x1 = gen_reg_rtx (mode);
11710   while (iterations--)
11711     {
11712       rtx x2 = gen_reg_rtx (mode);
11713       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
11714
11715       emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
11716
11717       if (iterations > 0)
11718         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
11719     }
11720
11721   if (!recp)
11722     {
11723       /* Qualify the approximate reciprocal square root when the argument is
11724          0.0 by squashing the intermediary result to 0.0.  */
11725       rtx xtmp = gen_reg_rtx (mmsk);
11726       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
11727                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
11728       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
11729
11730       /* Calculate the approximate square root.  */
11731       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
11732     }
11733
11734   /* Finalize the approximation.  */
11735   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
11736
11737   return true;
11738 }
11739
11740 /* Emit the instruction sequence to compute the approximation for the division
11741    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
11742
11743 bool
11744 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
11745 {
11746   machine_mode mode = GET_MODE (quo);
11747
11748   if (GET_MODE_INNER (mode) == HFmode)
11749     return false;
11750
11751   bool use_approx_division_p = (flag_mlow_precision_div
11752                                 || (aarch64_tune_params.approx_modes->division
11753                                     & AARCH64_APPROX_MODE (mode)));
11754
11755   if (!flag_finite_math_only
11756       || flag_trapping_math
11757       || !flag_unsafe_math_optimizations
11758       || optimize_function_for_size_p (cfun)
11759       || !use_approx_division_p)
11760     return false;
11761
11762   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
11763     return false;
11764
11765   /* Estimate the approximate reciprocal.  */
11766   rtx xrcp = gen_reg_rtx (mode);
11767   emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
11768
11769   /* Iterate over the series twice for SF and thrice for DF.  */
11770   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11771
11772   /* Optionally iterate over the series once less for faster performance,
11773      while sacrificing the accuracy.  */
11774   if (flag_mlow_precision_div)
11775     iterations--;
11776
11777   /* Iterate over the series to calculate the approximate reciprocal.  */
11778   rtx xtmp = gen_reg_rtx (mode);
11779   while (iterations--)
11780     {
11781       emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
11782
11783       if (iterations > 0)
11784         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
11785     }
11786
11787   if (num != CONST1_RTX (mode))
11788     {
11789       /* As the approximate reciprocal of DEN is already calculated, only
11790          calculate the approximate division when NUM is not 1.0.  */
11791       rtx xnum = force_reg (mode, num);
11792       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
11793     }
11794
11795   /* Finalize the approximation.  */
11796   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
11797   return true;
11798 }
11799
11800 /* Return the number of instructions that can be issued per cycle.  */
11801 static int
11802 aarch64_sched_issue_rate (void)
11803 {
11804   return aarch64_tune_params.issue_rate;
11805 }
11806
11807 static int
11808 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
11809 {
11810   int issue_rate = aarch64_sched_issue_rate ();
11811
11812   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
11813 }
11814
11815
11816 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
11817    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
11818    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
11819
11820 static int
11821 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
11822                                                     int ready_index)
11823 {
11824   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
11825 }
11826
11827
11828 /* Vectorizer cost model target hooks.  */
11829
11830 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
11831 static int
11832 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
11833                                     tree vectype,
11834                                     int misalign ATTRIBUTE_UNUSED)
11835 {
11836   unsigned elements;
11837   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
11838   bool fp = false;
11839
11840   if (vectype != NULL)
11841     fp = FLOAT_TYPE_P (vectype);
11842
11843   switch (type_of_cost)
11844     {
11845       case scalar_stmt:
11846         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
11847
11848       case scalar_load:
11849         return costs->scalar_load_cost;
11850
11851       case scalar_store:
11852         return costs->scalar_store_cost;
11853
11854       case vector_stmt:
11855         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11856
11857       case vector_load:
11858         return costs->vec_align_load_cost;
11859
11860       case vector_store:
11861         return costs->vec_store_cost;
11862
11863       case vec_to_scalar:
11864         return costs->vec_to_scalar_cost;
11865
11866       case scalar_to_vec:
11867         return costs->scalar_to_vec_cost;
11868
11869       case unaligned_load:
11870       case vector_gather_load:
11871         return costs->vec_unalign_load_cost;
11872
11873       case unaligned_store:
11874       case vector_scatter_store:
11875         return costs->vec_unalign_store_cost;
11876
11877       case cond_branch_taken:
11878         return costs->cond_taken_branch_cost;
11879
11880       case cond_branch_not_taken:
11881         return costs->cond_not_taken_branch_cost;
11882
11883       case vec_perm:
11884         return costs->vec_permute_cost;
11885
11886       case vec_promote_demote:
11887         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11888
11889       case vec_construct:
11890         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
11891         return elements / 2 + 1;
11892
11893       default:
11894         gcc_unreachable ();
11895     }
11896 }
11897
11898 /* Implement targetm.vectorize.add_stmt_cost.  */
11899 static unsigned
11900 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
11901                        struct _stmt_vec_info *stmt_info, int misalign,
11902                        enum vect_cost_model_location where)
11903 {
11904   unsigned *cost = (unsigned *) data;
11905   unsigned retval = 0;
11906
11907   if (flag_vect_cost_model)
11908     {
11909       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
11910       int stmt_cost =
11911             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
11912
11913       /* Statements in an inner loop relative to the loop being
11914          vectorized are weighted more heavily.  The value here is
11915          arbitrary and could potentially be improved with analysis.  */
11916       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
11917         count *= 50; /*  FIXME  */
11918
11919       retval = (unsigned) (count * stmt_cost);
11920       cost[where] += retval;
11921     }
11922
11923   return retval;
11924 }
11925
11926 static void initialize_aarch64_code_model (struct gcc_options *);
11927
11928 /* Parse the TO_PARSE string and put the architecture struct that it
11929    selects into RES and the architectural features into ISA_FLAGS.
11930    Return an aarch64_parse_opt_result describing the parse result.
11931    If there is an error parsing, RES and ISA_FLAGS are left unchanged.
11932    When the TO_PARSE string contains an invalid extension,
11933    a copy of the string is created and stored to INVALID_EXTENSION.  */
11934
11935 static enum aarch64_parse_opt_result
11936 aarch64_parse_arch (const char *to_parse, const struct processor **res,
11937                     uint64_t *isa_flags, std::string *invalid_extension)
11938 {
11939   const char *ext;
11940   const struct processor *arch;
11941   size_t len;
11942
11943   ext = strchr (to_parse, '+');
11944
11945   if (ext != NULL)
11946     len = ext - to_parse;
11947   else
11948     len = strlen (to_parse);
11949
11950   if (len == 0)
11951     return AARCH64_PARSE_MISSING_ARG;
11952
11953
11954   /* Loop through the list of supported ARCHes to find a match.  */
11955   for (arch = all_architectures; arch->name != NULL; arch++)
11956     {
11957       if (strlen (arch->name) == len
11958           && strncmp (arch->name, to_parse, len) == 0)
11959         {
11960           uint64_t isa_temp = arch->flags;
11961
11962           if (ext != NULL)
11963             {
11964               /* TO_PARSE string contains at least one extension.  */
11965               enum aarch64_parse_opt_result ext_res
11966                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11967
11968               if (ext_res != AARCH64_PARSE_OK)
11969                 return ext_res;
11970             }
11971           /* Extension parsing was successful.  Confirm the result
11972              arch and ISA flags.  */
11973           *res = arch;
11974           *isa_flags = isa_temp;
11975           return AARCH64_PARSE_OK;
11976         }
11977     }
11978
11979   /* ARCH name not found in list.  */
11980   return AARCH64_PARSE_INVALID_ARG;
11981 }
11982
11983 /* Parse the TO_PARSE string and put the result tuning in RES and the
11984    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
11985    describing the parse result.  If there is an error parsing, RES and
11986    ISA_FLAGS are left unchanged.
11987    When the TO_PARSE string contains an invalid extension,
11988    a copy of the string is created and stored to INVALID_EXTENSION.  */
11989
11990 static enum aarch64_parse_opt_result
11991 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
11992                    uint64_t *isa_flags, std::string *invalid_extension)
11993 {
11994   const char *ext;
11995   const struct processor *cpu;
11996   size_t len;
11997
11998   ext = strchr (to_parse, '+');
11999
12000   if (ext != NULL)
12001     len = ext - to_parse;
12002   else
12003     len = strlen (to_parse);
12004
12005   if (len == 0)
12006     return AARCH64_PARSE_MISSING_ARG;
12007
12008
12009   /* Loop through the list of supported CPUs to find a match.  */
12010   for (cpu = all_cores; cpu->name != NULL; cpu++)
12011     {
12012       if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
12013         {
12014           uint64_t isa_temp = cpu->flags;
12015
12016
12017           if (ext != NULL)
12018             {
12019               /* TO_PARSE string contains at least one extension.  */
12020               enum aarch64_parse_opt_result ext_res
12021                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
12022
12023               if (ext_res != AARCH64_PARSE_OK)
12024                 return ext_res;
12025             }
12026           /* Extension parsing was successfull.  Confirm the result
12027              cpu and ISA flags.  */
12028           *res = cpu;
12029           *isa_flags = isa_temp;
12030           return AARCH64_PARSE_OK;
12031         }
12032     }
12033
12034   /* CPU name not found in list.  */
12035   return AARCH64_PARSE_INVALID_ARG;
12036 }
12037
12038 /* Parse the TO_PARSE string and put the cpu it selects into RES.
12039    Return an aarch64_parse_opt_result describing the parse result.
12040    If the parsing fails the RES does not change.  */
12041
12042 static enum aarch64_parse_opt_result
12043 aarch64_parse_tune (const char *to_parse, const struct processor **res)
12044 {
12045   const struct processor *cpu;
12046
12047   /* Loop through the list of supported CPUs to find a match.  */
12048   for (cpu = all_cores; cpu->name != NULL; cpu++)
12049     {
12050       if (strcmp (cpu->name, to_parse) == 0)
12051         {
12052           *res = cpu;
12053           return AARCH64_PARSE_OK;
12054         }
12055     }
12056
12057   /* CPU name not found in list.  */
12058   return AARCH64_PARSE_INVALID_ARG;
12059 }
12060
12061 /* Parse TOKEN, which has length LENGTH to see if it is an option
12062    described in FLAG.  If it is, return the index bit for that fusion type.
12063    If not, error (printing OPTION_NAME) and return zero.  */
12064
12065 static unsigned int
12066 aarch64_parse_one_option_token (const char *token,
12067                                 size_t length,
12068                                 const struct aarch64_flag_desc *flag,
12069                                 const char *option_name)
12070 {
12071   for (; flag->name != NULL; flag++)
12072     {
12073       if (length == strlen (flag->name)
12074           && !strncmp (flag->name, token, length))
12075         return flag->flag;
12076     }
12077
12078   error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
12079   return 0;
12080 }
12081
12082 /* Parse OPTION which is a comma-separated list of flags to enable.
12083    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
12084    default state we inherit from the CPU tuning structures.  OPTION_NAME
12085    gives the top-level option we are parsing in the -moverride string,
12086    for use in error messages.  */
12087
12088 static unsigned int
12089 aarch64_parse_boolean_options (const char *option,
12090                                const struct aarch64_flag_desc *flags,
12091                                unsigned int initial_state,
12092                                const char *option_name)
12093 {
12094   const char separator = '.';
12095   const char* specs = option;
12096   const char* ntoken = option;
12097   unsigned int found_flags = initial_state;
12098
12099   while ((ntoken = strchr (specs, separator)))
12100     {
12101       size_t token_length = ntoken - specs;
12102       unsigned token_ops = aarch64_parse_one_option_token (specs,
12103                                                            token_length,
12104                                                            flags,
12105                                                            option_name);
12106       /* If we find "none" (or, for simplicity's sake, an error) anywhere
12107          in the token stream, reset the supported operations.  So:
12108
12109            adrp+add.cmp+branch.none.adrp+add
12110
12111            would have the result of turning on only adrp+add fusion.  */
12112       if (!token_ops)
12113         found_flags = 0;
12114
12115       found_flags |= token_ops;
12116       specs = ++ntoken;
12117     }
12118
12119   /* We ended with a comma, print something.  */
12120   if (!(*specs))
12121     {
12122       error ("%s string ill-formed\n", option_name);
12123       return 0;
12124     }
12125
12126   /* We still have one more token to parse.  */
12127   size_t token_length = strlen (specs);
12128   unsigned token_ops = aarch64_parse_one_option_token (specs,
12129                                                        token_length,
12130                                                        flags,
12131                                                        option_name);
12132    if (!token_ops)
12133      found_flags = 0;
12134
12135   found_flags |= token_ops;
12136   return found_flags;
12137 }
12138
12139 /* Support for overriding instruction fusion.  */
12140
12141 static void
12142 aarch64_parse_fuse_string (const char *fuse_string,
12143                             struct tune_params *tune)
12144 {
12145   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
12146                                                      aarch64_fusible_pairs,
12147                                                      tune->fusible_ops,
12148                                                      "fuse=");
12149 }
12150
12151 /* Support for overriding other tuning flags.  */
12152
12153 static void
12154 aarch64_parse_tune_string (const char *tune_string,
12155                             struct tune_params *tune)
12156 {
12157   tune->extra_tuning_flags
12158     = aarch64_parse_boolean_options (tune_string,
12159                                      aarch64_tuning_flags,
12160                                      tune->extra_tuning_flags,
12161                                      "tune=");
12162 }
12163
12164 /* Parse the sve_width tuning moverride string in TUNE_STRING.
12165    Accept the valid SVE vector widths allowed by
12166    aarch64_sve_vector_bits_enum and use it to override sve_width
12167    in TUNE.  */
12168
12169 static void
12170 aarch64_parse_sve_width_string (const char *tune_string,
12171                                 struct tune_params *tune)
12172 {
12173   int width = -1;
12174
12175   int n = sscanf (tune_string, "%d", &width);
12176   if (n == EOF)
12177     {
12178       error ("invalid format for sve_width");
12179       return;
12180     }
12181   switch (width)
12182     {
12183     case SVE_128:
12184     case SVE_256:
12185     case SVE_512:
12186     case SVE_1024:
12187     case SVE_2048:
12188       break;
12189     default:
12190       error ("invalid sve_width value: %d", width);
12191     }
12192   tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
12193 }
12194
12195 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
12196    we understand.  If it is, extract the option string and handoff to
12197    the appropriate function.  */
12198
12199 void
12200 aarch64_parse_one_override_token (const char* token,
12201                                   size_t length,
12202                                   struct tune_params *tune)
12203 {
12204   const struct aarch64_tuning_override_function *fn
12205     = aarch64_tuning_override_functions;
12206
12207   const char *option_part = strchr (token, '=');
12208   if (!option_part)
12209     {
12210       error ("tuning string missing in option (%s)", token);
12211       return;
12212     }
12213
12214   /* Get the length of the option name.  */
12215   length = option_part - token;
12216   /* Skip the '=' to get to the option string.  */
12217   option_part++;
12218
12219   for (; fn->name != NULL; fn++)
12220     {
12221       if (!strncmp (fn->name, token, length))
12222         {
12223           fn->parse_override (option_part, tune);
12224           return;
12225         }
12226     }
12227
12228   error ("unknown tuning option (%s)",token);
12229   return;
12230 }
12231
12232 /* A checking mechanism for the implementation of the tls size.  */
12233
12234 static void
12235 initialize_aarch64_tls_size (struct gcc_options *opts)
12236 {
12237   if (aarch64_tls_size == 0)
12238     aarch64_tls_size = 24;
12239
12240   switch (opts->x_aarch64_cmodel_var)
12241     {
12242     case AARCH64_CMODEL_TINY:
12243       /* Both the default and maximum TLS size allowed under tiny is 1M which
12244          needs two instructions to address, so we clamp the size to 24.  */
12245       if (aarch64_tls_size > 24)
12246         aarch64_tls_size = 24;
12247       break;
12248     case AARCH64_CMODEL_SMALL:
12249       /* The maximum TLS size allowed under small is 4G.  */
12250       if (aarch64_tls_size > 32)
12251         aarch64_tls_size = 32;
12252       break;
12253     case AARCH64_CMODEL_LARGE:
12254       /* The maximum TLS size allowed under large is 16E.
12255          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
12256       if (aarch64_tls_size > 48)
12257         aarch64_tls_size = 48;
12258       break;
12259     default:
12260       gcc_unreachable ();
12261     }
12262
12263   return;
12264 }
12265
12266 /* Parse STRING looking for options in the format:
12267      string     :: option:string
12268      option     :: name=substring
12269      name       :: {a-z}
12270      substring  :: defined by option.  */
12271
12272 static void
12273 aarch64_parse_override_string (const char* input_string,
12274                                struct tune_params* tune)
12275 {
12276   const char separator = ':';
12277   size_t string_length = strlen (input_string) + 1;
12278   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
12279   char *string = string_root;
12280   strncpy (string, input_string, string_length);
12281   string[string_length - 1] = '\0';
12282
12283   char* ntoken = string;
12284
12285   while ((ntoken = strchr (string, separator)))
12286     {
12287       size_t token_length = ntoken - string;
12288       /* Make this substring look like a string.  */
12289       *ntoken = '\0';
12290       aarch64_parse_one_override_token (string, token_length, tune);
12291       string = ++ntoken;
12292     }
12293
12294   /* One last option to parse.  */
12295   aarch64_parse_one_override_token (string, strlen (string), tune);
12296   free (string_root);
12297 }
12298
12299
12300 static void
12301 aarch64_override_options_after_change_1 (struct gcc_options *opts)
12302 {
12303   if (accepted_branch_protection_string)
12304     {
12305       opts->x_aarch64_branch_protection_string
12306         = xstrdup (accepted_branch_protection_string);
12307     }
12308
12309   /* PR 70044: We have to be careful about being called multiple times for the
12310      same function.  This means all changes should be repeatable.  */
12311
12312   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
12313      Disable the frame pointer flag so the mid-end will not use a frame
12314      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
12315      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
12316      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
12317   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
12318   if (opts->x_flag_omit_frame_pointer == 0)
12319     opts->x_flag_omit_frame_pointer = 2;
12320
12321   /* If not optimizing for size, set the default
12322      alignment to what the target wants.  */
12323   if (!opts->x_optimize_size)
12324     {
12325       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
12326         opts->x_str_align_loops = aarch64_tune_params.loop_align;
12327       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
12328         opts->x_str_align_jumps = aarch64_tune_params.jump_align;
12329       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
12330         opts->x_str_align_functions = aarch64_tune_params.function_align;
12331     }
12332
12333   /* We default to no pc-relative literal loads.  */
12334
12335   aarch64_pcrelative_literal_loads = false;
12336
12337   /* If -mpc-relative-literal-loads is set on the command line, this
12338      implies that the user asked for PC relative literal loads.  */
12339   if (opts->x_pcrelative_literal_loads == 1)
12340     aarch64_pcrelative_literal_loads = true;
12341
12342   /* In the tiny memory model it makes no sense to disallow PC relative
12343      literal pool loads.  */
12344   if (aarch64_cmodel == AARCH64_CMODEL_TINY
12345       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
12346     aarch64_pcrelative_literal_loads = true;
12347
12348   /* When enabling the lower precision Newton series for the square root, also
12349      enable it for the reciprocal square root, since the latter is an
12350      intermediary step for the former.  */
12351   if (flag_mlow_precision_sqrt)
12352     flag_mrecip_low_precision_sqrt = true;
12353 }
12354
12355 /* 'Unpack' up the internal tuning structs and update the options
12356     in OPTS.  The caller must have set up selected_tune and selected_arch
12357     as all the other target-specific codegen decisions are
12358     derived from them.  */
12359
12360 void
12361 aarch64_override_options_internal (struct gcc_options *opts)
12362 {
12363   aarch64_tune_flags = selected_tune->flags;
12364   aarch64_tune = selected_tune->sched_core;
12365   /* Make a copy of the tuning parameters attached to the core, which
12366      we may later overwrite.  */
12367   aarch64_tune_params = *(selected_tune->tune);
12368   aarch64_architecture_version = selected_arch->architecture_version;
12369
12370   if (opts->x_aarch64_override_tune_string)
12371     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
12372                                   &aarch64_tune_params);
12373
12374   /* This target defaults to strict volatile bitfields.  */
12375   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
12376     opts->x_flag_strict_volatile_bitfields = 1;
12377
12378   if (aarch64_stack_protector_guard == SSP_GLOBAL
12379       && opts->x_aarch64_stack_protector_guard_offset_str)
12380     {
12381       error ("incompatible options %<-mstack-protector-guard=global%> and "
12382              "%<-mstack-protector-guard-offset=%s%>",
12383              aarch64_stack_protector_guard_offset_str);
12384     }
12385
12386   if (aarch64_stack_protector_guard == SSP_SYSREG
12387       && !(opts->x_aarch64_stack_protector_guard_offset_str
12388            && opts->x_aarch64_stack_protector_guard_reg_str))
12389     {
12390       error ("both %<-mstack-protector-guard-offset%> and "
12391              "%<-mstack-protector-guard-reg%> must be used "
12392              "with %<-mstack-protector-guard=sysreg%>");
12393     }
12394
12395   if (opts->x_aarch64_stack_protector_guard_reg_str)
12396     {
12397       if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
12398           error ("specify a system register with a small string length.");
12399     }
12400
12401   if (opts->x_aarch64_stack_protector_guard_offset_str)
12402     {
12403       char *end;
12404       const char *str = aarch64_stack_protector_guard_offset_str;
12405       errno = 0;
12406       long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
12407       if (!*str || *end || errno)
12408         error ("%qs is not a valid offset in %qs", str,
12409                "-mstack-protector-guard-offset=");
12410       aarch64_stack_protector_guard_offset = offs;
12411     }
12412
12413   initialize_aarch64_code_model (opts);
12414   initialize_aarch64_tls_size (opts);
12415
12416   int queue_depth = 0;
12417   switch (aarch64_tune_params.autoprefetcher_model)
12418     {
12419       case tune_params::AUTOPREFETCHER_OFF:
12420         queue_depth = -1;
12421         break;
12422       case tune_params::AUTOPREFETCHER_WEAK:
12423         queue_depth = 0;
12424         break;
12425       case tune_params::AUTOPREFETCHER_STRONG:
12426         queue_depth = max_insn_queue_index + 1;
12427         break;
12428       default:
12429         gcc_unreachable ();
12430     }
12431
12432   /* We don't mind passing in global_options_set here as we don't use
12433      the *options_set structs anyway.  */
12434   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
12435                          queue_depth,
12436                          opts->x_param_values,
12437                          global_options_set.x_param_values);
12438
12439   /* Set up parameters to be used in prefetching algorithm.  Do not
12440      override the defaults unless we are tuning for a core we have
12441      researched values for.  */
12442   if (aarch64_tune_params.prefetch->num_slots > 0)
12443     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
12444                            aarch64_tune_params.prefetch->num_slots,
12445                            opts->x_param_values,
12446                            global_options_set.x_param_values);
12447   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
12448     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
12449                            aarch64_tune_params.prefetch->l1_cache_size,
12450                            opts->x_param_values,
12451                            global_options_set.x_param_values);
12452   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
12453     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
12454                            aarch64_tune_params.prefetch->l1_cache_line_size,
12455                            opts->x_param_values,
12456                            global_options_set.x_param_values);
12457   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
12458     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
12459                            aarch64_tune_params.prefetch->l2_cache_size,
12460                            opts->x_param_values,
12461                            global_options_set.x_param_values);
12462   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
12463     maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
12464                            0,
12465                            opts->x_param_values,
12466                            global_options_set.x_param_values);
12467   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
12468     maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
12469                            aarch64_tune_params.prefetch->minimum_stride,
12470                            opts->x_param_values,
12471                            global_options_set.x_param_values);
12472
12473   /* Use the alternative scheduling-pressure algorithm by default.  */
12474   maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
12475                          opts->x_param_values,
12476                          global_options_set.x_param_values);
12477
12478   /* If the user hasn't changed it via configure then set the default to 64 KB
12479      for the backend.  */
12480   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
12481                          DEFAULT_STK_CLASH_GUARD_SIZE == 0
12482                            ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE,
12483                          opts->x_param_values,
12484                          global_options_set.x_param_values);
12485
12486   /* Validate the guard size.  */
12487   int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
12488
12489   /* Enforce that interval is the same size as size so the mid-end does the
12490      right thing.  */
12491   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
12492                          guard_size,
12493                          opts->x_param_values,
12494                          global_options_set.x_param_values);
12495
12496   /* The maybe_set calls won't update the value if the user has explicitly set
12497      one.  Which means we need to validate that probing interval and guard size
12498      are equal.  */
12499   int probe_interval
12500     = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
12501   if (guard_size != probe_interval)
12502     error ("stack clash guard size %<%d%> must be equal to probing interval "
12503            "%<%d%>", guard_size, probe_interval);
12504
12505   /* Enable sw prefetching at specified optimization level for
12506      CPUS that have prefetch.  Lower optimization level threshold by 1
12507      when profiling is enabled.  */
12508   if (opts->x_flag_prefetch_loop_arrays < 0
12509       && !opts->x_optimize_size
12510       && aarch64_tune_params.prefetch->default_opt_level >= 0
12511       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
12512     opts->x_flag_prefetch_loop_arrays = 1;
12513
12514   if (opts->x_aarch64_arch_string == NULL)
12515     opts->x_aarch64_arch_string = selected_arch->name;
12516   if (opts->x_aarch64_cpu_string == NULL)
12517     opts->x_aarch64_cpu_string = selected_cpu->name;
12518   if (opts->x_aarch64_tune_string == NULL)
12519     opts->x_aarch64_tune_string = selected_tune->name;
12520
12521   aarch64_override_options_after_change_1 (opts);
12522 }
12523
12524 /* Print a hint with a suggestion for a core or architecture name that
12525    most closely resembles what the user passed in STR.  ARCH is true if
12526    the user is asking for an architecture name.  ARCH is false if the user
12527    is asking for a core name.  */
12528
12529 static void
12530 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
12531 {
12532   auto_vec<const char *> candidates;
12533   const struct processor *entry = arch ? all_architectures : all_cores;
12534   for (; entry->name != NULL; entry++)
12535     candidates.safe_push (entry->name);
12536
12537 #ifdef HAVE_LOCAL_CPU_DETECT
12538   /* Add also "native" as possible value.  */
12539   if (arch)
12540     candidates.safe_push ("native");
12541 #endif
12542
12543   char *s;
12544   const char *hint = candidates_list_and_hint (str, s, candidates);
12545   if (hint)
12546     inform (input_location, "valid arguments are: %s;"
12547                              " did you mean %qs?", s, hint);
12548   else
12549     inform (input_location, "valid arguments are: %s", s);
12550
12551   XDELETEVEC (s);
12552 }
12553
12554 /* Print a hint with a suggestion for a core name that most closely resembles
12555    what the user passed in STR.  */
12556
12557 inline static void
12558 aarch64_print_hint_for_core (const char *str)
12559 {
12560   aarch64_print_hint_for_core_or_arch (str, false);
12561 }
12562
12563 /* Print a hint with a suggestion for an architecture name that most closely
12564    resembles what the user passed in STR.  */
12565
12566 inline static void
12567 aarch64_print_hint_for_arch (const char *str)
12568 {
12569   aarch64_print_hint_for_core_or_arch (str, true);
12570 }
12571
12572
12573 /* Print a hint with a suggestion for an extension name
12574    that most closely resembles what the user passed in STR.  */
12575
12576 void
12577 aarch64_print_hint_for_extensions (const std::string &str)
12578 {
12579   auto_vec<const char *> candidates;
12580   aarch64_get_all_extension_candidates (&candidates);
12581   char *s;
12582   const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
12583   if (hint)
12584     inform (input_location, "valid arguments are: %s;"
12585                              " did you mean %qs?", s, hint);
12586   else
12587     inform (input_location, "valid arguments are: %s;", s);
12588
12589   XDELETEVEC (s);
12590 }
12591
12592 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
12593    specified in STR and throw errors if appropriate.  Put the results if
12594    they are valid in RES and ISA_FLAGS.  Return whether the option is
12595    valid.  */
12596
12597 static bool
12598 aarch64_validate_mcpu (const char *str, const struct processor **res,
12599                        uint64_t *isa_flags)
12600 {
12601   std::string invalid_extension;
12602   enum aarch64_parse_opt_result parse_res
12603     = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
12604
12605   if (parse_res == AARCH64_PARSE_OK)
12606     return true;
12607
12608   switch (parse_res)
12609     {
12610       case AARCH64_PARSE_MISSING_ARG:
12611         error ("missing cpu name in %<-mcpu=%s%>", str);
12612         break;
12613       case AARCH64_PARSE_INVALID_ARG:
12614         error ("unknown value %qs for %<-mcpu%>", str);
12615         aarch64_print_hint_for_core (str);
12616         break;
12617       case AARCH64_PARSE_INVALID_FEATURE:
12618         error ("invalid feature modifier %qs in %<-mcpu=%s%>",
12619                invalid_extension.c_str (), str);
12620         aarch64_print_hint_for_extensions (invalid_extension);
12621         break;
12622       default:
12623         gcc_unreachable ();
12624     }
12625
12626   return false;
12627 }
12628
12629 /* Parses CONST_STR for branch protection features specified in
12630    aarch64_branch_protect_types, and set any global variables required.  Returns
12631    the parsing result and assigns LAST_STR to the last processed token from
12632    CONST_STR so that it can be used for error reporting.  */
12633
12634 static enum
12635 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
12636                                                           char** last_str)
12637 {
12638   char *str_root = xstrdup (const_str);
12639   char* token_save = NULL;
12640   char *str = strtok_r (str_root, "+", &token_save);
12641   enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
12642   if (!str)
12643     res = AARCH64_PARSE_MISSING_ARG;
12644   else
12645     {
12646       char *next_str = strtok_r (NULL, "+", &token_save);
12647       /* Reset the branch protection features to their defaults.  */
12648       aarch64_handle_no_branch_protection (NULL, NULL);
12649
12650       while (str && res == AARCH64_PARSE_OK)
12651         {
12652           const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
12653           bool found = false;
12654           /* Search for this type.  */
12655           while (type && type->name && !found && res == AARCH64_PARSE_OK)
12656             {
12657               if (strcmp (str, type->name) == 0)
12658                 {
12659                   found = true;
12660                   res = type->handler (str, next_str);
12661                   str = next_str;
12662                   next_str = strtok_r (NULL, "+", &token_save);
12663                 }
12664               else
12665                 type++;
12666             }
12667           if (found && res == AARCH64_PARSE_OK)
12668             {
12669               bool found_subtype = true;
12670               /* Loop through each token until we find one that isn't a
12671                  subtype.  */
12672               while (found_subtype)
12673                 {
12674                   found_subtype = false;
12675                   const aarch64_branch_protect_type *subtype = type->subtypes;
12676                   /* Search for the subtype.  */
12677                   while (str && subtype && subtype->name && !found_subtype
12678                           && res == AARCH64_PARSE_OK)
12679                     {
12680                       if (strcmp (str, subtype->name) == 0)
12681                         {
12682                           found_subtype = true;
12683                           res = subtype->handler (str, next_str);
12684                           str = next_str;
12685                           next_str = strtok_r (NULL, "+", &token_save);
12686                         }
12687                       else
12688                         subtype++;
12689                     }
12690                 }
12691             }
12692           else if (!found)
12693             res = AARCH64_PARSE_INVALID_ARG;
12694         }
12695     }
12696   /* Copy the last processed token into the argument to pass it back.
12697     Used by option and attribute validation to print the offending token.  */
12698   if (last_str)
12699     {
12700       if (str) strcpy (*last_str, str);
12701       else *last_str = NULL;
12702     }
12703   if (res == AARCH64_PARSE_OK)
12704     {
12705       /* If needed, alloc the accepted string then copy in const_str.
12706         Used by override_option_after_change_1.  */
12707       if (!accepted_branch_protection_string)
12708         accepted_branch_protection_string = (char *) xmalloc (
12709                                                       BRANCH_PROTECT_STR_MAX
12710                                                         + 1);
12711       strncpy (accepted_branch_protection_string, const_str,
12712                 BRANCH_PROTECT_STR_MAX + 1);
12713       /* Forcibly null-terminate.  */
12714       accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
12715     }
12716   return res;
12717 }
12718
12719 static bool
12720 aarch64_validate_mbranch_protection (const char *const_str)
12721 {
12722   char *str = (char *) xmalloc (strlen (const_str));
12723   enum aarch64_parse_opt_result res =
12724     aarch64_parse_branch_protection (const_str, &str);
12725   if (res == AARCH64_PARSE_INVALID_ARG)
12726     error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
12727   else if (res == AARCH64_PARSE_MISSING_ARG)
12728     error ("missing argument for %<-mbranch-protection=%>");
12729   free (str);
12730   return res == AARCH64_PARSE_OK;
12731 }
12732
12733 /* Validate a command-line -march option.  Parse the arch and extensions
12734    (if any) specified in STR and throw errors if appropriate.  Put the
12735    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
12736    option is valid.  */
12737
12738 static bool
12739 aarch64_validate_march (const char *str, const struct processor **res,
12740                          uint64_t *isa_flags)
12741 {
12742   std::string invalid_extension;
12743   enum aarch64_parse_opt_result parse_res
12744     = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
12745
12746   if (parse_res == AARCH64_PARSE_OK)
12747     return true;
12748
12749   switch (parse_res)
12750     {
12751       case AARCH64_PARSE_MISSING_ARG:
12752         error ("missing arch name in %<-march=%s%>", str);
12753         break;
12754       case AARCH64_PARSE_INVALID_ARG:
12755         error ("unknown value %qs for %<-march%>", str);
12756         aarch64_print_hint_for_arch (str);
12757         break;
12758       case AARCH64_PARSE_INVALID_FEATURE:
12759         error ("invalid feature modifier %qs in %<-march=%s%>",
12760                invalid_extension.c_str (), str);
12761         aarch64_print_hint_for_extensions (invalid_extension);
12762         break;
12763       default:
12764         gcc_unreachable ();
12765     }
12766
12767   return false;
12768 }
12769
12770 /* Validate a command-line -mtune option.  Parse the cpu
12771    specified in STR and throw errors if appropriate.  Put the
12772    result, if it is valid, in RES.  Return whether the option is
12773    valid.  */
12774
12775 static bool
12776 aarch64_validate_mtune (const char *str, const struct processor **res)
12777 {
12778   enum aarch64_parse_opt_result parse_res
12779     = aarch64_parse_tune (str, res);
12780
12781   if (parse_res == AARCH64_PARSE_OK)
12782     return true;
12783
12784   switch (parse_res)
12785     {
12786       case AARCH64_PARSE_MISSING_ARG:
12787         error ("missing cpu name in %<-mtune=%s%>", str);
12788         break;
12789       case AARCH64_PARSE_INVALID_ARG:
12790         error ("unknown value %qs for %<-mtune%>", str);
12791         aarch64_print_hint_for_core (str);
12792         break;
12793       default:
12794         gcc_unreachable ();
12795     }
12796   return false;
12797 }
12798
12799 /* Return the CPU corresponding to the enum CPU.
12800    If it doesn't specify a cpu, return the default.  */
12801
12802 static const struct processor *
12803 aarch64_get_tune_cpu (enum aarch64_processor cpu)
12804 {
12805   if (cpu != aarch64_none)
12806     return &all_cores[cpu];
12807
12808   /* The & 0x3f is to extract the bottom 6 bits that encode the
12809      default cpu as selected by the --with-cpu GCC configure option
12810      in config.gcc.
12811      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
12812      flags mechanism should be reworked to make it more sane.  */
12813   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12814 }
12815
12816 /* Return the architecture corresponding to the enum ARCH.
12817    If it doesn't specify a valid architecture, return the default.  */
12818
12819 static const struct processor *
12820 aarch64_get_arch (enum aarch64_arch arch)
12821 {
12822   if (arch != aarch64_no_arch)
12823     return &all_architectures[arch];
12824
12825   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12826
12827   return &all_architectures[cpu->arch];
12828 }
12829
12830 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
12831
12832 static poly_uint16
12833 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
12834 {
12835   /* For now generate vector-length agnostic code for -msve-vector-bits=128.
12836      This ensures we can clearly distinguish SVE and Advanced SIMD modes when
12837      deciding which .md file patterns to use and when deciding whether
12838      something is a legitimate address or constant.  */
12839   if (value == SVE_SCALABLE || value == SVE_128)
12840     return poly_uint16 (2, 2);
12841   else
12842     return (int) value / 64;
12843 }
12844
12845 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
12846    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
12847    tuning structs.  In particular it must set selected_tune and
12848    aarch64_isa_flags that define the available ISA features and tuning
12849    decisions.  It must also set selected_arch as this will be used to
12850    output the .arch asm tags for each function.  */
12851
12852 static void
12853 aarch64_override_options (void)
12854 {
12855   uint64_t cpu_isa = 0;
12856   uint64_t arch_isa = 0;
12857   aarch64_isa_flags = 0;
12858
12859   bool valid_cpu = true;
12860   bool valid_tune = true;
12861   bool valid_arch = true;
12862
12863   selected_cpu = NULL;
12864   selected_arch = NULL;
12865   selected_tune = NULL;
12866
12867   if (aarch64_branch_protection_string)
12868     aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
12869
12870   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
12871      If either of -march or -mtune is given, they override their
12872      respective component of -mcpu.  */
12873   if (aarch64_cpu_string)
12874     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
12875                                         &cpu_isa);
12876
12877   if (aarch64_arch_string)
12878     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
12879                                           &arch_isa);
12880
12881   if (aarch64_tune_string)
12882     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
12883
12884 #ifdef SUBTARGET_OVERRIDE_OPTIONS
12885   SUBTARGET_OVERRIDE_OPTIONS;
12886 #endif
12887
12888   /* If the user did not specify a processor, choose the default
12889      one for them.  This will be the CPU set during configuration using
12890      --with-cpu, otherwise it is "generic".  */
12891   if (!selected_cpu)
12892     {
12893       if (selected_arch)
12894         {
12895           selected_cpu = &all_cores[selected_arch->ident];
12896           aarch64_isa_flags = arch_isa;
12897           explicit_arch = selected_arch->arch;
12898         }
12899       else
12900         {
12901           /* Get default configure-time CPU.  */
12902           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
12903           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
12904         }
12905
12906       if (selected_tune)
12907         explicit_tune_core = selected_tune->ident;
12908     }
12909   /* If both -mcpu and -march are specified check that they are architecturally
12910      compatible, warn if they're not and prefer the -march ISA flags.  */
12911   else if (selected_arch)
12912     {
12913       if (selected_arch->arch != selected_cpu->arch)
12914         {
12915           warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
12916                        all_architectures[selected_cpu->arch].name,
12917                        selected_arch->name);
12918         }
12919       aarch64_isa_flags = arch_isa;
12920       explicit_arch = selected_arch->arch;
12921       explicit_tune_core = selected_tune ? selected_tune->ident
12922                                           : selected_cpu->ident;
12923     }
12924   else
12925     {
12926       /* -mcpu but no -march.  */
12927       aarch64_isa_flags = cpu_isa;
12928       explicit_tune_core = selected_tune ? selected_tune->ident
12929                                           : selected_cpu->ident;
12930       gcc_assert (selected_cpu);
12931       selected_arch = &all_architectures[selected_cpu->arch];
12932       explicit_arch = selected_arch->arch;
12933     }
12934
12935   /* Set the arch as well as we will need it when outputing
12936      the .arch directive in assembly.  */
12937   if (!selected_arch)
12938     {
12939       gcc_assert (selected_cpu);
12940       selected_arch = &all_architectures[selected_cpu->arch];
12941     }
12942
12943   if (!selected_tune)
12944     selected_tune = selected_cpu;
12945
12946   if (aarch64_enable_bti == 2)
12947     {
12948 #ifdef TARGET_ENABLE_BTI
12949       aarch64_enable_bti = 1;
12950 #else
12951       aarch64_enable_bti = 0;
12952 #endif
12953     }
12954
12955   /* Return address signing is currently not supported for ILP32 targets.  For
12956      LP64 targets use the configured option in the absence of a command-line
12957      option for -mbranch-protection.  */
12958   if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
12959     {
12960 #ifdef TARGET_ENABLE_PAC_RET
12961       aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
12962 #else
12963       aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
12964 #endif
12965     }
12966
12967 #ifndef HAVE_AS_MABI_OPTION
12968   /* The compiler may have been configured with 2.23.* binutils, which does
12969      not have support for ILP32.  */
12970   if (TARGET_ILP32)
12971     error ("assembler does not support %<-mabi=ilp32%>");
12972 #endif
12973
12974   /* Convert -msve-vector-bits to a VG count.  */
12975   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
12976
12977   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
12978     sorry ("return address signing is only supported for %<-mabi=lp64%>");
12979
12980   /* Make sure we properly set up the explicit options.  */
12981   if ((aarch64_cpu_string && valid_cpu)
12982        || (aarch64_tune_string && valid_tune))
12983     gcc_assert (explicit_tune_core != aarch64_none);
12984
12985   if ((aarch64_cpu_string && valid_cpu)
12986        || (aarch64_arch_string && valid_arch))
12987     gcc_assert (explicit_arch != aarch64_no_arch);
12988
12989   /* The pass to insert speculation tracking runs before
12990      shrink-wrapping and the latter does not know how to update the
12991      tracking status.  So disable it in this case.  */
12992   if (aarch64_track_speculation)
12993     flag_shrink_wrap = 0;
12994
12995   aarch64_override_options_internal (&global_options);
12996
12997   /* Save these options as the default ones in case we push and pop them later
12998      while processing functions with potential target attributes.  */
12999   target_option_default_node = target_option_current_node
13000       = build_target_option_node (&global_options);
13001 }
13002
13003 /* Implement targetm.override_options_after_change.  */
13004
13005 static void
13006 aarch64_override_options_after_change (void)
13007 {
13008   aarch64_override_options_after_change_1 (&global_options);
13009 }
13010
13011 static struct machine_function *
13012 aarch64_init_machine_status (void)
13013 {
13014   struct machine_function *machine;
13015   machine = ggc_cleared_alloc<machine_function> ();
13016   return machine;
13017 }
13018
13019 void
13020 aarch64_init_expanders (void)
13021 {
13022   init_machine_status = aarch64_init_machine_status;
13023 }
13024
13025 /* A checking mechanism for the implementation of the various code models.  */
13026 static void
13027 initialize_aarch64_code_model (struct gcc_options *opts)
13028 {
13029    if (opts->x_flag_pic)
13030      {
13031        switch (opts->x_aarch64_cmodel_var)
13032          {
13033          case AARCH64_CMODEL_TINY:
13034            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
13035            break;
13036          case AARCH64_CMODEL_SMALL:
13037 #ifdef HAVE_AS_SMALL_PIC_RELOCS
13038            aarch64_cmodel = (flag_pic == 2
13039                              ? AARCH64_CMODEL_SMALL_PIC
13040                              : AARCH64_CMODEL_SMALL_SPIC);
13041 #else
13042            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
13043 #endif
13044            break;
13045          case AARCH64_CMODEL_LARGE:
13046            sorry ("code model %qs with %<-f%s%>", "large",
13047                   opts->x_flag_pic > 1 ? "PIC" : "pic");
13048            break;
13049          default:
13050            gcc_unreachable ();
13051          }
13052      }
13053    else
13054      aarch64_cmodel = opts->x_aarch64_cmodel_var;
13055 }
13056
13057 /* Implement TARGET_OPTION_SAVE.  */
13058
13059 static void
13060 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
13061 {
13062   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
13063   ptr->x_aarch64_branch_protection_string
13064     = opts->x_aarch64_branch_protection_string;
13065 }
13066
13067 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
13068    using the information saved in PTR.  */
13069
13070 static void
13071 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
13072 {
13073   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
13074   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
13075   opts->x_explicit_arch = ptr->x_explicit_arch;
13076   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
13077   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
13078   opts->x_aarch64_branch_protection_string
13079     = ptr->x_aarch64_branch_protection_string;
13080   if (opts->x_aarch64_branch_protection_string)
13081     {
13082       aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
13083                                         NULL);
13084     }
13085
13086   aarch64_override_options_internal (opts);
13087 }
13088
13089 /* Implement TARGET_OPTION_PRINT.  */
13090
13091 static void
13092 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
13093 {
13094   const struct processor *cpu
13095     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
13096   uint64_t isa_flags = ptr->x_aarch64_isa_flags;
13097   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
13098   std::string extension
13099     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
13100
13101   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
13102   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
13103            arch->name, extension.c_str ());
13104 }
13105
13106 static GTY(()) tree aarch64_previous_fndecl;
13107
13108 void
13109 aarch64_reset_previous_fndecl (void)
13110 {
13111   aarch64_previous_fndecl = NULL;
13112 }
13113
13114 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
13115    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
13116    make sure optab availability predicates are recomputed when necessary.  */
13117
13118 void
13119 aarch64_save_restore_target_globals (tree new_tree)
13120 {
13121   if (TREE_TARGET_GLOBALS (new_tree))
13122     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
13123   else if (new_tree == target_option_default_node)
13124     restore_target_globals (&default_target_globals);
13125   else
13126     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
13127 }
13128
13129 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
13130    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
13131    of the function, if such exists.  This function may be called multiple
13132    times on a single function so use aarch64_previous_fndecl to avoid
13133    setting up identical state.  */
13134
13135 static void
13136 aarch64_set_current_function (tree fndecl)
13137 {
13138   if (!fndecl || fndecl == aarch64_previous_fndecl)
13139     return;
13140
13141   tree old_tree = (aarch64_previous_fndecl
13142                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
13143                    : NULL_TREE);
13144
13145   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13146
13147   /* If current function has no attributes but the previous one did,
13148      use the default node.  */
13149   if (!new_tree && old_tree)
13150     new_tree = target_option_default_node;
13151
13152   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
13153      the default have been handled by aarch64_save_restore_target_globals from
13154      aarch64_pragma_target_parse.  */
13155   if (old_tree == new_tree)
13156     return;
13157
13158   aarch64_previous_fndecl = fndecl;
13159
13160   /* First set the target options.  */
13161   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
13162
13163   aarch64_save_restore_target_globals (new_tree);
13164 }
13165
13166 /* Enum describing the various ways we can handle attributes.
13167    In many cases we can reuse the generic option handling machinery.  */
13168
13169 enum aarch64_attr_opt_type
13170 {
13171   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
13172   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
13173   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
13174   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
13175 };
13176
13177 /* All the information needed to handle a target attribute.
13178    NAME is the name of the attribute.
13179    ATTR_TYPE specifies the type of behavior of the attribute as described
13180    in the definition of enum aarch64_attr_opt_type.
13181    ALLOW_NEG is true if the attribute supports a "no-" form.
13182    HANDLER is the function that takes the attribute string as an argument
13183    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
13184    OPT_NUM is the enum specifying the option that the attribute modifies.
13185    This is needed for attributes that mirror the behavior of a command-line
13186    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
13187    aarch64_attr_enum.  */
13188
13189 struct aarch64_attribute_info
13190 {
13191   const char *name;
13192   enum aarch64_attr_opt_type attr_type;
13193   bool allow_neg;
13194   bool (*handler) (const char *);
13195   enum opt_code opt_num;
13196 };
13197
13198 /* Handle the ARCH_STR argument to the arch= target attribute.  */
13199
13200 static bool
13201 aarch64_handle_attr_arch (const char *str)
13202 {
13203   const struct processor *tmp_arch = NULL;
13204   std::string invalid_extension;
13205   enum aarch64_parse_opt_result parse_res
13206     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
13207
13208   if (parse_res == AARCH64_PARSE_OK)
13209     {
13210       gcc_assert (tmp_arch);
13211       selected_arch = tmp_arch;
13212       explicit_arch = selected_arch->arch;
13213       return true;
13214     }
13215
13216   switch (parse_res)
13217     {
13218       case AARCH64_PARSE_MISSING_ARG:
13219         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
13220         break;
13221       case AARCH64_PARSE_INVALID_ARG:
13222         error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
13223         aarch64_print_hint_for_arch (str);
13224         break;
13225       case AARCH64_PARSE_INVALID_FEATURE:
13226         error ("invalid feature modifier %s of value (\"%s\") in "
13227                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13228         aarch64_print_hint_for_extensions (invalid_extension);
13229         break;
13230       default:
13231         gcc_unreachable ();
13232     }
13233
13234   return false;
13235 }
13236
13237 /* Handle the argument CPU_STR to the cpu= target attribute.  */
13238
13239 static bool
13240 aarch64_handle_attr_cpu (const char *str)
13241 {
13242   const struct processor *tmp_cpu = NULL;
13243   std::string invalid_extension;
13244   enum aarch64_parse_opt_result parse_res
13245     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
13246
13247   if (parse_res == AARCH64_PARSE_OK)
13248     {
13249       gcc_assert (tmp_cpu);
13250       selected_tune = tmp_cpu;
13251       explicit_tune_core = selected_tune->ident;
13252
13253       selected_arch = &all_architectures[tmp_cpu->arch];
13254       explicit_arch = selected_arch->arch;
13255       return true;
13256     }
13257
13258   switch (parse_res)
13259     {
13260       case AARCH64_PARSE_MISSING_ARG:
13261         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
13262         break;
13263       case AARCH64_PARSE_INVALID_ARG:
13264         error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
13265         aarch64_print_hint_for_core (str);
13266         break;
13267       case AARCH64_PARSE_INVALID_FEATURE:
13268         error ("invalid feature modifier %s of value (\"%s\") in "
13269                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13270         aarch64_print_hint_for_extensions (invalid_extension);
13271         break;
13272       default:
13273         gcc_unreachable ();
13274     }
13275
13276   return false;
13277 }
13278
13279 /* Handle the argument STR to the branch-protection= attribute.  */
13280
13281  static bool
13282  aarch64_handle_attr_branch_protection (const char* str)
13283  {
13284   char *err_str = (char *) xmalloc (strlen (str));
13285   enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
13286                                                                       &err_str);
13287   bool success = false;
13288   switch (res)
13289     {
13290      case AARCH64_PARSE_MISSING_ARG:
13291        error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
13292               " attribute");
13293        break;
13294      case AARCH64_PARSE_INVALID_ARG:
13295        error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
13296               "=\")%> pragma or attribute", err_str);
13297        break;
13298      case AARCH64_PARSE_OK:
13299        success = true;
13300       /* Fall through.  */
13301      case AARCH64_PARSE_INVALID_FEATURE:
13302        break;
13303      default:
13304        gcc_unreachable ();
13305     }
13306   free (err_str);
13307   return success;
13308  }
13309
13310 /* Handle the argument STR to the tune= target attribute.  */
13311
13312 static bool
13313 aarch64_handle_attr_tune (const char *str)
13314 {
13315   const struct processor *tmp_tune = NULL;
13316   enum aarch64_parse_opt_result parse_res
13317     = aarch64_parse_tune (str, &tmp_tune);
13318
13319   if (parse_res == AARCH64_PARSE_OK)
13320     {
13321       gcc_assert (tmp_tune);
13322       selected_tune = tmp_tune;
13323       explicit_tune_core = selected_tune->ident;
13324       return true;
13325     }
13326
13327   switch (parse_res)
13328     {
13329       case AARCH64_PARSE_INVALID_ARG:
13330         error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
13331         aarch64_print_hint_for_core (str);
13332         break;
13333       default:
13334         gcc_unreachable ();
13335     }
13336
13337   return false;
13338 }
13339
13340 /* Parse an architecture extensions target attribute string specified in STR.
13341    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
13342    if successful.  Update aarch64_isa_flags to reflect the ISA features
13343    modified.  */
13344
13345 static bool
13346 aarch64_handle_attr_isa_flags (char *str)
13347 {
13348   enum aarch64_parse_opt_result parse_res;
13349   uint64_t isa_flags = aarch64_isa_flags;
13350
13351   /* We allow "+nothing" in the beginning to clear out all architectural
13352      features if the user wants to handpick specific features.  */
13353   if (strncmp ("+nothing", str, 8) == 0)
13354     {
13355       isa_flags = 0;
13356       str += 8;
13357     }
13358
13359   std::string invalid_extension;
13360   parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
13361
13362   if (parse_res == AARCH64_PARSE_OK)
13363     {
13364       aarch64_isa_flags = isa_flags;
13365       return true;
13366     }
13367
13368   switch (parse_res)
13369     {
13370       case AARCH64_PARSE_MISSING_ARG:
13371         error ("missing value in %<target()%> pragma or attribute");
13372         break;
13373
13374       case AARCH64_PARSE_INVALID_FEATURE:
13375         error ("invalid feature modifier %s of value (\"%s\") in "
13376                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13377         break;
13378
13379       default:
13380         gcc_unreachable ();
13381     }
13382
13383  return false;
13384 }
13385
13386 /* The target attributes that we support.  On top of these we also support just
13387    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
13388    handled explicitly in aarch64_process_one_target_attr.  */
13389
13390 static const struct aarch64_attribute_info aarch64_attributes[] =
13391 {
13392   { "general-regs-only", aarch64_attr_mask, false, NULL,
13393      OPT_mgeneral_regs_only },
13394   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
13395      OPT_mfix_cortex_a53_835769 },
13396   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
13397      OPT_mfix_cortex_a53_843419 },
13398   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
13399   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
13400   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
13401      OPT_momit_leaf_frame_pointer },
13402   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
13403   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
13404      OPT_march_ },
13405   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
13406   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
13407      OPT_mtune_ },
13408   { "branch-protection", aarch64_attr_custom, false,
13409      aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
13410   { "sign-return-address", aarch64_attr_enum, false, NULL,
13411      OPT_msign_return_address_ },
13412   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
13413 };
13414
13415 /* Parse ARG_STR which contains the definition of one target attribute.
13416    Show appropriate errors if any or return true if the attribute is valid.  */
13417
13418 static bool
13419 aarch64_process_one_target_attr (char *arg_str)
13420 {
13421   bool invert = false;
13422
13423   size_t len = strlen (arg_str);
13424
13425   if (len == 0)
13426     {
13427       error ("malformed %<target()%> pragma or attribute");
13428       return false;
13429     }
13430
13431   char *str_to_check = (char *) alloca (len + 1);
13432   strcpy (str_to_check, arg_str);
13433
13434   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
13435      It is easier to detect and handle it explicitly here rather than going
13436      through the machinery for the rest of the target attributes in this
13437      function.  */
13438   if (*str_to_check == '+')
13439     return aarch64_handle_attr_isa_flags (str_to_check);
13440
13441   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
13442     {
13443       invert = true;
13444       str_to_check += 3;
13445     }
13446   char *arg = strchr (str_to_check, '=');
13447
13448   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
13449      and point ARG to "foo".  */
13450   if (arg)
13451     {
13452       *arg = '\0';
13453       arg++;
13454     }
13455   const struct aarch64_attribute_info *p_attr;
13456   bool found = false;
13457   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
13458     {
13459       /* If the names don't match up, or the user has given an argument
13460          to an attribute that doesn't accept one, or didn't give an argument
13461          to an attribute that expects one, fail to match.  */
13462       if (strcmp (str_to_check, p_attr->name) != 0)
13463         continue;
13464
13465       found = true;
13466       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
13467                               || p_attr->attr_type == aarch64_attr_enum;
13468
13469       if (attr_need_arg_p ^ (arg != NULL))
13470         {
13471           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
13472           return false;
13473         }
13474
13475       /* If the name matches but the attribute does not allow "no-" versions
13476          then we can't match.  */
13477       if (invert && !p_attr->allow_neg)
13478         {
13479           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
13480           return false;
13481         }
13482
13483       switch (p_attr->attr_type)
13484         {
13485         /* Has a custom handler registered.
13486            For example, cpu=, arch=, tune=.  */
13487           case aarch64_attr_custom:
13488             gcc_assert (p_attr->handler);
13489             if (!p_attr->handler (arg))
13490               return false;
13491             break;
13492
13493           /* Either set or unset a boolean option.  */
13494           case aarch64_attr_bool:
13495             {
13496               struct cl_decoded_option decoded;
13497
13498               generate_option (p_attr->opt_num, NULL, !invert,
13499                                CL_TARGET, &decoded);
13500               aarch64_handle_option (&global_options, &global_options_set,
13501                                       &decoded, input_location);
13502               break;
13503             }
13504           /* Set or unset a bit in the target_flags.  aarch64_handle_option
13505              should know what mask to apply given the option number.  */
13506           case aarch64_attr_mask:
13507             {
13508               struct cl_decoded_option decoded;
13509               /* We only need to specify the option number.
13510                  aarch64_handle_option will know which mask to apply.  */
13511               decoded.opt_index = p_attr->opt_num;
13512               decoded.value = !invert;
13513               aarch64_handle_option (&global_options, &global_options_set,
13514                                       &decoded, input_location);
13515               break;
13516             }
13517           /* Use the option setting machinery to set an option to an enum.  */
13518           case aarch64_attr_enum:
13519             {
13520               gcc_assert (arg);
13521               bool valid;
13522               int value;
13523               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
13524                                               &value, CL_TARGET);
13525               if (valid)
13526                 {
13527                   set_option (&global_options, NULL, p_attr->opt_num, value,
13528                               NULL, DK_UNSPECIFIED, input_location,
13529                               global_dc);
13530                 }
13531               else
13532                 {
13533                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
13534                 }
13535               break;
13536             }
13537           default:
13538             gcc_unreachable ();
13539         }
13540     }
13541
13542   /* If we reached here we either have found an attribute and validated
13543      it or didn't match any.  If we matched an attribute but its arguments
13544      were malformed we will have returned false already.  */
13545   return found;
13546 }
13547
13548 /* Count how many times the character C appears in
13549    NULL-terminated string STR.  */
13550
13551 static unsigned int
13552 num_occurences_in_str (char c, char *str)
13553 {
13554   unsigned int res = 0;
13555   while (*str != '\0')
13556     {
13557       if (*str == c)
13558         res++;
13559
13560       str++;
13561     }
13562
13563   return res;
13564 }
13565
13566 /* Parse the tree in ARGS that contains the target attribute information
13567    and update the global target options space.  */
13568
13569 bool
13570 aarch64_process_target_attr (tree args)
13571 {
13572   if (TREE_CODE (args) == TREE_LIST)
13573     {
13574       do
13575         {
13576           tree head = TREE_VALUE (args);
13577           if (head)
13578             {
13579               if (!aarch64_process_target_attr (head))
13580                 return false;
13581             }
13582           args = TREE_CHAIN (args);
13583         } while (args);
13584
13585       return true;
13586     }
13587
13588   if (TREE_CODE (args) != STRING_CST)
13589     {
13590       error ("attribute %<target%> argument not a string");
13591       return false;
13592     }
13593
13594   size_t len = strlen (TREE_STRING_POINTER (args));
13595   char *str_to_check = (char *) alloca (len + 1);
13596   strcpy (str_to_check, TREE_STRING_POINTER (args));
13597
13598   if (len == 0)
13599     {
13600       error ("malformed %<target()%> pragma or attribute");
13601       return false;
13602     }
13603
13604   /* Used to catch empty spaces between commas i.e.
13605      attribute ((target ("attr1,,attr2"))).  */
13606   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
13607
13608   /* Handle multiple target attributes separated by ','.  */
13609   char *token = strtok_r (str_to_check, ",", &str_to_check);
13610
13611   unsigned int num_attrs = 0;
13612   while (token)
13613     {
13614       num_attrs++;
13615       if (!aarch64_process_one_target_attr (token))
13616         {
13617           error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
13618           return false;
13619         }
13620
13621       token = strtok_r (NULL, ",", &str_to_check);
13622     }
13623
13624   if (num_attrs != num_commas + 1)
13625     {
13626       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
13627       return false;
13628     }
13629
13630   return true;
13631 }
13632
13633 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
13634    process attribute ((target ("..."))).  */
13635
13636 static bool
13637 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
13638 {
13639   struct cl_target_option cur_target;
13640   bool ret;
13641   tree old_optimize;
13642   tree new_target, new_optimize;
13643   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13644
13645   /* If what we're processing is the current pragma string then the
13646      target option node is already stored in target_option_current_node
13647      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
13648      having to re-parse the string.  This is especially useful to keep
13649      arm_neon.h compile times down since that header contains a lot
13650      of intrinsics enclosed in pragmas.  */
13651   if (!existing_target && args == current_target_pragma)
13652     {
13653       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
13654       return true;
13655     }
13656   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13657
13658   old_optimize = build_optimization_node (&global_options);
13659   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13660
13661   /* If the function changed the optimization levels as well as setting
13662      target options, start with the optimizations specified.  */
13663   if (func_optimize && func_optimize != old_optimize)
13664     cl_optimization_restore (&global_options,
13665                              TREE_OPTIMIZATION (func_optimize));
13666
13667   /* Save the current target options to restore at the end.  */
13668   cl_target_option_save (&cur_target, &global_options);
13669
13670   /* If fndecl already has some target attributes applied to it, unpack
13671      them so that we add this attribute on top of them, rather than
13672      overwriting them.  */
13673   if (existing_target)
13674     {
13675       struct cl_target_option *existing_options
13676         = TREE_TARGET_OPTION (existing_target);
13677
13678       if (existing_options)
13679         cl_target_option_restore (&global_options, existing_options);
13680     }
13681   else
13682     cl_target_option_restore (&global_options,
13683                         TREE_TARGET_OPTION (target_option_current_node));
13684
13685   ret = aarch64_process_target_attr (args);
13686
13687   /* Set up any additional state.  */
13688   if (ret)
13689     {
13690       aarch64_override_options_internal (&global_options);
13691       /* Initialize SIMD builtins if we haven't already.
13692          Set current_target_pragma to NULL for the duration so that
13693          the builtin initialization code doesn't try to tag the functions
13694          being built with the attributes specified by any current pragma, thus
13695          going into an infinite recursion.  */
13696       if (TARGET_SIMD)
13697         {
13698           tree saved_current_target_pragma = current_target_pragma;
13699           current_target_pragma = NULL;
13700           aarch64_init_simd_builtins ();
13701           current_target_pragma = saved_current_target_pragma;
13702         }
13703       new_target = build_target_option_node (&global_options);
13704     }
13705   else
13706     new_target = NULL;
13707
13708   new_optimize = build_optimization_node (&global_options);
13709
13710   if (fndecl && ret)
13711     {
13712       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
13713
13714       if (old_optimize != new_optimize)
13715         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
13716     }
13717
13718   cl_target_option_restore (&global_options, &cur_target);
13719
13720   if (old_optimize != new_optimize)
13721     cl_optimization_restore (&global_options,
13722                              TREE_OPTIMIZATION (old_optimize));
13723   return ret;
13724 }
13725
13726 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
13727    tri-bool options (yes, no, don't care) and the default value is
13728    DEF, determine whether to reject inlining.  */
13729
13730 static bool
13731 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
13732                                      int dont_care, int def)
13733 {
13734   /* If the callee doesn't care, always allow inlining.  */
13735   if (callee == dont_care)
13736     return true;
13737
13738   /* If the caller doesn't care, always allow inlining.  */
13739   if (caller == dont_care)
13740     return true;
13741
13742   /* Otherwise, allow inlining if either the callee and caller values
13743      agree, or if the callee is using the default value.  */
13744   return (callee == caller || callee == def);
13745 }
13746
13747 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
13748    to inline CALLEE into CALLER based on target-specific info.
13749    Make sure that the caller and callee have compatible architectural
13750    features.  Then go through the other possible target attributes
13751    and see if they can block inlining.  Try not to reject always_inline
13752    callees unless they are incompatible architecturally.  */
13753
13754 static bool
13755 aarch64_can_inline_p (tree caller, tree callee)
13756 {
13757   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
13758   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
13759
13760   struct cl_target_option *caller_opts
13761         = TREE_TARGET_OPTION (caller_tree ? caller_tree
13762                                            : target_option_default_node);
13763
13764   struct cl_target_option *callee_opts
13765         = TREE_TARGET_OPTION (callee_tree ? callee_tree
13766                                            : target_option_default_node);
13767
13768   /* Callee's ISA flags should be a subset of the caller's.  */
13769   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
13770        != callee_opts->x_aarch64_isa_flags)
13771     return false;
13772
13773   /* Allow non-strict aligned functions inlining into strict
13774      aligned ones.  */
13775   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
13776        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
13777       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
13778            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
13779     return false;
13780
13781   bool always_inline = lookup_attribute ("always_inline",
13782                                           DECL_ATTRIBUTES (callee));
13783
13784   /* If the architectural features match up and the callee is always_inline
13785      then the other attributes don't matter.  */
13786   if (always_inline)
13787     return true;
13788
13789   if (caller_opts->x_aarch64_cmodel_var
13790       != callee_opts->x_aarch64_cmodel_var)
13791     return false;
13792
13793   if (caller_opts->x_aarch64_tls_dialect
13794       != callee_opts->x_aarch64_tls_dialect)
13795     return false;
13796
13797   /* Honour explicit requests to workaround errata.  */
13798   if (!aarch64_tribools_ok_for_inlining_p (
13799           caller_opts->x_aarch64_fix_a53_err835769,
13800           callee_opts->x_aarch64_fix_a53_err835769,
13801           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
13802     return false;
13803
13804   if (!aarch64_tribools_ok_for_inlining_p (
13805           caller_opts->x_aarch64_fix_a53_err843419,
13806           callee_opts->x_aarch64_fix_a53_err843419,
13807           2, TARGET_FIX_ERR_A53_843419))
13808     return false;
13809
13810   /* If the user explicitly specified -momit-leaf-frame-pointer for the
13811      caller and calle and they don't match up, reject inlining.  */
13812   if (!aarch64_tribools_ok_for_inlining_p (
13813           caller_opts->x_flag_omit_leaf_frame_pointer,
13814           callee_opts->x_flag_omit_leaf_frame_pointer,
13815           2, 1))
13816     return false;
13817
13818   /* If the callee has specific tuning overrides, respect them.  */
13819   if (callee_opts->x_aarch64_override_tune_string != NULL
13820       && caller_opts->x_aarch64_override_tune_string == NULL)
13821     return false;
13822
13823   /* If the user specified tuning override strings for the
13824      caller and callee and they don't match up, reject inlining.
13825      We just do a string compare here, we don't analyze the meaning
13826      of the string, as it would be too costly for little gain.  */
13827   if (callee_opts->x_aarch64_override_tune_string
13828       && caller_opts->x_aarch64_override_tune_string
13829       && (strcmp (callee_opts->x_aarch64_override_tune_string,
13830                   caller_opts->x_aarch64_override_tune_string) != 0))
13831     return false;
13832
13833   return true;
13834 }
13835
13836 /* Return true if SYMBOL_REF X binds locally.  */
13837
13838 static bool
13839 aarch64_symbol_binds_local_p (const_rtx x)
13840 {
13841   return (SYMBOL_REF_DECL (x)
13842           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
13843           : SYMBOL_REF_LOCAL_P (x));
13844 }
13845
13846 /* Return true if SYMBOL_REF X is thread local */
13847 static bool
13848 aarch64_tls_symbol_p (rtx x)
13849 {
13850   if (! TARGET_HAVE_TLS)
13851     return false;
13852
13853   if (GET_CODE (x) != SYMBOL_REF)
13854     return false;
13855
13856   return SYMBOL_REF_TLS_MODEL (x) != 0;
13857 }
13858
13859 /* Classify a TLS symbol into one of the TLS kinds.  */
13860 enum aarch64_symbol_type
13861 aarch64_classify_tls_symbol (rtx x)
13862 {
13863   enum tls_model tls_kind = tls_symbolic_operand_type (x);
13864
13865   switch (tls_kind)
13866     {
13867     case TLS_MODEL_GLOBAL_DYNAMIC:
13868     case TLS_MODEL_LOCAL_DYNAMIC:
13869       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
13870
13871     case TLS_MODEL_INITIAL_EXEC:
13872       switch (aarch64_cmodel)
13873         {
13874         case AARCH64_CMODEL_TINY:
13875         case AARCH64_CMODEL_TINY_PIC:
13876           return SYMBOL_TINY_TLSIE;
13877         default:
13878           return SYMBOL_SMALL_TLSIE;
13879         }
13880
13881     case TLS_MODEL_LOCAL_EXEC:
13882       if (aarch64_tls_size == 12)
13883         return SYMBOL_TLSLE12;
13884       else if (aarch64_tls_size == 24)
13885         return SYMBOL_TLSLE24;
13886       else if (aarch64_tls_size == 32)
13887         return SYMBOL_TLSLE32;
13888       else if (aarch64_tls_size == 48)
13889         return SYMBOL_TLSLE48;
13890       else
13891         gcc_unreachable ();
13892
13893     case TLS_MODEL_EMULATED:
13894     case TLS_MODEL_NONE:
13895       return SYMBOL_FORCE_TO_MEM;
13896
13897     default:
13898       gcc_unreachable ();
13899     }
13900 }
13901
13902 /* Return the correct method for accessing X + OFFSET, where X is either
13903    a SYMBOL_REF or LABEL_REF.  */
13904
13905 enum aarch64_symbol_type
13906 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
13907 {
13908   if (GET_CODE (x) == LABEL_REF)
13909     {
13910       switch (aarch64_cmodel)
13911         {
13912         case AARCH64_CMODEL_LARGE:
13913           return SYMBOL_FORCE_TO_MEM;
13914
13915         case AARCH64_CMODEL_TINY_PIC:
13916         case AARCH64_CMODEL_TINY:
13917           return SYMBOL_TINY_ABSOLUTE;
13918
13919         case AARCH64_CMODEL_SMALL_SPIC:
13920         case AARCH64_CMODEL_SMALL_PIC:
13921         case AARCH64_CMODEL_SMALL:
13922           return SYMBOL_SMALL_ABSOLUTE;
13923
13924         default:
13925           gcc_unreachable ();
13926         }
13927     }
13928
13929   if (GET_CODE (x) == SYMBOL_REF)
13930     {
13931       if (aarch64_tls_symbol_p (x))
13932         return aarch64_classify_tls_symbol (x);
13933
13934       switch (aarch64_cmodel)
13935         {
13936         case AARCH64_CMODEL_TINY:
13937           /* When we retrieve symbol + offset address, we have to make sure
13938              the offset does not cause overflow of the final address.  But
13939              we have no way of knowing the address of symbol at compile time
13940              so we can't accurately say if the distance between the PC and
13941              symbol + offset is outside the addressible range of +/-1M in the
13942              TINY code model.  So we rely on images not being greater than
13943              1M and cap the offset at 1M and anything beyond 1M will have to
13944              be loaded using an alternative mechanism.  Furthermore if the
13945              symbol is a weak reference to something that isn't known to
13946              resolve to a symbol in this module, then force to memory.  */
13947           if ((SYMBOL_REF_WEAK (x)
13948                && !aarch64_symbol_binds_local_p (x))
13949               || !IN_RANGE (offset, -1048575, 1048575))
13950             return SYMBOL_FORCE_TO_MEM;
13951           return SYMBOL_TINY_ABSOLUTE;
13952
13953         case AARCH64_CMODEL_SMALL:
13954           /* Same reasoning as the tiny code model, but the offset cap here is
13955              4G.  */
13956           if ((SYMBOL_REF_WEAK (x)
13957                && !aarch64_symbol_binds_local_p (x))
13958               || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
13959                             HOST_WIDE_INT_C (4294967264)))
13960             return SYMBOL_FORCE_TO_MEM;
13961           return SYMBOL_SMALL_ABSOLUTE;
13962
13963         case AARCH64_CMODEL_TINY_PIC:
13964           if (!aarch64_symbol_binds_local_p (x))
13965             return SYMBOL_TINY_GOT;
13966           return SYMBOL_TINY_ABSOLUTE;
13967
13968         case AARCH64_CMODEL_SMALL_SPIC:
13969         case AARCH64_CMODEL_SMALL_PIC:
13970           if (!aarch64_symbol_binds_local_p (x))
13971             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
13972                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
13973           return SYMBOL_SMALL_ABSOLUTE;
13974
13975         case AARCH64_CMODEL_LARGE:
13976           /* This is alright even in PIC code as the constant
13977              pool reference is always PC relative and within
13978              the same translation unit.  */
13979           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
13980             return SYMBOL_SMALL_ABSOLUTE;
13981           else
13982             return SYMBOL_FORCE_TO_MEM;
13983
13984         default:
13985           gcc_unreachable ();
13986         }
13987     }
13988
13989   /* By default push everything into the constant pool.  */
13990   return SYMBOL_FORCE_TO_MEM;
13991 }
13992
13993 bool
13994 aarch64_constant_address_p (rtx x)
13995 {
13996   return (CONSTANT_P (x) && memory_address_p (DImode, x));
13997 }
13998
13999 bool
14000 aarch64_legitimate_pic_operand_p (rtx x)
14001 {
14002   if (GET_CODE (x) == SYMBOL_REF
14003       || (GET_CODE (x) == CONST
14004           && GET_CODE (XEXP (x, 0)) == PLUS
14005           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
14006      return false;
14007
14008   return true;
14009 }
14010
14011 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
14012    that should be rematerialized rather than spilled.  */
14013
14014 static bool
14015 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
14016 {
14017   /* Support CSE and rematerialization of common constants.  */
14018   if (CONST_INT_P (x)
14019       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
14020       || GET_CODE (x) == CONST_VECTOR)
14021     return true;
14022
14023   /* Do not allow vector struct mode constants for Advanced SIMD.
14024      We could support 0 and -1 easily, but they need support in
14025      aarch64-simd.md.  */
14026   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14027   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
14028     return false;
14029
14030   /* Only accept variable-length vector constants if they can be
14031      handled directly.
14032
14033      ??? It would be possible to handle rematerialization of other
14034      constants via secondary reloads.  */
14035   if (vec_flags & VEC_ANY_SVE)
14036     return aarch64_simd_valid_immediate (x, NULL);
14037
14038   if (GET_CODE (x) == HIGH)
14039     x = XEXP (x, 0);
14040
14041   /* Accept polynomial constants that can be calculated by using the
14042      destination of a move as the sole temporary.  Constants that
14043      require a second temporary cannot be rematerialized (they can't be
14044      forced to memory and also aren't legitimate constants).  */
14045   poly_int64 offset;
14046   if (poly_int_rtx_p (x, &offset))
14047     return aarch64_offset_temporaries (false, offset) <= 1;
14048
14049   /* If an offset is being added to something else, we need to allow the
14050      base to be moved into the destination register, meaning that there
14051      are no free temporaries for the offset.  */
14052   x = strip_offset (x, &offset);
14053   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
14054     return false;
14055
14056   /* Do not allow const (plus (anchor_symbol, const_int)).  */
14057   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
14058     return false;
14059
14060   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
14061      so spilling them is better than rematerialization.  */
14062   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
14063     return true;
14064
14065   /* Label references are always constant.  */
14066   if (GET_CODE (x) == LABEL_REF)
14067     return true;
14068
14069   return false;
14070 }
14071
14072 rtx
14073 aarch64_load_tp (rtx target)
14074 {
14075   if (!target
14076       || GET_MODE (target) != Pmode
14077       || !register_operand (target, Pmode))
14078     target = gen_reg_rtx (Pmode);
14079
14080   /* Can return in any reg.  */
14081   emit_insn (gen_aarch64_load_tp_hard (target));
14082   return target;
14083 }
14084
14085 /* On AAPCS systems, this is the "struct __va_list".  */
14086 static GTY(()) tree va_list_type;
14087
14088 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
14089    Return the type to use as __builtin_va_list.
14090
14091    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
14092
14093    struct __va_list
14094    {
14095      void *__stack;
14096      void *__gr_top;
14097      void *__vr_top;
14098      int   __gr_offs;
14099      int   __vr_offs;
14100    };  */
14101
14102 static tree
14103 aarch64_build_builtin_va_list (void)
14104 {
14105   tree va_list_name;
14106   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14107
14108   /* Create the type.  */
14109   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
14110   /* Give it the required name.  */
14111   va_list_name = build_decl (BUILTINS_LOCATION,
14112                              TYPE_DECL,
14113                              get_identifier ("__va_list"),
14114                              va_list_type);
14115   DECL_ARTIFICIAL (va_list_name) = 1;
14116   TYPE_NAME (va_list_type) = va_list_name;
14117   TYPE_STUB_DECL (va_list_type) = va_list_name;
14118
14119   /* Create the fields.  */
14120   f_stack = build_decl (BUILTINS_LOCATION,
14121                         FIELD_DECL, get_identifier ("__stack"),
14122                         ptr_type_node);
14123   f_grtop = build_decl (BUILTINS_LOCATION,
14124                         FIELD_DECL, get_identifier ("__gr_top"),
14125                         ptr_type_node);
14126   f_vrtop = build_decl (BUILTINS_LOCATION,
14127                         FIELD_DECL, get_identifier ("__vr_top"),
14128                         ptr_type_node);
14129   f_groff = build_decl (BUILTINS_LOCATION,
14130                         FIELD_DECL, get_identifier ("__gr_offs"),
14131                         integer_type_node);
14132   f_vroff = build_decl (BUILTINS_LOCATION,
14133                         FIELD_DECL, get_identifier ("__vr_offs"),
14134                         integer_type_node);
14135
14136   /* Tell tree-stdarg pass about our internal offset fields.
14137      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
14138      purpose to identify whether the code is updating va_list internal
14139      offset fields through irregular way.  */
14140   va_list_gpr_counter_field = f_groff;
14141   va_list_fpr_counter_field = f_vroff;
14142
14143   DECL_ARTIFICIAL (f_stack) = 1;
14144   DECL_ARTIFICIAL (f_grtop) = 1;
14145   DECL_ARTIFICIAL (f_vrtop) = 1;
14146   DECL_ARTIFICIAL (f_groff) = 1;
14147   DECL_ARTIFICIAL (f_vroff) = 1;
14148
14149   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
14150   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
14151   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
14152   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
14153   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
14154
14155   TYPE_FIELDS (va_list_type) = f_stack;
14156   DECL_CHAIN (f_stack) = f_grtop;
14157   DECL_CHAIN (f_grtop) = f_vrtop;
14158   DECL_CHAIN (f_vrtop) = f_groff;
14159   DECL_CHAIN (f_groff) = f_vroff;
14160
14161   /* Compute its layout.  */
14162   layout_type (va_list_type);
14163
14164   return va_list_type;
14165 }
14166
14167 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
14168 static void
14169 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
14170 {
14171   const CUMULATIVE_ARGS *cum;
14172   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14173   tree stack, grtop, vrtop, groff, vroff;
14174   tree t;
14175   int gr_save_area_size = cfun->va_list_gpr_size;
14176   int vr_save_area_size = cfun->va_list_fpr_size;
14177   int vr_offset;
14178
14179   cum = &crtl->args.info;
14180   if (cfun->va_list_gpr_size)
14181     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
14182                              cfun->va_list_gpr_size);
14183   if (cfun->va_list_fpr_size)
14184     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
14185                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
14186
14187   if (!TARGET_FLOAT)
14188     {
14189       gcc_assert (cum->aapcs_nvrn == 0);
14190       vr_save_area_size = 0;
14191     }
14192
14193   f_stack = TYPE_FIELDS (va_list_type_node);
14194   f_grtop = DECL_CHAIN (f_stack);
14195   f_vrtop = DECL_CHAIN (f_grtop);
14196   f_groff = DECL_CHAIN (f_vrtop);
14197   f_vroff = DECL_CHAIN (f_groff);
14198
14199   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
14200                   NULL_TREE);
14201   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
14202                   NULL_TREE);
14203   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
14204                   NULL_TREE);
14205   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
14206                   NULL_TREE);
14207   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
14208                   NULL_TREE);
14209
14210   /* Emit code to initialize STACK, which points to the next varargs stack
14211      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
14212      by named arguments.  STACK is 8-byte aligned.  */
14213   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
14214   if (cum->aapcs_stack_size > 0)
14215     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
14216   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
14217   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14218
14219   /* Emit code to initialize GRTOP, the top of the GR save area.
14220      virtual_incoming_args_rtx should have been 16 byte aligned.  */
14221   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
14222   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
14223   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14224
14225   /* Emit code to initialize VRTOP, the top of the VR save area.
14226      This address is gr_save_area_bytes below GRTOP, rounded
14227      down to the next 16-byte boundary.  */
14228   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
14229   vr_offset = ROUND_UP (gr_save_area_size,
14230                         STACK_BOUNDARY / BITS_PER_UNIT);
14231
14232   if (vr_offset)
14233     t = fold_build_pointer_plus_hwi (t, -vr_offset);
14234   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
14235   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14236
14237   /* Emit code to initialize GROFF, the offset from GRTOP of the
14238      next GPR argument.  */
14239   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
14240               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
14241   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14242
14243   /* Likewise emit code to initialize VROFF, the offset from FTOP
14244      of the next VR argument.  */
14245   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
14246               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
14247   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14248 }
14249
14250 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
14251
14252 static tree
14253 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
14254                               gimple_seq *post_p ATTRIBUTE_UNUSED)
14255 {
14256   tree addr;
14257   bool indirect_p;
14258   bool is_ha;           /* is HFA or HVA.  */
14259   bool dw_align;        /* double-word align.  */
14260   machine_mode ag_mode = VOIDmode;
14261   int nregs;
14262   machine_mode mode;
14263
14264   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14265   tree stack, f_top, f_off, off, arg, roundup, on_stack;
14266   HOST_WIDE_INT size, rsize, adjust, align;
14267   tree t, u, cond1, cond2;
14268
14269   indirect_p = pass_va_arg_by_reference (type);
14270   if (indirect_p)
14271     type = build_pointer_type (type);
14272
14273   mode = TYPE_MODE (type);
14274
14275   f_stack = TYPE_FIELDS (va_list_type_node);
14276   f_grtop = DECL_CHAIN (f_stack);
14277   f_vrtop = DECL_CHAIN (f_grtop);
14278   f_groff = DECL_CHAIN (f_vrtop);
14279   f_vroff = DECL_CHAIN (f_groff);
14280
14281   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
14282                   f_stack, NULL_TREE);
14283   size = int_size_in_bytes (type);
14284
14285   bool abi_break;
14286   align
14287     = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
14288
14289   dw_align = false;
14290   adjust = 0;
14291   if (aarch64_vfp_is_call_or_return_candidate (mode,
14292                                                type,
14293                                                &ag_mode,
14294                                                &nregs,
14295                                                &is_ha))
14296     {
14297       /* No frontends can create types with variable-sized modes, so we
14298          shouldn't be asked to pass or return them.  */
14299       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
14300
14301       /* TYPE passed in fp/simd registers.  */
14302       if (!TARGET_FLOAT)
14303         aarch64_err_no_fpadvsimd (mode);
14304
14305       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
14306                       unshare_expr (valist), f_vrtop, NULL_TREE);
14307       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
14308                       unshare_expr (valist), f_vroff, NULL_TREE);
14309
14310       rsize = nregs * UNITS_PER_VREG;
14311
14312       if (is_ha)
14313         {
14314           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
14315             adjust = UNITS_PER_VREG - ag_size;
14316         }
14317       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14318                && size < UNITS_PER_VREG)
14319         {
14320           adjust = UNITS_PER_VREG - size;
14321         }
14322     }
14323   else
14324     {
14325       /* TYPE passed in general registers.  */
14326       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
14327                       unshare_expr (valist), f_grtop, NULL_TREE);
14328       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
14329                       unshare_expr (valist), f_groff, NULL_TREE);
14330       rsize = ROUND_UP (size, UNITS_PER_WORD);
14331       nregs = rsize / UNITS_PER_WORD;
14332
14333       if (align > 8)
14334         {
14335           if (abi_break && warn_psabi)
14336             inform (input_location, "parameter passing for argument of type "
14337                     "%qT changed in GCC 9.1", type);
14338           dw_align = true;
14339         }
14340
14341       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14342           && size < UNITS_PER_WORD)
14343         {
14344           adjust = UNITS_PER_WORD  - size;
14345         }
14346     }
14347
14348   /* Get a local temporary for the field value.  */
14349   off = get_initialized_tmp_var (f_off, pre_p, NULL);
14350
14351   /* Emit code to branch if off >= 0.  */
14352   t = build2 (GE_EXPR, boolean_type_node, off,
14353               build_int_cst (TREE_TYPE (off), 0));
14354   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
14355
14356   if (dw_align)
14357     {
14358       /* Emit: offs = (offs + 15) & -16.  */
14359       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
14360                   build_int_cst (TREE_TYPE (off), 15));
14361       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
14362                   build_int_cst (TREE_TYPE (off), -16));
14363       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
14364     }
14365   else
14366     roundup = NULL;
14367
14368   /* Update ap.__[g|v]r_offs  */
14369   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
14370               build_int_cst (TREE_TYPE (off), rsize));
14371   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
14372
14373   /* String up.  */
14374   if (roundup)
14375     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
14376
14377   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
14378   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
14379               build_int_cst (TREE_TYPE (f_off), 0));
14380   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
14381
14382   /* String up: make sure the assignment happens before the use.  */
14383   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
14384   COND_EXPR_ELSE (cond1) = t;
14385
14386   /* Prepare the trees handling the argument that is passed on the stack;
14387      the top level node will store in ON_STACK.  */
14388   arg = get_initialized_tmp_var (stack, pre_p, NULL);
14389   if (align > 8)
14390     {
14391       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
14392       t = fold_build_pointer_plus_hwi (arg, 15);
14393       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
14394                   build_int_cst (TREE_TYPE (t), -16));
14395       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
14396     }
14397   else
14398     roundup = NULL;
14399   /* Advance ap.__stack  */
14400   t = fold_build_pointer_plus_hwi (arg, size + 7);
14401   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
14402               build_int_cst (TREE_TYPE (t), -8));
14403   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
14404   /* String up roundup and advance.  */
14405   if (roundup)
14406     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
14407   /* String up with arg */
14408   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
14409   /* Big-endianness related address adjustment.  */
14410   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14411       && size < UNITS_PER_WORD)
14412   {
14413     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
14414                 size_int (UNITS_PER_WORD - size));
14415     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
14416   }
14417
14418   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
14419   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
14420
14421   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
14422   t = off;
14423   if (adjust)
14424     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
14425                 build_int_cst (TREE_TYPE (off), adjust));
14426
14427   t = fold_convert (sizetype, t);
14428   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
14429
14430   if (is_ha)
14431     {
14432       /* type ha; // treat as "struct {ftype field[n];}"
14433          ... [computing offs]
14434          for (i = 0; i <nregs; ++i, offs += 16)
14435            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
14436          return ha;  */
14437       int i;
14438       tree tmp_ha, field_t, field_ptr_t;
14439
14440       /* Declare a local variable.  */
14441       tmp_ha = create_tmp_var_raw (type, "ha");
14442       gimple_add_tmp_var (tmp_ha);
14443
14444       /* Establish the base type.  */
14445       switch (ag_mode)
14446         {
14447         case E_SFmode:
14448           field_t = float_type_node;
14449           field_ptr_t = float_ptr_type_node;
14450           break;
14451         case E_DFmode:
14452           field_t = double_type_node;
14453           field_ptr_t = double_ptr_type_node;
14454           break;
14455         case E_TFmode:
14456           field_t = long_double_type_node;
14457           field_ptr_t = long_double_ptr_type_node;
14458           break;
14459         case E_HFmode:
14460           field_t = aarch64_fp16_type_node;
14461           field_ptr_t = aarch64_fp16_ptr_type_node;
14462           break;
14463         case E_V2SImode:
14464         case E_V4SImode:
14465             {
14466               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
14467               field_t = build_vector_type_for_mode (innertype, ag_mode);
14468               field_ptr_t = build_pointer_type (field_t);
14469             }
14470           break;
14471         default:
14472           gcc_assert (0);
14473         }
14474
14475       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
14476       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
14477       addr = t;
14478       t = fold_convert (field_ptr_t, addr);
14479       t = build2 (MODIFY_EXPR, field_t,
14480                   build1 (INDIRECT_REF, field_t, tmp_ha),
14481                   build1 (INDIRECT_REF, field_t, t));
14482
14483       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
14484       for (i = 1; i < nregs; ++i)
14485         {
14486           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
14487           u = fold_convert (field_ptr_t, addr);
14488           u = build2 (MODIFY_EXPR, field_t,
14489                       build2 (MEM_REF, field_t, tmp_ha,
14490                               build_int_cst (field_ptr_t,
14491                                              (i *
14492                                               int_size_in_bytes (field_t)))),
14493                       build1 (INDIRECT_REF, field_t, u));
14494           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
14495         }
14496
14497       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
14498       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
14499     }
14500
14501   COND_EXPR_ELSE (cond2) = t;
14502   addr = fold_convert (build_pointer_type (type), cond1);
14503   addr = build_va_arg_indirect_ref (addr);
14504
14505   if (indirect_p)
14506     addr = build_va_arg_indirect_ref (addr);
14507
14508   return addr;
14509 }
14510
14511 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
14512
14513 static void
14514 aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
14515                                 const function_arg_info &arg,
14516                                 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
14517 {
14518   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
14519   CUMULATIVE_ARGS local_cum;
14520   int gr_saved = cfun->va_list_gpr_size;
14521   int vr_saved = cfun->va_list_fpr_size;
14522
14523   /* The caller has advanced CUM up to, but not beyond, the last named
14524      argument.  Advance a local copy of CUM past the last "real" named
14525      argument, to find out how many registers are left over.  */
14526   local_cum = *cum;
14527   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
14528
14529   /* Found out how many registers we need to save.
14530      Honor tree-stdvar analysis results.  */
14531   if (cfun->va_list_gpr_size)
14532     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
14533                     cfun->va_list_gpr_size / UNITS_PER_WORD);
14534   if (cfun->va_list_fpr_size)
14535     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
14536                     cfun->va_list_fpr_size / UNITS_PER_VREG);
14537
14538   if (!TARGET_FLOAT)
14539     {
14540       gcc_assert (local_cum.aapcs_nvrn == 0);
14541       vr_saved = 0;
14542     }
14543
14544   if (!no_rtl)
14545     {
14546       if (gr_saved > 0)
14547         {
14548           rtx ptr, mem;
14549
14550           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
14551           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
14552                                - gr_saved * UNITS_PER_WORD);
14553           mem = gen_frame_mem (BLKmode, ptr);
14554           set_mem_alias_set (mem, get_varargs_alias_set ());
14555
14556           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
14557                                mem, gr_saved);
14558         }
14559       if (vr_saved > 0)
14560         {
14561           /* We can't use move_block_from_reg, because it will use
14562              the wrong mode, storing D regs only.  */
14563           machine_mode mode = TImode;
14564           int off, i, vr_start;
14565
14566           /* Set OFF to the offset from virtual_incoming_args_rtx of
14567              the first vector register.  The VR save area lies below
14568              the GR one, and is aligned to 16 bytes.  */
14569           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
14570                            STACK_BOUNDARY / BITS_PER_UNIT);
14571           off -= vr_saved * UNITS_PER_VREG;
14572
14573           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
14574           for (i = 0; i < vr_saved; ++i)
14575             {
14576               rtx ptr, mem;
14577
14578               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
14579               mem = gen_frame_mem (mode, ptr);
14580               set_mem_alias_set (mem, get_varargs_alias_set ());
14581               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
14582               off += UNITS_PER_VREG;
14583             }
14584         }
14585     }
14586
14587   /* We don't save the size into *PRETEND_SIZE because we want to avoid
14588      any complication of having crtl->args.pretend_args_size changed.  */
14589   cfun->machine->frame.saved_varargs_size
14590     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
14591                  STACK_BOUNDARY / BITS_PER_UNIT)
14592        + vr_saved * UNITS_PER_VREG);
14593 }
14594
14595 static void
14596 aarch64_conditional_register_usage (void)
14597 {
14598   int i;
14599   if (!TARGET_FLOAT)
14600     {
14601       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
14602         {
14603           fixed_regs[i] = 1;
14604           call_used_regs[i] = 1;
14605         }
14606     }
14607   if (!TARGET_SVE)
14608     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
14609       {
14610         fixed_regs[i] = 1;
14611         call_used_regs[i] = 1;
14612       }
14613
14614   /* When tracking speculation, we need a couple of call-clobbered registers
14615      to track the speculation state.  It would be nice to just use
14616      IP0 and IP1, but currently there are numerous places that just
14617      assume these registers are free for other uses (eg pointer
14618      authentication).  */
14619   if (aarch64_track_speculation)
14620     {
14621       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
14622       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
14623       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14624       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14625     }
14626 }
14627
14628 /* Walk down the type tree of TYPE counting consecutive base elements.
14629    If *MODEP is VOIDmode, then set it to the first valid floating point
14630    type.  If a non-floating point type is found, or if a floating point
14631    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
14632    otherwise return the count in the sub-tree.  */
14633 static int
14634 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
14635 {
14636   machine_mode mode;
14637   HOST_WIDE_INT size;
14638
14639   switch (TREE_CODE (type))
14640     {
14641     case REAL_TYPE:
14642       mode = TYPE_MODE (type);
14643       if (mode != DFmode && mode != SFmode
14644           && mode != TFmode && mode != HFmode)
14645         return -1;
14646
14647       if (*modep == VOIDmode)
14648         *modep = mode;
14649
14650       if (*modep == mode)
14651         return 1;
14652
14653       break;
14654
14655     case COMPLEX_TYPE:
14656       mode = TYPE_MODE (TREE_TYPE (type));
14657       if (mode != DFmode && mode != SFmode
14658           && mode != TFmode && mode != HFmode)
14659         return -1;
14660
14661       if (*modep == VOIDmode)
14662         *modep = mode;
14663
14664       if (*modep == mode)
14665         return 2;
14666
14667       break;
14668
14669     case VECTOR_TYPE:
14670       /* Use V2SImode and V4SImode as representatives of all 64-bit
14671          and 128-bit vector types.  */
14672       size = int_size_in_bytes (type);
14673       switch (size)
14674         {
14675         case 8:
14676           mode = V2SImode;
14677           break;
14678         case 16:
14679           mode = V4SImode;
14680           break;
14681         default:
14682           return -1;
14683         }
14684
14685       if (*modep == VOIDmode)
14686         *modep = mode;
14687
14688       /* Vector modes are considered to be opaque: two vectors are
14689          equivalent for the purposes of being homogeneous aggregates
14690          if they are the same size.  */
14691       if (*modep == mode)
14692         return 1;
14693
14694       break;
14695
14696     case ARRAY_TYPE:
14697       {
14698         int count;
14699         tree index = TYPE_DOMAIN (type);
14700
14701         /* Can't handle incomplete types nor sizes that are not
14702            fixed.  */
14703         if (!COMPLETE_TYPE_P (type)
14704             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14705           return -1;
14706
14707         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
14708         if (count == -1
14709             || !index
14710             || !TYPE_MAX_VALUE (index)
14711             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
14712             || !TYPE_MIN_VALUE (index)
14713             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
14714             || count < 0)
14715           return -1;
14716
14717         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
14718                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
14719
14720         /* There must be no padding.  */
14721         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14722                       count * GET_MODE_BITSIZE (*modep)))
14723           return -1;
14724
14725         return count;
14726       }
14727
14728     case RECORD_TYPE:
14729       {
14730         int count = 0;
14731         int sub_count;
14732         tree field;
14733
14734         /* Can't handle incomplete types nor sizes that are not
14735            fixed.  */
14736         if (!COMPLETE_TYPE_P (type)
14737             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14738           return -1;
14739
14740         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14741           {
14742             if (TREE_CODE (field) != FIELD_DECL)
14743               continue;
14744
14745             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14746             if (sub_count < 0)
14747               return -1;
14748             count += sub_count;
14749           }
14750
14751         /* There must be no padding.  */
14752         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14753                       count * GET_MODE_BITSIZE (*modep)))
14754           return -1;
14755
14756         return count;
14757       }
14758
14759     case UNION_TYPE:
14760     case QUAL_UNION_TYPE:
14761       {
14762         /* These aren't very interesting except in a degenerate case.  */
14763         int count = 0;
14764         int sub_count;
14765         tree field;
14766
14767         /* Can't handle incomplete types nor sizes that are not
14768            fixed.  */
14769         if (!COMPLETE_TYPE_P (type)
14770             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14771           return -1;
14772
14773         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14774           {
14775             if (TREE_CODE (field) != FIELD_DECL)
14776               continue;
14777
14778             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14779             if (sub_count < 0)
14780               return -1;
14781             count = count > sub_count ? count : sub_count;
14782           }
14783
14784         /* There must be no padding.  */
14785         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14786                       count * GET_MODE_BITSIZE (*modep)))
14787           return -1;
14788
14789         return count;
14790       }
14791
14792     default:
14793       break;
14794     }
14795
14796   return -1;
14797 }
14798
14799 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
14800    type as described in AAPCS64 \S 4.1.2.
14801
14802    See the comment above aarch64_composite_type_p for the notes on MODE.  */
14803
14804 static bool
14805 aarch64_short_vector_p (const_tree type,
14806                         machine_mode mode)
14807 {
14808   poly_int64 size = -1;
14809
14810   if (type && TREE_CODE (type) == VECTOR_TYPE)
14811     size = int_size_in_bytes (type);
14812   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
14813             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
14814     size = GET_MODE_SIZE (mode);
14815
14816   return known_eq (size, 8) || known_eq (size, 16);
14817 }
14818
14819 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
14820    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
14821    array types.  The C99 floating-point complex types are also considered
14822    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
14823    types, which are GCC extensions and out of the scope of AAPCS64, are
14824    treated as composite types here as well.
14825
14826    Note that MODE itself is not sufficient in determining whether a type
14827    is such a composite type or not.  This is because
14828    stor-layout.c:compute_record_mode may have already changed the MODE
14829    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
14830    structure with only one field may have its MODE set to the mode of the
14831    field.  Also an integer mode whose size matches the size of the
14832    RECORD_TYPE type may be used to substitute the original mode
14833    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
14834    solely relied on.  */
14835
14836 static bool
14837 aarch64_composite_type_p (const_tree type,
14838                           machine_mode mode)
14839 {
14840   if (aarch64_short_vector_p (type, mode))
14841     return false;
14842
14843   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
14844     return true;
14845
14846   if (mode == BLKmode
14847       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
14848       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
14849     return true;
14850
14851   return false;
14852 }
14853
14854 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
14855    shall be passed or returned in simd/fp register(s) (providing these
14856    parameter passing registers are available).
14857
14858    Upon successful return, *COUNT returns the number of needed registers,
14859    *BASE_MODE returns the mode of the individual register and when IS_HAF
14860    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
14861    floating-point aggregate or a homogeneous short-vector aggregate.  */
14862
14863 static bool
14864 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
14865                                          const_tree type,
14866                                          machine_mode *base_mode,
14867                                          int *count,
14868                                          bool *is_ha)
14869 {
14870   machine_mode new_mode = VOIDmode;
14871   bool composite_p = aarch64_composite_type_p (type, mode);
14872
14873   if (is_ha != NULL) *is_ha = false;
14874
14875   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
14876       || aarch64_short_vector_p (type, mode))
14877     {
14878       *count = 1;
14879       new_mode = mode;
14880     }
14881   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
14882     {
14883       if (is_ha != NULL) *is_ha = true;
14884       *count = 2;
14885       new_mode = GET_MODE_INNER (mode);
14886     }
14887   else if (type && composite_p)
14888     {
14889       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
14890
14891       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
14892         {
14893           if (is_ha != NULL) *is_ha = true;
14894           *count = ag_count;
14895         }
14896       else
14897         return false;
14898     }
14899   else
14900     return false;
14901
14902   *base_mode = new_mode;
14903   return true;
14904 }
14905
14906 /* Implement TARGET_STRUCT_VALUE_RTX.  */
14907
14908 static rtx
14909 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
14910                           int incoming ATTRIBUTE_UNUSED)
14911 {
14912   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
14913 }
14914
14915 /* Implements target hook vector_mode_supported_p.  */
14916 static bool
14917 aarch64_vector_mode_supported_p (machine_mode mode)
14918 {
14919   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14920   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
14921 }
14922
14923 /* Return the full-width SVE vector mode for element mode MODE, if one
14924    exists.  */
14925 opt_machine_mode
14926 aarch64_full_sve_mode (scalar_mode mode)
14927 {
14928   switch (mode)
14929     {
14930     case E_DFmode:
14931       return VNx2DFmode;
14932     case E_SFmode:
14933       return VNx4SFmode;
14934     case E_HFmode:
14935       return VNx8HFmode;
14936     case E_DImode:
14937         return VNx2DImode;
14938     case E_SImode:
14939       return VNx4SImode;
14940     case E_HImode:
14941       return VNx8HImode;
14942     case E_QImode:
14943       return VNx16QImode;
14944     default:
14945       return opt_machine_mode ();
14946     }
14947 }
14948
14949 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
14950    if it exists.  */
14951 opt_machine_mode
14952 aarch64_vq_mode (scalar_mode mode)
14953 {
14954   switch (mode)
14955     {
14956     case E_DFmode:
14957       return V2DFmode;
14958     case E_SFmode:
14959       return V4SFmode;
14960     case E_HFmode:
14961       return V8HFmode;
14962     case E_SImode:
14963       return V4SImode;
14964     case E_HImode:
14965       return V8HImode;
14966     case E_QImode:
14967       return V16QImode;
14968     case E_DImode:
14969       return V2DImode;
14970     default:
14971       return opt_machine_mode ();
14972     }
14973 }
14974
14975 /* Return appropriate SIMD container
14976    for MODE within a vector of WIDTH bits.  */
14977 static machine_mode
14978 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
14979 {
14980   if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
14981     return aarch64_full_sve_mode (mode).else_mode (word_mode);
14982
14983   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
14984   if (TARGET_SIMD)
14985     {
14986       if (known_eq (width, 128))
14987         return aarch64_vq_mode (mode).else_mode (word_mode);
14988       else
14989         switch (mode)
14990           {
14991           case E_SFmode:
14992             return V2SFmode;
14993           case E_HFmode:
14994             return V4HFmode;
14995           case E_SImode:
14996             return V2SImode;
14997           case E_HImode:
14998             return V4HImode;
14999           case E_QImode:
15000             return V8QImode;
15001           default:
15002             break;
15003           }
15004     }
15005   return word_mode;
15006 }
15007
15008 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
15009 static machine_mode
15010 aarch64_preferred_simd_mode (scalar_mode mode)
15011 {
15012   poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
15013   return aarch64_simd_container_mode (mode, bits);
15014 }
15015
15016 /* Return a list of possible vector sizes for the vectorizer
15017    to iterate over.  */
15018 static void
15019 aarch64_autovectorize_vector_sizes (vector_sizes *sizes, bool)
15020 {
15021   if (TARGET_SVE)
15022     sizes->safe_push (BYTES_PER_SVE_VECTOR);
15023   sizes->safe_push (16);
15024   sizes->safe_push (8);
15025 }
15026
15027 /* Implement TARGET_MANGLE_TYPE.  */
15028
15029 static const char *
15030 aarch64_mangle_type (const_tree type)
15031 {
15032   /* The AArch64 ABI documents say that "__va_list" has to be
15033      mangled as if it is in the "std" namespace.  */
15034   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
15035     return "St9__va_list";
15036
15037   /* Half-precision float.  */
15038   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
15039     return "Dh";
15040
15041   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
15042      builtin types.  */
15043   if (TYPE_NAME (type) != NULL)
15044     return aarch64_mangle_builtin_type (type);
15045
15046   /* Use the default mangling.  */
15047   return NULL;
15048 }
15049
15050 /* Find the first rtx_insn before insn that will generate an assembly
15051    instruction.  */
15052
15053 static rtx_insn *
15054 aarch64_prev_real_insn (rtx_insn *insn)
15055 {
15056   if (!insn)
15057     return NULL;
15058
15059   do
15060     {
15061       insn = prev_real_insn (insn);
15062     }
15063   while (insn && recog_memoized (insn) < 0);
15064
15065   return insn;
15066 }
15067
15068 static bool
15069 is_madd_op (enum attr_type t1)
15070 {
15071   unsigned int i;
15072   /* A number of these may be AArch32 only.  */
15073   enum attr_type mlatypes[] = {
15074     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
15075     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
15076     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
15077   };
15078
15079   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
15080     {
15081       if (t1 == mlatypes[i])
15082         return true;
15083     }
15084
15085   return false;
15086 }
15087
15088 /* Check if there is a register dependency between a load and the insn
15089    for which we hold recog_data.  */
15090
15091 static bool
15092 dep_between_memop_and_curr (rtx memop)
15093 {
15094   rtx load_reg;
15095   int opno;
15096
15097   gcc_assert (GET_CODE (memop) == SET);
15098
15099   if (!REG_P (SET_DEST (memop)))
15100     return false;
15101
15102   load_reg = SET_DEST (memop);
15103   for (opno = 1; opno < recog_data.n_operands; opno++)
15104     {
15105       rtx operand = recog_data.operand[opno];
15106       if (REG_P (operand)
15107           && reg_overlap_mentioned_p (load_reg, operand))
15108         return true;
15109
15110     }
15111   return false;
15112 }
15113
15114
15115 /* When working around the Cortex-A53 erratum 835769,
15116    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
15117    instruction and has a preceding memory instruction such that a NOP
15118    should be inserted between them.  */
15119
15120 bool
15121 aarch64_madd_needs_nop (rtx_insn* insn)
15122 {
15123   enum attr_type attr_type;
15124   rtx_insn *prev;
15125   rtx body;
15126
15127   if (!TARGET_FIX_ERR_A53_835769)
15128     return false;
15129
15130   if (!INSN_P (insn) || recog_memoized (insn) < 0)
15131     return false;
15132
15133   attr_type = get_attr_type (insn);
15134   if (!is_madd_op (attr_type))
15135     return false;
15136
15137   prev = aarch64_prev_real_insn (insn);
15138   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
15139      Restore recog state to INSN to avoid state corruption.  */
15140   extract_constrain_insn_cached (insn);
15141
15142   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
15143     return false;
15144
15145   body = single_set (prev);
15146
15147   /* If the previous insn is a memory op and there is no dependency between
15148      it and the DImode madd, emit a NOP between them.  If body is NULL then we
15149      have a complex memory operation, probably a load/store pair.
15150      Be conservative for now and emit a NOP.  */
15151   if (GET_MODE (recog_data.operand[0]) == DImode
15152       && (!body || !dep_between_memop_and_curr (body)))
15153     return true;
15154
15155   return false;
15156
15157 }
15158
15159
15160 /* Implement FINAL_PRESCAN_INSN.  */
15161
15162 void
15163 aarch64_final_prescan_insn (rtx_insn *insn)
15164 {
15165   if (aarch64_madd_needs_nop (insn))
15166     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
15167 }
15168
15169
15170 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
15171    instruction.  */
15172
15173 bool
15174 aarch64_sve_index_immediate_p (rtx base_or_step)
15175 {
15176   return (CONST_INT_P (base_or_step)
15177           && IN_RANGE (INTVAL (base_or_step), -16, 15));
15178 }
15179
15180 /* Return true if X is a valid immediate for the SVE ADD and SUB
15181    instructions.  Negate X first if NEGATE_P is true.  */
15182
15183 bool
15184 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
15185 {
15186   rtx elt;
15187
15188   if (!const_vec_duplicate_p (x, &elt)
15189       || !CONST_INT_P (elt))
15190     return false;
15191
15192   HOST_WIDE_INT val = INTVAL (elt);
15193   if (negate_p)
15194     val = -val;
15195   val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
15196
15197   if (val & 0xff)
15198     return IN_RANGE (val, 0, 0xff);
15199   return IN_RANGE (val, 0, 0xff00);
15200 }
15201
15202 /* Return true if X is a valid immediate operand for an SVE logical
15203    instruction such as AND.  */
15204
15205 bool
15206 aarch64_sve_bitmask_immediate_p (rtx x)
15207 {
15208   rtx elt;
15209
15210   return (const_vec_duplicate_p (x, &elt)
15211           && CONST_INT_P (elt)
15212           && aarch64_bitmask_imm (INTVAL (elt),
15213                                   GET_MODE_INNER (GET_MODE (x))));
15214 }
15215
15216 /* Return true if X is a valid immediate for the SVE DUP and CPY
15217    instructions.  */
15218
15219 bool
15220 aarch64_sve_dup_immediate_p (rtx x)
15221 {
15222   x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
15223   if (!CONST_INT_P (x))
15224     return false;
15225
15226   HOST_WIDE_INT val = INTVAL (x);
15227   if (val & 0xff)
15228     return IN_RANGE (val, -0x80, 0x7f);
15229   return IN_RANGE (val, -0x8000, 0x7f00);
15230 }
15231
15232 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
15233    SIGNED_P says whether the operand is signed rather than unsigned.  */
15234
15235 bool
15236 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
15237 {
15238   rtx elt;
15239
15240   return (const_vec_duplicate_p (x, &elt)
15241           && CONST_INT_P (elt)
15242           && (signed_p
15243               ? IN_RANGE (INTVAL (elt), -16, 15)
15244               : IN_RANGE (INTVAL (elt), 0, 127)));
15245 }
15246
15247 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
15248    instruction.  Negate X first if NEGATE_P is true.  */
15249
15250 bool
15251 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
15252 {
15253   rtx elt;
15254   REAL_VALUE_TYPE r;
15255
15256   if (!const_vec_duplicate_p (x, &elt)
15257       || GET_CODE (elt) != CONST_DOUBLE)
15258     return false;
15259
15260   r = *CONST_DOUBLE_REAL_VALUE (elt);
15261
15262   if (negate_p)
15263     r = real_value_negate (&r);
15264
15265   if (real_equal (&r, &dconst1))
15266     return true;
15267   if (real_equal (&r, &dconsthalf))
15268     return true;
15269   return false;
15270 }
15271
15272 /* Return true if X is a valid immediate operand for an SVE FMUL
15273    instruction.  */
15274
15275 bool
15276 aarch64_sve_float_mul_immediate_p (rtx x)
15277 {
15278   rtx elt;
15279
15280   return (const_vec_duplicate_p (x, &elt)
15281           && GET_CODE (elt) == CONST_DOUBLE
15282           && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
15283               || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
15284 }
15285
15286 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
15287    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
15288    is nonnull, use it to describe valid immediates.  */
15289 static bool
15290 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
15291                                     simd_immediate_info *info,
15292                                     enum simd_immediate_check which,
15293                                     simd_immediate_info::insn_type insn)
15294 {
15295   /* Try a 4-byte immediate with LSL.  */
15296   for (unsigned int shift = 0; shift < 32; shift += 8)
15297     if ((val32 & (0xff << shift)) == val32)
15298       {
15299         if (info)
15300           *info = simd_immediate_info (SImode, val32 >> shift, insn,
15301                                        simd_immediate_info::LSL, shift);
15302         return true;
15303       }
15304
15305   /* Try a 2-byte immediate with LSL.  */
15306   unsigned int imm16 = val32 & 0xffff;
15307   if (imm16 == (val32 >> 16))
15308     for (unsigned int shift = 0; shift < 16; shift += 8)
15309       if ((imm16 & (0xff << shift)) == imm16)
15310         {
15311           if (info)
15312             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
15313                                          simd_immediate_info::LSL, shift);
15314           return true;
15315         }
15316
15317   /* Try a 4-byte immediate with MSL, except for cases that MVN
15318      can handle.  */
15319   if (which == AARCH64_CHECK_MOV)
15320     for (unsigned int shift = 8; shift < 24; shift += 8)
15321       {
15322         unsigned int low = (1 << shift) - 1;
15323         if (((val32 & (0xff << shift)) | low) == val32)
15324           {
15325             if (info)
15326               *info = simd_immediate_info (SImode, val32 >> shift, insn,
15327                                            simd_immediate_info::MSL, shift);
15328             return true;
15329           }
15330       }
15331
15332   return false;
15333 }
15334
15335 /* Return true if replicating VAL64 is a valid immediate for the
15336    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
15337    use it to describe valid immediates.  */
15338 static bool
15339 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
15340                                  simd_immediate_info *info,
15341                                  enum simd_immediate_check which)
15342 {
15343   unsigned int val32 = val64 & 0xffffffff;
15344   unsigned int val16 = val64 & 0xffff;
15345   unsigned int val8 = val64 & 0xff;
15346
15347   if (val32 == (val64 >> 32))
15348     {
15349       if ((which & AARCH64_CHECK_ORR) != 0
15350           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
15351                                                  simd_immediate_info::MOV))
15352         return true;
15353
15354       if ((which & AARCH64_CHECK_BIC) != 0
15355           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
15356                                                  simd_immediate_info::MVN))
15357         return true;
15358
15359       /* Try using a replicated byte.  */
15360       if (which == AARCH64_CHECK_MOV
15361           && val16 == (val32 >> 16)
15362           && val8 == (val16 >> 8))
15363         {
15364           if (info)
15365             *info = simd_immediate_info (QImode, val8);
15366           return true;
15367         }
15368     }
15369
15370   /* Try using a bit-to-bytemask.  */
15371   if (which == AARCH64_CHECK_MOV)
15372     {
15373       unsigned int i;
15374       for (i = 0; i < 64; i += 8)
15375         {
15376           unsigned char byte = (val64 >> i) & 0xff;
15377           if (byte != 0 && byte != 0xff)
15378             break;
15379         }
15380       if (i == 64)
15381         {
15382           if (info)
15383             *info = simd_immediate_info (DImode, val64);
15384           return true;
15385         }
15386     }
15387   return false;
15388 }
15389
15390 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
15391    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
15392
15393 static bool
15394 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
15395                              simd_immediate_info *info)
15396 {
15397   scalar_int_mode mode = DImode;
15398   unsigned int val32 = val64 & 0xffffffff;
15399   if (val32 == (val64 >> 32))
15400     {
15401       mode = SImode;
15402       unsigned int val16 = val32 & 0xffff;
15403       if (val16 == (val32 >> 16))
15404         {
15405           mode = HImode;
15406           unsigned int val8 = val16 & 0xff;
15407           if (val8 == (val16 >> 8))
15408             mode = QImode;
15409         }
15410     }
15411   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
15412   if (IN_RANGE (val, -0x80, 0x7f))
15413     {
15414       /* DUP with no shift.  */
15415       if (info)
15416         *info = simd_immediate_info (mode, val);
15417       return true;
15418     }
15419   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
15420     {
15421       /* DUP with LSL #8.  */
15422       if (info)
15423         *info = simd_immediate_info (mode, val);
15424       return true;
15425     }
15426   if (aarch64_bitmask_imm (val64, mode))
15427     {
15428       /* DUPM.  */
15429       if (info)
15430         *info = simd_immediate_info (mode, val);
15431       return true;
15432     }
15433   return false;
15434 }
15435
15436 /* Return true if X is a valid SVE predicate.  If INFO is nonnull, use
15437    it to describe valid immediates.  */
15438
15439 static bool
15440 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
15441 {
15442   if (x == CONST0_RTX (GET_MODE (x)))
15443     {
15444       if (info)
15445         *info = simd_immediate_info (DImode, 0);
15446       return true;
15447     }
15448
15449   /* Analyze the value as a VNx16BImode.  This should be relatively
15450      efficient, since rtx_vector_builder has enough built-in capacity
15451      to store all VLA predicate constants without needing the heap.  */
15452   rtx_vector_builder builder;
15453   if (!aarch64_get_sve_pred_bits (builder, x))
15454     return false;
15455
15456   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
15457   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
15458     {
15459       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
15460       aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
15461       if (pattern != AARCH64_NUM_SVPATTERNS)
15462         {
15463           if (info)
15464             {
15465               scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
15466               *info = simd_immediate_info (int_mode, pattern);
15467             }
15468           return true;
15469         }
15470     }
15471   return false;
15472 }
15473
15474 /* Return true if OP is a valid SIMD immediate for the operation
15475    described by WHICH.  If INFO is nonnull, use it to describe valid
15476    immediates.  */
15477 bool
15478 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
15479                               enum simd_immediate_check which)
15480 {
15481   machine_mode mode = GET_MODE (op);
15482   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15483   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15484     return false;
15485
15486   if (vec_flags & VEC_SVE_PRED)
15487     return aarch64_sve_pred_valid_immediate (op, info);
15488
15489   scalar_mode elt_mode = GET_MODE_INNER (mode);
15490   rtx base, step;
15491   unsigned int n_elts;
15492   if (GET_CODE (op) == CONST_VECTOR
15493       && CONST_VECTOR_DUPLICATE_P (op))
15494     n_elts = CONST_VECTOR_NPATTERNS (op);
15495   else if ((vec_flags & VEC_SVE_DATA)
15496            && const_vec_series_p (op, &base, &step))
15497     {
15498       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
15499       if (!aarch64_sve_index_immediate_p (base)
15500           || !aarch64_sve_index_immediate_p (step))
15501         return false;
15502
15503       if (info)
15504         *info = simd_immediate_info (elt_mode, base, step);
15505       return true;
15506     }
15507   else if (GET_CODE (op) == CONST_VECTOR
15508            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
15509     /* N_ELTS set above.  */;
15510   else
15511     return false;
15512
15513   scalar_float_mode elt_float_mode;
15514   if (n_elts == 1
15515       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
15516     {
15517       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
15518       if (aarch64_float_const_zero_rtx_p (elt)
15519           || aarch64_float_const_representable_p (elt))
15520         {
15521           if (info)
15522             *info = simd_immediate_info (elt_float_mode, elt);
15523           return true;
15524         }
15525     }
15526
15527   unsigned int elt_size = GET_MODE_SIZE (elt_mode);
15528   if (elt_size > 8)
15529     return false;
15530
15531   scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
15532
15533   /* Expand the vector constant out into a byte vector, with the least
15534      significant byte of the register first.  */
15535   auto_vec<unsigned char, 16> bytes;
15536   bytes.reserve (n_elts * elt_size);
15537   for (unsigned int i = 0; i < n_elts; i++)
15538     {
15539       /* The vector is provided in gcc endian-neutral fashion.
15540          For aarch64_be Advanced SIMD, it must be laid out in the vector
15541          register in reverse order.  */
15542       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
15543       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
15544
15545       if (elt_mode != elt_int_mode)
15546         elt = gen_lowpart (elt_int_mode, elt);
15547
15548       if (!CONST_INT_P (elt))
15549         return false;
15550
15551       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
15552       for (unsigned int byte = 0; byte < elt_size; byte++)
15553         {
15554           bytes.quick_push (elt_val & 0xff);
15555           elt_val >>= BITS_PER_UNIT;
15556         }
15557     }
15558
15559   /* The immediate must repeat every eight bytes.  */
15560   unsigned int nbytes = bytes.length ();
15561   for (unsigned i = 8; i < nbytes; ++i)
15562     if (bytes[i] != bytes[i - 8])
15563       return false;
15564
15565   /* Get the repeating 8-byte value as an integer.  No endian correction
15566      is needed here because bytes is already in lsb-first order.  */
15567   unsigned HOST_WIDE_INT val64 = 0;
15568   for (unsigned int i = 0; i < 8; i++)
15569     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
15570               << (i * BITS_PER_UNIT));
15571
15572   if (vec_flags & VEC_SVE_DATA)
15573     return aarch64_sve_valid_immediate (val64, info);
15574   else
15575     return aarch64_advsimd_valid_immediate (val64, info, which);
15576 }
15577
15578 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
15579    has a step in the range of INDEX.  Return the index expression if so,
15580    otherwise return null.  */
15581 rtx
15582 aarch64_check_zero_based_sve_index_immediate (rtx x)
15583 {
15584   rtx base, step;
15585   if (const_vec_series_p (x, &base, &step)
15586       && base == const0_rtx
15587       && aarch64_sve_index_immediate_p (step))
15588     return step;
15589   return NULL_RTX;
15590 }
15591
15592 /* Check of immediate shift constants are within range.  */
15593 bool
15594 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
15595 {
15596   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
15597   if (left)
15598     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
15599   else
15600     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
15601 }
15602
15603 /* Return the bitmask CONST_INT to select the bits required by a zero extract
15604    operation of width WIDTH at bit position POS.  */
15605
15606 rtx
15607 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
15608 {
15609   gcc_assert (CONST_INT_P (width));
15610   gcc_assert (CONST_INT_P (pos));
15611
15612   unsigned HOST_WIDE_INT mask
15613     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
15614   return GEN_INT (mask << UINTVAL (pos));
15615 }
15616
15617 bool
15618 aarch64_mov_operand_p (rtx x, machine_mode mode)
15619 {
15620   if (GET_CODE (x) == HIGH
15621       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
15622     return true;
15623
15624   if (CONST_INT_P (x))
15625     return true;
15626
15627   if (VECTOR_MODE_P (GET_MODE (x)))
15628     {
15629       /* Require predicate constants to be VNx16BI before RA, so that we
15630          force everything to have a canonical form.  */
15631       if (!lra_in_progress
15632           && !reload_completed
15633           && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
15634           && GET_MODE (x) != VNx16BImode)
15635         return false;
15636
15637       return aarch64_simd_valid_immediate (x, NULL);
15638     }
15639
15640   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
15641     return true;
15642
15643   if (aarch64_sve_cnt_immediate_p (x))
15644     return true;
15645
15646   return aarch64_classify_symbolic_expression (x)
15647     == SYMBOL_TINY_ABSOLUTE;
15648 }
15649
15650 /* Return a const_int vector of VAL.  */
15651 rtx
15652 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
15653 {
15654   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
15655   return gen_const_vec_duplicate (mode, c);
15656 }
15657
15658 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
15659
15660 bool
15661 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
15662 {
15663   machine_mode vmode;
15664
15665   vmode = aarch64_simd_container_mode (mode, 64);
15666   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
15667   return aarch64_simd_valid_immediate (op_v, NULL);
15668 }
15669
15670 /* Construct and return a PARALLEL RTX vector with elements numbering the
15671    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
15672    the vector - from the perspective of the architecture.  This does not
15673    line up with GCC's perspective on lane numbers, so we end up with
15674    different masks depending on our target endian-ness.  The diagram
15675    below may help.  We must draw the distinction when building masks
15676    which select one half of the vector.  An instruction selecting
15677    architectural low-lanes for a big-endian target, must be described using
15678    a mask selecting GCC high-lanes.
15679
15680                  Big-Endian             Little-Endian
15681
15682 GCC             0   1   2   3           3   2   1   0
15683               | x | x | x | x |       | x | x | x | x |
15684 Architecture    3   2   1   0           3   2   1   0
15685
15686 Low Mask:         { 2, 3 }                { 0, 1 }
15687 High Mask:        { 0, 1 }                { 2, 3 }
15688
15689    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
15690
15691 rtx
15692 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
15693 {
15694   rtvec v = rtvec_alloc (nunits / 2);
15695   int high_base = nunits / 2;
15696   int low_base = 0;
15697   int base;
15698   rtx t1;
15699   int i;
15700
15701   if (BYTES_BIG_ENDIAN)
15702     base = high ? low_base : high_base;
15703   else
15704     base = high ? high_base : low_base;
15705
15706   for (i = 0; i < nunits / 2; i++)
15707     RTVEC_ELT (v, i) = GEN_INT (base + i);
15708
15709   t1 = gen_rtx_PARALLEL (mode, v);
15710   return t1;
15711 }
15712
15713 /* Check OP for validity as a PARALLEL RTX vector with elements
15714    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
15715    from the perspective of the architecture.  See the diagram above
15716    aarch64_simd_vect_par_cnst_half for more details.  */
15717
15718 bool
15719 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
15720                                        bool high)
15721 {
15722   int nelts;
15723   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
15724     return false;
15725
15726   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
15727   HOST_WIDE_INT count_op = XVECLEN (op, 0);
15728   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
15729   int i = 0;
15730
15731   if (count_op != count_ideal)
15732     return false;
15733
15734   for (i = 0; i < count_ideal; i++)
15735     {
15736       rtx elt_op = XVECEXP (op, 0, i);
15737       rtx elt_ideal = XVECEXP (ideal, 0, i);
15738
15739       if (!CONST_INT_P (elt_op)
15740           || INTVAL (elt_ideal) != INTVAL (elt_op))
15741         return false;
15742     }
15743   return true;
15744 }
15745
15746 /* Return a PARALLEL containing NELTS elements, with element I equal
15747    to BASE + I * STEP.  */
15748
15749 rtx
15750 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
15751 {
15752   rtvec vec = rtvec_alloc (nelts);
15753   for (unsigned int i = 0; i < nelts; ++i)
15754     RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
15755   return gen_rtx_PARALLEL (VOIDmode, vec);
15756 }
15757
15758 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
15759    series with step STEP.  */
15760
15761 bool
15762 aarch64_stepped_int_parallel_p (rtx op, int step)
15763 {
15764   if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
15765     return false;
15766
15767   unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
15768   for (int i = 1; i < XVECLEN (op, 0); ++i)
15769     if (!CONST_INT_P (XVECEXP (op, 0, i))
15770         || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
15771       return false;
15772
15773   return true;
15774 }
15775
15776 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
15777    HIGH (exclusive).  */
15778 void
15779 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
15780                           const_tree exp)
15781 {
15782   HOST_WIDE_INT lane;
15783   gcc_assert (CONST_INT_P (operand));
15784   lane = INTVAL (operand);
15785
15786   if (lane < low || lane >= high)
15787   {
15788     if (exp)
15789       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
15790     else
15791       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
15792   }
15793 }
15794
15795 /* Peform endian correction on lane number N, which indexes a vector
15796    of mode MODE, and return the result as an SImode rtx.  */
15797
15798 rtx
15799 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
15800 {
15801   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
15802 }
15803
15804 /* Return TRUE if OP is a valid vector addressing mode.  */
15805
15806 bool
15807 aarch64_simd_mem_operand_p (rtx op)
15808 {
15809   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
15810                         || REG_P (XEXP (op, 0)));
15811 }
15812
15813 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
15814
15815 bool
15816 aarch64_sve_ld1r_operand_p (rtx op)
15817 {
15818   struct aarch64_address_info addr;
15819   scalar_mode mode;
15820
15821   return (MEM_P (op)
15822           && is_a <scalar_mode> (GET_MODE (op), &mode)
15823           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
15824           && addr.type == ADDRESS_REG_IMM
15825           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
15826 }
15827
15828 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction.  */
15829 bool
15830 aarch64_sve_ld1rq_operand_p (rtx op)
15831 {
15832   struct aarch64_address_info addr;
15833   scalar_mode elem_mode = GET_MODE_INNER (GET_MODE (op));
15834   if (!MEM_P (op)
15835       || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
15836     return false;
15837
15838   if (addr.type == ADDRESS_REG_IMM)
15839     return offset_4bit_signed_scaled_p (TImode, addr.const_offset);
15840
15841   if (addr.type == ADDRESS_REG_REG)
15842     return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
15843
15844   return false;
15845 }
15846
15847 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
15848    The conditions for STR are the same.  */
15849 bool
15850 aarch64_sve_ldr_operand_p (rtx op)
15851 {
15852   struct aarch64_address_info addr;
15853
15854   return (MEM_P (op)
15855           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
15856                                        false, ADDR_QUERY_ANY)
15857           && addr.type == ADDRESS_REG_IMM);
15858 }
15859
15860 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
15861    We need to be able to access the individual pieces, so the range
15862    is different from LD[234] and ST[234].  */
15863 bool
15864 aarch64_sve_struct_memory_operand_p (rtx op)
15865 {
15866   if (!MEM_P (op))
15867     return false;
15868
15869   machine_mode mode = GET_MODE (op);
15870   struct aarch64_address_info addr;
15871   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
15872                                  ADDR_QUERY_ANY)
15873       || addr.type != ADDRESS_REG_IMM)
15874     return false;
15875
15876   poly_int64 first = addr.const_offset;
15877   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
15878   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
15879           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
15880 }
15881
15882 /* Emit a register copy from operand to operand, taking care not to
15883    early-clobber source registers in the process.
15884
15885    COUNT is the number of components into which the copy needs to be
15886    decomposed.  */
15887 void
15888 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
15889                                 unsigned int count)
15890 {
15891   unsigned int i;
15892   int rdest = REGNO (operands[0]);
15893   int rsrc = REGNO (operands[1]);
15894
15895   if (!reg_overlap_mentioned_p (operands[0], operands[1])
15896       || rdest < rsrc)
15897     for (i = 0; i < count; i++)
15898       emit_move_insn (gen_rtx_REG (mode, rdest + i),
15899                       gen_rtx_REG (mode, rsrc + i));
15900   else
15901     for (i = 0; i < count; i++)
15902       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
15903                       gen_rtx_REG (mode, rsrc + count - i - 1));
15904 }
15905
15906 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
15907    one of VSTRUCT modes: OI, CI, or XI.  */
15908 int
15909 aarch64_simd_attr_length_rglist (machine_mode mode)
15910 {
15911   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
15912   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
15913 }
15914
15915 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
15916    alignment of a vector to 128 bits.  SVE predicates have an alignment of
15917    16 bits.  */
15918 static HOST_WIDE_INT
15919 aarch64_simd_vector_alignment (const_tree type)
15920 {
15921   /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
15922      be set for non-predicate vectors of booleans.  Modes are the most
15923      direct way we have of identifying real SVE predicate types.  */
15924   if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
15925     return 16;
15926   if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
15927     return 128;
15928   return wi::umin (wi::to_wide (TYPE_SIZE (type)), 128).to_uhwi ();
15929 }
15930
15931 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
15932 static poly_uint64
15933 aarch64_vectorize_preferred_vector_alignment (const_tree type)
15934 {
15935   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
15936     {
15937       /* If the length of the vector is fixed, try to align to that length,
15938          otherwise don't try to align at all.  */
15939       HOST_WIDE_INT result;
15940       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
15941         result = TYPE_ALIGN (TREE_TYPE (type));
15942       return result;
15943     }
15944   return TYPE_ALIGN (type);
15945 }
15946
15947 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
15948 static bool
15949 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
15950 {
15951   if (is_packed)
15952     return false;
15953
15954   /* For fixed-length vectors, check that the vectorizer will aim for
15955      full-vector alignment.  This isn't true for generic GCC vectors
15956      that are wider than the ABI maximum of 128 bits.  */
15957   poly_uint64 preferred_alignment =
15958     aarch64_vectorize_preferred_vector_alignment (type);
15959   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
15960       && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
15961                    preferred_alignment))
15962     return false;
15963
15964   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
15965   return true;
15966 }
15967
15968 /* Return true if the vector misalignment factor is supported by the
15969    target.  */
15970 static bool
15971 aarch64_builtin_support_vector_misalignment (machine_mode mode,
15972                                              const_tree type, int misalignment,
15973                                              bool is_packed)
15974 {
15975   if (TARGET_SIMD && STRICT_ALIGNMENT)
15976     {
15977       /* Return if movmisalign pattern is not supported for this mode.  */
15978       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
15979         return false;
15980
15981       /* Misalignment factor is unknown at compile time.  */
15982       if (misalignment == -1)
15983         return false;
15984     }
15985   return default_builtin_support_vector_misalignment (mode, type, misalignment,
15986                                                       is_packed);
15987 }
15988
15989 /* If VALS is a vector constant that can be loaded into a register
15990    using DUP, generate instructions to do so and return an RTX to
15991    assign to the register.  Otherwise return NULL_RTX.  */
15992 static rtx
15993 aarch64_simd_dup_constant (rtx vals)
15994 {
15995   machine_mode mode = GET_MODE (vals);
15996   machine_mode inner_mode = GET_MODE_INNER (mode);
15997   rtx x;
15998
15999   if (!const_vec_duplicate_p (vals, &x))
16000     return NULL_RTX;
16001
16002   /* We can load this constant by using DUP and a constant in a
16003      single ARM register.  This will be cheaper than a vector
16004      load.  */
16005   x = copy_to_mode_reg (inner_mode, x);
16006   return gen_vec_duplicate (mode, x);
16007 }
16008
16009
16010 /* Generate code to load VALS, which is a PARALLEL containing only
16011    constants (for vec_init) or CONST_VECTOR, efficiently into a
16012    register.  Returns an RTX to copy into the register, or NULL_RTX
16013    for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
16014 static rtx
16015 aarch64_simd_make_constant (rtx vals)
16016 {
16017   machine_mode mode = GET_MODE (vals);
16018   rtx const_dup;
16019   rtx const_vec = NULL_RTX;
16020   int n_const = 0;
16021   int i;
16022
16023   if (GET_CODE (vals) == CONST_VECTOR)
16024     const_vec = vals;
16025   else if (GET_CODE (vals) == PARALLEL)
16026     {
16027       /* A CONST_VECTOR must contain only CONST_INTs and
16028          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
16029          Only store valid constants in a CONST_VECTOR.  */
16030       int n_elts = XVECLEN (vals, 0);
16031       for (i = 0; i < n_elts; ++i)
16032         {
16033           rtx x = XVECEXP (vals, 0, i);
16034           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16035             n_const++;
16036         }
16037       if (n_const == n_elts)
16038         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
16039     }
16040   else
16041     gcc_unreachable ();
16042
16043   if (const_vec != NULL_RTX
16044       && aarch64_simd_valid_immediate (const_vec, NULL))
16045     /* Load using MOVI/MVNI.  */
16046     return const_vec;
16047   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
16048     /* Loaded using DUP.  */
16049     return const_dup;
16050   else if (const_vec != NULL_RTX)
16051     /* Load from constant pool. We cannot take advantage of single-cycle
16052        LD1 because we need a PC-relative addressing mode.  */
16053     return const_vec;
16054   else
16055     /* A PARALLEL containing something not valid inside CONST_VECTOR.
16056        We cannot construct an initializer.  */
16057     return NULL_RTX;
16058 }
16059
16060 /* Expand a vector initialisation sequence, such that TARGET is
16061    initialised to contain VALS.  */
16062
16063 void
16064 aarch64_expand_vector_init (rtx target, rtx vals)
16065 {
16066   machine_mode mode = GET_MODE (target);
16067   scalar_mode inner_mode = GET_MODE_INNER (mode);
16068   /* The number of vector elements.  */
16069   int n_elts = XVECLEN (vals, 0);
16070   /* The number of vector elements which are not constant.  */
16071   int n_var = 0;
16072   rtx any_const = NULL_RTX;
16073   /* The first element of vals.  */
16074   rtx v0 = XVECEXP (vals, 0, 0);
16075   bool all_same = true;
16076
16077   /* This is a special vec_init<M><N> where N is not an element mode but a
16078      vector mode with half the elements of M.  We expect to find two entries
16079      of mode N in VALS and we must put their concatentation into TARGET.  */
16080   if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
16081     {
16082       gcc_assert (known_eq (GET_MODE_SIZE (mode),
16083                   2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
16084       rtx lo = XVECEXP (vals, 0, 0);
16085       rtx hi = XVECEXP (vals, 0, 1);
16086       machine_mode narrow_mode = GET_MODE (lo);
16087       gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
16088       gcc_assert (narrow_mode == GET_MODE (hi));
16089
16090       /* When we want to concatenate a half-width vector with zeroes we can
16091          use the aarch64_combinez[_be] patterns.  Just make sure that the
16092          zeroes are in the right half.  */
16093       if (BYTES_BIG_ENDIAN
16094           && aarch64_simd_imm_zero (lo, narrow_mode)
16095           && general_operand (hi, narrow_mode))
16096         emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
16097       else if (!BYTES_BIG_ENDIAN
16098                && aarch64_simd_imm_zero (hi, narrow_mode)
16099                && general_operand (lo, narrow_mode))
16100         emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
16101       else
16102         {
16103           /* Else create the two half-width registers and combine them.  */
16104           if (!REG_P (lo))
16105             lo = force_reg (GET_MODE (lo), lo);
16106           if (!REG_P (hi))
16107             hi = force_reg (GET_MODE (hi), hi);
16108
16109           if (BYTES_BIG_ENDIAN)
16110             std::swap (lo, hi);
16111           emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
16112         }
16113      return;
16114    }
16115
16116   /* Count the number of variable elements to initialise.  */
16117   for (int i = 0; i < n_elts; ++i)
16118     {
16119       rtx x = XVECEXP (vals, 0, i);
16120       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
16121         ++n_var;
16122       else
16123         any_const = x;
16124
16125       all_same &= rtx_equal_p (x, v0);
16126     }
16127
16128   /* No variable elements, hand off to aarch64_simd_make_constant which knows
16129      how best to handle this.  */
16130   if (n_var == 0)
16131     {
16132       rtx constant = aarch64_simd_make_constant (vals);
16133       if (constant != NULL_RTX)
16134         {
16135           emit_move_insn (target, constant);
16136           return;
16137         }
16138     }
16139
16140   /* Splat a single non-constant element if we can.  */
16141   if (all_same)
16142     {
16143       rtx x = copy_to_mode_reg (inner_mode, v0);
16144       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
16145       return;
16146     }
16147
16148   enum insn_code icode = optab_handler (vec_set_optab, mode);
16149   gcc_assert (icode != CODE_FOR_nothing);
16150
16151   /* If there are only variable elements, try to optimize
16152      the insertion using dup for the most common element
16153      followed by insertions.  */
16154
16155   /* The algorithm will fill matches[*][0] with the earliest matching element,
16156      and matches[X][1] with the count of duplicate elements (if X is the
16157      earliest element which has duplicates).  */
16158
16159   if (n_var == n_elts && n_elts <= 16)
16160     {
16161       int matches[16][2] = {0};
16162       for (int i = 0; i < n_elts; i++)
16163         {
16164           for (int j = 0; j <= i; j++)
16165             {
16166               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
16167                 {
16168                   matches[i][0] = j;
16169                   matches[j][1]++;
16170                   break;
16171                 }
16172             }
16173         }
16174       int maxelement = 0;
16175       int maxv = 0;
16176       for (int i = 0; i < n_elts; i++)
16177         if (matches[i][1] > maxv)
16178           {
16179             maxelement = i;
16180             maxv = matches[i][1];
16181           }
16182
16183       /* Create a duplicate of the most common element, unless all elements
16184          are equally useless to us, in which case just immediately set the
16185          vector register using the first element.  */
16186
16187       if (maxv == 1)
16188         {
16189           /* For vectors of two 64-bit elements, we can do even better.  */
16190           if (n_elts == 2
16191               && (inner_mode == E_DImode
16192                   || inner_mode == E_DFmode))
16193
16194             {
16195               rtx x0 = XVECEXP (vals, 0, 0);
16196               rtx x1 = XVECEXP (vals, 0, 1);
16197               /* Combine can pick up this case, but handling it directly
16198                  here leaves clearer RTL.
16199
16200                  This is load_pair_lanes<mode>, and also gives us a clean-up
16201                  for store_pair_lanes<mode>.  */
16202               if (memory_operand (x0, inner_mode)
16203                   && memory_operand (x1, inner_mode)
16204                   && !STRICT_ALIGNMENT
16205                   && rtx_equal_p (XEXP (x1, 0),
16206                                   plus_constant (Pmode,
16207                                                  XEXP (x0, 0),
16208                                                  GET_MODE_SIZE (inner_mode))))
16209                 {
16210                   rtx t;
16211                   if (inner_mode == DFmode)
16212                     t = gen_load_pair_lanesdf (target, x0, x1);
16213                   else
16214                     t = gen_load_pair_lanesdi (target, x0, x1);
16215                   emit_insn (t);
16216                   return;
16217                 }
16218             }
16219           /* The subreg-move sequence below will move into lane zero of the
16220              vector register.  For big-endian we want that position to hold
16221              the last element of VALS.  */
16222           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
16223           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
16224           aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
16225         }
16226       else
16227         {
16228           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
16229           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
16230         }
16231
16232       /* Insert the rest.  */
16233       for (int i = 0; i < n_elts; i++)
16234         {
16235           rtx x = XVECEXP (vals, 0, i);
16236           if (matches[i][0] == maxelement)
16237             continue;
16238           x = copy_to_mode_reg (inner_mode, x);
16239           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
16240         }
16241       return;
16242     }
16243
16244   /* Initialise a vector which is part-variable.  We want to first try
16245      to build those lanes which are constant in the most efficient way we
16246      can.  */
16247   if (n_var != n_elts)
16248     {
16249       rtx copy = copy_rtx (vals);
16250
16251       /* Load constant part of vector.  We really don't care what goes into the
16252          parts we will overwrite, but we're more likely to be able to load the
16253          constant efficiently if it has fewer, larger, repeating parts
16254          (see aarch64_simd_valid_immediate).  */
16255       for (int i = 0; i < n_elts; i++)
16256         {
16257           rtx x = XVECEXP (vals, 0, i);
16258           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16259             continue;
16260           rtx subst = any_const;
16261           for (int bit = n_elts / 2; bit > 0; bit /= 2)
16262             {
16263               /* Look in the copied vector, as more elements are const.  */
16264               rtx test = XVECEXP (copy, 0, i ^ bit);
16265               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
16266                 {
16267                   subst = test;
16268                   break;
16269                 }
16270             }
16271           XVECEXP (copy, 0, i) = subst;
16272         }
16273       aarch64_expand_vector_init (target, copy);
16274     }
16275
16276   /* Insert the variable lanes directly.  */
16277   for (int i = 0; i < n_elts; i++)
16278     {
16279       rtx x = XVECEXP (vals, 0, i);
16280       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16281         continue;
16282       x = copy_to_mode_reg (inner_mode, x);
16283       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
16284     }
16285 }
16286
16287 /* Emit RTL corresponding to:
16288    insr TARGET, ELEM.  */
16289
16290 static void
16291 emit_insr (rtx target, rtx elem)
16292 {
16293   machine_mode mode = GET_MODE (target);
16294   scalar_mode elem_mode = GET_MODE_INNER (mode);
16295   elem = force_reg (elem_mode, elem);
16296
16297   insn_code icode = optab_handler (vec_shl_insert_optab, mode);
16298   gcc_assert (icode != CODE_FOR_nothing);
16299   emit_insn (GEN_FCN (icode) (target, target, elem));
16300 }
16301
16302 /* Subroutine of aarch64_sve_expand_vector_init for handling
16303    trailing constants.
16304    This function works as follows:
16305    (a) Create a new vector consisting of trailing constants.
16306    (b) Initialize TARGET with the constant vector using emit_move_insn.
16307    (c) Insert remaining elements in TARGET using insr.
16308    NELTS is the total number of elements in original vector while
16309    while NELTS_REQD is the number of elements that are actually
16310    significant.
16311
16312    ??? The heuristic used is to do above only if number of constants
16313    is at least half the total number of elements.  May need fine tuning.  */
16314
16315 static bool
16316 aarch64_sve_expand_vector_init_handle_trailing_constants
16317  (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
16318 {
16319   machine_mode mode = GET_MODE (target);
16320   scalar_mode elem_mode = GET_MODE_INNER (mode);
16321   int n_trailing_constants = 0;
16322
16323   for (int i = nelts_reqd - 1;
16324        i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i));
16325        i--)
16326     n_trailing_constants++;
16327
16328   if (n_trailing_constants >= nelts_reqd / 2)
16329     {
16330       rtx_vector_builder v (mode, 1, nelts);
16331       for (int i = 0; i < nelts; i++)
16332         v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants));
16333       rtx const_vec = v.build ();
16334       emit_move_insn (target, const_vec);
16335
16336       for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
16337         emit_insr (target, builder.elt (i));
16338
16339       return true;
16340     }
16341
16342   return false;
16343 }
16344
16345 /* Subroutine of aarch64_sve_expand_vector_init.
16346    Works as follows:
16347    (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
16348    (b) Skip trailing elements from BUILDER, which are the same as
16349        element NELTS_REQD - 1.
16350    (c) Insert earlier elements in reverse order in TARGET using insr.  */
16351
16352 static void
16353 aarch64_sve_expand_vector_init_insert_elems (rtx target,
16354                                              const rtx_vector_builder &builder,
16355                                              int nelts_reqd)
16356 {
16357   machine_mode mode = GET_MODE (target);
16358   scalar_mode elem_mode = GET_MODE_INNER (mode);
16359
16360   struct expand_operand ops[2];
16361   enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
16362   gcc_assert (icode != CODE_FOR_nothing);
16363
16364   create_output_operand (&ops[0], target, mode);
16365   create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
16366   expand_insn (icode, 2, ops);
16367
16368   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
16369   for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
16370     emit_insr (target, builder.elt (i));
16371 }
16372
16373 /* Subroutine of aarch64_sve_expand_vector_init to handle case
16374    when all trailing elements of builder are same.
16375    This works as follows:
16376    (a) Use expand_insn interface to broadcast last vector element in TARGET.
16377    (b) Insert remaining elements in TARGET using insr.
16378
16379    ??? The heuristic used is to do above if number of same trailing elements
16380    is at least 3/4 of total number of elements, loosely based on
16381    heuristic from mostly_zeros_p.  May need fine-tuning.  */
16382
16383 static bool
16384 aarch64_sve_expand_vector_init_handle_trailing_same_elem
16385  (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
16386 {
16387   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
16388   if (ndups >= (3 * nelts_reqd) / 4)
16389     {
16390       aarch64_sve_expand_vector_init_insert_elems (target, builder,
16391                                                    nelts_reqd - ndups + 1);
16392       return true;
16393     }
16394
16395   return false;
16396 }
16397
16398 /* Initialize register TARGET from BUILDER. NELTS is the constant number
16399    of elements in BUILDER.
16400
16401    The function tries to initialize TARGET from BUILDER if it fits one
16402    of the special cases outlined below.
16403
16404    Failing that, the function divides BUILDER into two sub-vectors:
16405    v_even = even elements of BUILDER;
16406    v_odd = odd elements of BUILDER;
16407
16408    and recursively calls itself with v_even and v_odd.
16409
16410    if (recursive call succeeded for v_even or v_odd)
16411      TARGET = zip (v_even, v_odd)
16412
16413    The function returns true if it managed to build TARGET from BUILDER
16414    with one of the special cases, false otherwise.
16415
16416    Example: {a, 1, b, 2, c, 3, d, 4}
16417
16418    The vector gets divided into:
16419    v_even = {a, b, c, d}
16420    v_odd = {1, 2, 3, 4}
16421
16422    aarch64_sve_expand_vector_init(v_odd) hits case 1 and
16423    initialize tmp2 from constant vector v_odd using emit_move_insn.
16424
16425    aarch64_sve_expand_vector_init(v_even) fails since v_even contains
16426    4 elements, so we construct tmp1 from v_even using insr:
16427    tmp1 = dup(d)
16428    insr tmp1, c
16429    insr tmp1, b
16430    insr tmp1, a
16431
16432    And finally:
16433    TARGET = zip (tmp1, tmp2)
16434    which sets TARGET to {a, 1, b, 2, c, 3, d, 4}.  */
16435
16436 static bool
16437 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
16438                                 int nelts, int nelts_reqd)
16439 {
16440   machine_mode mode = GET_MODE (target);
16441
16442   /* Case 1: Vector contains trailing constants.  */
16443
16444   if (aarch64_sve_expand_vector_init_handle_trailing_constants
16445        (target, builder, nelts, nelts_reqd))
16446     return true;
16447
16448   /* Case 2: Vector contains leading constants.  */
16449
16450   rtx_vector_builder rev_builder (mode, 1, nelts_reqd);
16451   for (int i = 0; i < nelts_reqd; i++)
16452     rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
16453   rev_builder.finalize ();
16454
16455   if (aarch64_sve_expand_vector_init_handle_trailing_constants
16456        (target, rev_builder, nelts, nelts_reqd))
16457     {
16458       emit_insn (gen_aarch64_sve_rev (mode, target, target));
16459       return true;
16460     }
16461
16462   /* Case 3: Vector contains trailing same element.  */
16463
16464   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16465        (target, builder, nelts_reqd))
16466     return true;
16467
16468   /* Case 4: Vector contains leading same element.  */
16469
16470   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16471        (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
16472     {
16473       emit_insn (gen_aarch64_sve_rev (mode, target, target));
16474       return true;
16475     }
16476
16477   /* Avoid recursing below 4-elements.
16478      ??? The threshold 4 may need fine-tuning.  */
16479
16480   if (nelts_reqd <= 4)
16481     return false;
16482
16483   rtx_vector_builder v_even (mode, 1, nelts);
16484   rtx_vector_builder v_odd (mode, 1, nelts);
16485
16486   for (int i = 0; i < nelts * 2; i += 2)
16487     {
16488       v_even.quick_push (builder.elt (i));
16489       v_odd.quick_push (builder.elt (i + 1));
16490     }
16491
16492   v_even.finalize ();
16493   v_odd.finalize ();
16494
16495   rtx tmp1 = gen_reg_rtx (mode);
16496   bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
16497                                                     nelts, nelts_reqd / 2);
16498
16499   rtx tmp2 = gen_reg_rtx (mode);
16500   bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
16501                                                    nelts, nelts_reqd / 2);
16502
16503   if (!did_even_p && !did_odd_p)
16504     return false;
16505
16506   /* Initialize v_even and v_odd using INSR if it didn't match any of the
16507      special cases and zip v_even, v_odd.  */
16508
16509   if (!did_even_p)
16510     aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
16511
16512   if (!did_odd_p)
16513     aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
16514
16515   rtvec v = gen_rtvec (2, tmp1, tmp2);
16516   emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
16517   return true;
16518 }
16519
16520 /* Initialize register TARGET from the elements in PARALLEL rtx VALS.  */
16521
16522 void
16523 aarch64_sve_expand_vector_init (rtx target, rtx vals)
16524 {
16525   machine_mode mode = GET_MODE (target);
16526   int nelts = XVECLEN (vals, 0);
16527
16528   rtx_vector_builder v (mode, 1, nelts);
16529   for (int i = 0; i < nelts; i++)
16530     v.quick_push (XVECEXP (vals, 0, i));
16531   v.finalize ();
16532
16533   /* If neither sub-vectors of v could be initialized specially,
16534      then use INSR to insert all elements from v into TARGET.
16535      ??? This might not be optimal for vectors with large
16536      initializers like 16-element or above.
16537      For nelts < 4, it probably isn't useful to handle specially.  */
16538
16539   if (nelts < 4
16540       || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
16541     aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
16542 }
16543
16544 /* Check whether VALUE is a vector constant in which every element
16545    is either a power of 2 or a negated power of 2.  If so, return
16546    a constant vector of log2s, and flip CODE between PLUS and MINUS
16547    if VALUE contains negated powers of 2.  Return NULL_RTX otherwise.  */
16548
16549 static rtx
16550 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
16551 {
16552   if (GET_CODE (value) != CONST_VECTOR)
16553     return NULL_RTX;
16554
16555   rtx_vector_builder builder;
16556   if (!builder.new_unary_operation (GET_MODE (value), value, false))
16557     return NULL_RTX;
16558
16559   scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
16560   /* 1 if the result of the multiplication must be negated,
16561      0 if it mustn't, or -1 if we don't yet care.  */
16562   int negate = -1;
16563   unsigned int encoded_nelts = const_vector_encoded_nelts (value);
16564   for (unsigned int i = 0; i < encoded_nelts; ++i)
16565     {
16566       rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
16567       if (!CONST_SCALAR_INT_P (elt))
16568         return NULL_RTX;
16569       rtx_mode_t val (elt, int_mode);
16570       wide_int pow2 = wi::neg (val);
16571       if (val != pow2)
16572         {
16573           /* It matters whether we negate or not.  Make that choice,
16574              and make sure that it's consistent with previous elements.  */
16575           if (negate == !wi::neg_p (val))
16576             return NULL_RTX;
16577           negate = wi::neg_p (val);
16578           if (!negate)
16579             pow2 = val;
16580         }
16581       /* POW2 is now the value that we want to be a power of 2.  */
16582       int shift = wi::exact_log2 (pow2);
16583       if (shift < 0)
16584         return NULL_RTX;
16585       builder.quick_push (gen_int_mode (shift, int_mode));
16586     }
16587   if (negate == -1)
16588     /* PLUS and MINUS are equivalent; canonicalize on PLUS.  */
16589     code = PLUS;
16590   else if (negate == 1)
16591     code = code == PLUS ? MINUS : PLUS;
16592   return builder.build ();
16593 }
16594
16595 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
16596    CODE is PLUS for the former and MINUS for the latter.  OPERANDS is the
16597    operands array, in the same order as for fma_optab.  Return true if
16598    the function emitted all the necessary instructions, false if the caller
16599    should generate the pattern normally with the new OPERANDS array.  */
16600
16601 bool
16602 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
16603 {
16604   machine_mode mode = GET_MODE (operands[0]);
16605   if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
16606     {
16607       rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
16608                                   NULL_RTX, true, OPTAB_DIRECT);
16609       force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
16610                           operands[3], product, operands[0], true,
16611                           OPTAB_DIRECT);
16612       return true;
16613     }
16614   operands[2] = force_reg (mode, operands[2]);
16615   return false;
16616 }
16617
16618 /* Likewise, but for a conditional pattern.  */
16619
16620 bool
16621 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
16622 {
16623   machine_mode mode = GET_MODE (operands[0]);
16624   if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
16625     {
16626       rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
16627                                   NULL_RTX, true, OPTAB_DIRECT);
16628       emit_insn (gen_cond (code, mode, operands[0], operands[1],
16629                            operands[4], product, operands[5]));
16630       return true;
16631     }
16632   operands[3] = force_reg (mode, operands[3]);
16633   return false;
16634 }
16635
16636 static unsigned HOST_WIDE_INT
16637 aarch64_shift_truncation_mask (machine_mode mode)
16638 {
16639   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
16640     return 0;
16641   return GET_MODE_UNIT_BITSIZE (mode) - 1;
16642 }
16643
16644 /* Select a format to encode pointers in exception handling data.  */
16645 int
16646 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
16647 {
16648    int type;
16649    switch (aarch64_cmodel)
16650      {
16651      case AARCH64_CMODEL_TINY:
16652      case AARCH64_CMODEL_TINY_PIC:
16653      case AARCH64_CMODEL_SMALL:
16654      case AARCH64_CMODEL_SMALL_PIC:
16655      case AARCH64_CMODEL_SMALL_SPIC:
16656        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
16657           for everything.  */
16658        type = DW_EH_PE_sdata4;
16659        break;
16660      default:
16661        /* No assumptions here.  8-byte relocs required.  */
16662        type = DW_EH_PE_sdata8;
16663        break;
16664      }
16665    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
16666 }
16667
16668 /* Output .variant_pcs for aarch64_vector_pcs function symbols.  */
16669
16670 static void
16671 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
16672 {
16673   if (aarch64_simd_decl_p (decl))
16674     {
16675       fprintf (stream, "\t.variant_pcs\t");
16676       assemble_name (stream, name);
16677       fprintf (stream, "\n");
16678     }
16679 }
16680
16681 /* The last .arch and .tune assembly strings that we printed.  */
16682 static std::string aarch64_last_printed_arch_string;
16683 static std::string aarch64_last_printed_tune_string;
16684
16685 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
16686    by the function fndecl.  */
16687
16688 void
16689 aarch64_declare_function_name (FILE *stream, const char* name,
16690                                 tree fndecl)
16691 {
16692   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
16693
16694   struct cl_target_option *targ_options;
16695   if (target_parts)
16696     targ_options = TREE_TARGET_OPTION (target_parts);
16697   else
16698     targ_options = TREE_TARGET_OPTION (target_option_current_node);
16699   gcc_assert (targ_options);
16700
16701   const struct processor *this_arch
16702     = aarch64_get_arch (targ_options->x_explicit_arch);
16703
16704   uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
16705   std::string extension
16706     = aarch64_get_extension_string_for_isa_flags (isa_flags,
16707                                                   this_arch->flags);
16708   /* Only update the assembler .arch string if it is distinct from the last
16709      such string we printed.  */
16710   std::string to_print = this_arch->name + extension;
16711   if (to_print != aarch64_last_printed_arch_string)
16712     {
16713       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
16714       aarch64_last_printed_arch_string = to_print;
16715     }
16716
16717   /* Print the cpu name we're tuning for in the comments, might be
16718      useful to readers of the generated asm.  Do it only when it changes
16719      from function to function and verbose assembly is requested.  */
16720   const struct processor *this_tune
16721     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
16722
16723   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
16724     {
16725       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
16726                    this_tune->name);
16727       aarch64_last_printed_tune_string = this_tune->name;
16728     }
16729
16730   aarch64_asm_output_variant_pcs (stream, fndecl, name);
16731
16732   /* Don't forget the type directive for ELF.  */
16733   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
16734   ASM_OUTPUT_LABEL (stream, name);
16735 }
16736
16737 /* Implement ASM_OUTPUT_DEF_FROM_DECLS.  Output .variant_pcs for aliases.  */
16738
16739 void
16740 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
16741 {
16742   const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
16743   const char *value = IDENTIFIER_POINTER (target);
16744   aarch64_asm_output_variant_pcs (stream, decl, name);
16745   ASM_OUTPUT_DEF (stream, name, value);
16746 }
16747
16748 /* Implement ASM_OUTPUT_EXTERNAL.  Output .variant_pcs for undefined
16749    function symbol references.  */
16750
16751 void
16752 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
16753 {
16754   default_elf_asm_output_external (stream, decl, name);
16755   aarch64_asm_output_variant_pcs (stream, decl, name);
16756 }
16757
16758 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
16759    Used to output the .cfi_b_key_frame directive when signing the current
16760    function with the B key.  */
16761
16762 void
16763 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
16764 {
16765   if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
16766       && aarch64_ra_sign_key == AARCH64_KEY_B)
16767         asm_fprintf (f, "\t.cfi_b_key_frame\n");
16768 }
16769
16770 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
16771
16772 static void
16773 aarch64_start_file (void)
16774 {
16775   struct cl_target_option *default_options
16776     = TREE_TARGET_OPTION (target_option_default_node);
16777
16778   const struct processor *default_arch
16779     = aarch64_get_arch (default_options->x_explicit_arch);
16780   uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
16781   std::string extension
16782     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
16783                                                   default_arch->flags);
16784
16785    aarch64_last_printed_arch_string = default_arch->name + extension;
16786    aarch64_last_printed_tune_string = "";
16787    asm_fprintf (asm_out_file, "\t.arch %s\n",
16788                 aarch64_last_printed_arch_string.c_str ());
16789
16790    default_file_start ();
16791 }
16792
16793 /* Emit load exclusive.  */
16794
16795 static void
16796 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
16797                              rtx mem, rtx model_rtx)
16798 {
16799   emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
16800 }
16801
16802 /* Emit store exclusive.  */
16803
16804 static void
16805 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
16806                               rtx rval, rtx mem, rtx model_rtx)
16807 {
16808   emit_insn (gen_aarch64_store_exclusive (mode, bval, rval, mem, model_rtx));
16809 }
16810
16811 /* Mark the previous jump instruction as unlikely.  */
16812
16813 static void
16814 aarch64_emit_unlikely_jump (rtx insn)
16815 {
16816   rtx_insn *jump = emit_jump_insn (insn);
16817   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
16818 }
16819
16820 /* Expand a compare and swap pattern.  */
16821
16822 void
16823 aarch64_expand_compare_and_swap (rtx operands[])
16824 {
16825   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
16826   machine_mode mode, r_mode;
16827
16828   bval = operands[0];
16829   rval = operands[1];
16830   mem = operands[2];
16831   oldval = operands[3];
16832   newval = operands[4];
16833   is_weak = operands[5];
16834   mod_s = operands[6];
16835   mod_f = operands[7];
16836   mode = GET_MODE (mem);
16837
16838   /* Normally the succ memory model must be stronger than fail, but in the
16839      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
16840      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
16841   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
16842       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
16843     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
16844
16845   r_mode = mode;
16846   if (mode == QImode || mode == HImode)
16847     {
16848       r_mode = SImode;
16849       rval = gen_reg_rtx (r_mode);
16850     }
16851
16852   if (TARGET_LSE)
16853     {
16854       /* The CAS insn requires oldval and rval overlap, but we need to
16855          have a copy of oldval saved across the operation to tell if
16856          the operation is successful.  */
16857       if (reg_overlap_mentioned_p (rval, oldval))
16858         rval = copy_to_mode_reg (r_mode, oldval);
16859       else
16860         emit_move_insn (rval, gen_lowpart (r_mode, oldval));
16861
16862       emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
16863                                                    newval, mod_s));
16864       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
16865     }
16866   else
16867     {
16868       /* The oldval predicate varies by mode.  Test it and force to reg.  */
16869       insn_code code = code_for_aarch64_compare_and_swap (mode);
16870       if (!insn_data[code].operand[2].predicate (oldval, mode))
16871         oldval = force_reg (mode, oldval);
16872
16873       emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
16874                                  is_weak, mod_s, mod_f));
16875       cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
16876     }
16877
16878   if (r_mode != mode)
16879     rval = gen_lowpart (mode, rval);
16880   emit_move_insn (operands[1], rval);
16881
16882   x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
16883   emit_insn (gen_rtx_SET (bval, x));
16884 }
16885
16886 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
16887    sequence implementing an atomic operation.  */
16888
16889 static void
16890 aarch64_emit_post_barrier (enum memmodel model)
16891 {
16892   const enum memmodel base_model = memmodel_base (model);
16893
16894   if (is_mm_sync (model)
16895       && (base_model == MEMMODEL_ACQUIRE
16896           || base_model == MEMMODEL_ACQ_REL
16897           || base_model == MEMMODEL_SEQ_CST))
16898     {
16899       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
16900     }
16901 }
16902
16903 /* Split a compare and swap pattern.  */
16904
16905 void
16906 aarch64_split_compare_and_swap (rtx operands[])
16907 {
16908   rtx rval, mem, oldval, newval, scratch;
16909   machine_mode mode;
16910   bool is_weak;
16911   rtx_code_label *label1, *label2;
16912   rtx x, cond;
16913   enum memmodel model;
16914   rtx model_rtx;
16915
16916   rval = operands[0];
16917   mem = operands[1];
16918   oldval = operands[2];
16919   newval = operands[3];
16920   is_weak = (operands[4] != const0_rtx);
16921   model_rtx = operands[5];
16922   scratch = operands[7];
16923   mode = GET_MODE (mem);
16924   model = memmodel_from_int (INTVAL (model_rtx));
16925
16926   /* When OLDVAL is zero and we want the strong version we can emit a tighter
16927     loop:
16928     .label1:
16929         LD[A]XR rval, [mem]
16930         CBNZ    rval, .label2
16931         ST[L]XR scratch, newval, [mem]
16932         CBNZ    scratch, .label1
16933     .label2:
16934         CMP     rval, 0.  */
16935   bool strong_zero_p = !is_weak && oldval == const0_rtx;
16936
16937   label1 = NULL;
16938   if (!is_weak)
16939     {
16940       label1 = gen_label_rtx ();
16941       emit_label (label1);
16942     }
16943   label2 = gen_label_rtx ();
16944
16945   /* The initial load can be relaxed for a __sync operation since a final
16946      barrier will be emitted to stop code hoisting.  */
16947   if (is_mm_sync (model))
16948     aarch64_emit_load_exclusive (mode, rval, mem,
16949                                  GEN_INT (MEMMODEL_RELAXED));
16950   else
16951     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
16952
16953   if (strong_zero_p)
16954     {
16955       if (aarch64_track_speculation)
16956         {
16957           /* Emit an explicit compare instruction, so that we can correctly
16958              track the condition codes.  */
16959           rtx cc_reg = aarch64_gen_compare_reg (NE, rval, const0_rtx);
16960           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16961         }
16962       else
16963         x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
16964
16965       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16966                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
16967       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16968     }
16969   else
16970     {
16971       cond = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
16972       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
16973       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16974                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
16975       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16976     }
16977
16978   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
16979
16980   if (!is_weak)
16981     {
16982       if (aarch64_track_speculation)
16983         {
16984           /* Emit an explicit compare instruction, so that we can correctly
16985              track the condition codes.  */
16986           rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
16987           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16988         }
16989       else
16990         x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
16991
16992       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16993                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
16994       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16995     }
16996   else
16997     {
16998       cond = gen_rtx_REG (CCmode, CC_REGNUM);
16999       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
17000       emit_insn (gen_rtx_SET (cond, x));
17001     }
17002
17003   emit_label (label2);
17004   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
17005      to set the condition flags.  If this is not used it will be removed by
17006      later passes.  */
17007   if (strong_zero_p)
17008     {
17009       cond = gen_rtx_REG (CCmode, CC_REGNUM);
17010       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
17011       emit_insn (gen_rtx_SET (cond, x));
17012     }
17013   /* Emit any final barrier needed for a __sync operation.  */
17014   if (is_mm_sync (model))
17015     aarch64_emit_post_barrier (model);
17016 }
17017
17018 /* Split an atomic operation.  */
17019
17020 void
17021 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
17022                          rtx value, rtx model_rtx, rtx cond)
17023 {
17024   machine_mode mode = GET_MODE (mem);
17025   machine_mode wmode = (mode == DImode ? DImode : SImode);
17026   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
17027   const bool is_sync = is_mm_sync (model);
17028   rtx_code_label *label;
17029   rtx x;
17030
17031   /* Split the atomic operation into a sequence.  */
17032   label = gen_label_rtx ();
17033   emit_label (label);
17034
17035   if (new_out)
17036     new_out = gen_lowpart (wmode, new_out);
17037   if (old_out)
17038     old_out = gen_lowpart (wmode, old_out);
17039   else
17040     old_out = new_out;
17041   value = simplify_gen_subreg (wmode, value, mode, 0);
17042
17043   /* The initial load can be relaxed for a __sync operation since a final
17044      barrier will be emitted to stop code hoisting.  */
17045  if (is_sync)
17046     aarch64_emit_load_exclusive (mode, old_out, mem,
17047                                  GEN_INT (MEMMODEL_RELAXED));
17048   else
17049     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
17050
17051   switch (code)
17052     {
17053     case SET:
17054       new_out = value;
17055       break;
17056
17057     case NOT:
17058       x = gen_rtx_AND (wmode, old_out, value);
17059       emit_insn (gen_rtx_SET (new_out, x));
17060       x = gen_rtx_NOT (wmode, new_out);
17061       emit_insn (gen_rtx_SET (new_out, x));
17062       break;
17063
17064     case MINUS:
17065       if (CONST_INT_P (value))
17066         {
17067           value = GEN_INT (-INTVAL (value));
17068           code = PLUS;
17069         }
17070       /* Fall through.  */
17071
17072     default:
17073       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
17074       emit_insn (gen_rtx_SET (new_out, x));
17075       break;
17076     }
17077
17078   aarch64_emit_store_exclusive (mode, cond, mem,
17079                                 gen_lowpart (mode, new_out), model_rtx);
17080
17081   if (aarch64_track_speculation)
17082     {
17083       /* Emit an explicit compare instruction, so that we can correctly
17084          track the condition codes.  */
17085       rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
17086       x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
17087     }
17088   else
17089     x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
17090
17091   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
17092                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
17093   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
17094
17095   /* Emit any final barrier needed for a __sync operation.  */
17096   if (is_sync)
17097     aarch64_emit_post_barrier (model);
17098 }
17099
17100 static void
17101 aarch64_init_libfuncs (void)
17102 {
17103    /* Half-precision float operations.  The compiler handles all operations
17104      with NULL libfuncs by converting to SFmode.  */
17105
17106   /* Conversions.  */
17107   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
17108   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
17109
17110   /* Arithmetic.  */
17111   set_optab_libfunc (add_optab, HFmode, NULL);
17112   set_optab_libfunc (sdiv_optab, HFmode, NULL);
17113   set_optab_libfunc (smul_optab, HFmode, NULL);
17114   set_optab_libfunc (neg_optab, HFmode, NULL);
17115   set_optab_libfunc (sub_optab, HFmode, NULL);
17116
17117   /* Comparisons.  */
17118   set_optab_libfunc (eq_optab, HFmode, NULL);
17119   set_optab_libfunc (ne_optab, HFmode, NULL);
17120   set_optab_libfunc (lt_optab, HFmode, NULL);
17121   set_optab_libfunc (le_optab, HFmode, NULL);
17122   set_optab_libfunc (ge_optab, HFmode, NULL);
17123   set_optab_libfunc (gt_optab, HFmode, NULL);
17124   set_optab_libfunc (unord_optab, HFmode, NULL);
17125 }
17126
17127 /* Target hook for c_mode_for_suffix.  */
17128 static machine_mode
17129 aarch64_c_mode_for_suffix (char suffix)
17130 {
17131   if (suffix == 'q')
17132     return TFmode;
17133
17134   return VOIDmode;
17135 }
17136
17137 /* We can only represent floating point constants which will fit in
17138    "quarter-precision" values.  These values are characterised by
17139    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
17140    by:
17141
17142    (-1)^s * (n/16) * 2^r
17143
17144    Where:
17145      's' is the sign bit.
17146      'n' is an integer in the range 16 <= n <= 31.
17147      'r' is an integer in the range -3 <= r <= 4.  */
17148
17149 /* Return true iff X can be represented by a quarter-precision
17150    floating point immediate operand X.  Note, we cannot represent 0.0.  */
17151 bool
17152 aarch64_float_const_representable_p (rtx x)
17153 {
17154   /* This represents our current view of how many bits
17155      make up the mantissa.  */
17156   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
17157   int exponent;
17158   unsigned HOST_WIDE_INT mantissa, mask;
17159   REAL_VALUE_TYPE r, m;
17160   bool fail;
17161
17162   x = unwrap_const_vec_duplicate (x);
17163   if (!CONST_DOUBLE_P (x))
17164     return false;
17165
17166   if (GET_MODE (x) == VOIDmode
17167       || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
17168     return false;
17169
17170   r = *CONST_DOUBLE_REAL_VALUE (x);
17171
17172   /* We cannot represent infinities, NaNs or +/-zero.  We won't
17173      know if we have +zero until we analyse the mantissa, but we
17174      can reject the other invalid values.  */
17175   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
17176       || REAL_VALUE_MINUS_ZERO (r))
17177     return false;
17178
17179   /* Extract exponent.  */
17180   r = real_value_abs (&r);
17181   exponent = REAL_EXP (&r);
17182
17183   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
17184      highest (sign) bit, with a fixed binary point at bit point_pos.
17185      m1 holds the low part of the mantissa, m2 the high part.
17186      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
17187      bits for the mantissa, this can fail (low bits will be lost).  */
17188   real_ldexp (&m, &r, point_pos - exponent);
17189   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
17190
17191   /* If the low part of the mantissa has bits set we cannot represent
17192      the value.  */
17193   if (w.ulow () != 0)
17194     return false;
17195   /* We have rejected the lower HOST_WIDE_INT, so update our
17196      understanding of how many bits lie in the mantissa and
17197      look only at the high HOST_WIDE_INT.  */
17198   mantissa = w.elt (1);
17199   point_pos -= HOST_BITS_PER_WIDE_INT;
17200
17201   /* We can only represent values with a mantissa of the form 1.xxxx.  */
17202   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
17203   if ((mantissa & mask) != 0)
17204     return false;
17205
17206   /* Having filtered unrepresentable values, we may now remove all
17207      but the highest 5 bits.  */
17208   mantissa >>= point_pos - 5;
17209
17210   /* We cannot represent the value 0.0, so reject it.  This is handled
17211      elsewhere.  */
17212   if (mantissa == 0)
17213     return false;
17214
17215   /* Then, as bit 4 is always set, we can mask it off, leaving
17216      the mantissa in the range [0, 15].  */
17217   mantissa &= ~(1 << 4);
17218   gcc_assert (mantissa <= 15);
17219
17220   /* GCC internally does not use IEEE754-like encoding (where normalized
17221      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
17222      Our mantissa values are shifted 4 places to the left relative to
17223      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
17224      by 5 places to correct for GCC's representation.  */
17225   exponent = 5 - exponent;
17226
17227   return (exponent >= 0 && exponent <= 7);
17228 }
17229
17230 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
17231    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
17232    output MOVI/MVNI, ORR or BIC immediate.  */
17233 char*
17234 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
17235                                    enum simd_immediate_check which)
17236 {
17237   bool is_valid;
17238   static char templ[40];
17239   const char *mnemonic;
17240   const char *shift_op;
17241   unsigned int lane_count = 0;
17242   char element_char;
17243
17244   struct simd_immediate_info info;
17245
17246   /* This will return true to show const_vector is legal for use as either
17247      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
17248      It will also update INFO to show how the immediate should be generated.
17249      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
17250   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
17251   gcc_assert (is_valid);
17252
17253   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
17254   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
17255
17256   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
17257     {
17258       gcc_assert (info.insn == simd_immediate_info::MOV
17259                   && info.u.mov.shift == 0);
17260       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
17261          move immediate path.  */
17262       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
17263         info.u.mov.value = GEN_INT (0);
17264       else
17265         {
17266           const unsigned int buf_size = 20;
17267           char float_buf[buf_size] = {'\0'};
17268           real_to_decimal_for_mode (float_buf,
17269                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
17270                                     buf_size, buf_size, 1, info.elt_mode);
17271
17272           if (lane_count == 1)
17273             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
17274           else
17275             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
17276                       lane_count, element_char, float_buf);
17277           return templ;
17278         }
17279     }
17280
17281   gcc_assert (CONST_INT_P (info.u.mov.value));
17282
17283   if (which == AARCH64_CHECK_MOV)
17284     {
17285       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
17286       shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
17287                   ? "msl" : "lsl");
17288       if (lane_count == 1)
17289         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
17290                   mnemonic, UINTVAL (info.u.mov.value));
17291       else if (info.u.mov.shift)
17292         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
17293                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
17294                   element_char, UINTVAL (info.u.mov.value), shift_op,
17295                   info.u.mov.shift);
17296       else
17297         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
17298                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
17299                   element_char, UINTVAL (info.u.mov.value));
17300     }
17301   else
17302     {
17303       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
17304       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
17305       if (info.u.mov.shift)
17306         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
17307                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
17308                   element_char, UINTVAL (info.u.mov.value), "lsl",
17309                   info.u.mov.shift);
17310       else
17311         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
17312                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
17313                   element_char, UINTVAL (info.u.mov.value));
17314     }
17315   return templ;
17316 }
17317
17318 char*
17319 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
17320 {
17321
17322   /* If a floating point number was passed and we desire to use it in an
17323      integer mode do the conversion to integer.  */
17324   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
17325     {
17326       unsigned HOST_WIDE_INT ival;
17327       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
17328           gcc_unreachable ();
17329       immediate = gen_int_mode (ival, mode);
17330     }
17331
17332   machine_mode vmode;
17333   /* use a 64 bit mode for everything except for DI/DF mode, where we use
17334      a 128 bit vector mode.  */
17335   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
17336
17337   vmode = aarch64_simd_container_mode (mode, width);
17338   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
17339   return aarch64_output_simd_mov_immediate (v_op, width);
17340 }
17341
17342 /* Return the output string to use for moving immediate CONST_VECTOR
17343    into an SVE register.  */
17344
17345 char *
17346 aarch64_output_sve_mov_immediate (rtx const_vector)
17347 {
17348   static char templ[40];
17349   struct simd_immediate_info info;
17350   char element_char;
17351
17352   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
17353   gcc_assert (is_valid);
17354
17355   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
17356
17357   machine_mode vec_mode = GET_MODE (const_vector);
17358   if (aarch64_sve_pred_mode_p (vec_mode))
17359     {
17360       static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
17361       if (info.insn == simd_immediate_info::MOV)
17362         {
17363           gcc_assert (info.u.mov.value == const0_rtx);
17364           snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
17365         }
17366       else
17367         {
17368           gcc_assert (info.insn == simd_immediate_info::PTRUE);
17369           unsigned int total_bytes;
17370           if (info.u.pattern == AARCH64_SV_ALL
17371               && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
17372             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
17373                       total_bytes / GET_MODE_SIZE (info.elt_mode));
17374           else
17375             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
17376                       svpattern_token (info.u.pattern));
17377         }
17378       return buf;
17379     }
17380
17381   if (info.insn == simd_immediate_info::INDEX)
17382     {
17383       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
17384                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
17385                 element_char, INTVAL (info.u.index.base),
17386                 INTVAL (info.u.index.step));
17387       return templ;
17388     }
17389
17390   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
17391     {
17392       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
17393         info.u.mov.value = GEN_INT (0);
17394       else
17395         {
17396           const int buf_size = 20;
17397           char float_buf[buf_size] = {};
17398           real_to_decimal_for_mode (float_buf,
17399                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
17400                                     buf_size, buf_size, 1, info.elt_mode);
17401
17402           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
17403                     element_char, float_buf);
17404           return templ;
17405         }
17406     }
17407
17408   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
17409             element_char, INTVAL (info.u.mov.value));
17410   return templ;
17411 }
17412
17413 /* Split operands into moves from op[1] + op[2] into op[0].  */
17414
17415 void
17416 aarch64_split_combinev16qi (rtx operands[3])
17417 {
17418   unsigned int dest = REGNO (operands[0]);
17419   unsigned int src1 = REGNO (operands[1]);
17420   unsigned int src2 = REGNO (operands[2]);
17421   machine_mode halfmode = GET_MODE (operands[1]);
17422   unsigned int halfregs = REG_NREGS (operands[1]);
17423   rtx destlo, desthi;
17424
17425   gcc_assert (halfmode == V16QImode);
17426
17427   if (src1 == dest && src2 == dest + halfregs)
17428     {
17429       /* No-op move.  Can't split to nothing; emit something.  */
17430       emit_note (NOTE_INSN_DELETED);
17431       return;
17432     }
17433
17434   /* Preserve register attributes for variable tracking.  */
17435   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
17436   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
17437                                GET_MODE_SIZE (halfmode));
17438
17439   /* Special case of reversed high/low parts.  */
17440   if (reg_overlap_mentioned_p (operands[2], destlo)
17441       && reg_overlap_mentioned_p (operands[1], desthi))
17442     {
17443       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
17444       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
17445       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
17446     }
17447   else if (!reg_overlap_mentioned_p (operands[2], destlo))
17448     {
17449       /* Try to avoid unnecessary moves if part of the result
17450          is in the right place already.  */
17451       if (src1 != dest)
17452         emit_move_insn (destlo, operands[1]);
17453       if (src2 != dest + halfregs)
17454         emit_move_insn (desthi, operands[2]);
17455     }
17456   else
17457     {
17458       if (src2 != dest + halfregs)
17459         emit_move_insn (desthi, operands[2]);
17460       if (src1 != dest)
17461         emit_move_insn (destlo, operands[1]);
17462     }
17463 }
17464
17465 /* vec_perm support.  */
17466
17467 struct expand_vec_perm_d
17468 {
17469   rtx target, op0, op1;
17470   vec_perm_indices perm;
17471   machine_mode vmode;
17472   unsigned int vec_flags;
17473   bool one_vector_p;
17474   bool testing_p;
17475 };
17476
17477 /* Generate a variable permutation.  */
17478
17479 static void
17480 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
17481 {
17482   machine_mode vmode = GET_MODE (target);
17483   bool one_vector_p = rtx_equal_p (op0, op1);
17484
17485   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
17486   gcc_checking_assert (GET_MODE (op0) == vmode);
17487   gcc_checking_assert (GET_MODE (op1) == vmode);
17488   gcc_checking_assert (GET_MODE (sel) == vmode);
17489   gcc_checking_assert (TARGET_SIMD);
17490
17491   if (one_vector_p)
17492     {
17493       if (vmode == V8QImode)
17494         {
17495           /* Expand the argument to a V16QI mode by duplicating it.  */
17496           rtx pair = gen_reg_rtx (V16QImode);
17497           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
17498           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
17499         }
17500       else
17501         {
17502           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
17503         }
17504     }
17505   else
17506     {
17507       rtx pair;
17508
17509       if (vmode == V8QImode)
17510         {
17511           pair = gen_reg_rtx (V16QImode);
17512           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
17513           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
17514         }
17515       else
17516         {
17517           pair = gen_reg_rtx (OImode);
17518           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
17519           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
17520         }
17521     }
17522 }
17523
17524 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
17525    NELT is the number of elements in the vector.  */
17526
17527 void
17528 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
17529                          unsigned int nelt)
17530 {
17531   machine_mode vmode = GET_MODE (target);
17532   bool one_vector_p = rtx_equal_p (op0, op1);
17533   rtx mask;
17534
17535   /* The TBL instruction does not use a modulo index, so we must take care
17536      of that ourselves.  */
17537   mask = aarch64_simd_gen_const_vector_dup (vmode,
17538       one_vector_p ? nelt - 1 : 2 * nelt - 1);
17539   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
17540
17541   /* For big-endian, we also need to reverse the index within the vector
17542      (but not which vector).  */
17543   if (BYTES_BIG_ENDIAN)
17544     {
17545       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
17546       if (!one_vector_p)
17547         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
17548       sel = expand_simple_binop (vmode, XOR, sel, mask,
17549                                  NULL, 0, OPTAB_LIB_WIDEN);
17550     }
17551   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
17552 }
17553
17554 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
17555
17556 static void
17557 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
17558 {
17559   emit_insn (gen_rtx_SET (target,
17560                           gen_rtx_UNSPEC (GET_MODE (target),
17561                                           gen_rtvec (2, op0, op1), code)));
17562 }
17563
17564 /* Expand an SVE vec_perm with the given operands.  */
17565
17566 void
17567 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
17568 {
17569   machine_mode data_mode = GET_MODE (target);
17570   machine_mode sel_mode = GET_MODE (sel);
17571   /* Enforced by the pattern condition.  */
17572   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
17573
17574   /* Note: vec_perm indices are supposed to wrap when they go beyond the
17575      size of the two value vectors, i.e. the upper bits of the indices
17576      are effectively ignored.  SVE TBL instead produces 0 for any
17577      out-of-range indices, so we need to modulo all the vec_perm indices
17578      to ensure they are all in range.  */
17579   rtx sel_reg = force_reg (sel_mode, sel);
17580
17581   /* Check if the sel only references the first values vector.  */
17582   if (GET_CODE (sel) == CONST_VECTOR
17583       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
17584     {
17585       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
17586       return;
17587     }
17588
17589   /* Check if the two values vectors are the same.  */
17590   if (rtx_equal_p (op0, op1))
17591     {
17592       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
17593       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
17594                                          NULL, 0, OPTAB_DIRECT);
17595       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
17596       return;
17597     }
17598
17599   /* Run TBL on for each value vector and combine the results.  */
17600
17601   rtx res0 = gen_reg_rtx (data_mode);
17602   rtx res1 = gen_reg_rtx (data_mode);
17603   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
17604   if (GET_CODE (sel) != CONST_VECTOR
17605       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
17606     {
17607       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
17608                                                        2 * nunits - 1);
17609       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
17610                                      NULL, 0, OPTAB_DIRECT);
17611     }
17612   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
17613   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
17614                                      NULL, 0, OPTAB_DIRECT);
17615   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
17616   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
17617     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
17618   else
17619     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
17620 }
17621
17622 /* Recognize patterns suitable for the TRN instructions.  */
17623 static bool
17624 aarch64_evpc_trn (struct expand_vec_perm_d *d)
17625 {
17626   HOST_WIDE_INT odd;
17627   poly_uint64 nelt = d->perm.length ();
17628   rtx out, in0, in1, x;
17629   machine_mode vmode = d->vmode;
17630
17631   if (GET_MODE_UNIT_SIZE (vmode) > 8)
17632     return false;
17633
17634   /* Note that these are little-endian tests.
17635      We correct for big-endian later.  */
17636   if (!d->perm[0].is_constant (&odd)
17637       || (odd != 0 && odd != 1)
17638       || !d->perm.series_p (0, 2, odd, 2)
17639       || !d->perm.series_p (1, 2, nelt + odd, 2))
17640     return false;
17641
17642   /* Success!  */
17643   if (d->testing_p)
17644     return true;
17645
17646   in0 = d->op0;
17647   in1 = d->op1;
17648   /* We don't need a big-endian lane correction for SVE; see the comment
17649      at the head of aarch64-sve.md for details.  */
17650   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17651     {
17652       x = in0, in0 = in1, in1 = x;
17653       odd = !odd;
17654     }
17655   out = d->target;
17656
17657   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17658                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
17659   return true;
17660 }
17661
17662 /* Recognize patterns suitable for the UZP instructions.  */
17663 static bool
17664 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
17665 {
17666   HOST_WIDE_INT odd;
17667   rtx out, in0, in1, x;
17668   machine_mode vmode = d->vmode;
17669
17670   if (GET_MODE_UNIT_SIZE (vmode) > 8)
17671     return false;
17672
17673   /* Note that these are little-endian tests.
17674      We correct for big-endian later.  */
17675   if (!d->perm[0].is_constant (&odd)
17676       || (odd != 0 && odd != 1)
17677       || !d->perm.series_p (0, 1, odd, 2))
17678     return false;
17679
17680   /* Success!  */
17681   if (d->testing_p)
17682     return true;
17683
17684   in0 = d->op0;
17685   in1 = d->op1;
17686   /* We don't need a big-endian lane correction for SVE; see the comment
17687      at the head of aarch64-sve.md for details.  */
17688   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17689     {
17690       x = in0, in0 = in1, in1 = x;
17691       odd = !odd;
17692     }
17693   out = d->target;
17694
17695   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17696                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
17697   return true;
17698 }
17699
17700 /* Recognize patterns suitable for the ZIP instructions.  */
17701 static bool
17702 aarch64_evpc_zip (struct expand_vec_perm_d *d)
17703 {
17704   unsigned int high;
17705   poly_uint64 nelt = d->perm.length ();
17706   rtx out, in0, in1, x;
17707   machine_mode vmode = d->vmode;
17708
17709   if (GET_MODE_UNIT_SIZE (vmode) > 8)
17710     return false;
17711
17712   /* Note that these are little-endian tests.
17713      We correct for big-endian later.  */
17714   poly_uint64 first = d->perm[0];
17715   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
17716       || !d->perm.series_p (0, 2, first, 1)
17717       || !d->perm.series_p (1, 2, first + nelt, 1))
17718     return false;
17719   high = maybe_ne (first, 0U);
17720
17721   /* Success!  */
17722   if (d->testing_p)
17723     return true;
17724
17725   in0 = d->op0;
17726   in1 = d->op1;
17727   /* We don't need a big-endian lane correction for SVE; see the comment
17728      at the head of aarch64-sve.md for details.  */
17729   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17730     {
17731       x = in0, in0 = in1, in1 = x;
17732       high = !high;
17733     }
17734   out = d->target;
17735
17736   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17737                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
17738   return true;
17739 }
17740
17741 /* Recognize patterns for the EXT insn.  */
17742
17743 static bool
17744 aarch64_evpc_ext (struct expand_vec_perm_d *d)
17745 {
17746   HOST_WIDE_INT location;
17747   rtx offset;
17748
17749   /* The first element always refers to the first vector.
17750      Check if the extracted indices are increasing by one.  */
17751   if (d->vec_flags == VEC_SVE_PRED
17752       || !d->perm[0].is_constant (&location)
17753       || !d->perm.series_p (0, 1, location, 1))
17754     return false;
17755
17756   /* Success! */
17757   if (d->testing_p)
17758     return true;
17759
17760   /* The case where (location == 0) is a no-op for both big- and little-endian,
17761      and is removed by the mid-end at optimization levels -O1 and higher.
17762
17763      We don't need a big-endian lane correction for SVE; see the comment
17764      at the head of aarch64-sve.md for details.  */
17765   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
17766     {
17767       /* After setup, we want the high elements of the first vector (stored
17768          at the LSB end of the register), and the low elements of the second
17769          vector (stored at the MSB end of the register). So swap.  */
17770       std::swap (d->op0, d->op1);
17771       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
17772          to_constant () is safe since this is restricted to Advanced SIMD
17773          vectors.  */
17774       location = d->perm.length ().to_constant () - location;
17775     }
17776
17777   offset = GEN_INT (location);
17778   emit_set_insn (d->target,
17779                  gen_rtx_UNSPEC (d->vmode,
17780                                  gen_rtvec (3, d->op0, d->op1, offset),
17781                                  UNSPEC_EXT));
17782   return true;
17783 }
17784
17785 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
17786    within each 64-bit, 32-bit or 16-bit granule.  */
17787
17788 static bool
17789 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
17790 {
17791   HOST_WIDE_INT diff;
17792   unsigned int i, size, unspec;
17793   machine_mode pred_mode;
17794
17795   if (d->vec_flags == VEC_SVE_PRED
17796       || !d->one_vector_p
17797       || !d->perm[0].is_constant (&diff))
17798     return false;
17799
17800   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
17801   if (size == 8)
17802     {
17803       unspec = UNSPEC_REV64;
17804       pred_mode = VNx2BImode;
17805     }
17806   else if (size == 4)
17807     {
17808       unspec = UNSPEC_REV32;
17809       pred_mode = VNx4BImode;
17810     }
17811   else if (size == 2)
17812     {
17813       unspec = UNSPEC_REV16;
17814       pred_mode = VNx8BImode;
17815     }
17816   else
17817     return false;
17818
17819   unsigned int step = diff + 1;
17820   for (i = 0; i < step; ++i)
17821     if (!d->perm.series_p (i, step, diff - i, step))
17822       return false;
17823
17824   /* Success! */
17825   if (d->testing_p)
17826     return true;
17827
17828   if (d->vec_flags == VEC_SVE_DATA)
17829     {
17830       machine_mode int_mode = aarch64_sve_int_mode (pred_mode);
17831       rtx target = gen_reg_rtx (int_mode);
17832       if (BYTES_BIG_ENDIAN)
17833         /* The act of taking a subreg between INT_MODE and d->vmode
17834            is itself a reversing operation on big-endian targets;
17835            see the comment at the head of aarch64-sve.md for details.
17836            First reinterpret OP0 as INT_MODE without using a subreg
17837            and without changing the contents.  */
17838         emit_insn (gen_aarch64_sve_reinterpret (int_mode, target, d->op0));
17839       else
17840         {
17841           /* For SVE we use REV[BHW] unspecs derived from the element size
17842              of v->mode and vector modes whose elements have SIZE bytes.
17843              This ensures that the vector modes match the predicate modes.  */
17844           int unspec = aarch64_sve_rev_unspec (d->vmode);
17845           rtx pred = aarch64_ptrue_reg (pred_mode);
17846           emit_insn (gen_aarch64_pred (unspec, int_mode, target, pred,
17847                                        gen_lowpart (int_mode, d->op0)));
17848         }
17849       emit_move_insn (d->target, gen_lowpart (d->vmode, target));
17850       return true;
17851     }
17852   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
17853   emit_set_insn (d->target, src);
17854   return true;
17855 }
17856
17857 /* Recognize patterns for the REV insn, which reverses elements within
17858    a full vector.  */
17859
17860 static bool
17861 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
17862 {
17863   poly_uint64 nelt = d->perm.length ();
17864
17865   if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
17866     return false;
17867
17868   if (!d->perm.series_p (0, 1, nelt - 1, -1))
17869     return false;
17870
17871   /* Success! */
17872   if (d->testing_p)
17873     return true;
17874
17875   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
17876   emit_set_insn (d->target, src);
17877   return true;
17878 }
17879
17880 static bool
17881 aarch64_evpc_dup (struct expand_vec_perm_d *d)
17882 {
17883   rtx out = d->target;
17884   rtx in0;
17885   HOST_WIDE_INT elt;
17886   machine_mode vmode = d->vmode;
17887   rtx lane;
17888
17889   if (d->vec_flags == VEC_SVE_PRED
17890       || d->perm.encoding ().encoded_nelts () != 1
17891       || !d->perm[0].is_constant (&elt))
17892     return false;
17893
17894   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
17895     return false;
17896
17897   /* Success! */
17898   if (d->testing_p)
17899     return true;
17900
17901   /* The generic preparation in aarch64_expand_vec_perm_const_1
17902      swaps the operand order and the permute indices if it finds
17903      d->perm[0] to be in the second operand.  Thus, we can always
17904      use d->op0 and need not do any extra arithmetic to get the
17905      correct lane number.  */
17906   in0 = d->op0;
17907   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
17908
17909   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
17910   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
17911   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
17912   return true;
17913 }
17914
17915 static bool
17916 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
17917 {
17918   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
17919   machine_mode vmode = d->vmode;
17920
17921   /* Make sure that the indices are constant.  */
17922   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
17923   for (unsigned int i = 0; i < encoded_nelts; ++i)
17924     if (!d->perm[i].is_constant ())
17925       return false;
17926
17927   if (d->testing_p)
17928     return true;
17929
17930   /* Generic code will try constant permutation twice.  Once with the
17931      original mode and again with the elements lowered to QImode.
17932      So wait and don't do the selector expansion ourselves.  */
17933   if (vmode != V8QImode && vmode != V16QImode)
17934     return false;
17935
17936   /* to_constant is safe since this routine is specific to Advanced SIMD
17937      vectors.  */
17938   unsigned int nelt = d->perm.length ().to_constant ();
17939   for (unsigned int i = 0; i < nelt; ++i)
17940     /* If big-endian and two vectors we end up with a weird mixed-endian
17941        mode on NEON.  Reverse the index within each word but not the word
17942        itself.  to_constant is safe because we checked is_constant above.  */
17943     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
17944                         ? d->perm[i].to_constant () ^ (nelt - 1)
17945                         : d->perm[i].to_constant ());
17946
17947   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
17948   sel = force_reg (vmode, sel);
17949
17950   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
17951   return true;
17952 }
17953
17954 /* Try to implement D using an SVE TBL instruction.  */
17955
17956 static bool
17957 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
17958 {
17959   unsigned HOST_WIDE_INT nelt;
17960
17961   /* Permuting two variable-length vectors could overflow the
17962      index range.  */
17963   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
17964     return false;
17965
17966   if (d->testing_p)
17967     return true;
17968
17969   machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
17970   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
17971   if (d->one_vector_p)
17972     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
17973   else
17974     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
17975   return true;
17976 }
17977
17978 static bool
17979 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
17980 {
17981   /* The pattern matching functions above are written to look for a small
17982      number to begin the sequence (0, 1, N/2).  If we begin with an index
17983      from the second operand, we can swap the operands.  */
17984   poly_int64 nelt = d->perm.length ();
17985   if (known_ge (d->perm[0], nelt))
17986     {
17987       d->perm.rotate_inputs (1);
17988       std::swap (d->op0, d->op1);
17989     }
17990
17991   if ((d->vec_flags == VEC_ADVSIMD
17992        || d->vec_flags == VEC_SVE_DATA
17993        || d->vec_flags == VEC_SVE_PRED)
17994       && known_gt (nelt, 1))
17995     {
17996       if (aarch64_evpc_rev_local (d))
17997         return true;
17998       else if (aarch64_evpc_rev_global (d))
17999         return true;
18000       else if (aarch64_evpc_ext (d))
18001         return true;
18002       else if (aarch64_evpc_dup (d))
18003         return true;
18004       else if (aarch64_evpc_zip (d))
18005         return true;
18006       else if (aarch64_evpc_uzp (d))
18007         return true;
18008       else if (aarch64_evpc_trn (d))
18009         return true;
18010       if (d->vec_flags == VEC_SVE_DATA)
18011         return aarch64_evpc_sve_tbl (d);
18012       else if (d->vec_flags == VEC_ADVSIMD)
18013         return aarch64_evpc_tbl (d);
18014     }
18015   return false;
18016 }
18017
18018 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
18019
18020 static bool
18021 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
18022                                   rtx op1, const vec_perm_indices &sel)
18023 {
18024   struct expand_vec_perm_d d;
18025
18026   /* Check whether the mask can be applied to a single vector.  */
18027   if (sel.ninputs () == 1
18028       || (op0 && rtx_equal_p (op0, op1)))
18029     d.one_vector_p = true;
18030   else if (sel.all_from_input_p (0))
18031     {
18032       d.one_vector_p = true;
18033       op1 = op0;
18034     }
18035   else if (sel.all_from_input_p (1))
18036     {
18037       d.one_vector_p = true;
18038       op0 = op1;
18039     }
18040   else
18041     d.one_vector_p = false;
18042
18043   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
18044                      sel.nelts_per_input ());
18045   d.vmode = vmode;
18046   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
18047   d.target = target;
18048   d.op0 = op0;
18049   d.op1 = op1;
18050   d.testing_p = !target;
18051
18052   if (!d.testing_p)
18053     return aarch64_expand_vec_perm_const_1 (&d);
18054
18055   rtx_insn *last = get_last_insn ();
18056   bool ret = aarch64_expand_vec_perm_const_1 (&d);
18057   gcc_assert (last == get_last_insn ());
18058
18059   return ret;
18060 }
18061
18062 /* Generate a byte permute mask for a register of mode MODE,
18063    which has NUNITS units.  */
18064
18065 rtx
18066 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
18067 {
18068   /* We have to reverse each vector because we dont have
18069      a permuted load that can reverse-load according to ABI rules.  */
18070   rtx mask;
18071   rtvec v = rtvec_alloc (16);
18072   unsigned int i, j;
18073   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
18074
18075   gcc_assert (BYTES_BIG_ENDIAN);
18076   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
18077
18078   for (i = 0; i < nunits; i++)
18079     for (j = 0; j < usize; j++)
18080       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
18081   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
18082   return force_reg (V16QImode, mask);
18083 }
18084
18085 /* Expand an SVE integer comparison using the SVE equivalent of:
18086
18087      (set TARGET (CODE OP0 OP1)).  */
18088
18089 void
18090 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
18091 {
18092   machine_mode pred_mode = GET_MODE (target);
18093   machine_mode data_mode = GET_MODE (op0);
18094   rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
18095                                       op0, op1);
18096   if (!rtx_equal_p (target, res))
18097     emit_move_insn (target, res);
18098 }
18099
18100 /* Return the UNSPEC_COND_* code for comparison CODE.  */
18101
18102 static unsigned int
18103 aarch64_unspec_cond_code (rtx_code code)
18104 {
18105   switch (code)
18106     {
18107     case NE:
18108       return UNSPEC_COND_FCMNE;
18109     case EQ:
18110       return UNSPEC_COND_FCMEQ;
18111     case LT:
18112       return UNSPEC_COND_FCMLT;
18113     case GT:
18114       return UNSPEC_COND_FCMGT;
18115     case LE:
18116       return UNSPEC_COND_FCMLE;
18117     case GE:
18118       return UNSPEC_COND_FCMGE;
18119     case UNORDERED:
18120       return UNSPEC_COND_FCMUO;
18121     default:
18122       gcc_unreachable ();
18123     }
18124 }
18125
18126 /* Emit:
18127
18128       (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
18129
18130    where <X> is the operation associated with comparison CODE.
18131    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
18132
18133 static void
18134 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
18135                           bool known_ptrue_p, rtx op0, rtx op1)
18136 {
18137   rtx flag = gen_int_mode (known_ptrue_p, SImode);
18138   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
18139                                gen_rtvec (4, pred, flag, op0, op1),
18140                                aarch64_unspec_cond_code (code));
18141   emit_set_insn (target, unspec);
18142 }
18143
18144 /* Emit the SVE equivalent of:
18145
18146       (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
18147       (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
18148       (set TARGET (ior:PRED_MODE TMP1 TMP2))
18149
18150    where <Xi> is the operation associated with comparison CODEi.
18151    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
18152
18153 static void
18154 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
18155                               rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
18156 {
18157   machine_mode pred_mode = GET_MODE (pred);
18158   rtx tmp1 = gen_reg_rtx (pred_mode);
18159   aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
18160   rtx tmp2 = gen_reg_rtx (pred_mode);
18161   aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
18162   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
18163 }
18164
18165 /* Emit the SVE equivalent of:
18166
18167       (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
18168       (set TARGET (not TMP))
18169
18170    where <X> is the operation associated with comparison CODE.
18171    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
18172
18173 static void
18174 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
18175                                  bool known_ptrue_p, rtx op0, rtx op1)
18176 {
18177   machine_mode pred_mode = GET_MODE (pred);
18178   rtx tmp = gen_reg_rtx (pred_mode);
18179   aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
18180   aarch64_emit_unop (target, one_cmpl_optab, tmp);
18181 }
18182
18183 /* Expand an SVE floating-point comparison using the SVE equivalent of:
18184
18185      (set TARGET (CODE OP0 OP1))
18186
18187    If CAN_INVERT_P is true, the caller can also handle inverted results;
18188    return true if the result is in fact inverted.  */
18189
18190 bool
18191 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
18192                                   rtx op0, rtx op1, bool can_invert_p)
18193 {
18194   machine_mode pred_mode = GET_MODE (target);
18195   machine_mode data_mode = GET_MODE (op0);
18196
18197   rtx ptrue = aarch64_ptrue_reg (pred_mode);
18198   switch (code)
18199     {
18200     case UNORDERED:
18201       /* UNORDERED has no immediate form.  */
18202       op1 = force_reg (data_mode, op1);
18203       /* fall through */
18204     case LT:
18205     case LE:
18206     case GT:
18207     case GE:
18208     case EQ:
18209     case NE:
18210       {
18211         /* There is native support for the comparison.  */
18212         aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
18213         return false;
18214       }
18215
18216     case LTGT:
18217       /* This is a trapping operation (LT or GT).  */
18218       aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
18219       return false;
18220
18221     case UNEQ:
18222       if (!flag_trapping_math)
18223         {
18224           /* This would trap for signaling NaNs.  */
18225           op1 = force_reg (data_mode, op1);
18226           aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
18227                                         ptrue, true, op0, op1);
18228           return false;
18229         }
18230       /* fall through */
18231     case UNLT:
18232     case UNLE:
18233     case UNGT:
18234     case UNGE:
18235       if (flag_trapping_math)
18236         {
18237           /* Work out which elements are ordered.  */
18238           rtx ordered = gen_reg_rtx (pred_mode);
18239           op1 = force_reg (data_mode, op1);
18240           aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
18241                                            ptrue, true, op0, op1);
18242
18243           /* Test the opposite condition for the ordered elements,
18244              then invert the result.  */
18245           if (code == UNEQ)
18246             code = NE;
18247           else
18248             code = reverse_condition_maybe_unordered (code);
18249           if (can_invert_p)
18250             {
18251               aarch64_emit_sve_fp_cond (target, code,
18252                                         ordered, false, op0, op1);
18253               return true;
18254             }
18255           aarch64_emit_sve_invert_fp_cond (target, code,
18256                                            ordered, false, op0, op1);
18257           return false;
18258         }
18259       break;
18260
18261     case ORDERED:
18262       /* ORDERED has no immediate form.  */
18263       op1 = force_reg (data_mode, op1);
18264       break;
18265
18266     default:
18267       gcc_unreachable ();
18268     }
18269
18270   /* There is native support for the inverse comparison.  */
18271   code = reverse_condition_maybe_unordered (code);
18272   if (can_invert_p)
18273     {
18274       aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
18275       return true;
18276     }
18277   aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
18278   return false;
18279 }
18280
18281 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
18282    of the data being selected and CMP_MODE is the mode of the values being
18283    compared.  */
18284
18285 void
18286 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
18287                           rtx *ops)
18288 {
18289   machine_mode pred_mode
18290     = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
18291                              GET_MODE_SIZE (cmp_mode)).require ();
18292   rtx pred = gen_reg_rtx (pred_mode);
18293   if (FLOAT_MODE_P (cmp_mode))
18294     {
18295       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
18296                                             ops[4], ops[5], true))
18297         std::swap (ops[1], ops[2]);
18298     }
18299   else
18300     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
18301
18302   if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
18303     ops[1] = force_reg (data_mode, ops[1]);
18304   /* The "false" value can only be zero if the "true" value is a constant.  */
18305   if (register_operand (ops[1], data_mode)
18306       || !aarch64_simd_reg_or_zero (ops[2], data_mode))
18307     ops[2] = force_reg (data_mode, ops[2]);
18308
18309   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
18310   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
18311 }
18312
18313 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
18314    true.  However due to issues with register allocation it is preferable
18315    to avoid tieing integer scalar and FP scalar modes.  Executing integer
18316    operations in general registers is better than treating them as scalar
18317    vector operations.  This reduces latency and avoids redundant int<->FP
18318    moves.  So tie modes if they are either the same class, or vector modes
18319    with other vector modes, vector structs or any scalar mode.  */
18320
18321 static bool
18322 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
18323 {
18324   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
18325     return true;
18326
18327   /* We specifically want to allow elements of "structure" modes to
18328      be tieable to the structure.  This more general condition allows
18329      other rarer situations too.  The reason we don't extend this to
18330      predicate modes is that there are no predicate structure modes
18331      nor any specific instructions for extracting part of a predicate
18332      register.  */
18333   if (aarch64_vector_data_mode_p (mode1)
18334       && aarch64_vector_data_mode_p (mode2))
18335     return true;
18336
18337   /* Also allow any scalar modes with vectors.  */
18338   if (aarch64_vector_mode_supported_p (mode1)
18339       || aarch64_vector_mode_supported_p (mode2))
18340     return true;
18341
18342   return false;
18343 }
18344
18345 /* Return a new RTX holding the result of moving POINTER forward by
18346    AMOUNT bytes.  */
18347
18348 static rtx
18349 aarch64_move_pointer (rtx pointer, poly_int64 amount)
18350 {
18351   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
18352
18353   return adjust_automodify_address (pointer, GET_MODE (pointer),
18354                                     next, amount);
18355 }
18356
18357 /* Return a new RTX holding the result of moving POINTER forward by the
18358    size of the mode it points to.  */
18359
18360 static rtx
18361 aarch64_progress_pointer (rtx pointer)
18362 {
18363   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
18364 }
18365
18366 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
18367    MODE bytes.  */
18368
18369 static void
18370 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
18371                                               machine_mode mode)
18372 {
18373   rtx reg = gen_reg_rtx (mode);
18374
18375   /* "Cast" the pointers to the correct mode.  */
18376   *src = adjust_address (*src, mode, 0);
18377   *dst = adjust_address (*dst, mode, 0);
18378   /* Emit the memcpy.  */
18379   emit_move_insn (reg, *src);
18380   emit_move_insn (*dst, reg);
18381   /* Move the pointers forward.  */
18382   *src = aarch64_progress_pointer (*src);
18383   *dst = aarch64_progress_pointer (*dst);
18384 }
18385
18386 /* Expand cpymem, as if from a __builtin_memcpy.  Return true if
18387    we succeed, otherwise return false.  */
18388
18389 bool
18390 aarch64_expand_cpymem (rtx *operands)
18391 {
18392   int n, mode_bits;
18393   rtx dst = operands[0];
18394   rtx src = operands[1];
18395   rtx base;
18396   machine_mode cur_mode = BLKmode, next_mode;
18397   bool speed_p = !optimize_function_for_size_p (cfun);
18398
18399   /* When optimizing for size, give a better estimate of the length of a
18400      memcpy call, but use the default otherwise.  Moves larger than 8 bytes
18401      will always require an even number of instructions to do now.  And each
18402      operation requires both a load+store, so devide the max number by 2.  */
18403   int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
18404
18405   /* We can't do anything smart if the amount to copy is not constant.  */
18406   if (!CONST_INT_P (operands[2]))
18407     return false;
18408
18409   n = INTVAL (operands[2]);
18410
18411   /* Try to keep the number of instructions low.  For all cases we will do at
18412      most two moves for the residual amount, since we'll always overlap the
18413      remainder.  */
18414   if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
18415     return false;
18416
18417   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
18418   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
18419
18420   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
18421   src = adjust_automodify_address (src, VOIDmode, base, 0);
18422
18423   /* Convert n to bits to make the rest of the code simpler.  */
18424   n = n * BITS_PER_UNIT;
18425
18426   /* Maximum amount to copy in one go.  The AArch64 back-end has integer modes
18427      larger than TImode, but we should not use them for loads/stores here.  */
18428   const int copy_limit = GET_MODE_BITSIZE (TImode);
18429
18430   while (n > 0)
18431     {
18432       /* Find the largest mode in which to do the copy in without over reading
18433          or writing.  */
18434       opt_scalar_int_mode mode_iter;
18435       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
18436         if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
18437           cur_mode = mode_iter.require ();
18438
18439       gcc_assert (cur_mode != BLKmode);
18440
18441       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
18442       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
18443
18444       n -= mode_bits;
18445
18446       /* Do certain trailing copies as overlapping if it's going to be
18447          cheaper.  i.e. less instructions to do so.  For instance doing a 15
18448          byte copy it's more efficient to do two overlapping 8 byte copies than
18449          8 + 6 + 1.  */
18450       if (n > 0 && n <= 8 * BITS_PER_UNIT)
18451         {
18452           next_mode = smallest_mode_for_size (n, MODE_INT);
18453           int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
18454           src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
18455           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
18456           n = n_bits;
18457         }
18458     }
18459
18460   return true;
18461 }
18462
18463 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
18464    SImode stores.  Handle the case when the constant has identical
18465    bottom and top halves.  This is beneficial when the two stores can be
18466    merged into an STP and we avoid synthesising potentially expensive
18467    immediates twice.  Return true if such a split is possible.  */
18468
18469 bool
18470 aarch64_split_dimode_const_store (rtx dst, rtx src)
18471 {
18472   rtx lo = gen_lowpart (SImode, src);
18473   rtx hi = gen_highpart_mode (SImode, DImode, src);
18474
18475   bool size_p = optimize_function_for_size_p (cfun);
18476
18477   if (!rtx_equal_p (lo, hi))
18478     return false;
18479
18480   unsigned int orig_cost
18481     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
18482   unsigned int lo_cost
18483     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
18484
18485   /* We want to transform:
18486      MOV        x1, 49370
18487      MOVK       x1, 0x140, lsl 16
18488      MOVK       x1, 0xc0da, lsl 32
18489      MOVK       x1, 0x140, lsl 48
18490      STR        x1, [x0]
18491    into:
18492      MOV        w1, 49370
18493      MOVK       w1, 0x140, lsl 16
18494      STP        w1, w1, [x0]
18495    So we want to perform this only when we save two instructions
18496    or more.  When optimizing for size, however, accept any code size
18497    savings we can.  */
18498   if (size_p && orig_cost <= lo_cost)
18499     return false;
18500
18501   if (!size_p
18502       && (orig_cost <= lo_cost + 1))
18503     return false;
18504
18505   rtx mem_lo = adjust_address (dst, SImode, 0);
18506   if (!aarch64_mem_pair_operand (mem_lo, SImode))
18507     return false;
18508
18509   rtx tmp_reg = gen_reg_rtx (SImode);
18510   aarch64_expand_mov_immediate (tmp_reg, lo);
18511   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
18512   /* Don't emit an explicit store pair as this may not be always profitable.
18513      Let the sched-fusion logic decide whether to merge them.  */
18514   emit_move_insn (mem_lo, tmp_reg);
18515   emit_move_insn (mem_hi, tmp_reg);
18516
18517   return true;
18518 }
18519
18520 /* Generate RTL for a conditional branch with rtx comparison CODE in
18521    mode CC_MODE.  The destination of the unlikely conditional branch
18522    is LABEL_REF.  */
18523
18524 void
18525 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
18526                               rtx label_ref)
18527 {
18528   rtx x;
18529   x = gen_rtx_fmt_ee (code, VOIDmode,
18530                       gen_rtx_REG (cc_mode, CC_REGNUM),
18531                       const0_rtx);
18532
18533   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
18534                             gen_rtx_LABEL_REF (VOIDmode, label_ref),
18535                             pc_rtx);
18536   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
18537 }
18538
18539 /* Generate DImode scratch registers for 128-bit (TImode) addition.
18540
18541    OP1 represents the TImode destination operand 1
18542    OP2 represents the TImode destination operand 2
18543    LOW_DEST represents the low half (DImode) of TImode operand 0
18544    LOW_IN1 represents the low half (DImode) of TImode operand 1
18545    LOW_IN2 represents the low half (DImode) of TImode operand 2
18546    HIGH_DEST represents the high half (DImode) of TImode operand 0
18547    HIGH_IN1 represents the high half (DImode) of TImode operand 1
18548    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
18549
18550 void
18551 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
18552                             rtx *low_in1, rtx *low_in2,
18553                             rtx *high_dest, rtx *high_in1,
18554                             rtx *high_in2)
18555 {
18556   *low_dest = gen_reg_rtx (DImode);
18557   *low_in1 = gen_lowpart (DImode, op1);
18558   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
18559                                   subreg_lowpart_offset (DImode, TImode));
18560   *high_dest = gen_reg_rtx (DImode);
18561   *high_in1 = gen_highpart (DImode, op1);
18562   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
18563                                    subreg_highpart_offset (DImode, TImode));
18564 }
18565
18566 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
18567
18568    This function differs from 'arch64_addti_scratch_regs' in that
18569    OP1 can be an immediate constant (zero). We must call
18570    subreg_highpart_offset with DImode and TImode arguments, otherwise
18571    VOIDmode will be used for the const_int which generates an internal
18572    error from subreg_size_highpart_offset which does not expect a size of zero.
18573
18574    OP1 represents the TImode destination operand 1
18575    OP2 represents the TImode destination operand 2
18576    LOW_DEST represents the low half (DImode) of TImode operand 0
18577    LOW_IN1 represents the low half (DImode) of TImode operand 1
18578    LOW_IN2 represents the low half (DImode) of TImode operand 2
18579    HIGH_DEST represents the high half (DImode) of TImode operand 0
18580    HIGH_IN1 represents the high half (DImode) of TImode operand 1
18581    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
18582
18583
18584 void
18585 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
18586                              rtx *low_in1, rtx *low_in2,
18587                              rtx *high_dest, rtx *high_in1,
18588                              rtx *high_in2)
18589 {
18590   *low_dest = gen_reg_rtx (DImode);
18591   *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
18592                                   subreg_lowpart_offset (DImode, TImode));
18593
18594   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
18595                                   subreg_lowpart_offset (DImode, TImode));
18596   *high_dest = gen_reg_rtx (DImode);
18597
18598   *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
18599                                    subreg_highpart_offset (DImode, TImode));
18600   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
18601                                    subreg_highpart_offset (DImode, TImode));
18602 }
18603
18604 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
18605
18606    OP0 represents the TImode destination operand 0
18607    LOW_DEST represents the low half (DImode) of TImode operand 0
18608    LOW_IN1 represents the low half (DImode) of TImode operand 1
18609    LOW_IN2 represents the low half (DImode) of TImode operand 2
18610    HIGH_DEST represents the high half (DImode) of TImode operand 0
18611    HIGH_IN1 represents the high half (DImode) of TImode operand 1
18612    HIGH_IN2 represents the high half (DImode) of TImode operand 2
18613    UNSIGNED_P is true if the operation is being performed on unsigned
18614    values.  */
18615 void
18616 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
18617                        rtx low_in2, rtx high_dest, rtx high_in1,
18618                        rtx high_in2, bool unsigned_p)
18619 {
18620   if (low_in2 == const0_rtx)
18621     {
18622       low_dest = low_in1;
18623       high_in2 = force_reg (DImode, high_in2);
18624       if (unsigned_p)
18625         emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
18626       else
18627         emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
18628     }
18629   else
18630     {
18631       if (CONST_INT_P (low_in2))
18632         {
18633           high_in2 = force_reg (DImode, high_in2);
18634           emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
18635                                               GEN_INT (-INTVAL (low_in2))));
18636         }
18637       else
18638         emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
18639
18640       if (unsigned_p)
18641         emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
18642       else
18643         emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
18644     }
18645
18646   emit_move_insn (gen_lowpart (DImode, op0), low_dest);
18647   emit_move_insn (gen_highpart (DImode, op0), high_dest);
18648
18649 }
18650
18651 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
18652
18653 static unsigned HOST_WIDE_INT
18654 aarch64_asan_shadow_offset (void)
18655 {
18656   if (TARGET_ILP32)
18657     return (HOST_WIDE_INT_1 << 29);
18658   else
18659     return (HOST_WIDE_INT_1 << 36);
18660 }
18661
18662 static rtx
18663 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
18664                         int code, tree treeop0, tree treeop1)
18665 {
18666   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
18667   rtx op0, op1;
18668   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
18669   insn_code icode;
18670   struct expand_operand ops[4];
18671
18672   start_sequence ();
18673   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
18674
18675   op_mode = GET_MODE (op0);
18676   if (op_mode == VOIDmode)
18677     op_mode = GET_MODE (op1);
18678
18679   switch (op_mode)
18680     {
18681     case E_QImode:
18682     case E_HImode:
18683     case E_SImode:
18684       cmp_mode = SImode;
18685       icode = CODE_FOR_cmpsi;
18686       break;
18687
18688     case E_DImode:
18689       cmp_mode = DImode;
18690       icode = CODE_FOR_cmpdi;
18691       break;
18692
18693     case E_SFmode:
18694       cmp_mode = SFmode;
18695       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
18696       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
18697       break;
18698
18699     case E_DFmode:
18700       cmp_mode = DFmode;
18701       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
18702       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
18703       break;
18704
18705     default:
18706       end_sequence ();
18707       return NULL_RTX;
18708     }
18709
18710   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
18711   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
18712   if (!op0 || !op1)
18713     {
18714       end_sequence ();
18715       return NULL_RTX;
18716     }
18717   *prep_seq = get_insns ();
18718   end_sequence ();
18719
18720   create_fixed_operand (&ops[0], op0);
18721   create_fixed_operand (&ops[1], op1);
18722
18723   start_sequence ();
18724   if (!maybe_expand_insn (icode, 2, ops))
18725     {
18726       end_sequence ();
18727       return NULL_RTX;
18728     }
18729   *gen_seq = get_insns ();
18730   end_sequence ();
18731
18732   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
18733                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
18734 }
18735
18736 static rtx
18737 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
18738                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
18739 {
18740   rtx op0, op1, target;
18741   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
18742   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
18743   insn_code icode;
18744   struct expand_operand ops[6];
18745   int aarch64_cond;
18746
18747   push_to_sequence (*prep_seq);
18748   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
18749
18750   op_mode = GET_MODE (op0);
18751   if (op_mode == VOIDmode)
18752     op_mode = GET_MODE (op1);
18753
18754   switch (op_mode)
18755     {
18756     case E_QImode:
18757     case E_HImode:
18758     case E_SImode:
18759       cmp_mode = SImode;
18760       icode = CODE_FOR_ccmpsi;
18761       break;
18762
18763     case E_DImode:
18764       cmp_mode = DImode;
18765       icode = CODE_FOR_ccmpdi;
18766       break;
18767
18768     case E_SFmode:
18769       cmp_mode = SFmode;
18770       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
18771       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
18772       break;
18773
18774     case E_DFmode:
18775       cmp_mode = DFmode;
18776       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
18777       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
18778       break;
18779
18780     default:
18781       end_sequence ();
18782       return NULL_RTX;
18783     }
18784
18785   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
18786   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
18787   if (!op0 || !op1)
18788     {
18789       end_sequence ();
18790       return NULL_RTX;
18791     }
18792   *prep_seq = get_insns ();
18793   end_sequence ();
18794
18795   target = gen_rtx_REG (cc_mode, CC_REGNUM);
18796   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
18797
18798   if (bit_code != AND)
18799     {
18800       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
18801                                                 GET_MODE (XEXP (prev, 0))),
18802                              VOIDmode, XEXP (prev, 0), const0_rtx);
18803       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
18804     }
18805
18806   create_fixed_operand (&ops[0], XEXP (prev, 0));
18807   create_fixed_operand (&ops[1], target);
18808   create_fixed_operand (&ops[2], op0);
18809   create_fixed_operand (&ops[3], op1);
18810   create_fixed_operand (&ops[4], prev);
18811   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
18812
18813   push_to_sequence (*gen_seq);
18814   if (!maybe_expand_insn (icode, 6, ops))
18815     {
18816       end_sequence ();
18817       return NULL_RTX;
18818     }
18819
18820   *gen_seq = get_insns ();
18821   end_sequence ();
18822
18823   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
18824 }
18825
18826 #undef TARGET_GEN_CCMP_FIRST
18827 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
18828
18829 #undef TARGET_GEN_CCMP_NEXT
18830 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
18831
18832 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
18833    instruction fusion of some sort.  */
18834
18835 static bool
18836 aarch64_macro_fusion_p (void)
18837 {
18838   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
18839 }
18840
18841
18842 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
18843    should be kept together during scheduling.  */
18844
18845 static bool
18846 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
18847 {
18848   rtx set_dest;
18849   rtx prev_set = single_set (prev);
18850   rtx curr_set = single_set (curr);
18851   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
18852   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
18853
18854   if (!aarch64_macro_fusion_p ())
18855     return false;
18856
18857   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
18858     {
18859       /* We are trying to match:
18860          prev (mov)  == (set (reg r0) (const_int imm16))
18861          curr (movk) == (set (zero_extract (reg r0)
18862                                            (const_int 16)
18863                                            (const_int 16))
18864                              (const_int imm16_1))  */
18865
18866       set_dest = SET_DEST (curr_set);
18867
18868       if (GET_CODE (set_dest) == ZERO_EXTRACT
18869           && CONST_INT_P (SET_SRC (curr_set))
18870           && CONST_INT_P (SET_SRC (prev_set))
18871           && CONST_INT_P (XEXP (set_dest, 2))
18872           && INTVAL (XEXP (set_dest, 2)) == 16
18873           && REG_P (XEXP (set_dest, 0))
18874           && REG_P (SET_DEST (prev_set))
18875           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
18876         {
18877           return true;
18878         }
18879     }
18880
18881   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
18882     {
18883
18884       /*  We're trying to match:
18885           prev (adrp) == (set (reg r1)
18886                               (high (symbol_ref ("SYM"))))
18887           curr (add) == (set (reg r0)
18888                              (lo_sum (reg r1)
18889                                      (symbol_ref ("SYM"))))
18890           Note that r0 need not necessarily be the same as r1, especially
18891           during pre-regalloc scheduling.  */
18892
18893       if (satisfies_constraint_Ush (SET_SRC (prev_set))
18894           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
18895         {
18896           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
18897               && REG_P (XEXP (SET_SRC (curr_set), 0))
18898               && REGNO (XEXP (SET_SRC (curr_set), 0))
18899                  == REGNO (SET_DEST (prev_set))
18900               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
18901                               XEXP (SET_SRC (curr_set), 1)))
18902             return true;
18903         }
18904     }
18905
18906   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
18907     {
18908
18909       /* We're trying to match:
18910          prev (movk) == (set (zero_extract (reg r0)
18911                                            (const_int 16)
18912                                            (const_int 32))
18913                              (const_int imm16_1))
18914          curr (movk) == (set (zero_extract (reg r0)
18915                                            (const_int 16)
18916                                            (const_int 48))
18917                              (const_int imm16_2))  */
18918
18919       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
18920           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
18921           && REG_P (XEXP (SET_DEST (prev_set), 0))
18922           && REG_P (XEXP (SET_DEST (curr_set), 0))
18923           && REGNO (XEXP (SET_DEST (prev_set), 0))
18924              == REGNO (XEXP (SET_DEST (curr_set), 0))
18925           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
18926           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
18927           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
18928           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
18929           && CONST_INT_P (SET_SRC (prev_set))
18930           && CONST_INT_P (SET_SRC (curr_set)))
18931         return true;
18932
18933     }
18934   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
18935     {
18936       /* We're trying to match:
18937           prev (adrp) == (set (reg r0)
18938                               (high (symbol_ref ("SYM"))))
18939           curr (ldr) == (set (reg r1)
18940                              (mem (lo_sum (reg r0)
18941                                              (symbol_ref ("SYM")))))
18942                  or
18943           curr (ldr) == (set (reg r1)
18944                              (zero_extend (mem
18945                                            (lo_sum (reg r0)
18946                                                    (symbol_ref ("SYM"))))))  */
18947       if (satisfies_constraint_Ush (SET_SRC (prev_set))
18948           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
18949         {
18950           rtx curr_src = SET_SRC (curr_set);
18951
18952           if (GET_CODE (curr_src) == ZERO_EXTEND)
18953             curr_src = XEXP (curr_src, 0);
18954
18955           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
18956               && REG_P (XEXP (XEXP (curr_src, 0), 0))
18957               && REGNO (XEXP (XEXP (curr_src, 0), 0))
18958                  == REGNO (SET_DEST (prev_set))
18959               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
18960                               XEXP (SET_SRC (prev_set), 0)))
18961               return true;
18962         }
18963     }
18964
18965   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
18966       && any_condjump_p (curr))
18967     {
18968       unsigned int condreg1, condreg2;
18969       rtx cc_reg_1;
18970       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
18971       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
18972
18973       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
18974           && prev
18975           && modified_in_p (cc_reg_1, prev))
18976         {
18977           enum attr_type prev_type = get_attr_type (prev);
18978
18979           /* FIXME: this misses some which is considered simple arthematic
18980              instructions for ThunderX.  Simple shifts are missed here.  */
18981           if (prev_type == TYPE_ALUS_SREG
18982               || prev_type == TYPE_ALUS_IMM
18983               || prev_type == TYPE_LOGICS_REG
18984               || prev_type == TYPE_LOGICS_IMM)
18985             return true;
18986         }
18987     }
18988
18989   if (prev_set
18990       && curr_set
18991       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
18992       && any_condjump_p (curr))
18993     {
18994       /* We're trying to match:
18995           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
18996           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
18997                                                          (const_int 0))
18998                                                  (label_ref ("SYM"))
18999                                                  (pc))  */
19000       if (SET_DEST (curr_set) == (pc_rtx)
19001           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
19002           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
19003           && REG_P (SET_DEST (prev_set))
19004           && REGNO (SET_DEST (prev_set))
19005              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
19006         {
19007           /* Fuse ALU operations followed by conditional branch instruction.  */
19008           switch (get_attr_type (prev))
19009             {
19010             case TYPE_ALU_IMM:
19011             case TYPE_ALU_SREG:
19012             case TYPE_ADC_REG:
19013             case TYPE_ADC_IMM:
19014             case TYPE_ADCS_REG:
19015             case TYPE_ADCS_IMM:
19016             case TYPE_LOGIC_REG:
19017             case TYPE_LOGIC_IMM:
19018             case TYPE_CSEL:
19019             case TYPE_ADR:
19020             case TYPE_MOV_IMM:
19021             case TYPE_SHIFT_REG:
19022             case TYPE_SHIFT_IMM:
19023             case TYPE_BFM:
19024             case TYPE_RBIT:
19025             case TYPE_REV:
19026             case TYPE_EXTEND:
19027               return true;
19028
19029             default:;
19030             }
19031         }
19032     }
19033
19034   return false;
19035 }
19036
19037 /* Return true iff the instruction fusion described by OP is enabled.  */
19038
19039 bool
19040 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
19041 {
19042   return (aarch64_tune_params.fusible_ops & op) != 0;
19043 }
19044
19045 /* If MEM is in the form of [base+offset], extract the two parts
19046    of address and set to BASE and OFFSET, otherwise return false
19047    after clearing BASE and OFFSET.  */
19048
19049 bool
19050 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
19051 {
19052   rtx addr;
19053
19054   gcc_assert (MEM_P (mem));
19055
19056   addr = XEXP (mem, 0);
19057
19058   if (REG_P (addr))
19059     {
19060       *base = addr;
19061       *offset = const0_rtx;
19062       return true;
19063     }
19064
19065   if (GET_CODE (addr) == PLUS
19066       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
19067     {
19068       *base = XEXP (addr, 0);
19069       *offset = XEXP (addr, 1);
19070       return true;
19071     }
19072
19073   *base = NULL_RTX;
19074   *offset = NULL_RTX;
19075
19076   return false;
19077 }
19078
19079 /* Types for scheduling fusion.  */
19080 enum sched_fusion_type
19081 {
19082   SCHED_FUSION_NONE = 0,
19083   SCHED_FUSION_LD_SIGN_EXTEND,
19084   SCHED_FUSION_LD_ZERO_EXTEND,
19085   SCHED_FUSION_LD,
19086   SCHED_FUSION_ST,
19087   SCHED_FUSION_NUM
19088 };
19089
19090 /* If INSN is a load or store of address in the form of [base+offset],
19091    extract the two parts and set to BASE and OFFSET.  Return scheduling
19092    fusion type this INSN is.  */
19093
19094 static enum sched_fusion_type
19095 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
19096 {
19097   rtx x, dest, src;
19098   enum sched_fusion_type fusion = SCHED_FUSION_LD;
19099
19100   gcc_assert (INSN_P (insn));
19101   x = PATTERN (insn);
19102   if (GET_CODE (x) != SET)
19103     return SCHED_FUSION_NONE;
19104
19105   src = SET_SRC (x);
19106   dest = SET_DEST (x);
19107
19108   machine_mode dest_mode = GET_MODE (dest);
19109
19110   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
19111     return SCHED_FUSION_NONE;
19112
19113   if (GET_CODE (src) == SIGN_EXTEND)
19114     {
19115       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
19116       src = XEXP (src, 0);
19117       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
19118         return SCHED_FUSION_NONE;
19119     }
19120   else if (GET_CODE (src) == ZERO_EXTEND)
19121     {
19122       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
19123       src = XEXP (src, 0);
19124       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
19125         return SCHED_FUSION_NONE;
19126     }
19127
19128   if (GET_CODE (src) == MEM && REG_P (dest))
19129     extract_base_offset_in_addr (src, base, offset);
19130   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
19131     {
19132       fusion = SCHED_FUSION_ST;
19133       extract_base_offset_in_addr (dest, base, offset);
19134     }
19135   else
19136     return SCHED_FUSION_NONE;
19137
19138   if (*base == NULL_RTX || *offset == NULL_RTX)
19139     fusion = SCHED_FUSION_NONE;
19140
19141   return fusion;
19142 }
19143
19144 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
19145
19146    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
19147    and PRI are only calculated for these instructions.  For other instruction,
19148    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
19149    type instruction fusion can be added by returning different priorities.
19150
19151    It's important that irrelevant instructions get the largest FUSION_PRI.  */
19152
19153 static void
19154 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
19155                                int *fusion_pri, int *pri)
19156 {
19157   int tmp, off_val;
19158   rtx base, offset;
19159   enum sched_fusion_type fusion;
19160
19161   gcc_assert (INSN_P (insn));
19162
19163   tmp = max_pri - 1;
19164   fusion = fusion_load_store (insn, &base, &offset);
19165   if (fusion == SCHED_FUSION_NONE)
19166     {
19167       *pri = tmp;
19168       *fusion_pri = tmp;
19169       return;
19170     }
19171
19172   /* Set FUSION_PRI according to fusion type and base register.  */
19173   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
19174
19175   /* Calculate PRI.  */
19176   tmp /= 2;
19177
19178   /* INSN with smaller offset goes first.  */
19179   off_val = (int)(INTVAL (offset));
19180   if (off_val >= 0)
19181     tmp -= (off_val & 0xfffff);
19182   else
19183     tmp += ((- off_val) & 0xfffff);
19184
19185   *pri = tmp;
19186   return;
19187 }
19188
19189 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
19190    Adjust priority of sha1h instructions so they are scheduled before
19191    other SHA1 instructions.  */
19192
19193 static int
19194 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
19195 {
19196   rtx x = PATTERN (insn);
19197
19198   if (GET_CODE (x) == SET)
19199     {
19200       x = SET_SRC (x);
19201
19202       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
19203         return priority + 10;
19204     }
19205
19206   return priority;
19207 }
19208
19209 /* Given OPERANDS of consecutive load/store, check if we can merge
19210    them into ldp/stp.  LOAD is true if they are load instructions.
19211    MODE is the mode of memory operands.  */
19212
19213 bool
19214 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
19215                                 machine_mode mode)
19216 {
19217   HOST_WIDE_INT offval_1, offval_2, msize;
19218   enum reg_class rclass_1, rclass_2;
19219   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
19220
19221   if (load)
19222     {
19223       mem_1 = operands[1];
19224       mem_2 = operands[3];
19225       reg_1 = operands[0];
19226       reg_2 = operands[2];
19227       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
19228       if (REGNO (reg_1) == REGNO (reg_2))
19229         return false;
19230     }
19231   else
19232     {
19233       mem_1 = operands[0];
19234       mem_2 = operands[2];
19235       reg_1 = operands[1];
19236       reg_2 = operands[3];
19237     }
19238
19239   /* The mems cannot be volatile.  */
19240   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
19241     return false;
19242
19243   /* If we have SImode and slow unaligned ldp,
19244      check the alignment to be at least 8 byte. */
19245   if (mode == SImode
19246       && (aarch64_tune_params.extra_tuning_flags
19247           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
19248       && !optimize_size
19249       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
19250     return false;
19251
19252   /* Check if the addresses are in the form of [base+offset].  */
19253   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
19254   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
19255     return false;
19256   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
19257   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
19258     return false;
19259
19260   /* Check if the bases are same.  */
19261   if (!rtx_equal_p (base_1, base_2))
19262     return false;
19263
19264   /* The operands must be of the same size.  */
19265   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
19266                          GET_MODE_SIZE (GET_MODE (mem_2))));
19267
19268   offval_1 = INTVAL (offset_1);
19269   offval_2 = INTVAL (offset_2);
19270   /* We should only be trying this for fixed-sized modes.  There is no
19271      SVE LDP/STP instruction.  */
19272   msize = GET_MODE_SIZE (mode).to_constant ();
19273   /* Check if the offsets are consecutive.  */
19274   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
19275     return false;
19276
19277   /* Check if the addresses are clobbered by load.  */
19278   if (load)
19279     {
19280       if (reg_mentioned_p (reg_1, mem_1))
19281         return false;
19282
19283       /* In increasing order, the last load can clobber the address.  */
19284       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
19285         return false;
19286     }
19287
19288   /* One of the memory accesses must be a mempair operand.
19289      If it is not the first one, they need to be swapped by the
19290      peephole.  */
19291   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
19292        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
19293     return false;
19294
19295   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
19296     rclass_1 = FP_REGS;
19297   else
19298     rclass_1 = GENERAL_REGS;
19299
19300   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
19301     rclass_2 = FP_REGS;
19302   else
19303     rclass_2 = GENERAL_REGS;
19304
19305   /* Check if the registers are of same class.  */
19306   if (rclass_1 != rclass_2)
19307     return false;
19308
19309   return true;
19310 }
19311
19312 /* Given OPERANDS of consecutive load/store that can be merged,
19313    swap them if they are not in ascending order.  */
19314 void
19315 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
19316 {
19317   rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
19318   HOST_WIDE_INT offval_1, offval_2;
19319
19320   if (load)
19321     {
19322       mem_1 = operands[1];
19323       mem_2 = operands[3];
19324     }
19325   else
19326     {
19327       mem_1 = operands[0];
19328       mem_2 = operands[2];
19329     }
19330
19331   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
19332   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
19333
19334   offval_1 = INTVAL (offset_1);
19335   offval_2 = INTVAL (offset_2);
19336
19337   if (offval_1 > offval_2)
19338     {
19339       /* Irrespective of whether this is a load or a store,
19340          we do the same swap.  */
19341       std::swap (operands[0], operands[2]);
19342       std::swap (operands[1], operands[3]);
19343     }
19344 }
19345
19346 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
19347    comparison between the two.  */
19348 int
19349 aarch64_host_wide_int_compare (const void *x, const void *y)
19350 {
19351   return wi::cmps (* ((const HOST_WIDE_INT *) x),
19352                    * ((const HOST_WIDE_INT *) y));
19353 }
19354
19355 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
19356    other pointing to a REG rtx containing an offset, compare the offsets
19357    of the two pairs.
19358
19359    Return:
19360
19361         1 iff offset (X) > offset (Y)
19362         0 iff offset (X) == offset (Y)
19363         -1 iff offset (X) < offset (Y)  */
19364 int
19365 aarch64_ldrstr_offset_compare (const void *x, const void *y)
19366 {
19367   const rtx * operands_1 = (const rtx *) x;
19368   const rtx * operands_2 = (const rtx *) y;
19369   rtx mem_1, mem_2, base, offset_1, offset_2;
19370
19371   if (MEM_P (operands_1[0]))
19372     mem_1 = operands_1[0];
19373   else
19374     mem_1 = operands_1[1];
19375
19376   if (MEM_P (operands_2[0]))
19377     mem_2 = operands_2[0];
19378   else
19379     mem_2 = operands_2[1];
19380
19381   /* Extract the offsets.  */
19382   extract_base_offset_in_addr (mem_1, &base, &offset_1);
19383   extract_base_offset_in_addr (mem_2, &base, &offset_2);
19384
19385   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
19386
19387   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
19388 }
19389
19390 /* Given OPERANDS of consecutive load/store, check if we can merge
19391    them into ldp/stp by adjusting the offset.  LOAD is true if they
19392    are load instructions.  MODE is the mode of memory operands.
19393
19394    Given below consecutive stores:
19395
19396      str  w1, [xb, 0x100]
19397      str  w1, [xb, 0x104]
19398      str  w1, [xb, 0x108]
19399      str  w1, [xb, 0x10c]
19400
19401    Though the offsets are out of the range supported by stp, we can
19402    still pair them after adjusting the offset, like:
19403
19404      add  scratch, xb, 0x100
19405      stp  w1, w1, [scratch]
19406      stp  w1, w1, [scratch, 0x8]
19407
19408    The peephole patterns detecting this opportunity should guarantee
19409    the scratch register is avaliable.  */
19410
19411 bool
19412 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
19413                                        scalar_mode mode)
19414 {
19415   const int num_insns = 4;
19416   enum reg_class rclass;
19417   HOST_WIDE_INT offvals[num_insns], msize;
19418   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
19419
19420   if (load)
19421     {
19422       for (int i = 0; i < num_insns; i++)
19423         {
19424           reg[i] = operands[2 * i];
19425           mem[i] = operands[2 * i + 1];
19426
19427           gcc_assert (REG_P (reg[i]));
19428         }
19429
19430       /* Do not attempt to merge the loads if the loads clobber each other.  */
19431       for (int i = 0; i < 8; i += 2)
19432         for (int j = i + 2; j < 8; j += 2)
19433           if (reg_overlap_mentioned_p (operands[i], operands[j]))
19434             return false;
19435     }
19436   else
19437     for (int i = 0; i < num_insns; i++)
19438       {
19439         mem[i] = operands[2 * i];
19440         reg[i] = operands[2 * i + 1];
19441       }
19442
19443   /* Skip if memory operand is by itself valid for ldp/stp.  */
19444   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
19445     return false;
19446
19447   for (int i = 0; i < num_insns; i++)
19448     {
19449       /* The mems cannot be volatile.  */
19450       if (MEM_VOLATILE_P (mem[i]))
19451         return false;
19452
19453       /* Check if the addresses are in the form of [base+offset].  */
19454       extract_base_offset_in_addr (mem[i], base + i, offset + i);
19455       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
19456         return false;
19457     }
19458
19459   /* Check if the registers are of same class.  */
19460   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
19461     ? FP_REGS : GENERAL_REGS;
19462
19463   for (int i = 1; i < num_insns; i++)
19464     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
19465       {
19466         if (rclass != FP_REGS)
19467           return false;
19468       }
19469     else
19470       {
19471         if (rclass != GENERAL_REGS)
19472           return false;
19473       }
19474
19475   /* Only the last register in the order in which they occur
19476      may be clobbered by the load.  */
19477   if (rclass == GENERAL_REGS && load)
19478     for (int i = 0; i < num_insns - 1; i++)
19479       if (reg_mentioned_p (reg[i], mem[i]))
19480         return false;
19481
19482   /* Check if the bases are same.  */
19483   for (int i = 0; i < num_insns - 1; i++)
19484     if (!rtx_equal_p (base[i], base[i + 1]))
19485       return false;
19486
19487   for (int i = 0; i < num_insns; i++)
19488     offvals[i] = INTVAL (offset[i]);
19489
19490   msize = GET_MODE_SIZE (mode);
19491
19492   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
19493   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
19494          aarch64_host_wide_int_compare);
19495
19496   if (!(offvals[1] == offvals[0] + msize
19497         && offvals[3] == offvals[2] + msize))
19498     return false;
19499
19500   /* Check that offsets are within range of each other.  The ldp/stp
19501      instructions have 7 bit immediate offsets, so use 0x80.  */
19502   if (offvals[2] - offvals[0] >= msize * 0x80)
19503     return false;
19504
19505   /* The offsets must be aligned with respect to each other.  */
19506   if (offvals[0] % msize != offvals[2] % msize)
19507     return false;
19508
19509   /* If we have SImode and slow unaligned ldp,
19510      check the alignment to be at least 8 byte. */
19511   if (mode == SImode
19512       && (aarch64_tune_params.extra_tuning_flags
19513           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
19514       && !optimize_size
19515       && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
19516     return false;
19517
19518   return true;
19519 }
19520
19521 /* Given OPERANDS of consecutive load/store, this function pairs them
19522    into LDP/STP after adjusting the offset.  It depends on the fact
19523    that the operands can be sorted so the offsets are correct for STP.
19524    MODE is the mode of memory operands.  CODE is the rtl operator
19525    which should be applied to all memory operands, it's SIGN_EXTEND,
19526    ZERO_EXTEND or UNKNOWN.  */
19527
19528 bool
19529 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
19530                              scalar_mode mode, RTX_CODE code)
19531 {
19532   rtx base, offset_1, offset_3, t1, t2;
19533   rtx mem_1, mem_2, mem_3, mem_4;
19534   rtx temp_operands[8];
19535   HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
19536                 stp_off_upper_limit, stp_off_lower_limit, msize;
19537
19538   /* We make changes on a copy as we may still bail out.  */
19539   for (int i = 0; i < 8; i ++)
19540     temp_operands[i] = operands[i];
19541
19542   /* Sort the operands.  */
19543   qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
19544
19545   /* Copy the memory operands so that if we have to bail for some
19546      reason the original addresses are unchanged.  */
19547   if (load)
19548     {
19549       mem_1 = copy_rtx (temp_operands[1]);
19550       mem_2 = copy_rtx (temp_operands[3]);
19551       mem_3 = copy_rtx (temp_operands[5]);
19552       mem_4 = copy_rtx (temp_operands[7]);
19553     }
19554   else
19555     {
19556       mem_1 = copy_rtx (temp_operands[0]);
19557       mem_2 = copy_rtx (temp_operands[2]);
19558       mem_3 = copy_rtx (temp_operands[4]);
19559       mem_4 = copy_rtx (temp_operands[6]);
19560       gcc_assert (code == UNKNOWN);
19561     }
19562
19563   extract_base_offset_in_addr (mem_1, &base, &offset_1);
19564   extract_base_offset_in_addr (mem_3, &base, &offset_3);
19565   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
19566               && offset_3 != NULL_RTX);
19567
19568   /* Adjust offset so it can fit in LDP/STP instruction.  */
19569   msize = GET_MODE_SIZE (mode);
19570   stp_off_upper_limit = msize * (0x40 - 1);
19571   stp_off_lower_limit = - msize * 0x40;
19572
19573   off_val_1 = INTVAL (offset_1);
19574   off_val_3 = INTVAL (offset_3);
19575
19576   /* The base offset is optimally half way between the two STP/LDP offsets.  */
19577   if (msize <= 4)
19578     base_off = (off_val_1 + off_val_3) / 2;
19579   else
19580     /* However, due to issues with negative LDP/STP offset generation for
19581        larger modes, for DF, DI and vector modes. we must not use negative
19582        addresses smaller than 9 signed unadjusted bits can store.  This
19583        provides the most range in this case.  */
19584     base_off = off_val_1;
19585
19586   /* Adjust the base so that it is aligned with the addresses but still
19587      optimal.  */
19588   if (base_off % msize != off_val_1 % msize)
19589     /* Fix the offset, bearing in mind we want to make it bigger not
19590        smaller.  */
19591     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19592   else if (msize <= 4)
19593     /* The negative range of LDP/STP is one larger than the positive range.  */
19594     base_off += msize;
19595
19596   /* Check if base offset is too big or too small.  We can attempt to resolve
19597      this issue by setting it to the maximum value and seeing if the offsets
19598      still fit.  */
19599   if (base_off >= 0x1000)
19600     {
19601       base_off = 0x1000 - 1;
19602       /* We must still make sure that the base offset is aligned with respect
19603          to the address.  But it may may not be made any bigger.  */
19604       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19605     }
19606
19607   /* Likewise for the case where the base is too small.  */
19608   if (base_off <= -0x1000)
19609     {
19610       base_off = -0x1000 + 1;
19611       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19612     }
19613
19614   /* Offset of the first STP/LDP.  */
19615   new_off_1 = off_val_1 - base_off;
19616
19617   /* Offset of the second STP/LDP.  */
19618   new_off_3 = off_val_3 - base_off;
19619
19620   /* The offsets must be within the range of the LDP/STP instructions.  */
19621   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
19622       || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
19623     return false;
19624
19625   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
19626                                                   new_off_1), true);
19627   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
19628                                                   new_off_1 + msize), true);
19629   replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
19630                                                   new_off_3), true);
19631   replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
19632                                                   new_off_3 + msize), true);
19633
19634   if (!aarch64_mem_pair_operand (mem_1, mode)
19635       || !aarch64_mem_pair_operand (mem_3, mode))
19636     return false;
19637
19638   if (code == ZERO_EXTEND)
19639     {
19640       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
19641       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
19642       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
19643       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
19644     }
19645   else if (code == SIGN_EXTEND)
19646     {
19647       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
19648       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
19649       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
19650       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
19651     }
19652
19653   if (load)
19654     {
19655       operands[0] = temp_operands[0];
19656       operands[1] = mem_1;
19657       operands[2] = temp_operands[2];
19658       operands[3] = mem_2;
19659       operands[4] = temp_operands[4];
19660       operands[5] = mem_3;
19661       operands[6] = temp_operands[6];
19662       operands[7] = mem_4;
19663     }
19664   else
19665     {
19666       operands[0] = mem_1;
19667       operands[1] = temp_operands[1];
19668       operands[2] = mem_2;
19669       operands[3] = temp_operands[3];
19670       operands[4] = mem_3;
19671       operands[5] = temp_operands[5];
19672       operands[6] = mem_4;
19673       operands[7] = temp_operands[7];
19674     }
19675
19676   /* Emit adjusting instruction.  */
19677   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
19678   /* Emit ldp/stp instructions.  */
19679   t1 = gen_rtx_SET (operands[0], operands[1]);
19680   t2 = gen_rtx_SET (operands[2], operands[3]);
19681   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
19682   t1 = gen_rtx_SET (operands[4], operands[5]);
19683   t2 = gen_rtx_SET (operands[6], operands[7]);
19684   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
19685   return true;
19686 }
19687
19688 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
19689    it isn't worth branching around empty masked ops (including masked
19690    stores).  */
19691
19692 static bool
19693 aarch64_empty_mask_is_expensive (unsigned)
19694 {
19695   return false;
19696 }
19697
19698 /* Return 1 if pseudo register should be created and used to hold
19699    GOT address for PIC code.  */
19700
19701 bool
19702 aarch64_use_pseudo_pic_reg (void)
19703 {
19704   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
19705 }
19706
19707 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
19708
19709 static int
19710 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
19711 {
19712   switch (XINT (x, 1))
19713     {
19714     case UNSPEC_GOTSMALLPIC:
19715     case UNSPEC_GOTSMALLPIC28K:
19716     case UNSPEC_GOTTINYPIC:
19717       return 0;
19718     default:
19719       break;
19720     }
19721
19722   return default_unspec_may_trap_p (x, flags);
19723 }
19724
19725
19726 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
19727    return the log2 of that value.  Otherwise return -1.  */
19728
19729 int
19730 aarch64_fpconst_pow_of_2 (rtx x)
19731 {
19732   const REAL_VALUE_TYPE *r;
19733
19734   if (!CONST_DOUBLE_P (x))
19735     return -1;
19736
19737   r = CONST_DOUBLE_REAL_VALUE (x);
19738
19739   if (REAL_VALUE_NEGATIVE (*r)
19740       || REAL_VALUE_ISNAN (*r)
19741       || REAL_VALUE_ISINF (*r)
19742       || !real_isinteger (r, DFmode))
19743     return -1;
19744
19745   return exact_log2 (real_to_integer (r));
19746 }
19747
19748 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
19749    power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
19750    return n. Otherwise return -1.  */
19751
19752 int
19753 aarch64_fpconst_pow2_recip (rtx x)
19754 {
19755   REAL_VALUE_TYPE r0;
19756
19757   if (!CONST_DOUBLE_P (x))
19758     return -1;
19759
19760   r0 = *CONST_DOUBLE_REAL_VALUE (x);
19761   if (exact_real_inverse (DFmode, &r0)
19762       && !REAL_VALUE_NEGATIVE (r0))
19763     {
19764         int ret = exact_log2 (real_to_integer (&r0));
19765         if (ret >= 1 && ret <= 32)
19766             return ret;
19767     }
19768   return -1;
19769 }
19770
19771 /* If X is a vector of equal CONST_DOUBLE values and that value is
19772    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
19773
19774 int
19775 aarch64_vec_fpconst_pow_of_2 (rtx x)
19776 {
19777   int nelts;
19778   if (GET_CODE (x) != CONST_VECTOR
19779       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
19780     return -1;
19781
19782   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
19783     return -1;
19784
19785   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
19786   if (firstval <= 0)
19787     return -1;
19788
19789   for (int i = 1; i < nelts; i++)
19790     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
19791       return -1;
19792
19793   return firstval;
19794 }
19795
19796 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
19797    to float.
19798
19799    __fp16 always promotes through this hook.
19800    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
19801    through the generic excess precision logic rather than here.  */
19802
19803 static tree
19804 aarch64_promoted_type (const_tree t)
19805 {
19806   if (SCALAR_FLOAT_TYPE_P (t)
19807       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
19808     return float_type_node;
19809
19810   return NULL_TREE;
19811 }
19812
19813 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
19814
19815 static bool
19816 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
19817                            optimization_type opt_type)
19818 {
19819   switch (op)
19820     {
19821     case rsqrt_optab:
19822       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
19823
19824     default:
19825       return true;
19826     }
19827 }
19828
19829 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
19830
19831 static unsigned int
19832 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
19833                                         int *offset)
19834 {
19835   /* Polynomial invariant 1 == (VG / 2) - 1.  */
19836   gcc_assert (i == 1);
19837   *factor = 2;
19838   *offset = 1;
19839   return AARCH64_DWARF_VG;
19840 }
19841
19842 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
19843    if MODE is HFmode, and punt to the generic implementation otherwise.  */
19844
19845 static bool
19846 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
19847 {
19848   return (mode == HFmode
19849           ? true
19850           : default_libgcc_floating_mode_supported_p (mode));
19851 }
19852
19853 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
19854    if MODE is HFmode, and punt to the generic implementation otherwise.  */
19855
19856 static bool
19857 aarch64_scalar_mode_supported_p (scalar_mode mode)
19858 {
19859   return (mode == HFmode
19860           ? true
19861           : default_scalar_mode_supported_p (mode));
19862 }
19863
19864 /* Set the value of FLT_EVAL_METHOD.
19865    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
19866
19867     0: evaluate all operations and constants, whose semantic type has at
19868        most the range and precision of type float, to the range and
19869        precision of float; evaluate all other operations and constants to
19870        the range and precision of the semantic type;
19871
19872     N, where _FloatN is a supported interchange floating type
19873        evaluate all operations and constants, whose semantic type has at
19874        most the range and precision of _FloatN type, to the range and
19875        precision of the _FloatN type; evaluate all other operations and
19876        constants to the range and precision of the semantic type;
19877
19878    If we have the ARMv8.2-A extensions then we support _Float16 in native
19879    precision, so we should set this to 16.  Otherwise, we support the type,
19880    but want to evaluate expressions in float precision, so set this to
19881    0.  */
19882
19883 static enum flt_eval_method
19884 aarch64_excess_precision (enum excess_precision_type type)
19885 {
19886   switch (type)
19887     {
19888       case EXCESS_PRECISION_TYPE_FAST:
19889       case EXCESS_PRECISION_TYPE_STANDARD:
19890         /* We can calculate either in 16-bit range and precision or
19891            32-bit range and precision.  Make that decision based on whether
19892            we have native support for the ARMv8.2-A 16-bit floating-point
19893            instructions or not.  */
19894         return (TARGET_FP_F16INST
19895                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
19896                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
19897       case EXCESS_PRECISION_TYPE_IMPLICIT:
19898         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
19899       default:
19900         gcc_unreachable ();
19901     }
19902   return FLT_EVAL_METHOD_UNPREDICTABLE;
19903 }
19904
19905 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
19906    scheduled for speculative execution.  Reject the long-running division
19907    and square-root instructions.  */
19908
19909 static bool
19910 aarch64_sched_can_speculate_insn (rtx_insn *insn)
19911 {
19912   switch (get_attr_type (insn))
19913     {
19914       case TYPE_SDIV:
19915       case TYPE_UDIV:
19916       case TYPE_FDIVS:
19917       case TYPE_FDIVD:
19918       case TYPE_FSQRTS:
19919       case TYPE_FSQRTD:
19920       case TYPE_NEON_FP_SQRT_S:
19921       case TYPE_NEON_FP_SQRT_D:
19922       case TYPE_NEON_FP_SQRT_S_Q:
19923       case TYPE_NEON_FP_SQRT_D_Q:
19924       case TYPE_NEON_FP_DIV_S:
19925       case TYPE_NEON_FP_DIV_D:
19926       case TYPE_NEON_FP_DIV_S_Q:
19927       case TYPE_NEON_FP_DIV_D_Q:
19928         return false;
19929       default:
19930         return true;
19931     }
19932 }
19933
19934 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
19935
19936 static int
19937 aarch64_compute_pressure_classes (reg_class *classes)
19938 {
19939   int i = 0;
19940   classes[i++] = GENERAL_REGS;
19941   classes[i++] = FP_REGS;
19942   /* PR_REGS isn't a useful pressure class because many predicate pseudo
19943      registers need to go in PR_LO_REGS at some point during their
19944      lifetime.  Splitting it into two halves has the effect of making
19945      all predicates count against PR_LO_REGS, so that we try whenever
19946      possible to restrict the number of live predicates to 8.  This
19947      greatly reduces the amount of spilling in certain loops.  */
19948   classes[i++] = PR_LO_REGS;
19949   classes[i++] = PR_HI_REGS;
19950   return i;
19951 }
19952
19953 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
19954
19955 static bool
19956 aarch64_can_change_mode_class (machine_mode from,
19957                                machine_mode to, reg_class_t)
19958 {
19959   if (BYTES_BIG_ENDIAN)
19960     {
19961       bool from_sve_p = aarch64_sve_data_mode_p (from);
19962       bool to_sve_p = aarch64_sve_data_mode_p (to);
19963
19964       /* Don't allow changes between SVE data modes and non-SVE modes.
19965          See the comment at the head of aarch64-sve.md for details.  */
19966       if (from_sve_p != to_sve_p)
19967         return false;
19968
19969       /* Don't allow changes in element size: lane 0 of the new vector
19970          would not then be lane 0 of the old vector.  See the comment
19971          above aarch64_maybe_expand_sve_subreg_move for a more detailed
19972          description.
19973
19974          In the worst case, this forces a register to be spilled in
19975          one mode and reloaded in the other, which handles the
19976          endianness correctly.  */
19977       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
19978         return false;
19979     }
19980   return true;
19981 }
19982
19983 /* Implement TARGET_EARLY_REMAT_MODES.  */
19984
19985 static void
19986 aarch64_select_early_remat_modes (sbitmap modes)
19987 {
19988   /* SVE values are not normally live across a call, so it should be
19989      worth doing early rematerialization even in VL-specific mode.  */
19990   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
19991     if (aarch64_sve_mode_p ((machine_mode) i))
19992       bitmap_set_bit (modes, i);
19993 }
19994
19995 /* Override the default target speculation_safe_value.  */
19996 static rtx
19997 aarch64_speculation_safe_value (machine_mode mode,
19998                                 rtx result, rtx val, rtx failval)
19999 {
20000   /* Maybe we should warn if falling back to hard barriers.  They are
20001      likely to be noticably more expensive than the alternative below.  */
20002   if (!aarch64_track_speculation)
20003     return default_speculation_safe_value (mode, result, val, failval);
20004
20005   if (!REG_P (val))
20006     val = copy_to_mode_reg (mode, val);
20007
20008   if (!aarch64_reg_or_zero (failval, mode))
20009     failval = copy_to_mode_reg (mode, failval);
20010
20011   emit_insn (gen_despeculate_copy (mode, result, val, failval));
20012   return result;
20013 }
20014
20015 /* Implement TARGET_ESTIMATED_POLY_VALUE.
20016    Look into the tuning structure for an estimate.
20017    VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
20018    Advanced SIMD 128 bits.  */
20019
20020 static HOST_WIDE_INT
20021 aarch64_estimated_poly_value (poly_int64 val)
20022 {
20023   enum aarch64_sve_vector_bits_enum width_source
20024     = aarch64_tune_params.sve_width;
20025
20026   /* If we still don't have an estimate, use the default.  */
20027   if (width_source == SVE_SCALABLE)
20028     return default_estimated_poly_value (val);
20029
20030   HOST_WIDE_INT over_128 = width_source - 128;
20031   return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
20032 }
20033
20034
20035 /* Return true for types that could be supported as SIMD return or
20036    argument types.  */
20037
20038 static bool
20039 supported_simd_type (tree t)
20040 {
20041   if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
20042     {
20043       HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
20044       return s == 1 || s == 2 || s == 4 || s == 8;
20045     }
20046   return false;
20047 }
20048
20049 /* Return true for types that currently are supported as SIMD return
20050    or argument types.  */
20051
20052 static bool
20053 currently_supported_simd_type (tree t, tree b)
20054 {
20055   if (COMPLEX_FLOAT_TYPE_P (t))
20056     return false;
20057
20058   if (TYPE_SIZE (t) != TYPE_SIZE (b))
20059     return false;
20060
20061   return supported_simd_type (t);
20062 }
20063
20064 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
20065
20066 static int
20067 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
20068                                         struct cgraph_simd_clone *clonei,
20069                                         tree base_type, int num)
20070 {
20071   tree t, ret_type, arg_type;
20072   unsigned int elt_bits, vec_bits, count;
20073
20074   if (!TARGET_SIMD)
20075     return 0;
20076
20077   if (clonei->simdlen
20078       && (clonei->simdlen < 2
20079           || clonei->simdlen > 1024
20080           || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
20081     {
20082       warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20083                   "unsupported simdlen %d", clonei->simdlen);
20084       return 0;
20085     }
20086
20087   ret_type = TREE_TYPE (TREE_TYPE (node->decl));
20088   if (TREE_CODE (ret_type) != VOID_TYPE
20089       && !currently_supported_simd_type (ret_type, base_type))
20090     {
20091       if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
20092         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20093                     "GCC does not currently support mixed size types "
20094                     "for %<simd%> functions");
20095       else if (supported_simd_type (ret_type))
20096         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20097                     "GCC does not currently support return type %qT "
20098                     "for %<simd%> functions", ret_type);
20099       else
20100         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20101                     "unsupported return type %qT for %<simd%> functions",
20102                     ret_type);
20103       return 0;
20104     }
20105
20106   for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
20107     {
20108       arg_type = TREE_TYPE (t);
20109
20110       if (!currently_supported_simd_type (arg_type, base_type))
20111         {
20112           if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
20113             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20114                         "GCC does not currently support mixed size types "
20115                         "for %<simd%> functions");
20116           else
20117             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20118                         "GCC does not currently support argument type %qT "
20119                         "for %<simd%> functions", arg_type);
20120           return 0;
20121         }
20122     }
20123
20124   clonei->vecsize_mangle = 'n';
20125   clonei->mask_mode = VOIDmode;
20126   elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
20127   if (clonei->simdlen == 0)
20128     {
20129       count = 2;
20130       vec_bits = (num == 0 ? 64 : 128);
20131       clonei->simdlen = vec_bits / elt_bits;
20132     }
20133   else
20134     {
20135       count = 1;
20136       vec_bits = clonei->simdlen * elt_bits;
20137       if (vec_bits != 64 && vec_bits != 128)
20138         {
20139           warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20140                       "GCC does not currently support simdlen %d for type %qT",
20141                       clonei->simdlen, base_type);
20142           return 0;
20143         }
20144     }
20145   clonei->vecsize_int = vec_bits;
20146   clonei->vecsize_float = vec_bits;
20147   return count;
20148 }
20149
20150 /* Implement TARGET_SIMD_CLONE_ADJUST.  */
20151
20152 static void
20153 aarch64_simd_clone_adjust (struct cgraph_node *node)
20154 {
20155   /* Add aarch64_vector_pcs target attribute to SIMD clones so they
20156      use the correct ABI.  */
20157
20158   tree t = TREE_TYPE (node->decl);
20159   TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
20160                                         TYPE_ATTRIBUTES (t));
20161 }
20162
20163 /* Implement TARGET_SIMD_CLONE_USABLE.  */
20164
20165 static int
20166 aarch64_simd_clone_usable (struct cgraph_node *node)
20167 {
20168   switch (node->simdclone->vecsize_mangle)
20169     {
20170     case 'n':
20171       if (!TARGET_SIMD)
20172         return -1;
20173       return 0;
20174     default:
20175       gcc_unreachable ();
20176     }
20177 }
20178
20179 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
20180
20181 static int
20182 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
20183 {
20184   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
20185       != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
20186     return 0;
20187   return 1;
20188 }
20189
20190 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
20191
20192 static const char *
20193 aarch64_get_multilib_abi_name (void)
20194 {
20195   if (TARGET_BIG_END)
20196     return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
20197   return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
20198 }
20199
20200 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
20201    global variable based guard use the default else
20202    return a null tree.  */
20203 static tree
20204 aarch64_stack_protect_guard (void)
20205 {
20206   if (aarch64_stack_protector_guard == SSP_GLOBAL)
20207     return default_stack_protect_guard ();
20208
20209   return NULL_TREE;
20210 }
20211
20212 /* Implement TARGET_ASM_FILE_END for AArch64.  This adds the AArch64 GNU NOTE
20213    section at the end if needed.  */
20214 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND      0xc0000000
20215 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI      (1U << 0)
20216 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC      (1U << 1)
20217 void
20218 aarch64_file_end_indicate_exec_stack ()
20219 {
20220   file_end_indicate_exec_stack ();
20221
20222   unsigned feature_1_and = 0;
20223   if (aarch64_bti_enabled ())
20224     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
20225
20226   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
20227     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
20228
20229   if (feature_1_and)
20230     {
20231       /* Generate .note.gnu.property section.  */
20232       switch_to_section (get_section (".note.gnu.property",
20233                                       SECTION_NOTYPE, NULL));
20234
20235       /* PT_NOTE header: namesz, descsz, type.
20236          namesz = 4 ("GNU\0")
20237          descsz = 16 (Size of the program property array)
20238                   [(12 + padding) * Number of array elements]
20239          type   = 5 (NT_GNU_PROPERTY_TYPE_0).  */
20240       assemble_align (POINTER_SIZE);
20241       assemble_integer (GEN_INT (4), 4, 32, 1);
20242       assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
20243       assemble_integer (GEN_INT (5), 4, 32, 1);
20244
20245       /* PT_NOTE name.  */
20246       assemble_string ("GNU", 4);
20247
20248       /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
20249          type   = GNU_PROPERTY_AARCH64_FEATURE_1_AND
20250          datasz = 4
20251          data   = feature_1_and.  */
20252       assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
20253       assemble_integer (GEN_INT (4), 4, 32, 1);
20254       assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
20255
20256       /* Pad the size of the note to the required alignment.  */
20257       assemble_align (POINTER_SIZE);
20258     }
20259 }
20260 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
20261 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
20262 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
20263
20264 /* Target-specific selftests.  */
20265
20266 #if CHECKING_P
20267
20268 namespace selftest {
20269
20270 /* Selftest for the RTL loader.
20271    Verify that the RTL loader copes with a dump from
20272    print_rtx_function.  This is essentially just a test that class
20273    function_reader can handle a real dump, but it also verifies
20274    that lookup_reg_by_dump_name correctly handles hard regs.
20275    The presence of hard reg names in the dump means that the test is
20276    target-specific, hence it is in this file.  */
20277
20278 static void
20279 aarch64_test_loading_full_dump ()
20280 {
20281   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
20282
20283   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
20284
20285   rtx_insn *insn_1 = get_insn_by_uid (1);
20286   ASSERT_EQ (NOTE, GET_CODE (insn_1));
20287
20288   rtx_insn *insn_15 = get_insn_by_uid (15);
20289   ASSERT_EQ (INSN, GET_CODE (insn_15));
20290   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
20291
20292   /* Verify crtl->return_rtx.  */
20293   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
20294   ASSERT_EQ (0, REGNO (crtl->return_rtx));
20295   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
20296 }
20297
20298 /* Run all target-specific selftests.  */
20299
20300 static void
20301 aarch64_run_selftests (void)
20302 {
20303   aarch64_test_loading_full_dump ();
20304 }
20305
20306 } // namespace selftest
20307
20308 #endif /* #if CHECKING_P */
20309
20310 #undef TARGET_STACK_PROTECT_GUARD
20311 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
20312
20313 #undef TARGET_ADDRESS_COST
20314 #define TARGET_ADDRESS_COST aarch64_address_cost
20315
20316 /* This hook will determines whether unnamed bitfields affect the alignment
20317    of the containing structure.  The hook returns true if the structure
20318    should inherit the alignment requirements of an unnamed bitfield's
20319    type.  */
20320 #undef TARGET_ALIGN_ANON_BITFIELD
20321 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
20322
20323 #undef TARGET_ASM_ALIGNED_DI_OP
20324 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
20325
20326 #undef TARGET_ASM_ALIGNED_HI_OP
20327 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
20328
20329 #undef TARGET_ASM_ALIGNED_SI_OP
20330 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
20331
20332 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
20333 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
20334   hook_bool_const_tree_hwi_hwi_const_tree_true
20335
20336 #undef TARGET_ASM_FILE_START
20337 #define TARGET_ASM_FILE_START aarch64_start_file
20338
20339 #undef TARGET_ASM_OUTPUT_MI_THUNK
20340 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
20341
20342 #undef TARGET_ASM_SELECT_RTX_SECTION
20343 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
20344
20345 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
20346 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
20347
20348 #undef TARGET_BUILD_BUILTIN_VA_LIST
20349 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
20350
20351 #undef TARGET_CALLEE_COPIES
20352 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
20353
20354 #undef TARGET_CAN_ELIMINATE
20355 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
20356
20357 #undef TARGET_CAN_INLINE_P
20358 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
20359
20360 #undef TARGET_CANNOT_FORCE_CONST_MEM
20361 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
20362
20363 #undef TARGET_CASE_VALUES_THRESHOLD
20364 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
20365
20366 #undef TARGET_CONDITIONAL_REGISTER_USAGE
20367 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
20368
20369 /* Only the least significant bit is used for initialization guard
20370    variables.  */
20371 #undef TARGET_CXX_GUARD_MASK_BIT
20372 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
20373
20374 #undef TARGET_C_MODE_FOR_SUFFIX
20375 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
20376
20377 #ifdef TARGET_BIG_ENDIAN_DEFAULT
20378 #undef  TARGET_DEFAULT_TARGET_FLAGS
20379 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
20380 #endif
20381
20382 #undef TARGET_CLASS_MAX_NREGS
20383 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
20384
20385 #undef TARGET_BUILTIN_DECL
20386 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
20387
20388 #undef TARGET_BUILTIN_RECIPROCAL
20389 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
20390
20391 #undef TARGET_C_EXCESS_PRECISION
20392 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
20393
20394 #undef  TARGET_EXPAND_BUILTIN
20395 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
20396
20397 #undef TARGET_EXPAND_BUILTIN_VA_START
20398 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
20399
20400 #undef TARGET_FOLD_BUILTIN
20401 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
20402
20403 #undef TARGET_FUNCTION_ARG
20404 #define TARGET_FUNCTION_ARG aarch64_function_arg
20405
20406 #undef TARGET_FUNCTION_ARG_ADVANCE
20407 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
20408
20409 #undef TARGET_FUNCTION_ARG_BOUNDARY
20410 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
20411
20412 #undef TARGET_FUNCTION_ARG_PADDING
20413 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
20414
20415 #undef TARGET_GET_RAW_RESULT_MODE
20416 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
20417 #undef TARGET_GET_RAW_ARG_MODE
20418 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
20419
20420 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
20421 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
20422
20423 #undef TARGET_FUNCTION_VALUE
20424 #define TARGET_FUNCTION_VALUE aarch64_function_value
20425
20426 #undef TARGET_FUNCTION_VALUE_REGNO_P
20427 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
20428
20429 #undef TARGET_GIMPLE_FOLD_BUILTIN
20430 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
20431
20432 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
20433 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
20434
20435 #undef  TARGET_INIT_BUILTINS
20436 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
20437
20438 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
20439 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
20440   aarch64_ira_change_pseudo_allocno_class
20441
20442 #undef TARGET_LEGITIMATE_ADDRESS_P
20443 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
20444
20445 #undef TARGET_LEGITIMATE_CONSTANT_P
20446 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
20447
20448 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
20449 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
20450   aarch64_legitimize_address_displacement
20451
20452 #undef TARGET_LIBGCC_CMP_RETURN_MODE
20453 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
20454
20455 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
20456 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
20457 aarch64_libgcc_floating_mode_supported_p
20458
20459 #undef TARGET_MANGLE_TYPE
20460 #define TARGET_MANGLE_TYPE aarch64_mangle_type
20461
20462 #undef TARGET_MEMORY_MOVE_COST
20463 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
20464
20465 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
20466 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
20467
20468 #undef TARGET_MUST_PASS_IN_STACK
20469 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
20470
20471 /* This target hook should return true if accesses to volatile bitfields
20472    should use the narrowest mode possible.  It should return false if these
20473    accesses should use the bitfield container type.  */
20474 #undef TARGET_NARROW_VOLATILE_BITFIELD
20475 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
20476
20477 #undef  TARGET_OPTION_OVERRIDE
20478 #define TARGET_OPTION_OVERRIDE aarch64_override_options
20479
20480 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
20481 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
20482   aarch64_override_options_after_change
20483
20484 #undef TARGET_OPTION_SAVE
20485 #define TARGET_OPTION_SAVE aarch64_option_save
20486
20487 #undef TARGET_OPTION_RESTORE
20488 #define TARGET_OPTION_RESTORE aarch64_option_restore
20489
20490 #undef TARGET_OPTION_PRINT
20491 #define TARGET_OPTION_PRINT aarch64_option_print
20492
20493 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
20494 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
20495
20496 #undef TARGET_SET_CURRENT_FUNCTION
20497 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
20498
20499 #undef TARGET_PASS_BY_REFERENCE
20500 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
20501
20502 #undef TARGET_PREFERRED_RELOAD_CLASS
20503 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
20504
20505 #undef TARGET_SCHED_REASSOCIATION_WIDTH
20506 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
20507
20508 #undef TARGET_PROMOTED_TYPE
20509 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
20510
20511 #undef TARGET_SECONDARY_RELOAD
20512 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
20513
20514 #undef TARGET_SHIFT_TRUNCATION_MASK
20515 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
20516
20517 #undef TARGET_SETUP_INCOMING_VARARGS
20518 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
20519
20520 #undef TARGET_STRUCT_VALUE_RTX
20521 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
20522
20523 #undef TARGET_REGISTER_MOVE_COST
20524 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
20525
20526 #undef TARGET_RETURN_IN_MEMORY
20527 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
20528
20529 #undef TARGET_RETURN_IN_MSB
20530 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
20531
20532 #undef TARGET_RTX_COSTS
20533 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
20534
20535 #undef TARGET_SCALAR_MODE_SUPPORTED_P
20536 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
20537
20538 #undef TARGET_SCHED_ISSUE_RATE
20539 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
20540
20541 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
20542 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
20543   aarch64_sched_first_cycle_multipass_dfa_lookahead
20544
20545 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
20546 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
20547   aarch64_first_cycle_multipass_dfa_lookahead_guard
20548
20549 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
20550 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
20551   aarch64_get_separate_components
20552
20553 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
20554 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
20555   aarch64_components_for_bb
20556
20557 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
20558 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
20559   aarch64_disqualify_components
20560
20561 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
20562 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
20563   aarch64_emit_prologue_components
20564
20565 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
20566 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
20567   aarch64_emit_epilogue_components
20568
20569 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
20570 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
20571   aarch64_set_handled_components
20572
20573 #undef TARGET_TRAMPOLINE_INIT
20574 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
20575
20576 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
20577 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
20578
20579 #undef TARGET_VECTOR_MODE_SUPPORTED_P
20580 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
20581
20582 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
20583 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
20584   aarch64_builtin_support_vector_misalignment
20585
20586 #undef TARGET_ARRAY_MODE
20587 #define TARGET_ARRAY_MODE aarch64_array_mode
20588
20589 #undef TARGET_ARRAY_MODE_SUPPORTED_P
20590 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
20591
20592 #undef TARGET_VECTORIZE_ADD_STMT_COST
20593 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
20594
20595 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
20596 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
20597   aarch64_builtin_vectorization_cost
20598
20599 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
20600 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
20601
20602 #undef TARGET_VECTORIZE_BUILTINS
20603 #define TARGET_VECTORIZE_BUILTINS
20604
20605 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
20606 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
20607   aarch64_builtin_vectorized_function
20608
20609 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
20610 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
20611   aarch64_autovectorize_vector_sizes
20612
20613 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
20614 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
20615   aarch64_atomic_assign_expand_fenv
20616
20617 /* Section anchor support.  */
20618
20619 #undef TARGET_MIN_ANCHOR_OFFSET
20620 #define TARGET_MIN_ANCHOR_OFFSET -256
20621
20622 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
20623    byte offset; we can do much more for larger data types, but have no way
20624    to determine the size of the access.  We assume accesses are aligned.  */
20625 #undef TARGET_MAX_ANCHOR_OFFSET
20626 #define TARGET_MAX_ANCHOR_OFFSET 4095
20627
20628 #undef TARGET_VECTOR_ALIGNMENT
20629 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
20630
20631 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
20632 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
20633   aarch64_vectorize_preferred_vector_alignment
20634 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
20635 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
20636   aarch64_simd_vector_alignment_reachable
20637
20638 /* vec_perm support.  */
20639
20640 #undef TARGET_VECTORIZE_VEC_PERM_CONST
20641 #define TARGET_VECTORIZE_VEC_PERM_CONST \
20642   aarch64_vectorize_vec_perm_const
20643
20644 #undef TARGET_VECTORIZE_GET_MASK_MODE
20645 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
20646 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
20647 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
20648   aarch64_empty_mask_is_expensive
20649 #undef TARGET_PREFERRED_ELSE_VALUE
20650 #define TARGET_PREFERRED_ELSE_VALUE \
20651   aarch64_preferred_else_value
20652
20653 #undef TARGET_INIT_LIBFUNCS
20654 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
20655
20656 #undef TARGET_FIXED_CONDITION_CODE_REGS
20657 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
20658
20659 #undef TARGET_FLAGS_REGNUM
20660 #define TARGET_FLAGS_REGNUM CC_REGNUM
20661
20662 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
20663 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
20664
20665 #undef TARGET_ASAN_SHADOW_OFFSET
20666 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
20667
20668 #undef TARGET_LEGITIMIZE_ADDRESS
20669 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
20670
20671 #undef TARGET_SCHED_CAN_SPECULATE_INSN
20672 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
20673
20674 #undef TARGET_CAN_USE_DOLOOP_P
20675 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
20676
20677 #undef TARGET_SCHED_ADJUST_PRIORITY
20678 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
20679
20680 #undef TARGET_SCHED_MACRO_FUSION_P
20681 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
20682
20683 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
20684 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
20685
20686 #undef TARGET_SCHED_FUSION_PRIORITY
20687 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
20688
20689 #undef TARGET_UNSPEC_MAY_TRAP_P
20690 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
20691
20692 #undef TARGET_USE_PSEUDO_PIC_REG
20693 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
20694
20695 #undef TARGET_PRINT_OPERAND
20696 #define TARGET_PRINT_OPERAND aarch64_print_operand
20697
20698 #undef TARGET_PRINT_OPERAND_ADDRESS
20699 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
20700
20701 #undef TARGET_OPTAB_SUPPORTED_P
20702 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
20703
20704 #undef TARGET_OMIT_STRUCT_RETURN_REG
20705 #define TARGET_OMIT_STRUCT_RETURN_REG true
20706
20707 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
20708 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
20709   aarch64_dwarf_poly_indeterminate_value
20710
20711 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
20712 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
20713 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
20714
20715 #undef TARGET_HARD_REGNO_NREGS
20716 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
20717 #undef TARGET_HARD_REGNO_MODE_OK
20718 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
20719
20720 #undef TARGET_MODES_TIEABLE_P
20721 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
20722
20723 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
20724 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
20725   aarch64_hard_regno_call_part_clobbered
20726
20727 #undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
20728 #define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \
20729   aarch64_remove_extra_call_preserved_regs
20730
20731 #undef TARGET_RETURN_CALL_WITH_MAX_CLOBBERS
20732 #define TARGET_RETURN_CALL_WITH_MAX_CLOBBERS \
20733   aarch64_return_call_with_max_clobbers
20734
20735 #undef TARGET_CONSTANT_ALIGNMENT
20736 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
20737
20738 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
20739 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
20740   aarch64_stack_clash_protection_alloca_probe_range
20741
20742 #undef TARGET_COMPUTE_PRESSURE_CLASSES
20743 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
20744
20745 #undef TARGET_CAN_CHANGE_MODE_CLASS
20746 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
20747
20748 #undef TARGET_SELECT_EARLY_REMAT_MODES
20749 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
20750
20751 #undef TARGET_SPECULATION_SAFE_VALUE
20752 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
20753
20754 #undef TARGET_ESTIMATED_POLY_VALUE
20755 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
20756
20757 #undef TARGET_ATTRIBUTE_TABLE
20758 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
20759
20760 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
20761 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
20762   aarch64_simd_clone_compute_vecsize_and_simdlen
20763
20764 #undef TARGET_SIMD_CLONE_ADJUST
20765 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
20766
20767 #undef TARGET_SIMD_CLONE_USABLE
20768 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
20769
20770 #undef TARGET_COMP_TYPE_ATTRIBUTES
20771 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
20772
20773 #undef TARGET_GET_MULTILIB_ABI_NAME
20774 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
20775
20776 #if CHECKING_P
20777 #undef TARGET_RUN_TARGET_SELFTESTS
20778 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
20779 #endif /* #if CHECKING_P */
20780
20781 #undef TARGET_ASM_POST_CFI_STARTPROC
20782 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
20783
20784 struct gcc_target targetm = TARGET_INITIALIZER;
20785
20786 #include "gt-aarch64.h"