gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2019 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #include "config.h"
  24 #define INCLUDE_STRING
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "memmodel.h"
  32 #include "gimple.h"
  33 #include "cfghooks.h"
  34 #include "cfgloop.h"
  35 #include "df.h"
  36 #include "tm_p.h"
  37 #include "stringpool.h"
  38 #include "attribs.h"
  39 #include "optabs.h"
  40 #include "regs.h"
  41 #include "emit-rtl.h"
  42 #include "recog.h"
  43 #include "cgraph.h"
  44 #include "diagnostic.h"
  45 #include "insn-attr.h"
  46 #include "alias.h"
  47 #include "fold-const.h"
  48 #include "stor-layout.h"
  49 #include "calls.h"
  50 #include "varasm.h"
  51 #include "output.h"
  52 #include "flags.h"
  53 #include "explow.h"
  54 #include "expr.h"
  55 #include "reload.h"
  56 #include "langhooks.h"
  57 #include "opts.h"
  58 #include "params.h"
  59 #include "gimplify.h"
  60 #include "dwarf2.h"
  61 #include "gimple-iterator.h"
  62 #include "tree-vectorizer.h"
  63 #include "aarch64-cost-tables.h"
  64 #include "dumpfile.h"
  65 #include "builtins.h"
  66 #include "rtl-iter.h"
  67 #include "tm-constrs.h"
  68 #include "sched-int.h"
  69 #include "target-globals.h"
  70 #include "common/common-target.h"
  71 #include "cfgrtl.h"
  72 #include "selftest.h"
  73 #include "selftest-rtl.h"
  74 #include "rtx-vector-builder.h"
  75 #include "intl.h"
  76 #include "expmed.h"
  77
  78 /* This file should be included last.  */
  79 #include "target-def.h"
  80
  81 /* Defined for convenience.  */
  82 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  83
  84 /* Information about a legitimate vector immediate operand.  */
  85 struct simd_immediate_info
  86 {
  87   enum insn_type { MOV, MVN, INDEX, PTRUE };
  88   enum modifier_type { LSL, MSL };
  89
  90   simd_immediate_info () {}
  91   simd_immediate_info (scalar_float_mode, rtx);
  92   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
  93                        insn_type = MOV, modifier_type = LSL,
  94                        unsigned int = 0);
  95   simd_immediate_info (scalar_mode, rtx, rtx);
  96   simd_immediate_info (scalar_int_mode, aarch64_svpattern);
  97
  98   /* The mode of the elements.  */
  99   scalar_mode elt_mode;
 100
 101   /* The instruction to use to move the immediate into a vector.  */
 102   insn_type insn;
 103
 104   union
 105   {
 106     /* For MOV and MVN.  */
 107     struct
 108     {
 109       /* The value of each element.  */
 110       rtx value;
 111
 112       /* The kind of shift modifier to use, and the number of bits to shift.
 113          This is (LSL, 0) if no shift is needed.  */
 114       modifier_type modifier;
 115       unsigned int shift;
 116     } mov;
 117
 118     /* For INDEX.  */
 119     struct
 120     {
 121       /* The value of the first element and the step to be added for each
 122          subsequent element.  */
 123       rtx base, step;
 124     } index;
 125
 126     /* For PTRUE.  */
 127     aarch64_svpattern pattern;
 128   } u;
 129 };
 130
 131 /* Construct a floating-point immediate in which each element has mode
 132    ELT_MODE_IN and value VALUE_IN.  */
 133 inline simd_immediate_info
 134 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 135   : elt_mode (elt_mode_in), insn (MOV)
 136 {
 137   u.mov.value = value_in;
 138   u.mov.modifier = LSL;
 139   u.mov.shift = 0;
 140 }
 141
 142 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 143    and value VALUE_IN.  The other parameters are as for the structure
 144    fields.  */
 145 inline simd_immediate_info
 146 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 147                        unsigned HOST_WIDE_INT value_in,
 148                        insn_type insn_in, modifier_type modifier_in,
 149                        unsigned int shift_in)
 150   : elt_mode (elt_mode_in), insn (insn_in)
 151 {
 152   u.mov.value = gen_int_mode (value_in, elt_mode_in);
 153   u.mov.modifier = modifier_in;
 154   u.mov.shift = shift_in;
 155 }
 156
 157 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 158    and where element I is equal to BASE_IN + I * STEP_IN.  */
 159 inline simd_immediate_info
 160 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
 161   : elt_mode (elt_mode_in), insn (INDEX)
 162 {
 163   u.index.base = base_in;
 164   u.index.step = step_in;
 165 }
 166
 167 /* Construct a predicate that controls elements of mode ELT_MODE_IN
 168    and has PTRUE pattern PATTERN_IN.  */
 169 inline simd_immediate_info
 170 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 171                        aarch64_svpattern pattern_in)
 172   : elt_mode (elt_mode_in), insn (PTRUE)
 173 {
 174   u.pattern = pattern_in;
 175 }
 176
 177 /* The current code model.  */
 178 enum aarch64_code_model aarch64_cmodel;
 179
 180 /* The number of 64-bit elements in an SVE vector.  */
 181 poly_uint16 aarch64_sve_vg;
 182
 183 #ifdef HAVE_AS_TLS
 184 #undef TARGET_HAVE_TLS
 185 #define TARGET_HAVE_TLS 1
 186 #endif
 187
 188 static bool aarch64_composite_type_p (const_tree, machine_mode);
 189 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 190                                                      const_tree,
 191                                                      machine_mode *, int *,
 192                                                      bool *);
 193 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 194 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 195 static void aarch64_override_options_after_change (void);
 196 static bool aarch64_vector_mode_supported_p (machine_mode);
 197 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 198 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 199                                                          const_tree type,
 200                                                          int misalignment,
 201                                                          bool is_packed);
 202 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 203 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 204                                             aarch64_addr_query_type);
 205 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
 206
 207 /* Major revision number of the ARM Architecture implemented by the target.  */
 208 unsigned aarch64_architecture_version;
 209
 210 /* The processor for which instructions should be scheduled.  */
 211 enum aarch64_processor aarch64_tune = cortexa53;
 212
 213 /* Mask to specify which instruction scheduling options should be used.  */
 214 uint64_t aarch64_tune_flags = 0;
 215
 216 /* Global flag for PC relative loads.  */
 217 bool aarch64_pcrelative_literal_loads;
 218
 219 /* Global flag for whether frame pointer is enabled.  */
 220 bool aarch64_use_frame_pointer;
 221
 222 #define BRANCH_PROTECT_STR_MAX 255
 223 char *accepted_branch_protection_string = NULL;
 224
 225 static enum aarch64_parse_opt_result
 226 aarch64_parse_branch_protection (const char*, char**);
 227
 228 /* Support for command line parsing of boolean flags in the tuning
 229    structures.  */
 230 struct aarch64_flag_desc
 231 {
 232   const char* name;
 233   unsigned int flag;
 234 };
 235
 236 #define AARCH64_FUSION_PAIR(name, internal_name) \
 237   { name, AARCH64_FUSE_##internal_name },
 238 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 239 {
 240   { "none", AARCH64_FUSE_NOTHING },
 241 #include "aarch64-fusion-pairs.def"
 242   { "all", AARCH64_FUSE_ALL },
 243   { NULL, AARCH64_FUSE_NOTHING }
 244 };
 245
 246 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 247   { name, AARCH64_EXTRA_TUNE_##internal_name },
 248 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 249 {
 250   { "none", AARCH64_EXTRA_TUNE_NONE },
 251 #include "aarch64-tuning-flags.def"
 252   { "all", AARCH64_EXTRA_TUNE_ALL },
 253   { NULL, AARCH64_EXTRA_TUNE_NONE }
 254 };
 255
 256 /* Tuning parameters.  */
 257
 258 static const struct cpu_addrcost_table generic_addrcost_table =
 259 {
 260     {
 261       1, /* hi  */
 262       0, /* si  */
 263       0, /* di  */
 264       1, /* ti  */
 265     },
 266   0, /* pre_modify  */
 267   0, /* post_modify  */
 268   0, /* register_offset  */
 269   0, /* register_sextend  */
 270   0, /* register_zextend  */
 271   0 /* imm_offset  */
 272 };
 273
 274 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 275 {
 276     {
 277       0, /* hi  */
 278       0, /* si  */
 279       0, /* di  */
 280       2, /* ti  */
 281     },
 282   0, /* pre_modify  */
 283   0, /* post_modify  */
 284   1, /* register_offset  */
 285   1, /* register_sextend  */
 286   2, /* register_zextend  */
 287   0, /* imm_offset  */
 288 };
 289
 290 static const struct cpu_addrcost_table xgene1_addrcost_table =
 291 {
 292     {
 293       1, /* hi  */
 294       0, /* si  */
 295       0, /* di  */
 296       1, /* ti  */
 297     },
 298   1, /* pre_modify  */
 299   1, /* post_modify  */
 300   0, /* register_offset  */
 301   1, /* register_sextend  */
 302   1, /* register_zextend  */
 303   0, /* imm_offset  */
 304 };
 305
 306 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 307 {
 308     {
 309       1, /* hi  */
 310       1, /* si  */
 311       1, /* di  */
 312       2, /* ti  */
 313     },
 314   0, /* pre_modify  */
 315   0, /* post_modify  */
 316   2, /* register_offset  */
 317   3, /* register_sextend  */
 318   3, /* register_zextend  */
 319   0, /* imm_offset  */
 320 };
 321
 322 static const struct cpu_addrcost_table tsv110_addrcost_table =
 323 {
 324     {
 325       1, /* hi  */
 326       0, /* si  */
 327       0, /* di  */
 328       1, /* ti  */
 329     },
 330   0, /* pre_modify  */
 331   0, /* post_modify  */
 332   0, /* register_offset  */
 333   1, /* register_sextend  */
 334   1, /* register_zextend  */
 335   0, /* imm_offset  */
 336 };
 337
 338 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 339 {
 340     {
 341       1, /* hi  */
 342       1, /* si  */
 343       1, /* di  */
 344       2, /* ti  */
 345     },
 346   1, /* pre_modify  */
 347   1, /* post_modify  */
 348   3, /* register_offset  */
 349   3, /* register_sextend  */
 350   3, /* register_zextend  */
 351   2, /* imm_offset  */
 352 };
 353
 354 static const struct cpu_regmove_cost generic_regmove_cost =
 355 {
 356   1, /* GP2GP  */
 357   /* Avoid the use of slow int<->fp moves for spilling by setting
 358      their cost higher than memmov_cost.  */
 359   5, /* GP2FP  */
 360   5, /* FP2GP  */
 361   2 /* FP2FP  */
 362 };
 363
 364 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 365 {
 366   1, /* GP2GP  */
 367   /* Avoid the use of slow int<->fp moves for spilling by setting
 368      their cost higher than memmov_cost.  */
 369   5, /* GP2FP  */
 370   5, /* FP2GP  */
 371   2 /* FP2FP  */
 372 };
 373
 374 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 375 {
 376   1, /* GP2GP  */
 377   /* Avoid the use of slow int<->fp moves for spilling by setting
 378      their cost higher than memmov_cost.  */
 379   5, /* GP2FP  */
 380   5, /* FP2GP  */
 381   2 /* FP2FP  */
 382 };
 383
 384 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 385 {
 386   1, /* GP2GP  */
 387   /* Avoid the use of slow int<->fp moves for spilling by setting
 388      their cost higher than memmov_cost (actual, 4 and 9).  */
 389   9, /* GP2FP  */
 390   9, /* FP2GP  */
 391   1 /* FP2FP  */
 392 };
 393
 394 static const struct cpu_regmove_cost thunderx_regmove_cost =
 395 {
 396   2, /* GP2GP  */
 397   2, /* GP2FP  */
 398   6, /* FP2GP  */
 399   4 /* FP2FP  */
 400 };
 401
 402 static const struct cpu_regmove_cost xgene1_regmove_cost =
 403 {
 404   1, /* GP2GP  */
 405   /* Avoid the use of slow int<->fp moves for spilling by setting
 406      their cost higher than memmov_cost.  */
 407   8, /* GP2FP  */
 408   8, /* FP2GP  */
 409   2 /* FP2FP  */
 410 };
 411
 412 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 413 {
 414   2, /* GP2GP  */
 415   /* Avoid the use of int<->fp moves for spilling.  */
 416   6, /* GP2FP  */
 417   6, /* FP2GP  */
 418   4 /* FP2FP  */
 419 };
 420
 421 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 422 {
 423   1, /* GP2GP  */
 424   /* Avoid the use of int<->fp moves for spilling.  */
 425   8, /* GP2FP  */
 426   8, /* FP2GP  */
 427   4  /* FP2FP  */
 428 };
 429
 430 static const struct cpu_regmove_cost tsv110_regmove_cost =
 431 {
 432   1, /* GP2GP  */
 433   /* Avoid the use of slow int<->fp moves for spilling by setting
 434      their cost higher than memmov_cost.  */
 435   2, /* GP2FP  */
 436   3, /* FP2GP  */
 437   2  /* FP2FP  */
 438 };
 439
 440 /* Generic costs for vector insn classes.  */
 441 static const struct cpu_vector_cost generic_vector_cost =
 442 {
 443   1, /* scalar_int_stmt_cost  */
 444   1, /* scalar_fp_stmt_cost  */
 445   1, /* scalar_load_cost  */
 446   1, /* scalar_store_cost  */
 447   1, /* vec_int_stmt_cost  */
 448   1, /* vec_fp_stmt_cost  */
 449   2, /* vec_permute_cost  */
 450   1, /* vec_to_scalar_cost  */
 451   1, /* scalar_to_vec_cost  */
 452   1, /* vec_align_load_cost  */
 453   1, /* vec_unalign_load_cost  */
 454   1, /* vec_unalign_store_cost  */
 455   1, /* vec_store_cost  */
 456   3, /* cond_taken_branch_cost  */
 457   1 /* cond_not_taken_branch_cost  */
 458 };
 459
 460 /* QDF24XX costs for vector insn classes.  */
 461 static const struct cpu_vector_cost qdf24xx_vector_cost =
 462 {
 463   1, /* scalar_int_stmt_cost  */
 464   1, /* scalar_fp_stmt_cost  */
 465   1, /* scalar_load_cost  */
 466   1, /* scalar_store_cost  */
 467   1, /* vec_int_stmt_cost  */
 468   3, /* vec_fp_stmt_cost  */
 469   2, /* vec_permute_cost  */
 470   1, /* vec_to_scalar_cost  */
 471   1, /* scalar_to_vec_cost  */
 472   1, /* vec_align_load_cost  */
 473   1, /* vec_unalign_load_cost  */
 474   1, /* vec_unalign_store_cost  */
 475   1, /* vec_store_cost  */
 476   3, /* cond_taken_branch_cost  */
 477   1 /* cond_not_taken_branch_cost  */
 478 };
 479
 480 /* ThunderX costs for vector insn classes.  */
 481 static const struct cpu_vector_cost thunderx_vector_cost =
 482 {
 483   1, /* scalar_int_stmt_cost  */
 484   1, /* scalar_fp_stmt_cost  */
 485   3, /* scalar_load_cost  */
 486   1, /* scalar_store_cost  */
 487   4, /* vec_int_stmt_cost  */
 488   1, /* vec_fp_stmt_cost  */
 489   4, /* vec_permute_cost  */
 490   2, /* vec_to_scalar_cost  */
 491   2, /* scalar_to_vec_cost  */
 492   3, /* vec_align_load_cost  */
 493   5, /* vec_unalign_load_cost  */
 494   5, /* vec_unalign_store_cost  */
 495   1, /* vec_store_cost  */
 496   3, /* cond_taken_branch_cost  */
 497   3 /* cond_not_taken_branch_cost  */
 498 };
 499
 500 static const struct cpu_vector_cost tsv110_vector_cost =
 501 {
 502   1, /* scalar_int_stmt_cost  */
 503   1, /* scalar_fp_stmt_cost  */
 504   5, /* scalar_load_cost  */
 505   1, /* scalar_store_cost  */
 506   2, /* vec_int_stmt_cost  */
 507   2, /* vec_fp_stmt_cost  */
 508   2, /* vec_permute_cost  */
 509   3, /* vec_to_scalar_cost  */
 510   2, /* scalar_to_vec_cost  */
 511   5, /* vec_align_load_cost  */
 512   5, /* vec_unalign_load_cost  */
 513   1, /* vec_unalign_store_cost  */
 514   1, /* vec_store_cost  */
 515   1, /* cond_taken_branch_cost  */
 516   1 /* cond_not_taken_branch_cost  */
 517 };
 518
 519 /* Generic costs for vector insn classes.  */
 520 static const struct cpu_vector_cost cortexa57_vector_cost =
 521 {
 522   1, /* scalar_int_stmt_cost  */
 523   1, /* scalar_fp_stmt_cost  */
 524   4, /* scalar_load_cost  */
 525   1, /* scalar_store_cost  */
 526   2, /* vec_int_stmt_cost  */
 527   2, /* vec_fp_stmt_cost  */
 528   3, /* vec_permute_cost  */
 529   8, /* vec_to_scalar_cost  */
 530   8, /* scalar_to_vec_cost  */
 531   4, /* vec_align_load_cost  */
 532   4, /* vec_unalign_load_cost  */
 533   1, /* vec_unalign_store_cost  */
 534   1, /* vec_store_cost  */
 535   1, /* cond_taken_branch_cost  */
 536   1 /* cond_not_taken_branch_cost  */
 537 };
 538
 539 static const struct cpu_vector_cost exynosm1_vector_cost =
 540 {
 541   1, /* scalar_int_stmt_cost  */
 542   1, /* scalar_fp_stmt_cost  */
 543   5, /* scalar_load_cost  */
 544   1, /* scalar_store_cost  */
 545   3, /* vec_int_stmt_cost  */
 546   3, /* vec_fp_stmt_cost  */
 547   3, /* vec_permute_cost  */
 548   3, /* vec_to_scalar_cost  */
 549   3, /* scalar_to_vec_cost  */
 550   5, /* vec_align_load_cost  */
 551   5, /* vec_unalign_load_cost  */
 552   1, /* vec_unalign_store_cost  */
 553   1, /* vec_store_cost  */
 554   1, /* cond_taken_branch_cost  */
 555   1 /* cond_not_taken_branch_cost  */
 556 };
 557
 558 /* Generic costs for vector insn classes.  */
 559 static const struct cpu_vector_cost xgene1_vector_cost =
 560 {
 561   1, /* scalar_int_stmt_cost  */
 562   1, /* scalar_fp_stmt_cost  */
 563   5, /* scalar_load_cost  */
 564   1, /* scalar_store_cost  */
 565   2, /* vec_int_stmt_cost  */
 566   2, /* vec_fp_stmt_cost  */
 567   2, /* vec_permute_cost  */
 568   4, /* vec_to_scalar_cost  */
 569   4, /* scalar_to_vec_cost  */
 570   10, /* vec_align_load_cost  */
 571   10, /* vec_unalign_load_cost  */
 572   2, /* vec_unalign_store_cost  */
 573   2, /* vec_store_cost  */
 574   2, /* cond_taken_branch_cost  */
 575   1 /* cond_not_taken_branch_cost  */
 576 };
 577
 578 /* Costs for vector insn classes for Vulcan.  */
 579 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 580 {
 581   1, /* scalar_int_stmt_cost  */
 582   6, /* scalar_fp_stmt_cost  */
 583   4, /* scalar_load_cost  */
 584   1, /* scalar_store_cost  */
 585   5, /* vec_int_stmt_cost  */
 586   6, /* vec_fp_stmt_cost  */
 587   3, /* vec_permute_cost  */
 588   6, /* vec_to_scalar_cost  */
 589   5, /* scalar_to_vec_cost  */
 590   8, /* vec_align_load_cost  */
 591   8, /* vec_unalign_load_cost  */
 592   4, /* vec_unalign_store_cost  */
 593   4, /* vec_store_cost  */
 594   2, /* cond_taken_branch_cost  */
 595   1  /* cond_not_taken_branch_cost  */
 596 };
 597
 598 /* Generic costs for branch instructions.  */
 599 static const struct cpu_branch_cost generic_branch_cost =
 600 {
 601   1,  /* Predictable.  */
 602   3   /* Unpredictable.  */
 603 };
 604
 605 /* Generic approximation modes.  */
 606 static const cpu_approx_modes generic_approx_modes =
 607 {
 608   AARCH64_APPROX_NONE,  /* division  */
 609   AARCH64_APPROX_NONE,  /* sqrt  */
 610   AARCH64_APPROX_NONE   /* recip_sqrt  */
 611 };
 612
 613 /* Approximation modes for Exynos M1.  */
 614 static const cpu_approx_modes exynosm1_approx_modes =
 615 {
 616   AARCH64_APPROX_NONE,  /* division  */
 617   AARCH64_APPROX_ALL,   /* sqrt  */
 618   AARCH64_APPROX_ALL    /* recip_sqrt  */
 619 };
 620
 621 /* Approximation modes for X-Gene 1.  */
 622 static const cpu_approx_modes xgene1_approx_modes =
 623 {
 624   AARCH64_APPROX_NONE,  /* division  */
 625   AARCH64_APPROX_NONE,  /* sqrt  */
 626   AARCH64_APPROX_ALL    /* recip_sqrt  */
 627 };
 628
 629 /* Generic prefetch settings (which disable prefetch).  */
 630 static const cpu_prefetch_tune generic_prefetch_tune =
 631 {
 632   0,                    /* num_slots  */
 633   -1,                   /* l1_cache_size  */
 634   -1,                   /* l1_cache_line_size  */
 635   -1,                   /* l2_cache_size  */
 636   true,                 /* prefetch_dynamic_strides */
 637   -1,                   /* minimum_stride */
 638   -1                    /* default_opt_level  */
 639 };
 640
 641 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 642 {
 643   0,                    /* num_slots  */
 644   -1,                   /* l1_cache_size  */
 645   64,                   /* l1_cache_line_size  */
 646   -1,                   /* l2_cache_size  */
 647   true,                 /* prefetch_dynamic_strides */
 648   -1,                   /* minimum_stride */
 649   -1                    /* default_opt_level  */
 650 };
 651
 652 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 653 {
 654   4,                    /* num_slots  */
 655   32,                   /* l1_cache_size  */
 656   64,                   /* l1_cache_line_size  */
 657   512,                  /* l2_cache_size  */
 658   false,                /* prefetch_dynamic_strides */
 659   2048,                 /* minimum_stride */
 660   3                     /* default_opt_level  */
 661 };
 662
 663 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 664 {
 665   8,                    /* num_slots  */
 666   32,                   /* l1_cache_size  */
 667   128,                  /* l1_cache_line_size  */
 668   16*1024,              /* l2_cache_size  */
 669   true,                 /* prefetch_dynamic_strides */
 670   -1,                   /* minimum_stride */
 671   3                     /* default_opt_level  */
 672 };
 673
 674 static const cpu_prefetch_tune thunderx_prefetch_tune =
 675 {
 676   8,                    /* num_slots  */
 677   32,                   /* l1_cache_size  */
 678   128,                  /* l1_cache_line_size  */
 679   -1,                   /* l2_cache_size  */
 680   true,                 /* prefetch_dynamic_strides */
 681   -1,                   /* minimum_stride */
 682   -1                    /* default_opt_level  */
 683 };
 684
 685 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 686 {
 687   8,                    /* num_slots  */
 688   32,                   /* l1_cache_size  */
 689   64,                   /* l1_cache_line_size  */
 690   256,                  /* l2_cache_size  */
 691   true,                 /* prefetch_dynamic_strides */
 692   -1,                   /* minimum_stride */
 693   -1                    /* default_opt_level  */
 694 };
 695
 696 static const cpu_prefetch_tune tsv110_prefetch_tune =
 697 {
 698   0,                    /* num_slots  */
 699   64,                   /* l1_cache_size  */
 700   64,                   /* l1_cache_line_size  */
 701   512,                  /* l2_cache_size  */
 702   true,                 /* prefetch_dynamic_strides */
 703   -1,                   /* minimum_stride */
 704   -1                    /* default_opt_level  */
 705 };
 706
 707 static const cpu_prefetch_tune xgene1_prefetch_tune =
 708 {
 709   8,                    /* num_slots  */
 710   32,                   /* l1_cache_size  */
 711   64,                   /* l1_cache_line_size  */
 712   256,                  /* l2_cache_size  */
 713   true,                 /* prefetch_dynamic_strides */
 714   -1,                   /* minimum_stride */
 715   -1                    /* default_opt_level  */
 716 };
 717
 718 static const struct tune_params generic_tunings =
 719 {
 720   &cortexa57_extra_costs,
 721   &generic_addrcost_table,
 722   &generic_regmove_cost,
 723   &generic_vector_cost,
 724   &generic_branch_cost,
 725   &generic_approx_modes,
 726   SVE_NOT_IMPLEMENTED, /* sve_width  */
 727   4, /* memmov_cost  */
 728   2, /* issue_rate  */
 729   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 730   "16:12",      /* function_align.  */
 731   "4",  /* jump_align.  */
 732   "8",  /* loop_align.  */
 733   2,    /* int_reassoc_width.  */
 734   4,    /* fp_reassoc_width.  */
 735   1,    /* vec_reassoc_width.  */
 736   2,    /* min_div_recip_mul_sf.  */
 737   2,    /* min_div_recip_mul_df.  */
 738   0,    /* max_case_values.  */
 739   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 740   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 741   &generic_prefetch_tune
 742 };
 743
 744 static const struct tune_params cortexa35_tunings =
 745 {
 746   &cortexa53_extra_costs,
 747   &generic_addrcost_table,
 748   &cortexa53_regmove_cost,
 749   &generic_vector_cost,
 750   &generic_branch_cost,
 751   &generic_approx_modes,
 752   SVE_NOT_IMPLEMENTED, /* sve_width  */
 753   4, /* memmov_cost  */
 754   1, /* issue_rate  */
 755   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 756    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 757   "16", /* function_align.  */
 758   "4",  /* jump_align.  */
 759   "8",  /* loop_align.  */
 760   2,    /* int_reassoc_width.  */
 761   4,    /* fp_reassoc_width.  */
 762   1,    /* vec_reassoc_width.  */
 763   2,    /* min_div_recip_mul_sf.  */
 764   2,    /* min_div_recip_mul_df.  */
 765   0,    /* max_case_values.  */
 766   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 767   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 768   &generic_prefetch_tune
 769 };
 770
 771 static const struct tune_params cortexa53_tunings =
 772 {
 773   &cortexa53_extra_costs,
 774   &generic_addrcost_table,
 775   &cortexa53_regmove_cost,
 776   &generic_vector_cost,
 777   &generic_branch_cost,
 778   &generic_approx_modes,
 779   SVE_NOT_IMPLEMENTED, /* sve_width  */
 780   4, /* memmov_cost  */
 781   2, /* issue_rate  */
 782   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 783    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 784   "16", /* function_align.  */
 785   "4",  /* jump_align.  */
 786   "8",  /* loop_align.  */
 787   2,    /* int_reassoc_width.  */
 788   4,    /* fp_reassoc_width.  */
 789   1,    /* vec_reassoc_width.  */
 790   2,    /* min_div_recip_mul_sf.  */
 791   2,    /* min_div_recip_mul_df.  */
 792   0,    /* max_case_values.  */
 793   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 794   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 795   &generic_prefetch_tune
 796 };
 797
 798 static const struct tune_params cortexa57_tunings =
 799 {
 800   &cortexa57_extra_costs,
 801   &generic_addrcost_table,
 802   &cortexa57_regmove_cost,
 803   &cortexa57_vector_cost,
 804   &generic_branch_cost,
 805   &generic_approx_modes,
 806   SVE_NOT_IMPLEMENTED, /* sve_width  */
 807   4, /* memmov_cost  */
 808   3, /* issue_rate  */
 809   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 810    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 811   "16", /* function_align.  */
 812   "4",  /* jump_align.  */
 813   "8",  /* loop_align.  */
 814   2,    /* int_reassoc_width.  */
 815   4,    /* fp_reassoc_width.  */
 816   1,    /* vec_reassoc_width.  */
 817   2,    /* min_div_recip_mul_sf.  */
 818   2,    /* min_div_recip_mul_df.  */
 819   0,    /* max_case_values.  */
 820   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 821   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 822   &generic_prefetch_tune
 823 };
 824
 825 static const struct tune_params cortexa72_tunings =
 826 {
 827   &cortexa57_extra_costs,
 828   &generic_addrcost_table,
 829   &cortexa57_regmove_cost,
 830   &cortexa57_vector_cost,
 831   &generic_branch_cost,
 832   &generic_approx_modes,
 833   SVE_NOT_IMPLEMENTED, /* sve_width  */
 834   4, /* memmov_cost  */
 835   3, /* issue_rate  */
 836   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 837    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 838   "16", /* function_align.  */
 839   "4",  /* jump_align.  */
 840   "8",  /* loop_align.  */
 841   2,    /* int_reassoc_width.  */
 842   4,    /* fp_reassoc_width.  */
 843   1,    /* vec_reassoc_width.  */
 844   2,    /* min_div_recip_mul_sf.  */
 845   2,    /* min_div_recip_mul_df.  */
 846   0,    /* max_case_values.  */
 847   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 848   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 849   &generic_prefetch_tune
 850 };
 851
 852 static const struct tune_params cortexa73_tunings =
 853 {
 854   &cortexa57_extra_costs,
 855   &generic_addrcost_table,
 856   &cortexa57_regmove_cost,
 857   &cortexa57_vector_cost,
 858   &generic_branch_cost,
 859   &generic_approx_modes,
 860   SVE_NOT_IMPLEMENTED, /* sve_width  */
 861   4, /* memmov_cost.  */
 862   2, /* issue_rate.  */
 863   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 864    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 865   "16", /* function_align.  */
 866   "4",  /* jump_align.  */
 867   "8",  /* loop_align.  */
 868   2,    /* int_reassoc_width.  */
 869   4,    /* fp_reassoc_width.  */
 870   1,    /* vec_reassoc_width.  */
 871   2,    /* min_div_recip_mul_sf.  */
 872   2,    /* min_div_recip_mul_df.  */
 873   0,    /* max_case_values.  */
 874   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 875   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 876   &generic_prefetch_tune
 877 };
 878
 879
 880
 881 static const struct tune_params exynosm1_tunings =
 882 {
 883   &exynosm1_extra_costs,
 884   &exynosm1_addrcost_table,
 885   &exynosm1_regmove_cost,
 886   &exynosm1_vector_cost,
 887   &generic_branch_cost,
 888   &exynosm1_approx_modes,
 889   SVE_NOT_IMPLEMENTED, /* sve_width  */
 890   4,    /* memmov_cost  */
 891   3,    /* issue_rate  */
 892   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 893   "4",  /* function_align.  */
 894   "4",  /* jump_align.  */
 895   "4",  /* loop_align.  */
 896   2,    /* int_reassoc_width.  */
 897   4,    /* fp_reassoc_width.  */
 898   1,    /* vec_reassoc_width.  */
 899   2,    /* min_div_recip_mul_sf.  */
 900   2,    /* min_div_recip_mul_df.  */
 901   48,   /* max_case_values.  */
 902   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 903   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 904   &exynosm1_prefetch_tune
 905 };
 906
 907 static const struct tune_params thunderxt88_tunings =
 908 {
 909   &thunderx_extra_costs,
 910   &generic_addrcost_table,
 911   &thunderx_regmove_cost,
 912   &thunderx_vector_cost,
 913   &generic_branch_cost,
 914   &generic_approx_modes,
 915   SVE_NOT_IMPLEMENTED, /* sve_width  */
 916   6, /* memmov_cost  */
 917   2, /* issue_rate  */
 918   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 919   "8",  /* function_align.  */
 920   "8",  /* jump_align.  */
 921   "8",  /* loop_align.  */
 922   2,    /* int_reassoc_width.  */
 923   4,    /* fp_reassoc_width.  */
 924   1,    /* vec_reassoc_width.  */
 925   2,    /* min_div_recip_mul_sf.  */
 926   2,    /* min_div_recip_mul_df.  */
 927   0,    /* max_case_values.  */
 928   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 929   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 930   &thunderxt88_prefetch_tune
 931 };
 932
 933 static const struct tune_params thunderx_tunings =
 934 {
 935   &thunderx_extra_costs,
 936   &generic_addrcost_table,
 937   &thunderx_regmove_cost,
 938   &thunderx_vector_cost,
 939   &generic_branch_cost,
 940   &generic_approx_modes,
 941   SVE_NOT_IMPLEMENTED, /* sve_width  */
 942   6, /* memmov_cost  */
 943   2, /* issue_rate  */
 944   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 945   "8",  /* function_align.  */
 946   "8",  /* jump_align.  */
 947   "8",  /* loop_align.  */
 948   2,    /* int_reassoc_width.  */
 949   4,    /* fp_reassoc_width.  */
 950   1,    /* vec_reassoc_width.  */
 951   2,    /* min_div_recip_mul_sf.  */
 952   2,    /* min_div_recip_mul_df.  */
 953   0,    /* max_case_values.  */
 954   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 955   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 956    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 957   &thunderx_prefetch_tune
 958 };
 959
 960 static const struct tune_params tsv110_tunings =
 961 {
 962   &tsv110_extra_costs,
 963   &tsv110_addrcost_table,
 964   &tsv110_regmove_cost,
 965   &tsv110_vector_cost,
 966   &generic_branch_cost,
 967   &generic_approx_modes,
 968   SVE_NOT_IMPLEMENTED, /* sve_width  */
 969   4,    /* memmov_cost  */
 970   4,    /* issue_rate  */
 971   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
 972    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 973   "16", /* function_align.  */
 974   "4",  /* jump_align.  */
 975   "8",  /* loop_align.  */
 976   2,    /* int_reassoc_width.  */
 977   4,    /* fp_reassoc_width.  */
 978   1,    /* vec_reassoc_width.  */
 979   2,    /* min_div_recip_mul_sf.  */
 980   2,    /* min_div_recip_mul_df.  */
 981   0,    /* max_case_values.  */
 982   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 983   (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
 984   &tsv110_prefetch_tune
 985 };
 986
 987 static const struct tune_params xgene1_tunings =
 988 {
 989   &xgene1_extra_costs,
 990   &xgene1_addrcost_table,
 991   &xgene1_regmove_cost,
 992   &xgene1_vector_cost,
 993   &generic_branch_cost,
 994   &xgene1_approx_modes,
 995   SVE_NOT_IMPLEMENTED, /* sve_width  */
 996   6, /* memmov_cost  */
 997   4, /* issue_rate  */
 998   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 999   "16", /* function_align.  */
1000   "16", /* jump_align.  */
1001   "16", /* loop_align.  */
1002   2,    /* int_reassoc_width.  */
1003   4,    /* fp_reassoc_width.  */
1004   1,    /* vec_reassoc_width.  */
1005   2,    /* min_div_recip_mul_sf.  */
1006   2,    /* min_div_recip_mul_df.  */
1007   17,   /* max_case_values.  */
1008   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1009   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1010   &xgene1_prefetch_tune
1011 };
1012
1013 static const struct tune_params emag_tunings =
1014 {
1015   &xgene1_extra_costs,
1016   &xgene1_addrcost_table,
1017   &xgene1_regmove_cost,
1018   &xgene1_vector_cost,
1019   &generic_branch_cost,
1020   &xgene1_approx_modes,
1021   SVE_NOT_IMPLEMENTED,
1022   6, /* memmov_cost  */
1023   4, /* issue_rate  */
1024   AARCH64_FUSE_NOTHING, /* fusible_ops  */
1025   "16", /* function_align.  */
1026   "16", /* jump_align.  */
1027   "16", /* loop_align.  */
1028   2,    /* int_reassoc_width.  */
1029   4,    /* fp_reassoc_width.  */
1030   1,    /* vec_reassoc_width.  */
1031   2,    /* min_div_recip_mul_sf.  */
1032   2,    /* min_div_recip_mul_df.  */
1033   17,   /* max_case_values.  */
1034   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1035   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1036   &xgene1_prefetch_tune
1037 };
1038
1039 static const struct tune_params qdf24xx_tunings =
1040 {
1041   &qdf24xx_extra_costs,
1042   &qdf24xx_addrcost_table,
1043   &qdf24xx_regmove_cost,
1044   &qdf24xx_vector_cost,
1045   &generic_branch_cost,
1046   &generic_approx_modes,
1047   SVE_NOT_IMPLEMENTED, /* sve_width  */
1048   4, /* memmov_cost  */
1049   4, /* issue_rate  */
1050   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1051    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1052   "16", /* function_align.  */
1053   "8",  /* jump_align.  */
1054   "16", /* loop_align.  */
1055   2,    /* int_reassoc_width.  */
1056   4,    /* fp_reassoc_width.  */
1057   1,    /* vec_reassoc_width.  */
1058   2,    /* min_div_recip_mul_sf.  */
1059   2,    /* min_div_recip_mul_df.  */
1060   0,    /* max_case_values.  */
1061   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1062   AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
1063   &qdf24xx_prefetch_tune
1064 };
1065
1066 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
1067    for now.  */
1068 static const struct tune_params saphira_tunings =
1069 {
1070   &generic_extra_costs,
1071   &generic_addrcost_table,
1072   &generic_regmove_cost,
1073   &generic_vector_cost,
1074   &generic_branch_cost,
1075   &generic_approx_modes,
1076   SVE_NOT_IMPLEMENTED, /* sve_width  */
1077   4, /* memmov_cost  */
1078   4, /* issue_rate  */
1079   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1080    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1081   "16", /* function_align.  */
1082   "8",  /* jump_align.  */
1083   "16", /* loop_align.  */
1084   2,    /* int_reassoc_width.  */
1085   4,    /* fp_reassoc_width.  */
1086   1,    /* vec_reassoc_width.  */
1087   2,    /* min_div_recip_mul_sf.  */
1088   2,    /* min_div_recip_mul_df.  */
1089   0,    /* max_case_values.  */
1090   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1091   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
1092   &generic_prefetch_tune
1093 };
1094
1095 static const struct tune_params thunderx2t99_tunings =
1096 {
1097   &thunderx2t99_extra_costs,
1098   &thunderx2t99_addrcost_table,
1099   &thunderx2t99_regmove_cost,
1100   &thunderx2t99_vector_cost,
1101   &generic_branch_cost,
1102   &generic_approx_modes,
1103   SVE_NOT_IMPLEMENTED, /* sve_width  */
1104   4, /* memmov_cost.  */
1105   4, /* issue_rate.  */
1106   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
1107    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
1108   "16", /* function_align.  */
1109   "8",  /* jump_align.  */
1110   "16", /* loop_align.  */
1111   3,    /* int_reassoc_width.  */
1112   2,    /* fp_reassoc_width.  */
1113   2,    /* vec_reassoc_width.  */
1114   2,    /* min_div_recip_mul_sf.  */
1115   2,    /* min_div_recip_mul_df.  */
1116   0,    /* max_case_values.  */
1117   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1118   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1119   &thunderx2t99_prefetch_tune
1120 };
1121
1122 static const struct tune_params neoversen1_tunings =
1123 {
1124   &cortexa57_extra_costs,
1125   &generic_addrcost_table,
1126   &generic_regmove_cost,
1127   &cortexa57_vector_cost,
1128   &generic_branch_cost,
1129   &generic_approx_modes,
1130   SVE_NOT_IMPLEMENTED, /* sve_width  */
1131   4, /* memmov_cost  */
1132   3, /* issue_rate  */
1133   AARCH64_FUSE_AES_AESMC, /* fusible_ops  */
1134   "32:16",      /* function_align.  */
1135   "32:16",      /* jump_align.  */
1136   "32:16",      /* loop_align.  */
1137   2,    /* int_reassoc_width.  */
1138   4,    /* fp_reassoc_width.  */
1139   2,    /* vec_reassoc_width.  */
1140   2,    /* min_div_recip_mul_sf.  */
1141   2,    /* min_div_recip_mul_df.  */
1142   0,    /* max_case_values.  */
1143   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1144   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1145   &generic_prefetch_tune
1146 };
1147
1148 /* Support for fine-grained override of the tuning structures.  */
1149 struct aarch64_tuning_override_function
1150 {
1151   const char* name;
1152   void (*parse_override)(const char*, struct tune_params*);
1153 };
1154
1155 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1156 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1157 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1158
1159 static const struct aarch64_tuning_override_function
1160 aarch64_tuning_override_functions[] =
1161 {
1162   { "fuse", aarch64_parse_fuse_string },
1163   { "tune", aarch64_parse_tune_string },
1164   { "sve_width", aarch64_parse_sve_width_string },
1165   { NULL, NULL }
1166 };
1167
1168 /* A processor implementing AArch64.  */
1169 struct processor
1170 {
1171   const char *const name;
1172   enum aarch64_processor ident;
1173   enum aarch64_processor sched_core;
1174   enum aarch64_arch arch;
1175   unsigned architecture_version;
1176   const uint64_t flags;
1177   const struct tune_params *const tune;
1178 };
1179
1180 /* Architectures implementing AArch64.  */
1181 static const struct processor all_architectures[] =
1182 {
1183 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1184   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1185 #include "aarch64-arches.def"
1186   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1187 };
1188
1189 /* Processor cores implementing AArch64.  */
1190 static const struct processor all_cores[] =
1191 {
1192 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1193   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
1194   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
1195   FLAGS, &COSTS##_tunings},
1196 #include "aarch64-cores.def"
1197   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1198     AARCH64_FL_FOR_ARCH8, &generic_tunings},
1199   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1200 };
1201
1202
1203 /* Target specification.  These are populated by the -march, -mtune, -mcpu
1204    handling code or by target attributes.  */
1205 static const struct processor *selected_arch;
1206 static const struct processor *selected_cpu;
1207 static const struct processor *selected_tune;
1208
1209 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1210
1211 /* The current tuning set.  */
1212 struct tune_params aarch64_tune_params = generic_tunings;
1213
1214 /* Table of machine attributes.  */
1215 static const struct attribute_spec aarch64_attribute_table[] =
1216 {
1217   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1218        affects_type_identity, handler, exclude } */
1219   { "aarch64_vector_pcs", 0, 0, false, true,  true,  true,  NULL, NULL },
1220   { NULL,                 0, 0, false, false, false, false, NULL, NULL }
1221 };
1222
1223 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1224
1225 /* An ISA extension in the co-processor and main instruction set space.  */
1226 struct aarch64_option_extension
1227 {
1228   const char *const name;
1229   const unsigned long flags_on;
1230   const unsigned long flags_off;
1231 };
1232
1233 typedef enum aarch64_cond_code
1234 {
1235   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1236   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1237   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1238 }
1239 aarch64_cc;
1240
1241 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1242
1243 struct aarch64_branch_protect_type
1244 {
1245   /* The type's name that the user passes to the branch-protection option
1246     string.  */
1247   const char* name;
1248   /* Function to handle the protection type and set global variables.
1249     First argument is the string token corresponding with this type and the
1250     second argument is the next token in the option string.
1251     Return values:
1252     * AARCH64_PARSE_OK: Handling was sucessful.
1253     * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1254       should print an error.
1255     * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1256       own error.  */
1257   enum aarch64_parse_opt_result (*handler)(char*, char*);
1258   /* A list of types that can follow this type in the option string.  */
1259   const aarch64_branch_protect_type* subtypes;
1260   unsigned int num_subtypes;
1261 };
1262
1263 static enum aarch64_parse_opt_result
1264 aarch64_handle_no_branch_protection (char* str, char* rest)
1265 {
1266   aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1267   aarch64_enable_bti = 0;
1268   if (rest)
1269     {
1270       error ("unexpected %<%s%> after %<%s%>", rest, str);
1271       return AARCH64_PARSE_INVALID_FEATURE;
1272     }
1273   return AARCH64_PARSE_OK;
1274 }
1275
1276 static enum aarch64_parse_opt_result
1277 aarch64_handle_standard_branch_protection (char* str, char* rest)
1278 {
1279   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1280   aarch64_ra_sign_key = AARCH64_KEY_A;
1281   aarch64_enable_bti = 1;
1282   if (rest)
1283     {
1284       error ("unexpected %<%s%> after %<%s%>", rest, str);
1285       return AARCH64_PARSE_INVALID_FEATURE;
1286     }
1287   return AARCH64_PARSE_OK;
1288 }
1289
1290 static enum aarch64_parse_opt_result
1291 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1292                                     char* rest ATTRIBUTE_UNUSED)
1293 {
1294   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1295   aarch64_ra_sign_key = AARCH64_KEY_A;
1296   return AARCH64_PARSE_OK;
1297 }
1298
1299 static enum aarch64_parse_opt_result
1300 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1301                               char* rest ATTRIBUTE_UNUSED)
1302 {
1303   aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1304   return AARCH64_PARSE_OK;
1305 }
1306
1307 static enum aarch64_parse_opt_result
1308 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
1309                               char* rest ATTRIBUTE_UNUSED)
1310 {
1311   aarch64_ra_sign_key = AARCH64_KEY_B;
1312   return AARCH64_PARSE_OK;
1313 }
1314
1315 static enum aarch64_parse_opt_result
1316 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1317                                     char* rest ATTRIBUTE_UNUSED)
1318 {
1319   aarch64_enable_bti = 1;
1320   return AARCH64_PARSE_OK;
1321 }
1322
1323 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1324   { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1325   { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
1326   { NULL, NULL, NULL, 0 }
1327 };
1328
1329 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1330   { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1331   { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1332   { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1333     ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1334   { "bti", aarch64_handle_bti_protection, NULL, 0 },
1335   { NULL, NULL, NULL, 0 }
1336 };
1337
1338 /* The condition codes of the processor, and the inverse function.  */
1339 static const char * const aarch64_condition_codes[] =
1340 {
1341   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1342   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1343 };
1344
1345 /* The preferred condition codes for SVE conditions.  */
1346 static const char *const aarch64_sve_condition_codes[] =
1347 {
1348   "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1349   "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1350 };
1351
1352 /* Return the assembly token for svpattern value VALUE.  */
1353
1354 static const char *
1355 svpattern_token (enum aarch64_svpattern pattern)
1356 {
1357   switch (pattern)
1358     {
1359 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1360     AARCH64_FOR_SVPATTERN (CASE)
1361 #undef CASE
1362     case AARCH64_NUM_SVPATTERNS:
1363       break;
1364     }
1365   gcc_unreachable ();
1366 }
1367
1368 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1369 const char *
1370 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1371                         const char * branch_format)
1372 {
1373     rtx_code_label * tmp_label = gen_label_rtx ();
1374     char label_buf[256];
1375     char buffer[128];
1376     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1377                                  CODE_LABEL_NUMBER (tmp_label));
1378     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1379     rtx dest_label = operands[pos_label];
1380     operands[pos_label] = tmp_label;
1381
1382     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1383     output_asm_insn (buffer, operands);
1384
1385     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1386     operands[pos_label] = dest_label;
1387     output_asm_insn (buffer, operands);
1388     return "";
1389 }
1390
1391 void
1392 aarch64_err_no_fpadvsimd (machine_mode mode)
1393 {
1394   if (TARGET_GENERAL_REGS_ONLY)
1395     if (FLOAT_MODE_P (mode))
1396       error ("%qs is incompatible with the use of floating-point types",
1397              "-mgeneral-regs-only");
1398     else
1399       error ("%qs is incompatible with the use of vector types",
1400              "-mgeneral-regs-only");
1401   else
1402     if (FLOAT_MODE_P (mode))
1403       error ("%qs feature modifier is incompatible with the use of"
1404              " floating-point types", "+nofp");
1405     else
1406       error ("%qs feature modifier is incompatible with the use of"
1407              " vector types", "+nofp");
1408 }
1409
1410 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1411    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1412    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1413    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1414    and GENERAL_REGS is lower than the memory cost (in this case the best class
1415    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
1416    cost results in bad allocations with many redundant int<->FP moves which
1417    are expensive on various cores.
1418    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1419    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
1420    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
1421    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
1422    The result of this is that it is no longer inefficient to have a higher
1423    memory move cost than the register move cost.
1424 */
1425
1426 static reg_class_t
1427 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1428                                          reg_class_t best_class)
1429 {
1430   machine_mode mode;
1431
1432   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1433       || !reg_class_subset_p (FP_REGS, allocno_class))
1434     return allocno_class;
1435
1436   if (!reg_class_subset_p (GENERAL_REGS, best_class)
1437       || !reg_class_subset_p (FP_REGS, best_class))
1438     return best_class;
1439
1440   mode = PSEUDO_REGNO_MODE (regno);
1441   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1442 }
1443
1444 static unsigned int
1445 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1446 {
1447   if (GET_MODE_UNIT_SIZE (mode) == 4)
1448     return aarch64_tune_params.min_div_recip_mul_sf;
1449   return aarch64_tune_params.min_div_recip_mul_df;
1450 }
1451
1452 /* Return the reassociation width of treeop OPC with mode MODE.  */
1453 static int
1454 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1455 {
1456   if (VECTOR_MODE_P (mode))
1457     return aarch64_tune_params.vec_reassoc_width;
1458   if (INTEGRAL_MODE_P (mode))
1459     return aarch64_tune_params.int_reassoc_width;
1460   /* Avoid reassociating floating point addition so we emit more FMAs.  */
1461   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1462     return aarch64_tune_params.fp_reassoc_width;
1463   return 1;
1464 }
1465
1466 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1467 unsigned
1468 aarch64_dbx_register_number (unsigned regno)
1469 {
1470    if (GP_REGNUM_P (regno))
1471      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1472    else if (regno == SP_REGNUM)
1473      return AARCH64_DWARF_SP;
1474    else if (FP_REGNUM_P (regno))
1475      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1476    else if (PR_REGNUM_P (regno))
1477      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1478    else if (regno == VG_REGNUM)
1479      return AARCH64_DWARF_VG;
1480
1481    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1482       equivalent DWARF register.  */
1483    return DWARF_FRAME_REGISTERS;
1484 }
1485
1486 /* If X is a CONST_DOUBLE, return its bit representation as a constant
1487    integer, otherwise return X unmodified.  */
1488 static rtx
1489 aarch64_bit_representation (rtx x)
1490 {
1491   if (CONST_DOUBLE_P (x))
1492     x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
1493   return x;
1494 }
1495
1496 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1497 static bool
1498 aarch64_advsimd_struct_mode_p (machine_mode mode)
1499 {
1500   return (TARGET_SIMD
1501           && (mode == OImode || mode == CImode || mode == XImode));
1502 }
1503
1504 /* Return true if MODE is an SVE predicate mode.  */
1505 static bool
1506 aarch64_sve_pred_mode_p (machine_mode mode)
1507 {
1508   return (TARGET_SVE
1509           && (mode == VNx16BImode
1510               || mode == VNx8BImode
1511               || mode == VNx4BImode
1512               || mode == VNx2BImode));
1513 }
1514
1515 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1516 const unsigned int VEC_ADVSIMD  = 1;
1517 const unsigned int VEC_SVE_DATA = 2;
1518 const unsigned int VEC_SVE_PRED = 4;
1519 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1520    a structure of 2, 3 or 4 vectors.  */
1521 const unsigned int VEC_STRUCT   = 8;
1522 /* Useful combinations of the above.  */
1523 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1524 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1525
1526 /* Return a set of flags describing the vector properties of mode MODE.
1527    Ignore modes that are not supported by the current target.  */
1528 static unsigned int
1529 aarch64_classify_vector_mode (machine_mode mode)
1530 {
1531   if (aarch64_advsimd_struct_mode_p (mode))
1532     return VEC_ADVSIMD | VEC_STRUCT;
1533
1534   if (aarch64_sve_pred_mode_p (mode))
1535     return VEC_SVE_PRED;
1536
1537   /* Make the decision based on the mode's enum value rather than its
1538      properties, so that we keep the correct classification regardless
1539      of -msve-vector-bits.  */
1540   switch (mode)
1541     {
1542     /* Single SVE vectors.  */
1543     case E_VNx16QImode:
1544     case E_VNx8HImode:
1545     case E_VNx4SImode:
1546     case E_VNx2DImode:
1547     case E_VNx8HFmode:
1548     case E_VNx4SFmode:
1549     case E_VNx2DFmode:
1550       return TARGET_SVE ? VEC_SVE_DATA : 0;
1551
1552     /* x2 SVE vectors.  */
1553     case E_VNx32QImode:
1554     case E_VNx16HImode:
1555     case E_VNx8SImode:
1556     case E_VNx4DImode:
1557     case E_VNx16HFmode:
1558     case E_VNx8SFmode:
1559     case E_VNx4DFmode:
1560     /* x3 SVE vectors.  */
1561     case E_VNx48QImode:
1562     case E_VNx24HImode:
1563     case E_VNx12SImode:
1564     case E_VNx6DImode:
1565     case E_VNx24HFmode:
1566     case E_VNx12SFmode:
1567     case E_VNx6DFmode:
1568     /* x4 SVE vectors.  */
1569     case E_VNx64QImode:
1570     case E_VNx32HImode:
1571     case E_VNx16SImode:
1572     case E_VNx8DImode:
1573     case E_VNx32HFmode:
1574     case E_VNx16SFmode:
1575     case E_VNx8DFmode:
1576       return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
1577
1578     /* 64-bit Advanced SIMD vectors.  */
1579     case E_V8QImode:
1580     case E_V4HImode:
1581     case E_V2SImode:
1582     /* ...E_V1DImode doesn't exist.  */
1583     case E_V4HFmode:
1584     case E_V2SFmode:
1585     case E_V1DFmode:
1586     /* 128-bit Advanced SIMD vectors.  */
1587     case E_V16QImode:
1588     case E_V8HImode:
1589     case E_V4SImode:
1590     case E_V2DImode:
1591     case E_V8HFmode:
1592     case E_V4SFmode:
1593     case E_V2DFmode:
1594       return TARGET_SIMD ? VEC_ADVSIMD : 0;
1595
1596     default:
1597       return 0;
1598     }
1599 }
1600
1601 /* Return true if MODE is any of the data vector modes, including
1602    structure modes.  */
1603 static bool
1604 aarch64_vector_data_mode_p (machine_mode mode)
1605 {
1606   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1607 }
1608
1609 /* Return true if MODE is an SVE data vector mode; either a single vector
1610    or a structure of vectors.  */
1611 static bool
1612 aarch64_sve_data_mode_p (machine_mode mode)
1613 {
1614   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1615 }
1616
1617 /* Implement target hook TARGET_ARRAY_MODE.  */
1618 static opt_machine_mode
1619 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1620 {
1621   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1622       && IN_RANGE (nelems, 2, 4))
1623     return mode_for_vector (GET_MODE_INNER (mode),
1624                             GET_MODE_NUNITS (mode) * nelems);
1625
1626   return opt_machine_mode ();
1627 }
1628
1629 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1630 static bool
1631 aarch64_array_mode_supported_p (machine_mode mode,
1632                                 unsigned HOST_WIDE_INT nelems)
1633 {
1634   if (TARGET_SIMD
1635       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1636           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1637       && (nelems >= 2 && nelems <= 4))
1638     return true;
1639
1640   return false;
1641 }
1642
1643 /* Return the SVE predicate mode to use for elements that have
1644    ELEM_NBYTES bytes, if such a mode exists.  */
1645
1646 opt_machine_mode
1647 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1648 {
1649   if (TARGET_SVE)
1650     {
1651       if (elem_nbytes == 1)
1652         return VNx16BImode;
1653       if (elem_nbytes == 2)
1654         return VNx8BImode;
1655       if (elem_nbytes == 4)
1656         return VNx4BImode;
1657       if (elem_nbytes == 8)
1658         return VNx2BImode;
1659     }
1660   return opt_machine_mode ();
1661 }
1662
1663 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1664
1665 static opt_machine_mode
1666 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1667 {
1668   if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1669     {
1670       unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1671       machine_mode pred_mode;
1672       if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1673         return pred_mode;
1674     }
1675
1676   return default_get_mask_mode (nunits, nbytes);
1677 }
1678
1679 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE.  */
1680
1681 static opt_machine_mode
1682 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
1683 {
1684   enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
1685                             ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
1686   machine_mode mode;
1687   FOR_EACH_MODE_IN_CLASS (mode, mclass)
1688     if (inner_mode == GET_MODE_INNER (mode)
1689         && known_eq (nunits, GET_MODE_NUNITS (mode))
1690         && aarch64_sve_data_mode_p (mode))
1691       return mode;
1692   return opt_machine_mode ();
1693 }
1694
1695 /* Return the integer element mode associated with SVE mode MODE.  */
1696
1697 static scalar_int_mode
1698 aarch64_sve_element_int_mode (machine_mode mode)
1699 {
1700   unsigned int elt_bits = vector_element_size (BITS_PER_SVE_VECTOR,
1701                                                GET_MODE_NUNITS (mode));
1702   return int_mode_for_size (elt_bits, 0).require ();
1703 }
1704
1705 /* Return the integer vector mode associated with SVE mode MODE.
1706    Unlike mode_for_int_vector, this can handle the case in which
1707    MODE is a predicate (and thus has a different total size).  */
1708
1709 static machine_mode
1710 aarch64_sve_int_mode (machine_mode mode)
1711 {
1712   scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
1713   return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
1714 }
1715
1716 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
1717    prefer to use the first arithmetic operand as the else value if
1718    the else value doesn't matter, since that exactly matches the SVE
1719    destructive merging form.  For ternary operations we could either
1720    pick the first operand and use FMAD-like instructions or the last
1721    operand and use FMLA-like instructions; the latter seems more
1722    natural.  */
1723
1724 static tree
1725 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1726 {
1727   return nops == 3 ? ops[2] : ops[0];
1728 }
1729
1730 /* Implement TARGET_HARD_REGNO_NREGS.  */
1731
1732 static unsigned int
1733 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1734 {
1735   /* ??? Logically we should only need to provide a value when
1736      HARD_REGNO_MODE_OK says that the combination is valid,
1737      but at the moment we need to handle all modes.  Just ignore
1738      any runtime parts for registers that can't store them.  */
1739   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1740   switch (aarch64_regno_regclass (regno))
1741     {
1742     case FP_REGS:
1743     case FP_LO_REGS:
1744     case FP_LO8_REGS:
1745       if (aarch64_sve_data_mode_p (mode))
1746         return exact_div (GET_MODE_SIZE (mode),
1747                           BYTES_PER_SVE_VECTOR).to_constant ();
1748       return CEIL (lowest_size, UNITS_PER_VREG);
1749     case PR_REGS:
1750     case PR_LO_REGS:
1751     case PR_HI_REGS:
1752       return 1;
1753     default:
1754       return CEIL (lowest_size, UNITS_PER_WORD);
1755     }
1756   gcc_unreachable ();
1757 }
1758
1759 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1760
1761 static bool
1762 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1763 {
1764   if (GET_MODE_CLASS (mode) == MODE_CC)
1765     return regno == CC_REGNUM;
1766
1767   if (regno == VG_REGNUM)
1768     /* This must have the same size as _Unwind_Word.  */
1769     return mode == DImode;
1770
1771   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1772   if (vec_flags & VEC_SVE_PRED)
1773     return PR_REGNUM_P (regno);
1774
1775   if (PR_REGNUM_P (regno))
1776     return 0;
1777
1778   if (regno == SP_REGNUM)
1779     /* The purpose of comparing with ptr_mode is to support the
1780        global register variable associated with the stack pointer
1781        register via the syntax of asm ("wsp") in ILP32.  */
1782     return mode == Pmode || mode == ptr_mode;
1783
1784   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1785     return mode == Pmode;
1786
1787   if (GP_REGNUM_P (regno))
1788     {
1789       if (known_le (GET_MODE_SIZE (mode), 8))
1790         return true;
1791       else if (known_le (GET_MODE_SIZE (mode), 16))
1792         return (regno & 1) == 0;
1793     }
1794   else if (FP_REGNUM_P (regno))
1795     {
1796       if (vec_flags & VEC_STRUCT)
1797         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1798       else
1799         return !VECTOR_MODE_P (mode) || vec_flags != 0;
1800     }
1801
1802   return false;
1803 }
1804
1805 /* Return true if this is a definition of a vectorized simd function.  */
1806
1807 static bool
1808 aarch64_simd_decl_p (tree fndecl)
1809 {
1810   tree fntype;
1811
1812   if (fndecl == NULL)
1813     return false;
1814   fntype = TREE_TYPE (fndecl);
1815   if (fntype == NULL)
1816     return false;
1817
1818   /* Functions with the aarch64_vector_pcs attribute use the simd ABI.  */
1819   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
1820     return true;
1821
1822   return false;
1823 }
1824
1825 /* Return the mode a register save/restore should use.  DImode for integer
1826    registers, DFmode for FP registers in non-SIMD functions (they only save
1827    the bottom half of a 128 bit register), or TFmode for FP registers in
1828    SIMD functions.  */
1829
1830 static machine_mode
1831 aarch64_reg_save_mode (tree fndecl, unsigned regno)
1832 {
1833   return GP_REGNUM_P (regno)
1834            ? E_DImode
1835            : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
1836 }
1837
1838 /* Return true if the instruction is a call to a SIMD function, false
1839    if it is not a SIMD function or if we do not know anything about
1840    the function.  */
1841
1842 static bool
1843 aarch64_simd_call_p (rtx_insn *insn)
1844 {
1845   rtx symbol;
1846   rtx call;
1847   tree fndecl;
1848
1849   gcc_assert (CALL_P (insn));
1850   call = get_call_rtx_from (insn);
1851   symbol = XEXP (XEXP (call, 0), 0);
1852   if (GET_CODE (symbol) != SYMBOL_REF)
1853     return false;
1854   fndecl = SYMBOL_REF_DECL (symbol);
1855   if (!fndecl)
1856     return false;
1857
1858   return aarch64_simd_decl_p (fndecl);
1859 }
1860
1861 /* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS.  If INSN calls
1862    a function that uses the SIMD ABI, take advantage of the extra
1863    call-preserved registers that the ABI provides.  */
1864
1865 void
1866 aarch64_remove_extra_call_preserved_regs (rtx_insn *insn,
1867                                           HARD_REG_SET *return_set)
1868 {
1869   if (aarch64_simd_call_p (insn))
1870     {
1871       for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1872         if (FP_SIMD_SAVED_REGNUM_P (regno))
1873           CLEAR_HARD_REG_BIT (*return_set, regno);
1874     }
1875 }
1876
1877 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1878    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1879    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1880
1881 static bool
1882 aarch64_hard_regno_call_part_clobbered (rtx_insn *insn, unsigned int regno,
1883                                         machine_mode mode)
1884 {
1885   bool simd_p = insn && CALL_P (insn) && aarch64_simd_call_p (insn);
1886   return FP_REGNUM_P (regno)
1887          && maybe_gt (GET_MODE_SIZE (mode), simd_p ? 16 : 8);
1888 }
1889
1890 /* Implement TARGET_RETURN_CALL_WITH_MAX_CLOBBERS.  */
1891
1892 rtx_insn *
1893 aarch64_return_call_with_max_clobbers (rtx_insn *call_1, rtx_insn *call_2)
1894 {
1895   gcc_assert (CALL_P (call_1) && CALL_P (call_2));
1896
1897   if (!aarch64_simd_call_p (call_1) || aarch64_simd_call_p (call_2))
1898     return call_1;
1899   else
1900     return call_2;
1901 }
1902
1903 /* Implement REGMODE_NATURAL_SIZE.  */
1904 poly_uint64
1905 aarch64_regmode_natural_size (machine_mode mode)
1906 {
1907   /* The natural size for SVE data modes is one SVE data vector,
1908      and similarly for predicates.  We can't independently modify
1909      anything smaller than that.  */
1910   /* ??? For now, only do this for variable-width SVE registers.
1911      Doing it for constant-sized registers breaks lower-subreg.c.  */
1912   /* ??? And once that's fixed, we should probably have similar
1913      code for Advanced SIMD.  */
1914   if (!aarch64_sve_vg.is_constant ())
1915     {
1916       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1917       if (vec_flags & VEC_SVE_PRED)
1918         return BYTES_PER_SVE_PRED;
1919       if (vec_flags & VEC_SVE_DATA)
1920         return BYTES_PER_SVE_VECTOR;
1921     }
1922   return UNITS_PER_WORD;
1923 }
1924
1925 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1926 machine_mode
1927 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1928                                      machine_mode mode)
1929 {
1930   /* The predicate mode determines which bits are significant and
1931      which are "don't care".  Decreasing the number of lanes would
1932      lose data while increasing the number of lanes would make bits
1933      unnecessarily significant.  */
1934   if (PR_REGNUM_P (regno))
1935     return mode;
1936   if (known_ge (GET_MODE_SIZE (mode), 4))
1937     return mode;
1938   else
1939     return SImode;
1940 }
1941
1942 /* Return true if I's bits are consecutive ones from the MSB.  */
1943 bool
1944 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
1945 {
1946   return exact_log2 (-i) != HOST_WIDE_INT_M1;
1947 }
1948
1949 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
1950    that strcpy from constants will be faster.  */
1951
1952 static HOST_WIDE_INT
1953 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1954 {
1955   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1956     return MAX (align, BITS_PER_WORD);
1957   return align;
1958 }
1959
1960 /* Return true if calls to DECL should be treated as
1961    long-calls (ie called via a register).  */
1962 static bool
1963 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1964 {
1965   return false;
1966 }
1967
1968 /* Return true if calls to symbol-ref SYM should be treated as
1969    long-calls (ie called via a register).  */
1970 bool
1971 aarch64_is_long_call_p (rtx sym)
1972 {
1973   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1974 }
1975
1976 /* Return true if calls to symbol-ref SYM should not go through
1977    plt stubs.  */
1978
1979 bool
1980 aarch64_is_noplt_call_p (rtx sym)
1981 {
1982   const_tree decl = SYMBOL_REF_DECL (sym);
1983
1984   if (flag_pic
1985       && decl
1986       && (!flag_plt
1987           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1988       && !targetm.binds_local_p (decl))
1989     return true;
1990
1991   return false;
1992 }
1993
1994 /* Return true if the offsets to a zero/sign-extract operation
1995    represent an expression that matches an extend operation.  The
1996    operands represent the paramters from
1997
1998    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1999 bool
2000 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
2001                                 rtx extract_imm)
2002 {
2003   HOST_WIDE_INT mult_val, extract_val;
2004
2005   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
2006     return false;
2007
2008   mult_val = INTVAL (mult_imm);
2009   extract_val = INTVAL (extract_imm);
2010
2011   if (extract_val > 8
2012       && extract_val < GET_MODE_BITSIZE (mode)
2013       && exact_log2 (extract_val & ~7) > 0
2014       && (extract_val & 7) <= 4
2015       && mult_val == (1 << (extract_val & 7)))
2016     return true;
2017
2018   return false;
2019 }
2020
2021 /* Emit an insn that's a simple single-set.  Both the operands must be
2022    known to be valid.  */
2023 inline static rtx_insn *
2024 emit_set_insn (rtx x, rtx y)
2025 {
2026   return emit_insn (gen_rtx_SET (x, y));
2027 }
2028
2029 /* X and Y are two things to compare using CODE.  Emit the compare insn and
2030    return the rtx for register 0 in the proper mode.  */
2031 rtx
2032 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
2033 {
2034   machine_mode mode = SELECT_CC_MODE (code, x, y);
2035   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
2036
2037   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
2038   return cc_reg;
2039 }
2040
2041 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
2042
2043 static rtx
2044 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2045                                   machine_mode y_mode)
2046 {
2047   if (y_mode == E_QImode || y_mode == E_HImode)
2048     {
2049       if (CONST_INT_P (y))
2050         y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2051       else
2052         {
2053           rtx t, cc_reg;
2054           machine_mode cc_mode;
2055
2056           t = gen_rtx_ZERO_EXTEND (SImode, y);
2057           t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2058           cc_mode = CC_SWPmode;
2059           cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2060           emit_set_insn (cc_reg, t);
2061           return cc_reg;
2062         }
2063     }
2064
2065   return aarch64_gen_compare_reg (code, x, y);
2066 }
2067
2068 /* Build the SYMBOL_REF for __tls_get_addr.  */
2069
2070 static GTY(()) rtx tls_get_addr_libfunc;
2071
2072 rtx
2073 aarch64_tls_get_addr (void)
2074 {
2075   if (!tls_get_addr_libfunc)
2076     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
2077   return tls_get_addr_libfunc;
2078 }
2079
2080 /* Return the TLS model to use for ADDR.  */
2081
2082 static enum tls_model
2083 tls_symbolic_operand_type (rtx addr)
2084 {
2085   enum tls_model tls_kind = TLS_MODEL_NONE;
2086   if (GET_CODE (addr) == CONST)
2087     {
2088       poly_int64 addend;
2089       rtx sym = strip_offset (addr, &addend);
2090       if (GET_CODE (sym) == SYMBOL_REF)
2091         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
2092     }
2093   else if (GET_CODE (addr) == SYMBOL_REF)
2094     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2095
2096   return tls_kind;
2097 }
2098
2099 /* We'll allow lo_sum's in addresses in our legitimate addresses
2100    so that combine would take care of combining addresses where
2101    necessary, but for generation purposes, we'll generate the address
2102    as :
2103    RTL                               Absolute
2104    tmp = hi (symbol_ref);            adrp  x1, foo
2105    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
2106                                      nop
2107
2108    PIC                               TLS
2109    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
2110    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
2111                                      bl   __tls_get_addr
2112                                      nop
2113
2114    Load TLS symbol, depending on TLS mechanism and TLS access model.
2115
2116    Global Dynamic - Traditional TLS:
2117    adrp tmp, :tlsgd:imm
2118    add  dest, tmp, #:tlsgd_lo12:imm
2119    bl   __tls_get_addr
2120
2121    Global Dynamic - TLS Descriptors:
2122    adrp dest, :tlsdesc:imm
2123    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
2124    add  dest, dest, #:tlsdesc_lo12:imm
2125    blr  tmp
2126    mrs  tp, tpidr_el0
2127    add  dest, dest, tp
2128
2129    Initial Exec:
2130    mrs  tp, tpidr_el0
2131    adrp tmp, :gottprel:imm
2132    ldr  dest, [tmp, #:gottprel_lo12:imm]
2133    add  dest, dest, tp
2134
2135    Local Exec:
2136    mrs  tp, tpidr_el0
2137    add  t0, tp, #:tprel_hi12:imm, lsl #12
2138    add  t0, t0, #:tprel_lo12_nc:imm
2139 */
2140
2141 static void
2142 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2143                                    enum aarch64_symbol_type type)
2144 {
2145   switch (type)
2146     {
2147     case SYMBOL_SMALL_ABSOLUTE:
2148       {
2149         /* In ILP32, the mode of dest can be either SImode or DImode.  */
2150         rtx tmp_reg = dest;
2151         machine_mode mode = GET_MODE (dest);
2152
2153         gcc_assert (mode == Pmode || mode == ptr_mode);
2154
2155         if (can_create_pseudo_p ())
2156           tmp_reg = gen_reg_rtx (mode);
2157
2158         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2159         emit_insn (gen_add_losym (dest, tmp_reg, imm));
2160         return;
2161       }
2162
2163     case SYMBOL_TINY_ABSOLUTE:
2164       emit_insn (gen_rtx_SET (dest, imm));
2165       return;
2166
2167     case SYMBOL_SMALL_GOT_28K:
2168       {
2169         machine_mode mode = GET_MODE (dest);
2170         rtx gp_rtx = pic_offset_table_rtx;
2171         rtx insn;
2172         rtx mem;
2173
2174         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2175            here before rtl expand.  Tree IVOPT will generate rtl pattern to
2176            decide rtx costs, in which case pic_offset_table_rtx is not
2177            initialized.  For that case no need to generate the first adrp
2178            instruction as the final cost for global variable access is
2179            one instruction.  */
2180         if (gp_rtx != NULL)
2181           {
2182             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2183                using the page base as GOT base, the first page may be wasted,
2184                in the worst scenario, there is only 28K space for GOT).
2185
2186                The generate instruction sequence for accessing global variable
2187                is:
2188
2189                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2190
2191                Only one instruction needed. But we must initialize
2192                pic_offset_table_rtx properly.  We generate initialize insn for
2193                every global access, and allow CSE to remove all redundant.
2194
2195                The final instruction sequences will look like the following
2196                for multiply global variables access.
2197
2198                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2199
2200                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2201                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2202                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2203                  ...  */
2204
2205             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2206             crtl->uses_pic_offset_table = 1;
2207             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2208
2209             if (mode != GET_MODE (gp_rtx))
2210              gp_rtx = gen_lowpart (mode, gp_rtx);
2211
2212           }
2213
2214         if (mode == ptr_mode)
2215           {
2216             if (mode == DImode)
2217               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2218             else
2219               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2220
2221             mem = XVECEXP (SET_SRC (insn), 0, 0);
2222           }
2223         else
2224           {
2225             gcc_assert (mode == Pmode);
2226
2227             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2228             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2229           }
2230
2231         /* The operand is expected to be MEM.  Whenever the related insn
2232            pattern changed, above code which calculate mem should be
2233            updated.  */
2234         gcc_assert (GET_CODE (mem) == MEM);
2235         MEM_READONLY_P (mem) = 1;
2236         MEM_NOTRAP_P (mem) = 1;
2237         emit_insn (insn);
2238         return;
2239       }
2240
2241     case SYMBOL_SMALL_GOT_4G:
2242       {
2243         /* In ILP32, the mode of dest can be either SImode or DImode,
2244            while the got entry is always of SImode size.  The mode of
2245            dest depends on how dest is used: if dest is assigned to a
2246            pointer (e.g. in the memory), it has SImode; it may have
2247            DImode if dest is dereferenced to access the memeory.
2248            This is why we have to handle three different ldr_got_small
2249            patterns here (two patterns for ILP32).  */
2250
2251         rtx insn;
2252         rtx mem;
2253         rtx tmp_reg = dest;
2254         machine_mode mode = GET_MODE (dest);
2255
2256         if (can_create_pseudo_p ())
2257           tmp_reg = gen_reg_rtx (mode);
2258
2259         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2260         if (mode == ptr_mode)
2261           {
2262             if (mode == DImode)
2263               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2264             else
2265               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2266
2267             mem = XVECEXP (SET_SRC (insn), 0, 0);
2268           }
2269         else
2270           {
2271             gcc_assert (mode == Pmode);
2272
2273             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2274             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2275           }
2276
2277         gcc_assert (GET_CODE (mem) == MEM);
2278         MEM_READONLY_P (mem) = 1;
2279         MEM_NOTRAP_P (mem) = 1;
2280         emit_insn (insn);
2281         return;
2282       }
2283
2284     case SYMBOL_SMALL_TLSGD:
2285       {
2286         rtx_insn *insns;
2287         machine_mode mode = GET_MODE (dest);
2288         rtx result = gen_rtx_REG (mode, R0_REGNUM);
2289
2290         start_sequence ();
2291         if (TARGET_ILP32)
2292           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2293         else
2294           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2295         insns = get_insns ();
2296         end_sequence ();
2297
2298         RTL_CONST_CALL_P (insns) = 1;
2299         emit_libcall_block (insns, dest, result, imm);
2300         return;
2301       }
2302
2303     case SYMBOL_SMALL_TLSDESC:
2304       {
2305         machine_mode mode = GET_MODE (dest);
2306         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2307         rtx tp;
2308
2309         gcc_assert (mode == Pmode || mode == ptr_mode);
2310
2311         /* In ILP32, the got entry is always of SImode size.  Unlike
2312            small GOT, the dest is fixed at reg 0.  */
2313         if (TARGET_ILP32)
2314           emit_insn (gen_tlsdesc_small_si (imm));
2315         else
2316           emit_insn (gen_tlsdesc_small_di (imm));
2317         tp = aarch64_load_tp (NULL);
2318
2319         if (mode != Pmode)
2320           tp = gen_lowpart (mode, tp);
2321
2322         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2323         if (REG_P (dest))
2324           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2325         return;
2326       }
2327
2328     case SYMBOL_SMALL_TLSIE:
2329       {
2330         /* In ILP32, the mode of dest can be either SImode or DImode,
2331            while the got entry is always of SImode size.  The mode of
2332            dest depends on how dest is used: if dest is assigned to a
2333            pointer (e.g. in the memory), it has SImode; it may have
2334            DImode if dest is dereferenced to access the memeory.
2335            This is why we have to handle three different tlsie_small
2336            patterns here (two patterns for ILP32).  */
2337         machine_mode mode = GET_MODE (dest);
2338         rtx tmp_reg = gen_reg_rtx (mode);
2339         rtx tp = aarch64_load_tp (NULL);
2340
2341         if (mode == ptr_mode)
2342           {
2343             if (mode == DImode)
2344               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2345             else
2346               {
2347                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2348                 tp = gen_lowpart (mode, tp);
2349               }
2350           }
2351         else
2352           {
2353             gcc_assert (mode == Pmode);
2354             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2355           }
2356
2357         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2358         if (REG_P (dest))
2359           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2360         return;
2361       }
2362
2363     case SYMBOL_TLSLE12:
2364     case SYMBOL_TLSLE24:
2365     case SYMBOL_TLSLE32:
2366     case SYMBOL_TLSLE48:
2367       {
2368         machine_mode mode = GET_MODE (dest);
2369         rtx tp = aarch64_load_tp (NULL);
2370
2371         if (mode != Pmode)
2372           tp = gen_lowpart (mode, tp);
2373
2374         switch (type)
2375           {
2376           case SYMBOL_TLSLE12:
2377             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2378                         (dest, tp, imm));
2379             break;
2380           case SYMBOL_TLSLE24:
2381             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2382                         (dest, tp, imm));
2383           break;
2384           case SYMBOL_TLSLE32:
2385             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2386                         (dest, imm));
2387             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2388                         (dest, dest, tp));
2389           break;
2390           case SYMBOL_TLSLE48:
2391             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2392                         (dest, imm));
2393             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2394                         (dest, dest, tp));
2395             break;
2396           default:
2397             gcc_unreachable ();
2398           }
2399
2400         if (REG_P (dest))
2401           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2402         return;
2403       }
2404
2405     case SYMBOL_TINY_GOT:
2406       emit_insn (gen_ldr_got_tiny (dest, imm));
2407       return;
2408
2409     case SYMBOL_TINY_TLSIE:
2410       {
2411         machine_mode mode = GET_MODE (dest);
2412         rtx tp = aarch64_load_tp (NULL);
2413
2414         if (mode == ptr_mode)
2415           {
2416             if (mode == DImode)
2417               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2418             else
2419               {
2420                 tp = gen_lowpart (mode, tp);
2421                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2422               }
2423           }
2424         else
2425           {
2426             gcc_assert (mode == Pmode);
2427             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2428           }
2429
2430         if (REG_P (dest))
2431           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2432         return;
2433       }
2434
2435     default:
2436       gcc_unreachable ();
2437     }
2438 }
2439
2440 /* Emit a move from SRC to DEST.  Assume that the move expanders can
2441    handle all moves if !can_create_pseudo_p ().  The distinction is
2442    important because, unlike emit_move_insn, the move expanders know
2443    how to force Pmode objects into the constant pool even when the
2444    constant pool address is not itself legitimate.  */
2445 static rtx
2446 aarch64_emit_move (rtx dest, rtx src)
2447 {
2448   return (can_create_pseudo_p ()
2449           ? emit_move_insn (dest, src)
2450           : emit_move_insn_1 (dest, src));
2451 }
2452
2453 /* Apply UNOPTAB to OP and store the result in DEST.  */
2454
2455 static void
2456 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2457 {
2458   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2459   if (dest != tmp)
2460     emit_move_insn (dest, tmp);
2461 }
2462
2463 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
2464
2465 static void
2466 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2467 {
2468   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2469                           OPTAB_DIRECT);
2470   if (dest != tmp)
2471     emit_move_insn (dest, tmp);
2472 }
2473
2474 /* Split a 128-bit move operation into two 64-bit move operations,
2475    taking care to handle partial overlap of register to register
2476    copies.  Special cases are needed when moving between GP regs and
2477    FP regs.  SRC can be a register, constant or memory; DST a register
2478    or memory.  If either operand is memory it must not have any side
2479    effects.  */
2480 void
2481 aarch64_split_128bit_move (rtx dst, rtx src)
2482 {
2483   rtx dst_lo, dst_hi;
2484   rtx src_lo, src_hi;
2485
2486   machine_mode mode = GET_MODE (dst);
2487
2488   gcc_assert (mode == TImode || mode == TFmode);
2489   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2490   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2491
2492   if (REG_P (dst) && REG_P (src))
2493     {
2494       int src_regno = REGNO (src);
2495       int dst_regno = REGNO (dst);
2496
2497       /* Handle FP <-> GP regs.  */
2498       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2499         {
2500           src_lo = gen_lowpart (word_mode, src);
2501           src_hi = gen_highpart (word_mode, src);
2502
2503           emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2504           emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2505           return;
2506         }
2507       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2508         {
2509           dst_lo = gen_lowpart (word_mode, dst);
2510           dst_hi = gen_highpart (word_mode, dst);
2511
2512           emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2513           emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2514           return;
2515         }
2516     }
2517
2518   dst_lo = gen_lowpart (word_mode, dst);
2519   dst_hi = gen_highpart (word_mode, dst);
2520   src_lo = gen_lowpart (word_mode, src);
2521   src_hi = gen_highpart_mode (word_mode, mode, src);
2522
2523   /* At most one pairing may overlap.  */
2524   if (reg_overlap_mentioned_p (dst_lo, src_hi))
2525     {
2526       aarch64_emit_move (dst_hi, src_hi);
2527       aarch64_emit_move (dst_lo, src_lo);
2528     }
2529   else
2530     {
2531       aarch64_emit_move (dst_lo, src_lo);
2532       aarch64_emit_move (dst_hi, src_hi);
2533     }
2534 }
2535
2536 bool
2537 aarch64_split_128bit_move_p (rtx dst, rtx src)
2538 {
2539   return (! REG_P (src)
2540           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2541 }
2542
2543 /* Split a complex SIMD combine.  */
2544
2545 void
2546 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2547 {
2548   machine_mode src_mode = GET_MODE (src1);
2549   machine_mode dst_mode = GET_MODE (dst);
2550
2551   gcc_assert (VECTOR_MODE_P (dst_mode));
2552   gcc_assert (register_operand (dst, dst_mode)
2553               && register_operand (src1, src_mode)
2554               && register_operand (src2, src_mode));
2555
2556   emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2557   return;
2558 }
2559
2560 /* Split a complex SIMD move.  */
2561
2562 void
2563 aarch64_split_simd_move (rtx dst, rtx src)
2564 {
2565   machine_mode src_mode = GET_MODE (src);
2566   machine_mode dst_mode = GET_MODE (dst);
2567
2568   gcc_assert (VECTOR_MODE_P (dst_mode));
2569
2570   if (REG_P (dst) && REG_P (src))
2571     {
2572       gcc_assert (VECTOR_MODE_P (src_mode));
2573       emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2574     }
2575 }
2576
2577 bool
2578 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2579                               machine_mode ymode, rtx y)
2580 {
2581   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2582   gcc_assert (r != NULL);
2583   return rtx_equal_p (x, r);
2584 }
2585
2586
2587 /* Return TARGET if it is nonnull and a register of mode MODE.
2588    Otherwise, return a fresh register of mode MODE if we can,
2589    or TARGET reinterpreted as MODE if we can't.  */
2590
2591 static rtx
2592 aarch64_target_reg (rtx target, machine_mode mode)
2593 {
2594   if (target && REG_P (target) && GET_MODE (target) == mode)
2595     return target;
2596   if (!can_create_pseudo_p ())
2597     {
2598       gcc_assert (target);
2599       return gen_lowpart (mode, target);
2600     }
2601   return gen_reg_rtx (mode);
2602 }
2603
2604 /* Return a register that contains the constant in BUILDER, given that
2605    the constant is a legitimate move operand.  Use TARGET as the register
2606    if it is nonnull and convenient.  */
2607
2608 static rtx
2609 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
2610 {
2611   rtx src = builder.build ();
2612   target = aarch64_target_reg (target, GET_MODE (src));
2613   emit_insn (gen_rtx_SET (target, src));
2614   return target;
2615 }
2616
2617 static rtx
2618 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2619 {
2620   if (can_create_pseudo_p ())
2621     return force_reg (mode, value);
2622   else
2623     {
2624       gcc_assert (x);
2625       aarch64_emit_move (x, value);
2626       return x;
2627     }
2628 }
2629
2630 /* Return true if predicate value X is a constant in which every element
2631    is a CONST_INT.  When returning true, describe X in BUILDER as a VNx16BI
2632    value, i.e. as a predicate in which all bits are significant.  */
2633
2634 static bool
2635 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
2636 {
2637   if (GET_CODE (x) != CONST_VECTOR)
2638     return false;
2639
2640   unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
2641                                              GET_MODE_NUNITS (GET_MODE (x)));
2642   unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
2643   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
2644   builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
2645
2646   unsigned int nelts = const_vector_encoded_nelts (x);
2647   for (unsigned int i = 0; i < nelts; ++i)
2648     {
2649       rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
2650       if (!CONST_INT_P (elt))
2651         return false;
2652
2653       builder.quick_push (elt);
2654       for (unsigned int j = 1; j < factor; ++j)
2655         builder.quick_push (const0_rtx);
2656     }
2657   builder.finalize ();
2658   return true;
2659 }
2660
2661 /* BUILDER contains a predicate constant of mode VNx16BI.  Return the
2662    widest predicate element size it can have (that is, the largest size
2663    for which each element would still be 0 or 1).  */
2664
2665 unsigned int
2666 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
2667 {
2668   /* Start with the most optimistic assumption: that we only need
2669      one bit per pattern.  This is what we will use if only the first
2670      bit in each pattern is ever set.  */
2671   unsigned int mask = GET_MODE_SIZE (DImode);
2672   mask |= builder.npatterns ();
2673
2674   /* Look for set bits.  */
2675   unsigned int nelts = builder.encoded_nelts ();
2676   for (unsigned int i = 1; i < nelts; ++i)
2677     if (INTVAL (builder.elt (i)) != 0)
2678       {
2679         if (i & 1)
2680           return 1;
2681         mask |= i;
2682       }
2683   return mask & -mask;
2684 }
2685
2686 /* BUILDER is a predicate constant of mode VNx16BI.  Consider the value
2687    that the constant would have with predicate element size ELT_SIZE
2688    (ignoring the upper bits in each element) and return:
2689
2690    * -1 if all bits are set
2691    * N if the predicate has N leading set bits followed by all clear bits
2692    * 0 if the predicate does not have any of these forms.  */
2693
2694 int
2695 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
2696                               unsigned int elt_size)
2697 {
2698   /* If nelts_per_pattern is 3, we have set bits followed by clear bits
2699      followed by set bits.  */
2700   if (builder.nelts_per_pattern () == 3)
2701     return 0;
2702
2703   /* Skip over leading set bits.  */
2704   unsigned int nelts = builder.encoded_nelts ();
2705   unsigned int i = 0;
2706   for (; i < nelts; i += elt_size)
2707     if (INTVAL (builder.elt (i)) == 0)
2708       break;
2709   unsigned int vl = i / elt_size;
2710
2711   /* Check for the all-true case.  */
2712   if (i == nelts)
2713     return -1;
2714
2715   /* If nelts_per_pattern is 1, then either VL is zero, or we have a
2716      repeating pattern of set bits followed by clear bits.  */
2717   if (builder.nelts_per_pattern () != 2)
2718     return 0;
2719
2720   /* We have a "foreground" value and a duplicated "background" value.
2721      If the background might repeat and the last set bit belongs to it,
2722      we might have set bits followed by clear bits followed by set bits.  */
2723   if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
2724     return 0;
2725
2726   /* Make sure that the rest are all clear.  */
2727   for (; i < nelts; i += elt_size)
2728     if (INTVAL (builder.elt (i)) != 0)
2729       return 0;
2730
2731   return vl;
2732 }
2733
2734 /* See if there is an svpattern that encodes an SVE predicate of mode
2735    PRED_MODE in which the first VL bits are set and the rest are clear.
2736    Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
2737    A VL of -1 indicates an all-true vector.  */
2738
2739 aarch64_svpattern
2740 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
2741 {
2742   if (vl < 0)
2743     return AARCH64_SV_ALL;
2744
2745   if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
2746     return AARCH64_NUM_SVPATTERNS;
2747
2748   if (vl >= 1 && vl <= 8)
2749     return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
2750
2751   if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
2752     return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
2753
2754   int max_vl;
2755   if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
2756     {
2757       if (vl == (max_vl / 3) * 3)
2758         return AARCH64_SV_MUL3;
2759       /* These would only trigger for non-power-of-2 lengths.  */
2760       if (vl == (max_vl & -4))
2761         return AARCH64_SV_MUL4;
2762       if (vl == (1 << floor_log2 (max_vl)))
2763         return AARCH64_SV_POW2;
2764       if (vl == max_vl)
2765         return AARCH64_SV_ALL;
2766     }
2767   return AARCH64_NUM_SVPATTERNS;
2768 }
2769
2770 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
2771    bits has the lowest bit set and the upper bits clear.  This is the
2772    VNx16BImode equivalent of a PTRUE for controlling elements of
2773    ELT_SIZE bytes.  However, because the constant is VNx16BImode,
2774    all bits are significant, even the upper zeros.  */
2775
2776 rtx
2777 aarch64_ptrue_all (unsigned int elt_size)
2778 {
2779   rtx_vector_builder builder (VNx16BImode, elt_size, 1);
2780   builder.quick_push (const1_rtx);
2781   for (unsigned int i = 1; i < elt_size; ++i)
2782     builder.quick_push (const0_rtx);
2783   return builder.build ();
2784 }
2785
2786 /* Return an all-true predicate register of mode MODE.  */
2787
2788 rtx
2789 aarch64_ptrue_reg (machine_mode mode)
2790 {
2791   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2792   rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
2793   return gen_lowpart (mode, reg);
2794 }
2795
2796 /* Return an all-false predicate register of mode MODE.  */
2797
2798 rtx
2799 aarch64_pfalse_reg (machine_mode mode)
2800 {
2801   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2802   rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
2803   return gen_lowpart (mode, reg);
2804 }
2805
2806 /* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
2807    true, or alternatively if we know that the operation predicated by
2808    PRED1[0] is safe to perform whenever PRED2 is true.  PRED1[1] is a
2809    aarch64_sve_gp_strictness operand that describes the operation
2810    predicated by PRED1[0].  */
2811
2812 bool
2813 aarch64_sve_pred_dominates_p (rtx *pred1, rtx pred2)
2814 {
2815   machine_mode mode = GET_MODE (pred2);
2816   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2817               && mode == GET_MODE (pred1[0])
2818               && aarch64_sve_gp_strictness (pred1[1], SImode));
2819   return (pred1[0] == CONSTM1_RTX (mode)
2820           || INTVAL (pred1[1]) == SVE_RELAXED_GP
2821           || rtx_equal_p (pred1[0], pred2));
2822 }
2823
2824 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
2825    for it.  PRED2[0] is the predicate for the instruction whose result
2826    is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
2827    for it.  Return true if we can prove that the two predicates are
2828    equivalent for PTEST purposes; that is, if we can replace PRED2[0]
2829    with PRED1[0] without changing behavior.  */
2830
2831 bool
2832 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
2833 {
2834   machine_mode mode = GET_MODE (pred1[0]);
2835   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2836               && mode == GET_MODE (pred2[0])
2837               && aarch64_sve_ptrue_flag (pred1[1], SImode)
2838               && aarch64_sve_ptrue_flag (pred2[1], SImode));
2839
2840   bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
2841                    || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
2842   bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
2843                    || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
2844   return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
2845 }
2846
2847 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
2848    DATA_MODE, and return the result in a predicate of mode PRED_MODE.
2849    Use TARGET as the target register if nonnull and convenient.  */
2850
2851 static rtx
2852 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
2853                           machine_mode data_mode, rtx op1, rtx op2)
2854 {
2855   insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
2856   expand_operand ops[5];
2857   create_output_operand (&ops[0], target, pred_mode);
2858   create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
2859   create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
2860   create_input_operand (&ops[3], op1, data_mode);
2861   create_input_operand (&ops[4], op2, data_mode);
2862   expand_insn (icode, 5, ops);
2863   return ops[0].value;
2864 }
2865
2866 /* Use a comparison to convert integer vector SRC into MODE, which is
2867    the corresponding SVE predicate mode.  Use TARGET for the result
2868    if it's nonnull and convenient.  */
2869
2870 static rtx
2871 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
2872 {
2873   machine_mode src_mode = GET_MODE (src);
2874   return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
2875                                    src, CONST0_RTX (src_mode));
2876 }
2877
2878 /* Return true if we can move VALUE into a register using a single
2879    CNT[BHWD] instruction.  */
2880
2881 static bool
2882 aarch64_sve_cnt_immediate_p (poly_int64 value)
2883 {
2884   HOST_WIDE_INT factor = value.coeffs[0];
2885   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
2886   return (value.coeffs[1] == factor
2887           && IN_RANGE (factor, 2, 16 * 16)
2888           && (factor & 1) == 0
2889           && factor <= 16 * (factor & -factor));
2890 }
2891
2892 /* Likewise for rtx X.  */
2893
2894 bool
2895 aarch64_sve_cnt_immediate_p (rtx x)
2896 {
2897   poly_int64 value;
2898   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2899 }
2900
2901 /* Return the asm string for an instruction with a CNT-like vector size
2902    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2903    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2904    first part of the operands template (the part that comes before the
2905    vector size itself).  FACTOR is the number of quadwords.
2906    NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2907    If it is zero, we can use any element size.  */
2908
2909 static char *
2910 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2911                                   unsigned int factor,
2912                                   unsigned int nelts_per_vq)
2913 {
2914   static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2915
2916   if (nelts_per_vq == 0)
2917     /* There is some overlap in the ranges of the four CNT instructions.
2918        Here we always use the smallest possible element size, so that the
2919        multiplier is 1 whereever possible.  */
2920     nelts_per_vq = factor & -factor;
2921   int shift = std::min (exact_log2 (nelts_per_vq), 4);
2922   gcc_assert (IN_RANGE (shift, 1, 4));
2923   char suffix = "dwhb"[shift - 1];
2924
2925   factor >>= shift;
2926   unsigned int written;
2927   if (factor == 1)
2928     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2929                         prefix, suffix, operands);
2930   else
2931     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2932                         prefix, suffix, operands, factor);
2933   gcc_assert (written < sizeof (buffer));
2934   return buffer;
2935 }
2936
2937 /* Return the asm string for an instruction with a CNT-like vector size
2938    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2939    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2940    first part of the operands template (the part that comes before the
2941    vector size itself).  X is the value of the vector size operand,
2942    as a polynomial integer rtx.  */
2943
2944 char *
2945 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2946                                   rtx x)
2947 {
2948   poly_int64 value = rtx_to_poly_int64 (x);
2949   gcc_assert (aarch64_sve_cnt_immediate_p (value));
2950   return aarch64_output_sve_cnt_immediate (prefix, operands,
2951                                            value.coeffs[1], 0);
2952 }
2953
2954 /* Return true if we can add X using a single SVE INC or DEC instruction.  */
2955
2956 bool
2957 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
2958 {
2959   poly_int64 value;
2960   return (poly_int_rtx_p (x, &value)
2961           && (aarch64_sve_cnt_immediate_p (value)
2962               || aarch64_sve_cnt_immediate_p (-value)));
2963 }
2964
2965 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
2966    operand 0.  */
2967
2968 char *
2969 aarch64_output_sve_scalar_inc_dec (rtx offset)
2970 {
2971   poly_int64 offset_value = rtx_to_poly_int64 (offset);
2972   gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
2973   if (offset_value.coeffs[1] > 0)
2974     return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2975                                              offset_value.coeffs[1], 0);
2976   else
2977     return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2978                                              -offset_value.coeffs[1], 0);
2979 }
2980
2981 /* Return true if we can add VALUE to a register using a single ADDVL
2982    or ADDPL instruction.  */
2983
2984 static bool
2985 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2986 {
2987   HOST_WIDE_INT factor = value.coeffs[0];
2988   if (factor == 0 || value.coeffs[1] != factor)
2989     return false;
2990   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2991      and a value of 16 is one vector width.  */
2992   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2993           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2994 }
2995
2996 /* Likewise for rtx X.  */
2997
2998 bool
2999 aarch64_sve_addvl_addpl_immediate_p (rtx x)
3000 {
3001   poly_int64 value;
3002   return (poly_int_rtx_p (x, &value)
3003           && aarch64_sve_addvl_addpl_immediate_p (value));
3004 }
3005
3006 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
3007    to operand 1 and storing the result in operand 0.  */
3008
3009 char *
3010 aarch64_output_sve_addvl_addpl (rtx offset)
3011 {
3012   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
3013   poly_int64 offset_value = rtx_to_poly_int64 (offset);
3014   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
3015
3016   int factor = offset_value.coeffs[1];
3017   if ((factor & 15) == 0)
3018     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
3019   else
3020     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
3021   return buffer;
3022 }
3023
3024 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3025    instruction.  If it is, store the number of elements in each vector
3026    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
3027    factor in *FACTOR_OUT (if nonnull).  */
3028
3029 bool
3030 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
3031                                         unsigned int *nelts_per_vq_out)
3032 {
3033   rtx elt;
3034   poly_int64 value;
3035
3036   if (!const_vec_duplicate_p (x, &elt)
3037       || !poly_int_rtx_p (elt, &value))
3038     return false;
3039
3040   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
3041   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
3042     /* There's no vector INCB.  */
3043     return false;
3044
3045   HOST_WIDE_INT factor = value.coeffs[0];
3046   if (value.coeffs[1] != factor)
3047     return false;
3048
3049   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
3050   if ((factor % nelts_per_vq) != 0
3051       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
3052     return false;
3053
3054   if (factor_out)
3055     *factor_out = factor;
3056   if (nelts_per_vq_out)
3057     *nelts_per_vq_out = nelts_per_vq;
3058   return true;
3059 }
3060
3061 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3062    instruction.  */
3063
3064 bool
3065 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
3066 {
3067   return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
3068 }
3069
3070 /* Return the asm template for an SVE vector INC or DEC instruction.
3071    OPERANDS gives the operands before the vector count and X is the
3072    value of the vector count operand itself.  */
3073
3074 char *
3075 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
3076 {
3077   int factor;
3078   unsigned int nelts_per_vq;
3079   if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
3080     gcc_unreachable ();
3081   if (factor < 0)
3082     return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
3083                                              nelts_per_vq);
3084   else
3085     return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
3086                                              nelts_per_vq);
3087 }
3088
3089 static int
3090 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
3091                                 scalar_int_mode mode)
3092 {
3093   int i;
3094   unsigned HOST_WIDE_INT val, val2, mask;
3095   int one_match, zero_match;
3096   int num_insns;
3097
3098   val = INTVAL (imm);
3099
3100   if (aarch64_move_imm (val, mode))
3101     {
3102       if (generate)
3103         emit_insn (gen_rtx_SET (dest, imm));
3104       return 1;
3105     }
3106
3107   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
3108      (with XXXX non-zero). In that case check to see if the move can be done in
3109      a smaller mode.  */
3110   val2 = val & 0xffffffff;
3111   if (mode == DImode
3112       && aarch64_move_imm (val2, SImode)
3113       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
3114     {
3115       if (generate)
3116         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3117
3118       /* Check if we have to emit a second instruction by checking to see
3119          if any of the upper 32 bits of the original DI mode value is set.  */
3120       if (val == val2)
3121         return 1;
3122
3123       i = (val >> 48) ? 48 : 32;
3124
3125       if (generate)
3126          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3127                                     GEN_INT ((val >> i) & 0xffff)));
3128
3129       return 2;
3130     }
3131
3132   if ((val >> 32) == 0 || mode == SImode)
3133     {
3134       if (generate)
3135         {
3136           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
3137           if (mode == SImode)
3138             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
3139                                        GEN_INT ((val >> 16) & 0xffff)));
3140           else
3141             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
3142                                        GEN_INT ((val >> 16) & 0xffff)));
3143         }
3144       return 2;
3145     }
3146
3147   /* Remaining cases are all for DImode.  */
3148
3149   mask = 0xffff;
3150   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
3151     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
3152   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
3153     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
3154
3155   if (zero_match != 2 && one_match != 2)
3156     {
3157       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
3158          For a 64-bit bitmask try whether changing 16 bits to all ones or
3159          zeroes creates a valid bitmask.  To check any repeated bitmask,
3160          try using 16 bits from the other 32-bit half of val.  */
3161
3162       for (i = 0; i < 64; i += 16, mask <<= 16)
3163         {
3164           val2 = val & ~mask;
3165           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3166             break;
3167           val2 = val | mask;
3168           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3169             break;
3170           val2 = val2 & ~mask;
3171           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
3172           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3173             break;
3174         }
3175       if (i != 64)
3176         {
3177           if (generate)
3178             {
3179               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3180               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3181                                          GEN_INT ((val >> i) & 0xffff)));
3182             }
3183           return 2;
3184         }
3185     }
3186
3187   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
3188      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
3189      otherwise skip zero bits.  */
3190
3191   num_insns = 1;
3192   mask = 0xffff;
3193   val2 = one_match > zero_match ? ~val : val;
3194   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
3195
3196   if (generate)
3197     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
3198                                            ? (val | ~(mask << i))
3199                                            : (val & (mask << i)))));
3200   for (i += 16; i < 64; i += 16)
3201     {
3202       if ((val2 & (mask << i)) == 0)
3203         continue;
3204       if (generate)
3205         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3206                                    GEN_INT ((val >> i) & 0xffff)));
3207       num_insns ++;
3208     }
3209
3210   return num_insns;
3211 }
3212
3213 /* Return whether imm is a 128-bit immediate which is simple enough to
3214    expand inline.  */
3215 bool
3216 aarch64_mov128_immediate (rtx imm)
3217 {
3218   if (GET_CODE (imm) == CONST_INT)
3219     return true;
3220
3221   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
3222
3223   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
3224   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
3225
3226   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
3227          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
3228 }
3229
3230
3231 /* Return the number of temporary registers that aarch64_add_offset_1
3232    would need to add OFFSET to a register.  */
3233
3234 static unsigned int
3235 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
3236 {
3237   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
3238 }
3239
3240 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
3241    a non-polynomial OFFSET.  MODE is the mode of the addition.
3242    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3243    be set and CFA adjustments added to the generated instructions.
3244
3245    TEMP1, if nonnull, is a register of mode MODE that can be used as a
3246    temporary if register allocation is already complete.  This temporary
3247    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
3248    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
3249    the immediate again.
3250
3251    Since this function may be used to adjust the stack pointer, we must
3252    ensure that it cannot cause transient stack deallocation (for example
3253    by first incrementing SP and then decrementing when adjusting by a
3254    large immediate).  */
3255
3256 static void
3257 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
3258                       rtx src, HOST_WIDE_INT offset, rtx temp1,
3259                       bool frame_related_p, bool emit_move_imm)
3260 {
3261   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3262   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3263
3264   HOST_WIDE_INT moffset = abs_hwi (offset);
3265   rtx_insn *insn;
3266
3267   if (!moffset)
3268     {
3269       if (!rtx_equal_p (dest, src))
3270         {
3271           insn = emit_insn (gen_rtx_SET (dest, src));
3272           RTX_FRAME_RELATED_P (insn) = frame_related_p;
3273         }
3274       return;
3275     }
3276
3277   /* Single instruction adjustment.  */
3278   if (aarch64_uimm12_shift (moffset))
3279     {
3280       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
3281       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3282       return;
3283     }
3284
3285   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
3286      and either:
3287
3288      a) the offset cannot be loaded by a 16-bit move or
3289      b) there is no spare register into which we can move it.  */
3290   if (moffset < 0x1000000
3291       && ((!temp1 && !can_create_pseudo_p ())
3292           || !aarch64_move_imm (moffset, mode)))
3293     {
3294       HOST_WIDE_INT low_off = moffset & 0xfff;
3295
3296       low_off = offset < 0 ? -low_off : low_off;
3297       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
3298       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3299       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
3300       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3301       return;
3302     }
3303
3304   /* Emit a move immediate if required and an addition/subtraction.  */
3305   if (emit_move_imm)
3306     {
3307       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
3308       temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
3309     }
3310   insn = emit_insn (offset < 0
3311                     ? gen_sub3_insn (dest, src, temp1)
3312                     : gen_add3_insn (dest, src, temp1));
3313   if (frame_related_p)
3314     {
3315       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3316       rtx adj = plus_constant (mode, src, offset);
3317       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
3318     }
3319 }
3320
3321 /* Return the number of temporary registers that aarch64_add_offset
3322    would need to move OFFSET into a register or add OFFSET to a register;
3323    ADD_P is true if we want the latter rather than the former.  */
3324
3325 static unsigned int
3326 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
3327 {
3328   /* This follows the same structure as aarch64_add_offset.  */
3329   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
3330     return 0;
3331
3332   unsigned int count = 0;
3333   HOST_WIDE_INT factor = offset.coeffs[1];
3334   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3335   poly_int64 poly_offset (factor, factor);
3336   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3337     /* Need one register for the ADDVL/ADDPL result.  */
3338     count += 1;
3339   else if (factor != 0)
3340     {
3341       factor = abs (factor);
3342       if (factor > 16 * (factor & -factor))
3343         /* Need one register for the CNT result and one for the multiplication
3344            factor.  If necessary, the second temporary can be reused for the
3345            constant part of the offset.  */
3346         return 2;
3347       /* Need one register for the CNT result (which might then
3348          be shifted).  */
3349       count += 1;
3350     }
3351   return count + aarch64_add_offset_1_temporaries (constant);
3352 }
3353
3354 /* If X can be represented as a poly_int64, return the number
3355    of temporaries that are required to add it to a register.
3356    Return -1 otherwise.  */
3357
3358 int
3359 aarch64_add_offset_temporaries (rtx x)
3360 {
3361   poly_int64 offset;
3362   if (!poly_int_rtx_p (x, &offset))
3363     return -1;
3364   return aarch64_offset_temporaries (true, offset);
3365 }
3366
3367 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
3368    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3369    be set and CFA adjustments added to the generated instructions.
3370
3371    TEMP1, if nonnull, is a register of mode MODE that can be used as a
3372    temporary if register allocation is already complete.  This temporary
3373    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
3374    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
3375    false to avoid emitting the immediate again.
3376
3377    TEMP2, if nonnull, is a second temporary register that doesn't
3378    overlap either DEST or REG.
3379
3380    Since this function may be used to adjust the stack pointer, we must
3381    ensure that it cannot cause transient stack deallocation (for example
3382    by first incrementing SP and then decrementing when adjusting by a
3383    large immediate).  */
3384
3385 static void
3386 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3387                     poly_int64 offset, rtx temp1, rtx temp2,
3388                     bool frame_related_p, bool emit_move_imm = true)
3389 {
3390   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3391   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3392   gcc_assert (temp1 == NULL_RTX
3393               || !frame_related_p
3394               || !reg_overlap_mentioned_p (temp1, dest));
3395   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
3396
3397   /* Try using ADDVL or ADDPL to add the whole value.  */
3398   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
3399     {
3400       rtx offset_rtx = gen_int_mode (offset, mode);
3401       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3402       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3403       return;
3404     }
3405
3406   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
3407      SVE vector register, over and above the minimum size of 128 bits.
3408      This is equivalent to half the value returned by CNTD with a
3409      vector shape of ALL.  */
3410   HOST_WIDE_INT factor = offset.coeffs[1];
3411   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3412
3413   /* Try using ADDVL or ADDPL to add the VG-based part.  */
3414   poly_int64 poly_offset (factor, factor);
3415   if (src != const0_rtx
3416       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3417     {
3418       rtx offset_rtx = gen_int_mode (poly_offset, mode);
3419       if (frame_related_p)
3420         {
3421           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3422           RTX_FRAME_RELATED_P (insn) = true;
3423           src = dest;
3424         }
3425       else
3426         {
3427           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
3428           src = aarch64_force_temporary (mode, temp1, addr);
3429           temp1 = temp2;
3430           temp2 = NULL_RTX;
3431         }
3432     }
3433   /* Otherwise use a CNT-based sequence.  */
3434   else if (factor != 0)
3435     {
3436       /* Use a subtraction if we have a negative factor.  */
3437       rtx_code code = PLUS;
3438       if (factor < 0)
3439         {
3440           factor = -factor;
3441           code = MINUS;
3442         }
3443
3444       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
3445          into the multiplication.  */
3446       rtx val;
3447       int shift = 0;
3448       if (factor & 1)
3449         /* Use a right shift by 1.  */
3450         shift = -1;
3451       else
3452         factor /= 2;
3453       HOST_WIDE_INT low_bit = factor & -factor;
3454       if (factor <= 16 * low_bit)
3455         {
3456           if (factor > 16 * 8)
3457             {
3458               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3459                  the value with the minimum multiplier and shift it into
3460                  position.  */
3461               int extra_shift = exact_log2 (low_bit);
3462               shift += extra_shift;
3463               factor >>= extra_shift;
3464             }
3465           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
3466         }
3467       else
3468         {
3469           /* Base the factor on LOW_BIT if we can calculate LOW_BIT
3470              directly, since that should increase the chances of being
3471              able to use a shift and add sequence.  If LOW_BIT itself
3472              is out of range, just use CNTD.  */
3473           if (low_bit <= 16 * 8)
3474             factor /= low_bit;
3475           else
3476             low_bit = 1;
3477
3478           val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
3479           val = aarch64_force_temporary (mode, temp1, val);
3480
3481           if (can_create_pseudo_p ())
3482             {
3483               rtx coeff1 = gen_int_mode (factor, mode);
3484               val = expand_mult (mode, val, coeff1, NULL_RTX, false, true);
3485             }
3486           else
3487             {
3488               /* Go back to using a negative multiplication factor if we have
3489                  no register from which to subtract.  */
3490               if (code == MINUS && src == const0_rtx)
3491                 {
3492                   factor = -factor;
3493                   code = PLUS;
3494                 }
3495               rtx coeff1 = gen_int_mode (factor, mode);
3496               coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
3497               val = gen_rtx_MULT (mode, val, coeff1);
3498             }
3499         }
3500
3501       if (shift > 0)
3502         {
3503           /* Multiply by 1 << SHIFT.  */
3504           val = aarch64_force_temporary (mode, temp1, val);
3505           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3506         }
3507       else if (shift == -1)
3508         {
3509           /* Divide by 2.  */
3510           val = aarch64_force_temporary (mode, temp1, val);
3511           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3512         }
3513
3514       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
3515       if (src != const0_rtx)
3516         {
3517           val = aarch64_force_temporary (mode, temp1, val);
3518           val = gen_rtx_fmt_ee (code, mode, src, val);
3519         }
3520       else if (code == MINUS)
3521         {
3522           val = aarch64_force_temporary (mode, temp1, val);
3523           val = gen_rtx_NEG (mode, val);
3524         }
3525
3526       if (constant == 0 || frame_related_p)
3527         {
3528           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3529           if (frame_related_p)
3530             {
3531               RTX_FRAME_RELATED_P (insn) = true;
3532               add_reg_note (insn, REG_CFA_ADJUST_CFA,
3533                             gen_rtx_SET (dest, plus_constant (Pmode, src,
3534                                                               poly_offset)));
3535             }
3536           src = dest;
3537           if (constant == 0)
3538             return;
3539         }
3540       else
3541         {
3542           src = aarch64_force_temporary (mode, temp1, val);
3543           temp1 = temp2;
3544           temp2 = NULL_RTX;
3545         }
3546
3547       emit_move_imm = true;
3548     }
3549
3550   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
3551                         frame_related_p, emit_move_imm);
3552 }
3553
3554 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3555    than a poly_int64.  */
3556
3557 void
3558 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3559                           rtx offset_rtx, rtx temp1, rtx temp2)
3560 {
3561   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
3562                       temp1, temp2, false);
3563 }
3564
3565 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3566    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
3567    if TEMP1 already contains abs (DELTA).  */
3568
3569 static inline void
3570 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
3571 {
3572   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
3573                       temp1, temp2, true, emit_move_imm);
3574 }
3575
3576 /* Subtract DELTA from the stack pointer, marking the instructions
3577    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
3578    if nonnull.  */
3579
3580 static inline void
3581 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
3582                 bool emit_move_imm = true)
3583 {
3584   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
3585                       temp1, temp2, frame_related_p, emit_move_imm);
3586 }
3587
3588 /* Set DEST to (vec_series BASE STEP).  */
3589
3590 static void
3591 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
3592 {
3593   machine_mode mode = GET_MODE (dest);
3594   scalar_mode inner = GET_MODE_INNER (mode);
3595
3596   /* Each operand can be a register or an immediate in the range [-16, 15].  */
3597   if (!aarch64_sve_index_immediate_p (base))
3598     base = force_reg (inner, base);
3599   if (!aarch64_sve_index_immediate_p (step))
3600     step = force_reg (inner, step);
3601
3602   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
3603 }
3604
3605 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
3606    register of mode MODE.  Use TARGET for the result if it's nonnull
3607    and convenient.
3608
3609    The two vector modes must have the same element mode.  The behavior
3610    is to duplicate architectural lane N of SRC into architectural lanes
3611    N + I * STEP of the result.  On big-endian targets, architectural
3612    lane 0 of an Advanced SIMD vector is the last element of the vector
3613    in memory layout, so for big-endian targets this operation has the
3614    effect of reversing SRC before duplicating it.  Callers need to
3615    account for this.  */
3616
3617 rtx
3618 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
3619 {
3620   machine_mode src_mode = GET_MODE (src);
3621   gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
3622   insn_code icode = (BYTES_BIG_ENDIAN
3623                      ? code_for_aarch64_vec_duplicate_vq_be (mode)
3624                      : code_for_aarch64_vec_duplicate_vq_le (mode));
3625
3626   unsigned int i = 0;
3627   expand_operand ops[3];
3628   create_output_operand (&ops[i++], target, mode);
3629   create_output_operand (&ops[i++], src, src_mode);
3630   if (BYTES_BIG_ENDIAN)
3631     {
3632       /* Create a PARALLEL describing the reversal of SRC.  */
3633       unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
3634       rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
3635                                                   nelts_per_vq - 1, -1);
3636       create_fixed_operand (&ops[i++], sel);
3637     }
3638   expand_insn (icode, i, ops);
3639   return ops[0].value;
3640 }
3641
3642 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
3643    the memory image into DEST.  Return true on success.  */
3644
3645 static bool
3646 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
3647 {
3648   src = force_const_mem (GET_MODE (src), src);
3649   if (!src)
3650     return false;
3651
3652   /* Make sure that the address is legitimate.  */
3653   if (!aarch64_sve_ld1rq_operand_p (src))
3654     {
3655       rtx addr = force_reg (Pmode, XEXP (src, 0));
3656       src = replace_equiv_address (src, addr);
3657     }
3658
3659   machine_mode mode = GET_MODE (dest);
3660   unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
3661   machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
3662   rtx ptrue = aarch64_ptrue_reg (pred_mode);
3663   emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
3664   return true;
3665 }
3666
3667 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
3668    SVE data mode and isn't a legitimate constant.  Use TARGET for the
3669    result if convenient.
3670
3671    The returned register can have whatever mode seems most natural
3672    given the contents of SRC.  */
3673
3674 static rtx
3675 aarch64_expand_sve_const_vector (rtx target, rtx src)
3676 {
3677   machine_mode mode = GET_MODE (src);
3678   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
3679   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
3680   scalar_mode elt_mode = GET_MODE_INNER (mode);
3681   unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
3682   unsigned int encoded_bits = npatterns * nelts_per_pattern * elt_bits;
3683
3684   if (nelts_per_pattern == 1 && encoded_bits == 128)
3685     {
3686       /* The constant is a duplicated quadword but can't be narrowed
3687          beyond a quadword.  Get the memory image of the first quadword
3688          as a 128-bit vector and try using LD1RQ to load it from memory.
3689
3690          The effect for both endiannesses is to load memory lane N into
3691          architectural lanes N + I * STEP of the result.  On big-endian
3692          targets, the layout of the 128-bit vector in an Advanced SIMD
3693          register would be different from its layout in an SVE register,
3694          but this 128-bit vector is a memory value only.  */
3695       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
3696       rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
3697       if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
3698         return target;
3699     }
3700
3701   if (nelts_per_pattern == 1 && encoded_bits < 128)
3702     {
3703       /* The vector is a repeating sequence of 64 bits or fewer.
3704          See if we can load them using an Advanced SIMD move and then
3705          duplicate it to fill a vector.  This is better than using a GPR
3706          move because it keeps everything in the same register file.  */
3707       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
3708       rtx_vector_builder builder (vq_mode, npatterns, 1);
3709       for (unsigned int i = 0; i < npatterns; ++i)
3710         {
3711           /* We want memory lane N to go into architectural lane N,
3712              so reverse for big-endian targets.  The DUP .Q pattern
3713              has a compensating reverse built-in.  */
3714           unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
3715           builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
3716         }
3717       rtx vq_src = builder.build ();
3718       if (aarch64_simd_valid_immediate (vq_src, NULL))
3719         {
3720           vq_src = force_reg (vq_mode, vq_src);
3721           return aarch64_expand_sve_dupq (target, mode, vq_src);
3722         }
3723
3724       /* Get an integer representation of the repeating part of Advanced
3725          SIMD vector VQ_SRC.  This preserves the endianness of VQ_SRC,
3726          which for big-endian targets is lane-swapped wrt a normal
3727          Advanced SIMD vector.  This means that for both endiannesses,
3728          memory lane N of SVE vector SRC corresponds to architectural
3729          lane N of a register holding VQ_SRC.  This in turn means that
3730          memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
3731          as a single 128-bit value) and thus that memory lane 0 of SRC is
3732          in the lsb of the integer.  Duplicating the integer therefore
3733          ensures that memory lane N of SRC goes into architectural lane
3734          N + I * INDEX of the SVE register.  */
3735       scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
3736       rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
3737       if (elt_value)
3738         {
3739           /* Pretend that we had a vector of INT_MODE to start with.  */
3740           elt_mode = int_mode;
3741           mode = aarch64_full_sve_mode (int_mode).require ();
3742
3743           /* If the integer can be moved into a general register by a
3744              single instruction, do that and duplicate the result.  */
3745           if (CONST_INT_P (elt_value)
3746               && aarch64_move_imm (INTVAL (elt_value), elt_mode))
3747             {
3748               elt_value = force_reg (elt_mode, elt_value);
3749               return expand_vector_broadcast (mode, elt_value);
3750             }
3751         }
3752       else if (npatterns == 1)
3753         /* We're duplicating a single value, but can't do better than
3754            force it to memory and load from there.  This handles things
3755            like symbolic constants.  */
3756         elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
3757
3758       if (elt_value)
3759         {
3760           /* Load the element from memory if we can, otherwise move it into
3761              a register and use a DUP.  */
3762           rtx op = force_const_mem (elt_mode, elt_value);
3763           if (!op)
3764             op = force_reg (elt_mode, elt_value);
3765           return expand_vector_broadcast (mode, op);
3766         }
3767     }
3768
3769   /* Try using INDEX.  */
3770   rtx base, step;
3771   if (const_vec_series_p (src, &base, &step))
3772     {
3773       aarch64_expand_vec_series (target, base, step);
3774       return target;
3775     }
3776
3777   /* From here on, it's better to force the whole constant to memory
3778      if we can.  */
3779   if (GET_MODE_NUNITS (mode).is_constant ())
3780     return NULL_RTX;
3781
3782   /* Expand each pattern individually.  */
3783   gcc_assert (npatterns > 1);
3784   rtx_vector_builder builder;
3785   auto_vec<rtx, 16> vectors (npatterns);
3786   for (unsigned int i = 0; i < npatterns; ++i)
3787     {
3788       builder.new_vector (mode, 1, nelts_per_pattern);
3789       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
3790         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
3791       vectors.quick_push (force_reg (mode, builder.build ()));
3792     }
3793
3794   /* Use permutes to interleave the separate vectors.  */
3795   while (npatterns > 1)
3796     {
3797       npatterns /= 2;
3798       for (unsigned int i = 0; i < npatterns; ++i)
3799         {
3800           rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
3801           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
3802           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
3803           vectors[i] = tmp;
3804         }
3805     }
3806   gcc_assert (vectors[0] == target);
3807   return target;
3808 }
3809
3810 /* Use WHILE to set a predicate register of mode MODE in which the first
3811    VL bits are set and the rest are clear.  Use TARGET for the register
3812    if it's nonnull and convenient.  */
3813
3814 static rtx
3815 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
3816                                  unsigned int vl)
3817 {
3818   rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
3819   target = aarch64_target_reg (target, mode);
3820   emit_insn (gen_while_ult (DImode, mode, target, const0_rtx, limit));
3821   return target;
3822 }
3823
3824 static rtx
3825 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
3826
3827 /* BUILDER is a constant predicate in which the index of every set bit
3828    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
3829    by inverting every element at a multiple of ELT_SIZE and EORing the
3830    result with an ELT_SIZE PTRUE.
3831
3832    Return a register that contains the constant on success, otherwise
3833    return null.  Use TARGET as the register if it is nonnull and
3834    convenient.  */
3835
3836 static rtx
3837 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
3838                                    unsigned int elt_size)
3839 {
3840   /* Invert every element at a multiple of ELT_SIZE, keeping the
3841      other bits zero.  */
3842   rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
3843                                   builder.nelts_per_pattern ());
3844   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
3845     if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
3846       inv_builder.quick_push (const1_rtx);
3847     else
3848       inv_builder.quick_push (const0_rtx);
3849   inv_builder.finalize ();
3850
3851   /* See if we can load the constant cheaply.  */
3852   rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
3853   if (!inv)
3854     return NULL_RTX;
3855
3856   /* EOR the result with an ELT_SIZE PTRUE.  */
3857   rtx mask = aarch64_ptrue_all (elt_size);
3858   mask = force_reg (VNx16BImode, mask);
3859   target = aarch64_target_reg (target, VNx16BImode);
3860   emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
3861   return target;
3862 }
3863
3864 /* BUILDER is a constant predicate in which the index of every set bit
3865    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
3866    using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE.  Return the
3867    register on success, otherwise return null.  Use TARGET as the register
3868    if nonnull and convenient.  */
3869
3870 static rtx
3871 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
3872                                    unsigned int elt_size,
3873                                    unsigned int permute_size)
3874 {
3875   /* We're going to split the constant into two new constants A and B,
3876      with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
3877      and into B otherwise.  E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
3878
3879      A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
3880      B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
3881
3882      where _ indicates elements that will be discarded by the permute.
3883
3884      First calculate the ELT_SIZEs for A and B.  */
3885   unsigned int a_elt_size = GET_MODE_SIZE (DImode);
3886   unsigned int b_elt_size = GET_MODE_SIZE (DImode);
3887   for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
3888     if (INTVAL (builder.elt (i)) != 0)
3889       {
3890         if (i & permute_size)
3891           b_elt_size |= i - permute_size;
3892         else
3893           a_elt_size |= i;
3894       }
3895   a_elt_size &= -a_elt_size;
3896   b_elt_size &= -b_elt_size;
3897
3898   /* Now construct the vectors themselves.  */
3899   rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
3900                                 builder.nelts_per_pattern ());
3901   rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
3902                                 builder.nelts_per_pattern ());
3903   unsigned int nelts = builder.encoded_nelts ();
3904   for (unsigned int i = 0; i < nelts; ++i)
3905     if (i & (elt_size - 1))
3906       {
3907         a_builder.quick_push (const0_rtx);
3908         b_builder.quick_push (const0_rtx);
3909       }
3910     else if ((i & permute_size) == 0)
3911       {
3912         /* The A and B elements are significant.  */
3913         a_builder.quick_push (builder.elt (i));
3914         b_builder.quick_push (builder.elt (i + permute_size));
3915       }
3916     else
3917       {
3918         /* The A and B elements are going to be discarded, so pick whatever
3919            is likely to give a nice constant.  We are targeting element
3920            sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
3921            with the aim of each being a sequence of ones followed by
3922            a sequence of zeros.  So:
3923
3924            * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
3925              duplicate the last X_ELT_SIZE element, to extend the
3926              current sequence of ones or zeros.
3927
3928            * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
3929              zero, so that the constant really does have X_ELT_SIZE and
3930              not a smaller size.  */
3931         if (a_elt_size > permute_size)
3932           a_builder.quick_push (const0_rtx);
3933         else
3934           a_builder.quick_push (a_builder.elt (i - a_elt_size));
3935         if (b_elt_size > permute_size)
3936           b_builder.quick_push (const0_rtx);
3937         else
3938           b_builder.quick_push (b_builder.elt (i - b_elt_size));
3939       }
3940   a_builder.finalize ();
3941   b_builder.finalize ();
3942
3943   /* Try loading A into a register.  */
3944   rtx_insn *last = get_last_insn ();
3945   rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
3946   if (!a)
3947     return NULL_RTX;
3948
3949   /* Try loading B into a register.  */
3950   rtx b = a;
3951   if (a_builder != b_builder)
3952     {
3953       b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
3954       if (!b)
3955         {
3956           delete_insns_since (last);
3957           return NULL_RTX;
3958         }
3959     }
3960
3961   /* Emit the TRN1 itself.  */
3962   machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
3963   target = aarch64_target_reg (target, mode);
3964   emit_insn (gen_aarch64_sve (UNSPEC_TRN1, mode, target,
3965                               gen_lowpart (mode, a),
3966                               gen_lowpart (mode, b)));
3967   return target;
3968 }
3969
3970 /* Subroutine of aarch64_expand_sve_const_pred.  Try to load the VNx16BI
3971    constant in BUILDER into an SVE predicate register.  Return the register
3972    on success, otherwise return null.  Use TARGET for the register if
3973    nonnull and convenient.
3974
3975    ALLOW_RECURSE_P is true if we can use methods that would call this
3976    function recursively.  */
3977
3978 static rtx
3979 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
3980                                  bool allow_recurse_p)
3981 {
3982   if (builder.encoded_nelts () == 1)
3983     /* A PFALSE or a PTRUE .B ALL.  */
3984     return aarch64_emit_set_immediate (target, builder);
3985
3986   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
3987   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
3988     {
3989       /* If we can load the constant using PTRUE, use it as-is.  */
3990       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
3991       if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
3992         return aarch64_emit_set_immediate (target, builder);
3993
3994       /* Otherwise use WHILE to set the first VL bits.  */
3995       return aarch64_sve_move_pred_via_while (target, mode, vl);
3996     }
3997
3998   if (!allow_recurse_p)
3999     return NULL_RTX;
4000
4001   /* Try inverting the vector in element size ELT_SIZE and then EORing
4002      the result with an ELT_SIZE PTRUE.  */
4003   if (INTVAL (builder.elt (0)) == 0)
4004     if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
4005                                                      elt_size))
4006       return res;
4007
4008   /* Try using TRN1 to permute two simpler constants.  */
4009   for (unsigned int i = elt_size; i <= 8; i *= 2)
4010     if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
4011                                                      elt_size, i))
4012       return res;
4013
4014   return NULL_RTX;
4015 }
4016
4017 /* Return an SVE predicate register that contains the VNx16BImode
4018    constant in BUILDER, without going through the move expanders.
4019
4020    The returned register can have whatever mode seems most natural
4021    given the contents of BUILDER.  Use TARGET for the result if
4022    convenient.  */
4023
4024 static rtx
4025 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
4026 {
4027   /* Try loading the constant using pure predicate operations.  */
4028   if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
4029     return res;
4030
4031   /* Try forcing the constant to memory.  */
4032   if (builder.full_nelts ().is_constant ())
4033     if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
4034       {
4035         target = aarch64_target_reg (target, VNx16BImode);
4036         emit_move_insn (target, mem);
4037         return target;
4038       }
4039
4040   /* The last resort is to load the constant as an integer and then
4041      compare it against zero.  Use -1 for set bits in order to increase
4042      the changes of using SVE DUPM or an Advanced SIMD byte mask.  */
4043   rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
4044                                   builder.nelts_per_pattern ());
4045   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
4046     int_builder.quick_push (INTVAL (builder.elt (i))
4047                             ? constm1_rtx : const0_rtx);
4048   return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
4049                                            int_builder.build ());
4050 }
4051
4052 /* Set DEST to immediate IMM.  */
4053
4054 void
4055 aarch64_expand_mov_immediate (rtx dest, rtx imm)
4056 {
4057   machine_mode mode = GET_MODE (dest);
4058
4059   /* Check on what type of symbol it is.  */
4060   scalar_int_mode int_mode;
4061   if ((GET_CODE (imm) == SYMBOL_REF
4062        || GET_CODE (imm) == LABEL_REF
4063        || GET_CODE (imm) == CONST
4064        || GET_CODE (imm) == CONST_POLY_INT)
4065       && is_a <scalar_int_mode> (mode, &int_mode))
4066     {
4067       rtx mem;
4068       poly_int64 offset;
4069       HOST_WIDE_INT const_offset;
4070       enum aarch64_symbol_type sty;
4071
4072       /* If we have (const (plus symbol offset)), separate out the offset
4073          before we start classifying the symbol.  */
4074       rtx base = strip_offset (imm, &offset);
4075
4076       /* We must always add an offset involving VL separately, rather than
4077          folding it into the relocation.  */
4078       if (!offset.is_constant (&const_offset))
4079         {
4080           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
4081             emit_insn (gen_rtx_SET (dest, imm));
4082           else
4083             {
4084               /* Do arithmetic on 32-bit values if the result is smaller
4085                  than that.  */
4086               if (partial_subreg_p (int_mode, SImode))
4087                 {
4088                   /* It is invalid to do symbol calculations in modes
4089                      narrower than SImode.  */
4090                   gcc_assert (base == const0_rtx);
4091                   dest = gen_lowpart (SImode, dest);
4092                   int_mode = SImode;
4093                 }
4094               if (base != const0_rtx)
4095                 {
4096                   base = aarch64_force_temporary (int_mode, dest, base);
4097                   aarch64_add_offset (int_mode, dest, base, offset,
4098                                       NULL_RTX, NULL_RTX, false);
4099                 }
4100               else
4101                 aarch64_add_offset (int_mode, dest, base, offset,
4102                                     dest, NULL_RTX, false);
4103             }
4104           return;
4105         }
4106
4107       sty = aarch64_classify_symbol (base, const_offset);
4108       switch (sty)
4109         {
4110         case SYMBOL_FORCE_TO_MEM:
4111           if (const_offset != 0
4112               && targetm.cannot_force_const_mem (int_mode, imm))
4113             {
4114               gcc_assert (can_create_pseudo_p ());
4115               base = aarch64_force_temporary (int_mode, dest, base);
4116               aarch64_add_offset (int_mode, dest, base, const_offset,
4117                                   NULL_RTX, NULL_RTX, false);
4118               return;
4119             }
4120
4121           mem = force_const_mem (ptr_mode, imm);
4122           gcc_assert (mem);
4123
4124           /* If we aren't generating PC relative literals, then
4125              we need to expand the literal pool access carefully.
4126              This is something that needs to be done in a number
4127              of places, so could well live as a separate function.  */
4128           if (!aarch64_pcrelative_literal_loads)
4129             {
4130               gcc_assert (can_create_pseudo_p ());
4131               base = gen_reg_rtx (ptr_mode);
4132               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
4133               if (ptr_mode != Pmode)
4134                 base = convert_memory_address (Pmode, base);
4135               mem = gen_rtx_MEM (ptr_mode, base);
4136             }
4137
4138           if (int_mode != ptr_mode)
4139             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
4140
4141           emit_insn (gen_rtx_SET (dest, mem));
4142
4143           return;
4144
4145         case SYMBOL_SMALL_TLSGD:
4146         case SYMBOL_SMALL_TLSDESC:
4147         case SYMBOL_SMALL_TLSIE:
4148         case SYMBOL_SMALL_GOT_28K:
4149         case SYMBOL_SMALL_GOT_4G:
4150         case SYMBOL_TINY_GOT:
4151         case SYMBOL_TINY_TLSIE:
4152           if (const_offset != 0)
4153             {
4154               gcc_assert(can_create_pseudo_p ());
4155               base = aarch64_force_temporary (int_mode, dest, base);
4156               aarch64_add_offset (int_mode, dest, base, const_offset,
4157                                   NULL_RTX, NULL_RTX, false);
4158               return;
4159             }
4160           /* FALLTHRU */
4161
4162         case SYMBOL_SMALL_ABSOLUTE:
4163         case SYMBOL_TINY_ABSOLUTE:
4164         case SYMBOL_TLSLE12:
4165         case SYMBOL_TLSLE24:
4166         case SYMBOL_TLSLE32:
4167         case SYMBOL_TLSLE48:
4168           aarch64_load_symref_appropriately (dest, imm, sty);
4169           return;
4170
4171         default:
4172           gcc_unreachable ();
4173         }
4174     }
4175
4176   if (!CONST_INT_P (imm))
4177     {
4178       if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
4179         {
4180           /* Only the low bit of each .H, .S and .D element is defined,
4181              so we can set the upper bits to whatever we like.  If the
4182              predicate is all-true in MODE, prefer to set all the undefined
4183              bits as well, so that we can share a single .B predicate for
4184              all modes.  */
4185           if (imm == CONSTM1_RTX (mode))
4186             imm = CONSTM1_RTX (VNx16BImode);
4187
4188           /* All methods for constructing predicate modes wider than VNx16BI
4189              will set the upper bits of each element to zero.  Expose this
4190              by moving such constants as a VNx16BI, so that all bits are
4191              significant and so that constants for different modes can be
4192              shared.  The wider constant will still be available as a
4193              REG_EQUAL note.  */
4194           rtx_vector_builder builder;
4195           if (aarch64_get_sve_pred_bits (builder, imm))
4196             {
4197               rtx res = aarch64_expand_sve_const_pred (dest, builder);
4198               if (dest != res)
4199                 emit_move_insn (dest, gen_lowpart (mode, res));
4200               return;
4201             }
4202         }
4203
4204       if (GET_CODE (imm) == HIGH
4205           || aarch64_simd_valid_immediate (imm, NULL))
4206         {
4207           emit_insn (gen_rtx_SET (dest, imm));
4208           return;
4209         }
4210
4211       if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
4212         if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
4213           {
4214             if (dest != res)
4215               emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
4216             return;
4217           }
4218
4219       rtx mem = force_const_mem (mode, imm);
4220       gcc_assert (mem);
4221       emit_move_insn (dest, mem);
4222       return;
4223     }
4224
4225   aarch64_internal_mov_immediate (dest, imm, true,
4226                                   as_a <scalar_int_mode> (mode));
4227 }
4228
4229 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
4230    that is known to contain PTRUE.  */
4231
4232 void
4233 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
4234 {
4235   expand_operand ops[3];
4236   machine_mode mode = GET_MODE (dest);
4237   create_output_operand (&ops[0], dest, mode);
4238   create_input_operand (&ops[1], pred, GET_MODE(pred));
4239   create_input_operand (&ops[2], src, mode);
4240   temporary_volatile_ok v (true);
4241   expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
4242 }
4243
4244 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
4245    operand is in memory.  In this case we need to use the predicated LD1
4246    and ST1 instead of LDR and STR, both for correctness on big-endian
4247    targets and because LD1 and ST1 support a wider range of addressing modes.
4248    PRED_MODE is the mode of the predicate.
4249
4250    See the comment at the head of aarch64-sve.md for details about the
4251    big-endian handling.  */
4252
4253 void
4254 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
4255 {
4256   machine_mode mode = GET_MODE (dest);
4257   rtx ptrue = aarch64_ptrue_reg (pred_mode);
4258   if (!register_operand (src, mode)
4259       && !register_operand (dest, mode))
4260     {
4261       rtx tmp = gen_reg_rtx (mode);
4262       if (MEM_P (src))
4263         aarch64_emit_sve_pred_move (tmp, ptrue, src);
4264       else
4265         emit_move_insn (tmp, src);
4266       src = tmp;
4267     }
4268   aarch64_emit_sve_pred_move (dest, ptrue, src);
4269 }
4270
4271 /* Called only on big-endian targets.  See whether an SVE vector move
4272    from SRC to DEST is effectively a REV[BHW] instruction, because at
4273    least one operand is a subreg of an SVE vector that has wider or
4274    narrower elements.  Return true and emit the instruction if so.
4275
4276    For example:
4277
4278      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
4279
4280    represents a VIEW_CONVERT between the following vectors, viewed
4281    in memory order:
4282
4283      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
4284      R1: { [0],      [1],      [2],      [3],     ... }
4285
4286    The high part of lane X in R2 should therefore correspond to lane X*2
4287    of R1, but the register representations are:
4288
4289          msb                                      lsb
4290      R2: ...... [1].high  [1].low   [0].high  [0].low
4291      R1: ...... [3]       [2]       [1]       [0]
4292
4293    where the low part of lane X in R2 corresponds to lane X*2 in R1.
4294    We therefore need a reverse operation to swap the high and low values
4295    around.
4296
4297    This is purely an optimization.  Without it we would spill the
4298    subreg operand to the stack in one mode and reload it in the
4299    other mode, which has the same effect as the REV.  */
4300
4301 bool
4302 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
4303 {
4304   gcc_assert (BYTES_BIG_ENDIAN);
4305   if (GET_CODE (dest) == SUBREG)
4306     dest = SUBREG_REG (dest);
4307   if (GET_CODE (src) == SUBREG)
4308     src = SUBREG_REG (src);
4309
4310   /* The optimization handles two single SVE REGs with different element
4311      sizes.  */
4312   if (!REG_P (dest)
4313       || !REG_P (src)
4314       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
4315       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
4316       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
4317           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
4318     return false;
4319
4320   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
4321   rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
4322   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
4323                                UNSPEC_REV_SUBREG);
4324   emit_insn (gen_rtx_SET (dest, unspec));
4325   return true;
4326 }
4327
4328 /* Return a copy of X with mode MODE, without changing its other
4329    attributes.  Unlike gen_lowpart, this doesn't care whether the
4330    mode change is valid.  */
4331
4332 static rtx
4333 aarch64_replace_reg_mode (rtx x, machine_mode mode)
4334 {
4335   if (GET_MODE (x) == mode)
4336     return x;
4337
4338   x = shallow_copy_rtx (x);
4339   set_mode_and_regno (x, mode, REGNO (x));
4340   return x;
4341 }
4342
4343 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
4344    stored in wider integer containers.  */
4345
4346 static unsigned int
4347 aarch64_sve_rev_unspec (machine_mode mode)
4348 {
4349   switch (GET_MODE_UNIT_SIZE (mode))
4350     {
4351     case 1: return UNSPEC_REVB;
4352     case 2: return UNSPEC_REVH;
4353     case 4: return UNSPEC_REVW;
4354     }
4355   gcc_unreachable ();
4356 }
4357
4358 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
4359    operands.  */
4360
4361 void
4362 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
4363 {
4364   /* Decide which REV operation we need.  The mode with wider elements
4365      determines the mode of the operands and the mode with the narrower
4366      elements determines the reverse width.  */
4367   machine_mode mode_with_wider_elts = GET_MODE (dest);
4368   machine_mode mode_with_narrower_elts = GET_MODE (src);
4369   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
4370       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
4371     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
4372
4373   unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
4374   unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
4375   machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
4376
4377   /* Get the operands in the appropriate modes and emit the instruction.  */
4378   ptrue = gen_lowpart (pred_mode, ptrue);
4379   dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
4380   src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
4381   emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
4382                                dest, ptrue, src));
4383 }
4384
4385 static bool
4386 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
4387                                  tree exp ATTRIBUTE_UNUSED)
4388 {
4389   if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
4390     return false;
4391
4392   return true;
4393 }
4394
4395 /* Implement TARGET_PASS_BY_REFERENCE.  */
4396
4397 static bool
4398 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
4399                            machine_mode mode,
4400                            const_tree type,
4401                            bool named ATTRIBUTE_UNUSED)
4402 {
4403   HOST_WIDE_INT size;
4404   machine_mode dummymode;
4405   int nregs;
4406
4407   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
4408   if (mode == BLKmode && type)
4409     size = int_size_in_bytes (type);
4410   else
4411     /* No frontends can create types with variable-sized modes, so we
4412        shouldn't be asked to pass or return them.  */
4413     size = GET_MODE_SIZE (mode).to_constant ();
4414
4415   /* Aggregates are passed by reference based on their size.  */
4416   if (type && AGGREGATE_TYPE_P (type))
4417     {
4418       size = int_size_in_bytes (type);
4419     }
4420
4421   /* Variable sized arguments are always returned by reference.  */
4422   if (size < 0)
4423     return true;
4424
4425   /* Can this be a candidate to be passed in fp/simd register(s)?  */
4426   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
4427                                                &dummymode, &nregs,
4428                                                NULL))
4429     return false;
4430
4431   /* Arguments which are variable sized or larger than 2 registers are
4432      passed by reference unless they are a homogenous floating point
4433      aggregate.  */
4434   return size > 2 * UNITS_PER_WORD;
4435 }
4436
4437 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
4438 static bool
4439 aarch64_return_in_msb (const_tree valtype)
4440 {
4441   machine_mode dummy_mode;
4442   int dummy_int;
4443
4444   /* Never happens in little-endian mode.  */
4445   if (!BYTES_BIG_ENDIAN)
4446     return false;
4447
4448   /* Only composite types smaller than or equal to 16 bytes can
4449      be potentially returned in registers.  */
4450   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
4451       || int_size_in_bytes (valtype) <= 0
4452       || int_size_in_bytes (valtype) > 16)
4453     return false;
4454
4455   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
4456      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
4457      is always passed/returned in the least significant bits of fp/simd
4458      register(s).  */
4459   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
4460                                                &dummy_mode, &dummy_int, NULL))
4461     return false;
4462
4463   return true;
4464 }
4465
4466 /* Implement TARGET_FUNCTION_VALUE.
4467    Define how to find the value returned by a function.  */
4468
4469 static rtx
4470 aarch64_function_value (const_tree type, const_tree func,
4471                         bool outgoing ATTRIBUTE_UNUSED)
4472 {
4473   machine_mode mode;
4474   int unsignedp;
4475   int count;
4476   machine_mode ag_mode;
4477
4478   mode = TYPE_MODE (type);
4479   if (INTEGRAL_TYPE_P (type))
4480     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
4481
4482   if (aarch64_return_in_msb (type))
4483     {
4484       HOST_WIDE_INT size = int_size_in_bytes (type);
4485
4486       if (size % UNITS_PER_WORD != 0)
4487         {
4488           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
4489           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
4490         }
4491     }
4492
4493   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
4494                                                &ag_mode, &count, NULL))
4495     {
4496       if (!aarch64_composite_type_p (type, mode))
4497         {
4498           gcc_assert (count == 1 && mode == ag_mode);
4499           return gen_rtx_REG (mode, V0_REGNUM);
4500         }
4501       else
4502         {
4503           int i;
4504           rtx par;
4505
4506           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
4507           for (i = 0; i < count; i++)
4508             {
4509               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
4510               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
4511               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4512               XVECEXP (par, 0, i) = tmp;
4513             }
4514           return par;
4515         }
4516     }
4517   else
4518     return gen_rtx_REG (mode, R0_REGNUM);
4519 }
4520
4521 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
4522    Return true if REGNO is the number of a hard register in which the values
4523    of called function may come back.  */
4524
4525 static bool
4526 aarch64_function_value_regno_p (const unsigned int regno)
4527 {
4528   /* Maximum of 16 bytes can be returned in the general registers.  Examples
4529      of 16-byte return values are: 128-bit integers and 16-byte small
4530      structures (excluding homogeneous floating-point aggregates).  */
4531   if (regno == R0_REGNUM || regno == R1_REGNUM)
4532     return true;
4533
4534   /* Up to four fp/simd registers can return a function value, e.g. a
4535      homogeneous floating-point aggregate having four members.  */
4536   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
4537     return TARGET_FLOAT;
4538
4539   return false;
4540 }
4541
4542 /* Implement TARGET_RETURN_IN_MEMORY.
4543
4544    If the type T of the result of a function is such that
4545      void func (T arg)
4546    would require that arg be passed as a value in a register (or set of
4547    registers) according to the parameter passing rules, then the result
4548    is returned in the same registers as would be used for such an
4549    argument.  */
4550
4551 static bool
4552 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
4553 {
4554   HOST_WIDE_INT size;
4555   machine_mode ag_mode;
4556   int count;
4557
4558   if (!AGGREGATE_TYPE_P (type)
4559       && TREE_CODE (type) != COMPLEX_TYPE
4560       && TREE_CODE (type) != VECTOR_TYPE)
4561     /* Simple scalar types always returned in registers.  */
4562     return false;
4563
4564   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
4565                                                type,
4566                                                &ag_mode,
4567                                                &count,
4568                                                NULL))
4569     return false;
4570
4571   /* Types larger than 2 registers returned in memory.  */
4572   size = int_size_in_bytes (type);
4573   return (size < 0 || size > 2 * UNITS_PER_WORD);
4574 }
4575
4576 static bool
4577 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
4578                                const_tree type, int *nregs)
4579 {
4580   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4581   return aarch64_vfp_is_call_or_return_candidate (mode,
4582                                                   type,
4583                                                   &pcum->aapcs_vfp_rmode,
4584                                                   nregs,
4585                                                   NULL);
4586 }
4587
4588 /* Given MODE and TYPE of a function argument, return the alignment in
4589    bits.  The idea is to suppress any stronger alignment requested by
4590    the user and opt for the natural alignment (specified in AAPCS64 \S
4591    4.1).  ABI_BREAK is set to true if the alignment was incorrectly
4592    calculated in versions of GCC prior to GCC-9.  This is a helper
4593    function for local use only.  */
4594
4595 static unsigned int
4596 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
4597                                 bool *abi_break)
4598 {
4599   *abi_break = false;
4600   if (!type)
4601     return GET_MODE_ALIGNMENT (mode);
4602
4603   if (integer_zerop (TYPE_SIZE (type)))
4604     return 0;
4605
4606   gcc_assert (TYPE_MODE (type) == mode);
4607
4608   if (!AGGREGATE_TYPE_P (type))
4609     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
4610
4611   if (TREE_CODE (type) == ARRAY_TYPE)
4612     return TYPE_ALIGN (TREE_TYPE (type));
4613
4614   unsigned int alignment = 0;
4615   unsigned int bitfield_alignment = 0;
4616   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
4617     if (TREE_CODE (field) == FIELD_DECL)
4618       {
4619         alignment = std::max (alignment, DECL_ALIGN (field));
4620         if (DECL_BIT_FIELD_TYPE (field))
4621           bitfield_alignment
4622             = std::max (bitfield_alignment,
4623                         TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
4624       }
4625
4626   if (bitfield_alignment > alignment)
4627     {
4628       *abi_break = true;
4629       return bitfield_alignment;
4630     }
4631
4632   return alignment;
4633 }
4634
4635 /* Layout a function argument according to the AAPCS64 rules.  The rule
4636    numbers refer to the rule numbers in the AAPCS64.  */
4637
4638 static void
4639 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
4640                     const_tree type,
4641                     bool named ATTRIBUTE_UNUSED)
4642 {
4643   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4644   int ncrn, nvrn, nregs;
4645   bool allocate_ncrn, allocate_nvrn;
4646   HOST_WIDE_INT size;
4647   bool abi_break;
4648
4649   /* We need to do this once per argument.  */
4650   if (pcum->aapcs_arg_processed)
4651     return;
4652
4653   pcum->aapcs_arg_processed = true;
4654
4655   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
4656   if (type)
4657     size = int_size_in_bytes (type);
4658   else
4659     /* No frontends can create types with variable-sized modes, so we
4660        shouldn't be asked to pass or return them.  */
4661     size = GET_MODE_SIZE (mode).to_constant ();
4662   size = ROUND_UP (size, UNITS_PER_WORD);
4663
4664   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
4665   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
4666                                                  mode,
4667                                                  type,
4668                                                  &nregs);
4669
4670   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
4671      The following code thus handles passing by SIMD/FP registers first.  */
4672
4673   nvrn = pcum->aapcs_nvrn;
4674
4675   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
4676      and homogenous short-vector aggregates (HVA).  */
4677   if (allocate_nvrn)
4678     {
4679       if (!TARGET_FLOAT)
4680         aarch64_err_no_fpadvsimd (mode);
4681
4682       if (nvrn + nregs <= NUM_FP_ARG_REGS)
4683         {
4684           pcum->aapcs_nextnvrn = nvrn + nregs;
4685           if (!aarch64_composite_type_p (type, mode))
4686             {
4687               gcc_assert (nregs == 1);
4688               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
4689             }
4690           else
4691             {
4692               rtx par;
4693               int i;
4694               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4695               for (i = 0; i < nregs; i++)
4696                 {
4697                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
4698                                          V0_REGNUM + nvrn + i);
4699                   rtx offset = gen_int_mode
4700                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
4701                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4702                   XVECEXP (par, 0, i) = tmp;
4703                 }
4704               pcum->aapcs_reg = par;
4705             }
4706           return;
4707         }
4708       else
4709         {
4710           /* C.3 NSRN is set to 8.  */
4711           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
4712           goto on_stack;
4713         }
4714     }
4715
4716   ncrn = pcum->aapcs_ncrn;
4717   nregs = size / UNITS_PER_WORD;
4718
4719   /* C6 - C9.  though the sign and zero extension semantics are
4720      handled elsewhere.  This is the case where the argument fits
4721      entirely general registers.  */
4722   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
4723     {
4724       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
4725
4726       /* C.8 if the argument has an alignment of 16 then the NGRN is
4727          rounded up to the next even number.  */
4728       if (nregs == 2
4729           && ncrn % 2
4730           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
4731              comparison is there because for > 16 * BITS_PER_UNIT
4732              alignment nregs should be > 2 and therefore it should be
4733              passed by reference rather than value.  */
4734           && (aarch64_function_arg_alignment (mode, type, &abi_break)
4735               == 16 * BITS_PER_UNIT))
4736         {
4737           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4738             inform (input_location, "parameter passing for argument of type "
4739                     "%qT changed in GCC 9.1", type);
4740           ++ncrn;
4741           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
4742         }
4743
4744       /* NREGS can be 0 when e.g. an empty structure is to be passed.
4745          A reg is still generated for it, but the caller should be smart
4746          enough not to use it.  */
4747       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
4748         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
4749       else
4750         {
4751           rtx par;
4752           int i;
4753
4754           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4755           for (i = 0; i < nregs; i++)
4756             {
4757               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
4758               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
4759                                        GEN_INT (i * UNITS_PER_WORD));
4760               XVECEXP (par, 0, i) = tmp;
4761             }
4762           pcum->aapcs_reg = par;
4763         }
4764
4765       pcum->aapcs_nextncrn = ncrn + nregs;
4766       return;
4767     }
4768
4769   /* C.11  */
4770   pcum->aapcs_nextncrn = NUM_ARG_REGS;
4771
4772   /* The argument is passed on stack; record the needed number of words for
4773      this argument and align the total size if necessary.  */
4774 on_stack:
4775   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
4776
4777   if (aarch64_function_arg_alignment (mode, type, &abi_break)
4778       == 16 * BITS_PER_UNIT)
4779     {
4780       int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
4781       if (pcum->aapcs_stack_size != new_size)
4782         {
4783           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4784             inform (input_location, "parameter passing for argument of type "
4785                     "%qT changed in GCC 9.1", type);
4786           pcum->aapcs_stack_size = new_size;
4787         }
4788     }
4789   return;
4790 }
4791
4792 /* Implement TARGET_FUNCTION_ARG.  */
4793
4794 static rtx
4795 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
4796                       const_tree type, bool named)
4797 {
4798   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4799   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
4800
4801   if (mode == VOIDmode)
4802     return NULL_RTX;
4803
4804   aarch64_layout_arg (pcum_v, mode, type, named);
4805   return pcum->aapcs_reg;
4806 }
4807
4808 void
4809 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
4810                            const_tree fntype ATTRIBUTE_UNUSED,
4811                            rtx libname ATTRIBUTE_UNUSED,
4812                            const_tree fndecl ATTRIBUTE_UNUSED,
4813                            unsigned n_named ATTRIBUTE_UNUSED)
4814 {
4815   pcum->aapcs_ncrn = 0;
4816   pcum->aapcs_nvrn = 0;
4817   pcum->aapcs_nextncrn = 0;
4818   pcum->aapcs_nextnvrn = 0;
4819   pcum->pcs_variant = ARM_PCS_AAPCS64;
4820   pcum->aapcs_reg = NULL_RTX;
4821   pcum->aapcs_arg_processed = false;
4822   pcum->aapcs_stack_words = 0;
4823   pcum->aapcs_stack_size = 0;
4824
4825   if (!TARGET_FLOAT
4826       && fndecl && TREE_PUBLIC (fndecl)
4827       && fntype && fntype != error_mark_node)
4828     {
4829       const_tree type = TREE_TYPE (fntype);
4830       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
4831       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
4832       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
4833                                                    &mode, &nregs, NULL))
4834         aarch64_err_no_fpadvsimd (TYPE_MODE (type));
4835     }
4836   return;
4837 }
4838
4839 static void
4840 aarch64_function_arg_advance (cumulative_args_t pcum_v,
4841                               machine_mode mode,
4842                               const_tree type,
4843                               bool named)
4844 {
4845   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4846   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
4847     {
4848       aarch64_layout_arg (pcum_v, mode, type, named);
4849       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
4850                   != (pcum->aapcs_stack_words != 0));
4851       pcum->aapcs_arg_processed = false;
4852       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
4853       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
4854       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
4855       pcum->aapcs_stack_words = 0;
4856       pcum->aapcs_reg = NULL_RTX;
4857     }
4858 }
4859
4860 bool
4861 aarch64_function_arg_regno_p (unsigned regno)
4862 {
4863   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
4864           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
4865 }
4866
4867 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
4868    PARM_BOUNDARY bits of alignment, but will be given anything up
4869    to STACK_BOUNDARY bits if the type requires it.  This makes sure
4870    that both before and after the layout of each argument, the Next
4871    Stacked Argument Address (NSAA) will have a minimum alignment of
4872    8 bytes.  */
4873
4874 static unsigned int
4875 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
4876 {
4877   bool abi_break;
4878   unsigned int alignment = aarch64_function_arg_alignment (mode, type,
4879                                                            &abi_break);
4880   if (abi_break & warn_psabi)
4881     inform (input_location, "parameter passing for argument of type "
4882             "%qT changed in GCC 9.1", type);
4883
4884   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
4885 }
4886
4887 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
4888
4889 static fixed_size_mode
4890 aarch64_get_reg_raw_mode (int regno)
4891 {
4892   if (TARGET_SVE && FP_REGNUM_P (regno))
4893     /* Don't use the SVE part of the register for __builtin_apply and
4894        __builtin_return.  The SVE registers aren't used by the normal PCS,
4895        so using them there would be a waste of time.  The PCS extensions
4896        for SVE types are fundamentally incompatible with the
4897        __builtin_return/__builtin_apply interface.  */
4898     return as_a <fixed_size_mode> (V16QImode);
4899   return default_get_reg_raw_mode (regno);
4900 }
4901
4902 /* Implement TARGET_FUNCTION_ARG_PADDING.
4903
4904    Small aggregate types are placed in the lowest memory address.
4905
4906    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
4907
4908 static pad_direction
4909 aarch64_function_arg_padding (machine_mode mode, const_tree type)
4910 {
4911   /* On little-endian targets, the least significant byte of every stack
4912      argument is passed at the lowest byte address of the stack slot.  */
4913   if (!BYTES_BIG_ENDIAN)
4914     return PAD_UPWARD;
4915
4916   /* Otherwise, integral, floating-point and pointer types are padded downward:
4917      the least significant byte of a stack argument is passed at the highest
4918      byte address of the stack slot.  */
4919   if (type
4920       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
4921          || POINTER_TYPE_P (type))
4922       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
4923     return PAD_DOWNWARD;
4924
4925   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
4926   return PAD_UPWARD;
4927 }
4928
4929 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4930
4931    It specifies padding for the last (may also be the only)
4932    element of a block move between registers and memory.  If
4933    assuming the block is in the memory, padding upward means that
4934    the last element is padded after its highest significant byte,
4935    while in downward padding, the last element is padded at the
4936    its least significant byte side.
4937
4938    Small aggregates and small complex types are always padded
4939    upwards.
4940
4941    We don't need to worry about homogeneous floating-point or
4942    short-vector aggregates; their move is not affected by the
4943    padding direction determined here.  Regardless of endianness,
4944    each element of such an aggregate is put in the least
4945    significant bits of a fp/simd register.
4946
4947    Return !BYTES_BIG_ENDIAN if the least significant byte of the
4948    register has useful data, and return the opposite if the most
4949    significant byte does.  */
4950
4951 bool
4952 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
4953                      bool first ATTRIBUTE_UNUSED)
4954 {
4955
4956   /* Small composite types are always padded upward.  */
4957   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
4958     {
4959       HOST_WIDE_INT size;
4960       if (type)
4961         size = int_size_in_bytes (type);
4962       else
4963         /* No frontends can create types with variable-sized modes, so we
4964            shouldn't be asked to pass or return them.  */
4965         size = GET_MODE_SIZE (mode).to_constant ();
4966       if (size < 2 * UNITS_PER_WORD)
4967         return true;
4968     }
4969
4970   /* Otherwise, use the default padding.  */
4971   return !BYTES_BIG_ENDIAN;
4972 }
4973
4974 static scalar_int_mode
4975 aarch64_libgcc_cmp_return_mode (void)
4976 {
4977   return SImode;
4978 }
4979
4980 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
4981
4982 /* We use the 12-bit shifted immediate arithmetic instructions so values
4983    must be multiple of (1 << 12), i.e. 4096.  */
4984 #define ARITH_FACTOR 4096
4985
4986 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
4987 #error Cannot use simple address calculation for stack probing
4988 #endif
4989
4990 /* The pair of scratch registers used for stack probing.  */
4991 #define PROBE_STACK_FIRST_REG  R9_REGNUM
4992 #define PROBE_STACK_SECOND_REG R10_REGNUM
4993
4994 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
4995    inclusive.  These are offsets from the current stack pointer.  */
4996
4997 static void
4998 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
4999 {
5000   HOST_WIDE_INT size;
5001   if (!poly_size.is_constant (&size))
5002     {
5003       sorry ("stack probes for SVE frames");
5004       return;
5005     }
5006
5007   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
5008
5009   /* See the same assertion on PROBE_INTERVAL above.  */
5010   gcc_assert ((first % ARITH_FACTOR) == 0);
5011
5012   /* See if we have a constant small number of probes to generate.  If so,
5013      that's the easy case.  */
5014   if (size <= PROBE_INTERVAL)
5015     {
5016       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
5017
5018       emit_set_insn (reg1,
5019                      plus_constant (Pmode,
5020                                     stack_pointer_rtx, -(first + base)));
5021       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
5022     }
5023
5024   /* The run-time loop is made up of 8 insns in the generic case while the
5025      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
5026   else if (size <= 4 * PROBE_INTERVAL)
5027     {
5028       HOST_WIDE_INT i, rem;
5029
5030       emit_set_insn (reg1,
5031                      plus_constant (Pmode,
5032                                     stack_pointer_rtx,
5033                                     -(first + PROBE_INTERVAL)));
5034       emit_stack_probe (reg1);
5035
5036       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
5037          it exceeds SIZE.  If only two probes are needed, this will not
5038          generate any code.  Then probe at FIRST + SIZE.  */
5039       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
5040         {
5041           emit_set_insn (reg1,
5042                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
5043           emit_stack_probe (reg1);
5044         }
5045
5046       rem = size - (i - PROBE_INTERVAL);
5047       if (rem > 256)
5048         {
5049           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5050
5051           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
5052           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
5053         }
5054       else
5055         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
5056     }
5057
5058   /* Otherwise, do the same as above, but in a loop.  Note that we must be
5059      extra careful with variables wrapping around because we might be at
5060      the very top (or the very bottom) of the address space and we have
5061      to be able to handle this case properly; in particular, we use an
5062      equality test for the loop condition.  */
5063   else
5064     {
5065       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
5066
5067       /* Step 1: round SIZE to the previous multiple of the interval.  */
5068
5069       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
5070
5071
5072       /* Step 2: compute initial and final value of the loop counter.  */
5073
5074       /* TEST_ADDR = SP + FIRST.  */
5075       emit_set_insn (reg1,
5076                      plus_constant (Pmode, stack_pointer_rtx, -first));
5077
5078       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
5079       HOST_WIDE_INT adjustment = - (first + rounded_size);
5080       if (! aarch64_uimm12_shift (adjustment))
5081         {
5082           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
5083                                           true, Pmode);
5084           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
5085         }
5086       else
5087         emit_set_insn (reg2,
5088                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
5089
5090       /* Step 3: the loop
5091
5092          do
5093            {
5094              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
5095              probe at TEST_ADDR
5096            }
5097          while (TEST_ADDR != LAST_ADDR)
5098
5099          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
5100          until it is equal to ROUNDED_SIZE.  */
5101
5102       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
5103
5104
5105       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
5106          that SIZE is equal to ROUNDED_SIZE.  */
5107
5108       if (size != rounded_size)
5109         {
5110           HOST_WIDE_INT rem = size - rounded_size;
5111
5112           if (rem > 256)
5113             {
5114               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5115
5116               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
5117               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
5118             }
5119           else
5120             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
5121         }
5122     }
5123
5124   /* Make sure nothing is scheduled before we are done.  */
5125   emit_insn (gen_blockage ());
5126 }
5127
5128 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
5129    absolute addresses.  */
5130
5131 const char *
5132 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
5133 {
5134   static int labelno = 0;
5135   char loop_lab[32];
5136   rtx xops[2];
5137
5138   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
5139
5140   /* Loop.  */
5141   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
5142
5143   HOST_WIDE_INT stack_clash_probe_interval
5144     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5145
5146   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
5147   xops[0] = reg1;
5148   HOST_WIDE_INT interval;
5149   if (flag_stack_clash_protection)
5150     interval = stack_clash_probe_interval;
5151   else
5152     interval = PROBE_INTERVAL;
5153
5154   gcc_assert (aarch64_uimm12_shift (interval));
5155   xops[1] = GEN_INT (interval);
5156
5157   output_asm_insn ("sub\t%0, %0, %1", xops);
5158
5159   /* If doing stack clash protection then we probe up by the ABI specified
5160      amount.  We do this because we're dropping full pages at a time in the
5161      loop.  But if we're doing non-stack clash probing, probe at SP 0.  */
5162   if (flag_stack_clash_protection)
5163     xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
5164   else
5165     xops[1] = CONST0_RTX (GET_MODE (xops[1]));
5166
5167   /* Probe at TEST_ADDR.  If we're inside the loop it is always safe to probe
5168      by this amount for each iteration.  */
5169   output_asm_insn ("str\txzr, [%0, %1]", xops);
5170
5171   /* Test if TEST_ADDR == LAST_ADDR.  */
5172   xops[1] = reg2;
5173   output_asm_insn ("cmp\t%0, %1", xops);
5174
5175   /* Branch.  */
5176   fputs ("\tb.ne\t", asm_out_file);
5177   assemble_name_raw (asm_out_file, loop_lab);
5178   fputc ('\n', asm_out_file);
5179
5180   return "";
5181 }
5182
5183 /* Emit the probe loop for doing stack clash probes and stack adjustments for
5184    SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
5185    of GUARD_SIZE.  When a probe is emitted it is done at most
5186    MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
5187    at most MIN_PROBE_THRESHOLD.  By the end of this function
5188    BASE = BASE - ADJUSTMENT.  */
5189
5190 const char *
5191 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
5192                                       rtx min_probe_threshold, rtx guard_size)
5193 {
5194   /* This function is not allowed to use any instruction generation function
5195      like gen_ and friends.  If you do you'll likely ICE during CFG validation,
5196      so instead emit the code you want using output_asm_insn.  */
5197   gcc_assert (flag_stack_clash_protection);
5198   gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
5199   gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
5200
5201   /* The minimum required allocation before the residual requires probing.  */
5202   HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
5203
5204   /* Clamp the value down to the nearest value that can be used with a cmp.  */
5205   residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
5206   rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
5207
5208   gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
5209   gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
5210
5211   static int labelno = 0;
5212   char loop_start_lab[32];
5213   char loop_end_lab[32];
5214   rtx xops[2];
5215
5216   ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
5217   ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
5218
5219   /* Emit loop start label.  */
5220   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
5221
5222   /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
5223   xops[0] = adjustment;
5224   xops[1] = probe_offset_value_rtx;
5225   output_asm_insn ("cmp\t%0, %1", xops);
5226
5227   /* Branch to end if not enough adjustment to probe.  */
5228   fputs ("\tb.lt\t", asm_out_file);
5229   assemble_name_raw (asm_out_file, loop_end_lab);
5230   fputc ('\n', asm_out_file);
5231
5232   /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
5233   xops[0] = base;
5234   xops[1] = probe_offset_value_rtx;
5235   output_asm_insn ("sub\t%0, %0, %1", xops);
5236
5237   /* Probe at BASE.  */
5238   xops[1] = const0_rtx;
5239   output_asm_insn ("str\txzr, [%0, %1]", xops);
5240
5241   /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
5242   xops[0] = adjustment;
5243   xops[1] = probe_offset_value_rtx;
5244   output_asm_insn ("sub\t%0, %0, %1", xops);
5245
5246   /* Branch to start if still more bytes to allocate.  */
5247   fputs ("\tb\t", asm_out_file);
5248   assemble_name_raw (asm_out_file, loop_start_lab);
5249   fputc ('\n', asm_out_file);
5250
5251   /* No probe leave.  */
5252   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
5253
5254   /* BASE = BASE - ADJUSTMENT.  */
5255   xops[0] = base;
5256   xops[1] = adjustment;
5257   output_asm_insn ("sub\t%0, %0, %1", xops);
5258   return "";
5259 }
5260
5261 /* Determine whether a frame chain needs to be generated.  */
5262 static bool
5263 aarch64_needs_frame_chain (void)
5264 {
5265   /* Force a frame chain for EH returns so the return address is at FP+8.  */
5266   if (frame_pointer_needed || crtl->calls_eh_return)
5267     return true;
5268
5269   /* A leaf function cannot have calls or write LR.  */
5270   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
5271
5272   /* Don't use a frame chain in leaf functions if leaf frame pointers
5273      are disabled.  */
5274   if (flag_omit_leaf_frame_pointer && is_leaf)
5275     return false;
5276
5277   return aarch64_use_frame_pointer;
5278 }
5279
5280 /* Mark the registers that need to be saved by the callee and calculate
5281    the size of the callee-saved registers area and frame record (both FP
5282    and LR may be omitted).  */
5283 static void
5284 aarch64_layout_frame (void)
5285 {
5286   HOST_WIDE_INT offset = 0;
5287   int regno, last_fp_reg = INVALID_REGNUM;
5288   bool simd_function = aarch64_simd_decl_p (cfun->decl);
5289
5290   cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
5291
5292   /* Adjust the outgoing arguments size if required.  Keep it in sync with what
5293      the mid-end is doing.  */
5294   crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
5295
5296 #define SLOT_NOT_REQUIRED (-2)
5297 #define SLOT_REQUIRED     (-1)
5298
5299   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
5300   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
5301
5302   /* If this is a non-leaf simd function with calls we assume that
5303      at least one of those calls is to a non-simd function and thus
5304      we must save V8 to V23 in the prologue.  */
5305
5306   if (simd_function && !crtl->is_leaf)
5307     {
5308       for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5309         if (FP_SIMD_SAVED_REGNUM_P (regno))
5310           df_set_regs_ever_live (regno, true);
5311     }
5312
5313   /* First mark all the registers that really need to be saved...  */
5314   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5315     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5316
5317   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5318     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5319
5320   /* ... that includes the eh data registers (if needed)...  */
5321   if (crtl->calls_eh_return)
5322     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
5323       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
5324         = SLOT_REQUIRED;
5325
5326   /* ... and any callee saved register that dataflow says is live.  */
5327   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5328     if (df_regs_ever_live_p (regno)
5329         && (regno == R30_REGNUM
5330             || !call_used_regs[regno]))
5331       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
5332
5333   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5334     if (df_regs_ever_live_p (regno)
5335         && (!call_used_regs[regno]
5336             || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
5337       {
5338         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
5339         last_fp_reg = regno;
5340       }
5341
5342   if (cfun->machine->frame.emit_frame_chain)
5343     {
5344       /* FP and LR are placed in the linkage record.  */
5345       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
5346       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
5347       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
5348       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
5349       offset = 2 * UNITS_PER_WORD;
5350     }
5351
5352   /* With stack-clash, LR must be saved in non-leaf functions.  */
5353   gcc_assert (crtl->is_leaf
5354               || (cfun->machine->frame.reg_offset[R30_REGNUM]
5355                   != SLOT_NOT_REQUIRED));
5356
5357   /* Now assign stack slots for them.  */
5358   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5359     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
5360       {
5361         cfun->machine->frame.reg_offset[regno] = offset;
5362         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
5363           cfun->machine->frame.wb_candidate1 = regno;
5364         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
5365           cfun->machine->frame.wb_candidate2 = regno;
5366         offset += UNITS_PER_WORD;
5367       }
5368
5369   HOST_WIDE_INT max_int_offset = offset;
5370   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
5371   bool has_align_gap = offset != max_int_offset;
5372
5373   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5374     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
5375       {
5376         /* If there is an alignment gap between integer and fp callee-saves,
5377            allocate the last fp register to it if possible.  */
5378         if (regno == last_fp_reg
5379             && has_align_gap
5380             && !simd_function
5381             && (offset & 8) == 0)
5382           {
5383             cfun->machine->frame.reg_offset[regno] = max_int_offset;
5384             break;
5385           }
5386
5387         cfun->machine->frame.reg_offset[regno] = offset;
5388         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
5389           cfun->machine->frame.wb_candidate1 = regno;
5390         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
5391                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
5392           cfun->machine->frame.wb_candidate2 = regno;
5393         offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
5394       }
5395
5396   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
5397
5398   cfun->machine->frame.saved_regs_size = offset;
5399
5400   HOST_WIDE_INT varargs_and_saved_regs_size
5401     = offset + cfun->machine->frame.saved_varargs_size;
5402
5403   cfun->machine->frame.hard_fp_offset
5404     = aligned_upper_bound (varargs_and_saved_regs_size
5405                            + get_frame_size (),
5406                            STACK_BOUNDARY / BITS_PER_UNIT);
5407
5408   /* Both these values are already aligned.  */
5409   gcc_assert (multiple_p (crtl->outgoing_args_size,
5410                           STACK_BOUNDARY / BITS_PER_UNIT));
5411   cfun->machine->frame.frame_size
5412     = (cfun->machine->frame.hard_fp_offset
5413        + crtl->outgoing_args_size);
5414
5415   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
5416
5417   cfun->machine->frame.initial_adjust = 0;
5418   cfun->machine->frame.final_adjust = 0;
5419   cfun->machine->frame.callee_adjust = 0;
5420   cfun->machine->frame.callee_offset = 0;
5421
5422   HOST_WIDE_INT max_push_offset = 0;
5423   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
5424     max_push_offset = 512;
5425   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
5426     max_push_offset = 256;
5427
5428   HOST_WIDE_INT const_size, const_fp_offset;
5429   if (cfun->machine->frame.frame_size.is_constant (&const_size)
5430       && const_size < max_push_offset
5431       && known_eq (crtl->outgoing_args_size, 0))
5432     {
5433       /* Simple, small frame with no outgoing arguments:
5434          stp reg1, reg2, [sp, -frame_size]!
5435          stp reg3, reg4, [sp, 16]  */
5436       cfun->machine->frame.callee_adjust = const_size;
5437     }
5438   else if (known_lt (crtl->outgoing_args_size
5439                      + cfun->machine->frame.saved_regs_size, 512)
5440            && !(cfun->calls_alloca
5441                 && known_lt (cfun->machine->frame.hard_fp_offset,
5442                              max_push_offset)))
5443     {
5444       /* Frame with small outgoing arguments:
5445          sub sp, sp, frame_size
5446          stp reg1, reg2, [sp, outgoing_args_size]
5447          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
5448       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
5449       cfun->machine->frame.callee_offset
5450         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
5451     }
5452   else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
5453            && const_fp_offset < max_push_offset)
5454     {
5455       /* Frame with large outgoing arguments but a small local area:
5456          stp reg1, reg2, [sp, -hard_fp_offset]!
5457          stp reg3, reg4, [sp, 16]
5458          sub sp, sp, outgoing_args_size  */
5459       cfun->machine->frame.callee_adjust = const_fp_offset;
5460       cfun->machine->frame.final_adjust
5461         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
5462     }
5463   else
5464     {
5465       /* Frame with large local area and outgoing arguments using frame pointer:
5466          sub sp, sp, hard_fp_offset
5467          stp x29, x30, [sp, 0]
5468          add x29, sp, 0
5469          stp reg3, reg4, [sp, 16]
5470          sub sp, sp, outgoing_args_size  */
5471       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
5472       cfun->machine->frame.final_adjust
5473         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
5474     }
5475
5476   cfun->machine->frame.laid_out = true;
5477 }
5478
5479 /* Return true if the register REGNO is saved on entry to
5480    the current function.  */
5481
5482 static bool
5483 aarch64_register_saved_on_entry (int regno)
5484 {
5485   return cfun->machine->frame.reg_offset[regno] >= 0;
5486 }
5487
5488 /* Return the next register up from REGNO up to LIMIT for the callee
5489    to save.  */
5490
5491 static unsigned
5492 aarch64_next_callee_save (unsigned regno, unsigned limit)
5493 {
5494   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
5495     regno ++;
5496   return regno;
5497 }
5498
5499 /* Push the register number REGNO of mode MODE to the stack with write-back
5500    adjusting the stack by ADJUSTMENT.  */
5501
5502 static void
5503 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
5504                            HOST_WIDE_INT adjustment)
5505  {
5506   rtx base_rtx = stack_pointer_rtx;
5507   rtx insn, reg, mem;
5508
5509   reg = gen_rtx_REG (mode, regno);
5510   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
5511                             plus_constant (Pmode, base_rtx, -adjustment));
5512   mem = gen_frame_mem (mode, mem);
5513
5514   insn = emit_move_insn (mem, reg);
5515   RTX_FRAME_RELATED_P (insn) = 1;
5516 }
5517
5518 /* Generate and return an instruction to store the pair of registers
5519    REG and REG2 of mode MODE to location BASE with write-back adjusting
5520    the stack location BASE by ADJUSTMENT.  */
5521
5522 static rtx
5523 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
5524                           HOST_WIDE_INT adjustment)
5525 {
5526   switch (mode)
5527     {
5528     case E_DImode:
5529       return gen_storewb_pairdi_di (base, base, reg, reg2,
5530                                     GEN_INT (-adjustment),
5531                                     GEN_INT (UNITS_PER_WORD - adjustment));
5532     case E_DFmode:
5533       return gen_storewb_pairdf_di (base, base, reg, reg2,
5534                                     GEN_INT (-adjustment),
5535                                     GEN_INT (UNITS_PER_WORD - adjustment));
5536     case E_TFmode:
5537       return gen_storewb_pairtf_di (base, base, reg, reg2,
5538                                     GEN_INT (-adjustment),
5539                                     GEN_INT (UNITS_PER_VREG - adjustment));
5540     default:
5541       gcc_unreachable ();
5542     }
5543 }
5544
5545 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
5546    stack pointer by ADJUSTMENT.  */
5547
5548 static void
5549 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
5550 {
5551   rtx_insn *insn;
5552   machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
5553
5554   if (regno2 == INVALID_REGNUM)
5555     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
5556
5557   rtx reg1 = gen_rtx_REG (mode, regno1);
5558   rtx reg2 = gen_rtx_REG (mode, regno2);
5559
5560   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
5561                                               reg2, adjustment));
5562   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
5563   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5564   RTX_FRAME_RELATED_P (insn) = 1;
5565 }
5566
5567 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
5568    adjusting it by ADJUSTMENT afterwards.  */
5569
5570 static rtx
5571 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
5572                          HOST_WIDE_INT adjustment)
5573 {
5574   switch (mode)
5575     {
5576     case E_DImode:
5577       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
5578                                    GEN_INT (UNITS_PER_WORD));
5579     case E_DFmode:
5580       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
5581                                    GEN_INT (UNITS_PER_WORD));
5582     case E_TFmode:
5583       return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
5584                                    GEN_INT (UNITS_PER_VREG));
5585     default:
5586       gcc_unreachable ();
5587     }
5588 }
5589
5590 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
5591    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
5592    into CFI_OPS.  */
5593
5594 static void
5595 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
5596                   rtx *cfi_ops)
5597 {
5598   machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
5599   rtx reg1 = gen_rtx_REG (mode, regno1);
5600
5601   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
5602
5603   if (regno2 == INVALID_REGNUM)
5604     {
5605       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
5606       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
5607       emit_move_insn (reg1, gen_frame_mem (mode, mem));
5608     }
5609   else
5610     {
5611       rtx reg2 = gen_rtx_REG (mode, regno2);
5612       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5613       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
5614                                           reg2, adjustment));
5615     }
5616 }
5617
5618 /* Generate and return a store pair instruction of mode MODE to store
5619    register REG1 to MEM1 and register REG2 to MEM2.  */
5620
5621 static rtx
5622 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
5623                         rtx reg2)
5624 {
5625   switch (mode)
5626     {
5627     case E_DImode:
5628       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
5629
5630     case E_DFmode:
5631       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
5632
5633     case E_TFmode:
5634       return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
5635
5636     default:
5637       gcc_unreachable ();
5638     }
5639 }
5640
5641 /* Generate and regurn a load pair isntruction of mode MODE to load register
5642    REG1 from MEM1 and register REG2 from MEM2.  */
5643
5644 static rtx
5645 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
5646                        rtx mem2)
5647 {
5648   switch (mode)
5649     {
5650     case E_DImode:
5651       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
5652
5653     case E_DFmode:
5654       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
5655
5656     case E_TFmode:
5657       return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
5658
5659     default:
5660       gcc_unreachable ();
5661     }
5662 }
5663
5664 /* Return TRUE if return address signing should be enabled for the current
5665    function, otherwise return FALSE.  */
5666
5667 bool
5668 aarch64_return_address_signing_enabled (void)
5669 {
5670   /* This function should only be called after frame laid out.   */
5671   gcc_assert (cfun->machine->frame.laid_out);
5672
5673   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
5674      if its LR is pushed onto stack.  */
5675   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
5676           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
5677               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
5678 }
5679
5680 /* Return TRUE if Branch Target Identification Mechanism is enabled.  */
5681 bool
5682 aarch64_bti_enabled (void)
5683 {
5684   return (aarch64_enable_bti == 1);
5685 }
5686
5687 /* Emit code to save the callee-saved registers from register number START
5688    to LIMIT to the stack at the location starting at offset START_OFFSET,
5689    skipping any write-back candidates if SKIP_WB is true.  */
5690
5691 static void
5692 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
5693                            unsigned start, unsigned limit, bool skip_wb)
5694 {
5695   rtx_insn *insn;
5696   unsigned regno;
5697   unsigned regno2;
5698
5699   for (regno = aarch64_next_callee_save (start, limit);
5700        regno <= limit;
5701        regno = aarch64_next_callee_save (regno + 1, limit))
5702     {
5703       rtx reg, mem;
5704       poly_int64 offset;
5705       int offset_diff;
5706
5707       if (skip_wb
5708           && (regno == cfun->machine->frame.wb_candidate1
5709               || regno == cfun->machine->frame.wb_candidate2))
5710         continue;
5711
5712       if (cfun->machine->reg_is_wrapped_separately[regno])
5713        continue;
5714
5715       reg = gen_rtx_REG (mode, regno);
5716       offset = start_offset + cfun->machine->frame.reg_offset[regno];
5717       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5718                                                 offset));
5719
5720       regno2 = aarch64_next_callee_save (regno + 1, limit);
5721       offset_diff = cfun->machine->frame.reg_offset[regno2]
5722                     - cfun->machine->frame.reg_offset[regno];
5723
5724       if (regno2 <= limit
5725           && !cfun->machine->reg_is_wrapped_separately[regno2]
5726           && known_eq (GET_MODE_SIZE (mode), offset_diff))
5727         {
5728           rtx reg2 = gen_rtx_REG (mode, regno2);
5729           rtx mem2;
5730
5731           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5732           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5733                                                      offset));
5734           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
5735                                                     reg2));
5736
5737           /* The first part of a frame-related parallel insn is
5738              always assumed to be relevant to the frame
5739              calculations; subsequent parts, are only
5740              frame-related if explicitly marked.  */
5741           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5742           regno = regno2;
5743         }
5744       else
5745         insn = emit_move_insn (mem, reg);
5746
5747       RTX_FRAME_RELATED_P (insn) = 1;
5748     }
5749 }
5750
5751 /* Emit code to restore the callee registers of mode MODE from register
5752    number START up to and including LIMIT.  Restore from the stack offset
5753    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
5754    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
5755
5756 static void
5757 aarch64_restore_callee_saves (machine_mode mode,
5758                               poly_int64 start_offset, unsigned start,
5759                               unsigned limit, bool skip_wb, rtx *cfi_ops)
5760 {
5761   rtx base_rtx = stack_pointer_rtx;
5762   unsigned regno;
5763   unsigned regno2;
5764   poly_int64 offset;
5765
5766   for (regno = aarch64_next_callee_save (start, limit);
5767        regno <= limit;
5768        regno = aarch64_next_callee_save (regno + 1, limit))
5769     {
5770       if (cfun->machine->reg_is_wrapped_separately[regno])
5771        continue;
5772
5773       rtx reg, mem;
5774       int offset_diff;
5775
5776       if (skip_wb
5777           && (regno == cfun->machine->frame.wb_candidate1
5778               || regno == cfun->machine->frame.wb_candidate2))
5779         continue;
5780
5781       reg = gen_rtx_REG (mode, regno);
5782       offset = start_offset + cfun->machine->frame.reg_offset[regno];
5783       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5784
5785       regno2 = aarch64_next_callee_save (regno + 1, limit);
5786       offset_diff = cfun->machine->frame.reg_offset[regno2]
5787                     - cfun->machine->frame.reg_offset[regno];
5788
5789       if (regno2 <= limit
5790           && !cfun->machine->reg_is_wrapped_separately[regno2]
5791           && known_eq (GET_MODE_SIZE (mode), offset_diff))
5792         {
5793           rtx reg2 = gen_rtx_REG (mode, regno2);
5794           rtx mem2;
5795
5796           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5797           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5798           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5799
5800           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5801           regno = regno2;
5802         }
5803       else
5804         emit_move_insn (reg, mem);
5805       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
5806     }
5807 }
5808
5809 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
5810    of MODE.  */
5811
5812 static inline bool
5813 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5814 {
5815   HOST_WIDE_INT multiple;
5816   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5817           && IN_RANGE (multiple, -8, 7));
5818 }
5819
5820 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
5821    of MODE.  */
5822
5823 static inline bool
5824 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5825 {
5826   HOST_WIDE_INT multiple;
5827   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5828           && IN_RANGE (multiple, 0, 63));
5829 }
5830
5831 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
5832    of MODE.  */
5833
5834 bool
5835 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5836 {
5837   HOST_WIDE_INT multiple;
5838   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5839           && IN_RANGE (multiple, -64, 63));
5840 }
5841
5842 /* Return true if OFFSET is a signed 9-bit value.  */
5843
5844 bool
5845 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
5846                                        poly_int64 offset)
5847 {
5848   HOST_WIDE_INT const_offset;
5849   return (offset.is_constant (&const_offset)
5850           && IN_RANGE (const_offset, -256, 255));
5851 }
5852
5853 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
5854    of MODE.  */
5855
5856 static inline bool
5857 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5858 {
5859   HOST_WIDE_INT multiple;
5860   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5861           && IN_RANGE (multiple, -256, 255));
5862 }
5863
5864 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
5865    of MODE.  */
5866
5867 static inline bool
5868 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5869 {
5870   HOST_WIDE_INT multiple;
5871   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5872           && IN_RANGE (multiple, 0, 4095));
5873 }
5874
5875 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
5876
5877 static sbitmap
5878 aarch64_get_separate_components (void)
5879 {
5880   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5881   bitmap_clear (components);
5882
5883   /* The registers we need saved to the frame.  */
5884   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5885     if (aarch64_register_saved_on_entry (regno))
5886       {
5887         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5888         if (!frame_pointer_needed)
5889           offset += cfun->machine->frame.frame_size
5890                     - cfun->machine->frame.hard_fp_offset;
5891         /* Check that we can access the stack slot of the register with one
5892            direct load with no adjustments needed.  */
5893         if (offset_12bit_unsigned_scaled_p (DImode, offset))
5894           bitmap_set_bit (components, regno);
5895       }
5896
5897   /* Don't mess with the hard frame pointer.  */
5898   if (frame_pointer_needed)
5899     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
5900
5901   unsigned reg1 = cfun->machine->frame.wb_candidate1;
5902   unsigned reg2 = cfun->machine->frame.wb_candidate2;
5903   /* If registers have been chosen to be stored/restored with
5904      writeback don't interfere with them to avoid having to output explicit
5905      stack adjustment instructions.  */
5906   if (reg2 != INVALID_REGNUM)
5907     bitmap_clear_bit (components, reg2);
5908   if (reg1 != INVALID_REGNUM)
5909     bitmap_clear_bit (components, reg1);
5910
5911   bitmap_clear_bit (components, LR_REGNUM);
5912   bitmap_clear_bit (components, SP_REGNUM);
5913
5914   return components;
5915 }
5916
5917 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
5918
5919 static sbitmap
5920 aarch64_components_for_bb (basic_block bb)
5921 {
5922   bitmap in = DF_LIVE_IN (bb);
5923   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
5924   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
5925   bool simd_function = aarch64_simd_decl_p (cfun->decl);
5926
5927   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5928   bitmap_clear (components);
5929
5930   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
5931   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5932     if ((!call_used_regs[regno]
5933         || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
5934        && (bitmap_bit_p (in, regno)
5935            || bitmap_bit_p (gen, regno)
5936            || bitmap_bit_p (kill, regno)))
5937       {
5938         unsigned regno2, offset, offset2;
5939         bitmap_set_bit (components, regno);
5940
5941         /* If there is a callee-save at an adjacent offset, add it too
5942            to increase the use of LDP/STP.  */
5943         offset = cfun->machine->frame.reg_offset[regno];
5944         regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
5945
5946         if (regno2 <= LAST_SAVED_REGNUM)
5947           {
5948             offset2 = cfun->machine->frame.reg_offset[regno2];
5949             if ((offset & ~8) == (offset2 & ~8))
5950               bitmap_set_bit (components, regno2);
5951           }
5952       }
5953
5954   return components;
5955 }
5956
5957 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5958    Nothing to do for aarch64.  */
5959
5960 static void
5961 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
5962 {
5963 }
5964
5965 /* Return the next set bit in BMP from START onwards.  Return the total number
5966    of bits in BMP if no set bit is found at or after START.  */
5967
5968 static unsigned int
5969 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
5970 {
5971   unsigned int nbits = SBITMAP_SIZE (bmp);
5972   if (start == nbits)
5973     return start;
5974
5975   gcc_assert (start < nbits);
5976   for (unsigned int i = start; i < nbits; i++)
5977     if (bitmap_bit_p (bmp, i))
5978       return i;
5979
5980   return nbits;
5981 }
5982
5983 /* Do the work for aarch64_emit_prologue_components and
5984    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
5985    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
5986    for these components or the epilogue sequence.  That is, it determines
5987    whether we should emit stores or loads and what kind of CFA notes to attach
5988    to the insns.  Otherwise the logic for the two sequences is very
5989    similar.  */
5990
5991 static void
5992 aarch64_process_components (sbitmap components, bool prologue_p)
5993 {
5994   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
5995                              ? HARD_FRAME_POINTER_REGNUM
5996                              : STACK_POINTER_REGNUM);
5997
5998   unsigned last_regno = SBITMAP_SIZE (components);
5999   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
6000   rtx_insn *insn = NULL;
6001
6002   while (regno != last_regno)
6003     {
6004       /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
6005          so DFmode for the vector registers is enough.  For simd functions
6006          we want to save the low 128 bits.  */
6007       machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
6008
6009       rtx reg = gen_rtx_REG (mode, regno);
6010       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
6011       if (!frame_pointer_needed)
6012         offset += cfun->machine->frame.frame_size
6013                   - cfun->machine->frame.hard_fp_offset;
6014       rtx addr = plus_constant (Pmode, ptr_reg, offset);
6015       rtx mem = gen_frame_mem (mode, addr);
6016
6017       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
6018       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
6019       /* No more registers to handle after REGNO.
6020          Emit a single save/restore and exit.  */
6021       if (regno2 == last_regno)
6022         {
6023           insn = emit_insn (set);
6024           RTX_FRAME_RELATED_P (insn) = 1;
6025           if (prologue_p)
6026             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
6027           else
6028             add_reg_note (insn, REG_CFA_RESTORE, reg);
6029           break;
6030         }
6031
6032       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
6033       /* The next register is not of the same class or its offset is not
6034          mergeable with the current one into a pair.  */
6035       if (!satisfies_constraint_Ump (mem)
6036           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
6037           || (aarch64_simd_decl_p (cfun->decl) && FP_REGNUM_P (regno))
6038           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
6039                        GET_MODE_SIZE (mode)))
6040         {
6041           insn = emit_insn (set);
6042           RTX_FRAME_RELATED_P (insn) = 1;
6043           if (prologue_p)
6044             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
6045           else
6046             add_reg_note (insn, REG_CFA_RESTORE, reg);
6047
6048           regno = regno2;
6049           continue;
6050         }
6051
6052       /* REGNO2 can be saved/restored in a pair with REGNO.  */
6053       rtx reg2 = gen_rtx_REG (mode, regno2);
6054       if (!frame_pointer_needed)
6055         offset2 += cfun->machine->frame.frame_size
6056                   - cfun->machine->frame.hard_fp_offset;
6057       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
6058       rtx mem2 = gen_frame_mem (mode, addr2);
6059       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
6060                              : gen_rtx_SET (reg2, mem2);
6061
6062       if (prologue_p)
6063         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
6064       else
6065         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
6066
6067       RTX_FRAME_RELATED_P (insn) = 1;
6068       if (prologue_p)
6069         {
6070           add_reg_note (insn, REG_CFA_OFFSET, set);
6071           add_reg_note (insn, REG_CFA_OFFSET, set2);
6072         }
6073       else
6074         {
6075           add_reg_note (insn, REG_CFA_RESTORE, reg);
6076           add_reg_note (insn, REG_CFA_RESTORE, reg2);
6077         }
6078
6079       regno = aarch64_get_next_set_bit (components, regno2 + 1);
6080     }
6081 }
6082
6083 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
6084
6085 static void
6086 aarch64_emit_prologue_components (sbitmap components)
6087 {
6088   aarch64_process_components (components, true);
6089 }
6090
6091 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
6092
6093 static void
6094 aarch64_emit_epilogue_components (sbitmap components)
6095 {
6096   aarch64_process_components (components, false);
6097 }
6098
6099 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
6100
6101 static void
6102 aarch64_set_handled_components (sbitmap components)
6103 {
6104   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6105     if (bitmap_bit_p (components, regno))
6106       cfun->machine->reg_is_wrapped_separately[regno] = true;
6107 }
6108
6109 /* On AArch64 we have an ABI defined safe buffer.  This constant is used to
6110    determining the probe offset for alloca.  */
6111
6112 static HOST_WIDE_INT
6113 aarch64_stack_clash_protection_alloca_probe_range (void)
6114 {
6115   return STACK_CLASH_CALLER_GUARD;
6116 }
6117
6118
6119 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
6120    registers.  If POLY_SIZE is not large enough to require a probe this function
6121    will only adjust the stack.  When allocating the stack space
6122    FRAME_RELATED_P is then used to indicate if the allocation is frame related.
6123    FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
6124    arguments.  If we are then we ensure that any allocation larger than the ABI
6125    defined buffer needs a probe so that the invariant of having a 1KB buffer is
6126    maintained.
6127
6128    We emit barriers after each stack adjustment to prevent optimizations from
6129    breaking the invariant that we never drop the stack more than a page.  This
6130    invariant is needed to make it easier to correctly handle asynchronous
6131    events, e.g. if we were to allow the stack to be dropped by more than a page
6132    and then have multiple probes up and we take a signal somewhere in between
6133    then the signal handler doesn't know the state of the stack and can make no
6134    assumptions about which pages have been probed.  */
6135
6136 static void
6137 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
6138                                         poly_int64 poly_size,
6139                                         bool frame_related_p,
6140                                         bool final_adjustment_p)
6141 {
6142   HOST_WIDE_INT guard_size
6143     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
6144   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
6145   /* When doing the final adjustment for the outgoing argument size we can't
6146      assume that LR was saved at position 0.  So subtract it's offset from the
6147      ABI safe buffer so that we don't accidentally allow an adjustment that
6148      would result in an allocation larger than the ABI buffer without
6149      probing.  */
6150   HOST_WIDE_INT min_probe_threshold
6151     = final_adjustment_p
6152       ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
6153       : guard_size - guard_used_by_caller;
6154
6155   poly_int64 frame_size = cfun->machine->frame.frame_size;
6156
6157   /* We should always have a positive probe threshold.  */
6158   gcc_assert (min_probe_threshold > 0);
6159
6160   if (flag_stack_clash_protection && !final_adjustment_p)
6161     {
6162       poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6163       poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6164
6165       if (known_eq (frame_size, 0))
6166         {
6167           dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
6168         }
6169       else if (known_lt (initial_adjust, guard_size - guard_used_by_caller)
6170                && known_lt (final_adjust, guard_used_by_caller))
6171         {
6172           dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
6173         }
6174     }
6175
6176   /* If SIZE is not large enough to require probing, just adjust the stack and
6177      exit.  */
6178   if (known_lt (poly_size, min_probe_threshold)
6179       || !flag_stack_clash_protection)
6180     {
6181       aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
6182       return;
6183     }
6184
6185   HOST_WIDE_INT size;
6186   /* Handle the SVE non-constant case first.  */
6187   if (!poly_size.is_constant (&size))
6188     {
6189      if (dump_file)
6190       {
6191         fprintf (dump_file, "Stack clash SVE prologue: ");
6192         print_dec (poly_size, dump_file);
6193         fprintf (dump_file, " bytes, dynamic probing will be required.\n");
6194       }
6195
6196       /* First calculate the amount of bytes we're actually spilling.  */
6197       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
6198                           poly_size, temp1, temp2, false, true);
6199
6200       rtx_insn *insn = get_last_insn ();
6201
6202       if (frame_related_p)
6203         {
6204           /* This is done to provide unwinding information for the stack
6205              adjustments we're about to do, however to prevent the optimizers
6206              from removing the R11 move and leaving the CFA note (which would be
6207              very wrong) we tie the old and new stack pointer together.
6208              The tie will expand to nothing but the optimizers will not touch
6209              the instruction.  */
6210           rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
6211           emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
6212           emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
6213
6214           /* We want the CFA independent of the stack pointer for the
6215              duration of the loop.  */
6216           add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
6217           RTX_FRAME_RELATED_P (insn) = 1;
6218         }
6219
6220       rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
6221       rtx guard_const = gen_int_mode (guard_size, Pmode);
6222
6223       insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
6224                                                    stack_pointer_rtx, temp1,
6225                                                    probe_const, guard_const));
6226
6227       /* Now reset the CFA register if needed.  */
6228       if (frame_related_p)
6229         {
6230           add_reg_note (insn, REG_CFA_DEF_CFA,
6231                         gen_rtx_PLUS (Pmode, stack_pointer_rtx,
6232                                       gen_int_mode (poly_size, Pmode)));
6233           RTX_FRAME_RELATED_P (insn) = 1;
6234         }
6235
6236       return;
6237     }
6238
6239   if (dump_file)
6240     fprintf (dump_file,
6241              "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
6242              " bytes, probing will be required.\n", size);
6243
6244   /* Round size to the nearest multiple of guard_size, and calculate the
6245      residual as the difference between the original size and the rounded
6246      size.  */
6247   HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
6248   HOST_WIDE_INT residual = size - rounded_size;
6249
6250   /* We can handle a small number of allocations/probes inline.  Otherwise
6251      punt to a loop.  */
6252   if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
6253     {
6254       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
6255         {
6256           aarch64_sub_sp (NULL, temp2, guard_size, true);
6257           emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
6258                                            guard_used_by_caller));
6259           emit_insn (gen_blockage ());
6260         }
6261       dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
6262     }
6263   else
6264     {
6265       /* Compute the ending address.  */
6266       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
6267                           temp1, NULL, false, true);
6268       rtx_insn *insn = get_last_insn ();
6269
6270       /* For the initial allocation, we don't have a frame pointer
6271          set up, so we always need CFI notes.  If we're doing the
6272          final allocation, then we may have a frame pointer, in which
6273          case it is the CFA, otherwise we need CFI notes.
6274
6275          We can determine which allocation we are doing by looking at
6276          the value of FRAME_RELATED_P since the final allocations are not
6277          frame related.  */
6278       if (frame_related_p)
6279         {
6280           /* We want the CFA independent of the stack pointer for the
6281              duration of the loop.  */
6282           add_reg_note (insn, REG_CFA_DEF_CFA,
6283                         plus_constant (Pmode, temp1, rounded_size));
6284           RTX_FRAME_RELATED_P (insn) = 1;
6285         }
6286
6287       /* This allocates and probes the stack.  Note that this re-uses some of
6288          the existing Ada stack protection code.  However we are guaranteed not
6289          to enter the non loop or residual branches of that code.
6290
6291          The non-loop part won't be entered because if our allocation amount
6292          doesn't require a loop, the case above would handle it.
6293
6294          The residual amount won't be entered because TEMP1 is a mutliple of
6295          the allocation size.  The residual will always be 0.  As such, the only
6296          part we are actually using from that code is the loop setup.  The
6297          actual probing is done in aarch64_output_probe_stack_range.  */
6298       insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
6299                                                stack_pointer_rtx, temp1));
6300
6301       /* Now reset the CFA register if needed.  */
6302       if (frame_related_p)
6303         {
6304           add_reg_note (insn, REG_CFA_DEF_CFA,
6305                         plus_constant (Pmode, stack_pointer_rtx, rounded_size));
6306           RTX_FRAME_RELATED_P (insn) = 1;
6307         }
6308
6309       emit_insn (gen_blockage ());
6310       dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
6311     }
6312
6313   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
6314      be probed.  This maintains the requirement that each page is probed at
6315      least once.  For initial probing we probe only if the allocation is
6316      more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
6317      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
6318      GUARD_SIZE.  This works that for any allocation that is large enough to
6319      trigger a probe here, we'll have at least one, and if they're not large
6320      enough for this code to emit anything for them, The page would have been
6321      probed by the saving of FP/LR either by this function or any callees.  If
6322      we don't have any callees then we won't have more stack adjustments and so
6323      are still safe.  */
6324   if (residual)
6325     {
6326       HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
6327       /* If we're doing final adjustments, and we've done any full page
6328          allocations then any residual needs to be probed.  */
6329       if (final_adjustment_p && rounded_size != 0)
6330         min_probe_threshold = 0;
6331       /* If doing a small final adjustment, we always probe at offset 0.
6332          This is done to avoid issues when LR is not at position 0 or when
6333          the final adjustment is smaller than the probing offset.  */
6334       else if (final_adjustment_p && rounded_size == 0)
6335         residual_probe_offset = 0;
6336
6337       aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
6338       if (residual >= min_probe_threshold)
6339         {
6340           if (dump_file)
6341             fprintf (dump_file,
6342                      "Stack clash AArch64 prologue residuals: "
6343                      HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
6344                      "\n", residual);
6345
6346             emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
6347                                              residual_probe_offset));
6348           emit_insn (gen_blockage ());
6349         }
6350     }
6351 }
6352
6353 /* Return 1 if the register is used by the epilogue.  We need to say the
6354    return register is used, but only after epilogue generation is complete.
6355    Note that in the case of sibcalls, the values "used by the epilogue" are
6356    considered live at the start of the called function.
6357
6358    For SIMD functions we need to return 1 for FP registers that are saved and
6359    restored by a function but are not zero in call_used_regs.  If we do not do
6360    this optimizations may remove the restore of the register.  */
6361
6362 int
6363 aarch64_epilogue_uses (int regno)
6364 {
6365   if (epilogue_completed)
6366     {
6367       if (regno == LR_REGNUM)
6368         return 1;
6369       if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno))
6370         return 1;
6371     }
6372   return 0;
6373 }
6374
6375 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
6376    is saved at BASE + OFFSET.  */
6377
6378 static void
6379 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
6380                             rtx base, poly_int64 offset)
6381 {
6382   rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
6383   add_reg_note (insn, REG_CFA_EXPRESSION,
6384                 gen_rtx_SET (mem, regno_reg_rtx[reg]));
6385 }
6386
6387 /* AArch64 stack frames generated by this compiler look like:
6388
6389         +-------------------------------+
6390         |                               |
6391         |  incoming stack arguments     |
6392         |                               |
6393         +-------------------------------+
6394         |                               | <-- incoming stack pointer (aligned)
6395         |  callee-allocated save area   |
6396         |  for register varargs         |
6397         |                               |
6398         +-------------------------------+
6399         |  local variables              | <-- frame_pointer_rtx
6400         |                               |
6401         +-------------------------------+
6402         |  padding                      | \
6403         +-------------------------------+  |
6404         |  callee-saved registers       |  | frame.saved_regs_size
6405         +-------------------------------+  |
6406         |  LR'                          |  |
6407         +-------------------------------+  |
6408         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
6409         +-------------------------------+
6410         |  dynamic allocation           |
6411         +-------------------------------+
6412         |  padding                      |
6413         +-------------------------------+
6414         |  outgoing stack arguments     | <-- arg_pointer
6415         |                               |
6416         +-------------------------------+
6417         |                               | <-- stack_pointer_rtx (aligned)
6418
6419    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
6420    but leave frame_pointer_rtx and hard_frame_pointer_rtx
6421    unchanged.
6422
6423    By default for stack-clash we assume the guard is at least 64KB, but this
6424    value is configurable to either 4KB or 64KB.  We also force the guard size to
6425    be the same as the probing interval and both values are kept in sync.
6426
6427    With those assumptions the callee can allocate up to 63KB (or 3KB depending
6428    on the guard size) of stack space without probing.
6429
6430    When probing is needed, we emit a probe at the start of the prologue
6431    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
6432
6433    We have to track how much space has been allocated and the only stores
6434    to the stack we track as implicit probes are the FP/LR stores.
6435
6436    For outgoing arguments we probe if the size is larger than 1KB, such that
6437    the ABI specified buffer is maintained for the next callee.
6438
6439    The following registers are reserved during frame layout and should not be
6440    used for any other purpose:
6441
6442    - r11: Used by stack clash protection when SVE is enabled.
6443    - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
6444    - r14 and r15: Used for speculation tracking.
6445    - r16(IP0), r17(IP1): Used by indirect tailcalls.
6446    - r30(LR), r29(FP): Used by standard frame layout.
6447
6448    These registers must be avoided in frame layout related code unless the
6449    explicit intention is to interact with one of the features listed above.  */
6450
6451 /* Generate the prologue instructions for entry into a function.
6452    Establish the stack frame by decreasing the stack pointer with a
6453    properly calculated size and, if necessary, create a frame record
6454    filled with the values of LR and previous frame pointer.  The
6455    current FP is also set up if it is in use.  */
6456
6457 void
6458 aarch64_expand_prologue (void)
6459 {
6460   poly_int64 frame_size = cfun->machine->frame.frame_size;
6461   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6462   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6463   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6464   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
6465   unsigned reg1 = cfun->machine->frame.wb_candidate1;
6466   unsigned reg2 = cfun->machine->frame.wb_candidate2;
6467   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
6468   rtx_insn *insn;
6469
6470   /* Sign return address for functions.  */
6471   if (aarch64_return_address_signing_enabled ())
6472     {
6473       switch (aarch64_ra_sign_key)
6474         {
6475           case AARCH64_KEY_A:
6476             insn = emit_insn (gen_paciasp ());
6477             break;
6478           case AARCH64_KEY_B:
6479             insn = emit_insn (gen_pacibsp ());
6480             break;
6481           default:
6482             gcc_unreachable ();
6483         }
6484       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6485       RTX_FRAME_RELATED_P (insn) = 1;
6486     }
6487
6488   if (flag_stack_usage_info)
6489     current_function_static_stack_size = constant_lower_bound (frame_size);
6490
6491   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
6492     {
6493       if (crtl->is_leaf && !cfun->calls_alloca)
6494         {
6495           if (maybe_gt (frame_size, PROBE_INTERVAL)
6496               && maybe_gt (frame_size, get_stack_check_protect ()))
6497             aarch64_emit_probe_stack_range (get_stack_check_protect (),
6498                                             (frame_size
6499                                              - get_stack_check_protect ()));
6500         }
6501       else if (maybe_gt (frame_size, 0))
6502         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
6503     }
6504
6505   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
6506   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6507
6508   /* In theory we should never have both an initial adjustment
6509      and a callee save adjustment.  Verify that is the case since the
6510      code below does not handle it for -fstack-clash-protection.  */
6511   gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
6512
6513   /* Will only probe if the initial adjustment is larger than the guard
6514      less the amount of the guard reserved for use by the caller's
6515      outgoing args.  */
6516   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
6517                                           true, false);
6518
6519   if (callee_adjust != 0)
6520     aarch64_push_regs (reg1, reg2, callee_adjust);
6521
6522   if (emit_frame_chain)
6523     {
6524       poly_int64 reg_offset = callee_adjust;
6525       if (callee_adjust == 0)
6526         {
6527           reg1 = R29_REGNUM;
6528           reg2 = R30_REGNUM;
6529           reg_offset = callee_offset;
6530           aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
6531         }
6532       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
6533                           stack_pointer_rtx, callee_offset,
6534                           tmp1_rtx, tmp0_rtx, frame_pointer_needed);
6535       if (frame_pointer_needed && !frame_size.is_constant ())
6536         {
6537           /* Variable-sized frames need to describe the save slot
6538              address using DW_CFA_expression rather than DW_CFA_offset.
6539              This means that, without taking further action, the
6540              locations of the registers that we've already saved would
6541              remain based on the stack pointer even after we redefine
6542              the CFA based on the frame pointer.  We therefore need new
6543              DW_CFA_expressions to re-express the save slots with addresses
6544              based on the frame pointer.  */
6545           rtx_insn *insn = get_last_insn ();
6546           gcc_assert (RTX_FRAME_RELATED_P (insn));
6547
6548           /* Add an explicit CFA definition if this was previously
6549              implicit.  */
6550           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
6551             {
6552               rtx src = plus_constant (Pmode, stack_pointer_rtx,
6553                                        callee_offset);
6554               add_reg_note (insn, REG_CFA_ADJUST_CFA,
6555                             gen_rtx_SET (hard_frame_pointer_rtx, src));
6556             }
6557
6558           /* Change the save slot expressions for the registers that
6559              we've already saved.  */
6560           reg_offset -= callee_offset;
6561           aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
6562                                       reg_offset + UNITS_PER_WORD);
6563           aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
6564                                       reg_offset);
6565         }
6566       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
6567     }
6568
6569   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
6570                              callee_adjust != 0 || emit_frame_chain);
6571   if (aarch64_simd_decl_p (cfun->decl))
6572     aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6573                                callee_adjust != 0 || emit_frame_chain);
6574   else
6575     aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6576                                callee_adjust != 0 || emit_frame_chain);
6577
6578   /* We may need to probe the final adjustment if it is larger than the guard
6579      that is assumed by the called.  */
6580   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
6581                                           !frame_pointer_needed, true);
6582 }
6583
6584 /* Return TRUE if we can use a simple_return insn.
6585
6586    This function checks whether the callee saved stack is empty, which
6587    means no restore actions are need. The pro_and_epilogue will use
6588    this to check whether shrink-wrapping opt is feasible.  */
6589
6590 bool
6591 aarch64_use_return_insn_p (void)
6592 {
6593   if (!reload_completed)
6594     return false;
6595
6596   if (crtl->profile)
6597     return false;
6598
6599   return known_eq (cfun->machine->frame.frame_size, 0);
6600 }
6601
6602 /* Return false for non-leaf SIMD functions in order to avoid
6603    shrink-wrapping them.  Doing this will lose the necessary
6604    save/restore of FP registers.  */
6605
6606 bool
6607 aarch64_use_simple_return_insn_p (void)
6608 {
6609   if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf)
6610     return false;
6611
6612   return true;
6613 }
6614
6615 /* Generate the epilogue instructions for returning from a function.
6616    This is almost exactly the reverse of the prolog sequence, except
6617    that we need to insert barriers to avoid scheduling loads that read
6618    from a deallocated stack, and we optimize the unwind records by
6619    emitting them all together if possible.  */
6620 void
6621 aarch64_expand_epilogue (bool for_sibcall)
6622 {
6623   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6624   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6625   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6626   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
6627   unsigned reg1 = cfun->machine->frame.wb_candidate1;
6628   unsigned reg2 = cfun->machine->frame.wb_candidate2;
6629   rtx cfi_ops = NULL;
6630   rtx_insn *insn;
6631   /* A stack clash protection prologue may not have left EP0_REGNUM or
6632      EP1_REGNUM in a usable state.  The same is true for allocations
6633      with an SVE component, since we then need both temporary registers
6634      for each allocation.  For stack clash we are in a usable state if
6635      the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
6636   HOST_WIDE_INT guard_size
6637     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
6638   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
6639
6640   /* We can re-use the registers when the allocation amount is smaller than
6641      guard_size - guard_used_by_caller because we won't be doing any probes
6642      then.  In such situations the register should remain live with the correct
6643      value.  */
6644   bool can_inherit_p = (initial_adjust.is_constant ()
6645                         && final_adjust.is_constant ())
6646                         && (!flag_stack_clash_protection
6647                             || known_lt (initial_adjust,
6648                                          guard_size - guard_used_by_caller));
6649
6650   /* We need to add memory barrier to prevent read from deallocated stack.  */
6651   bool need_barrier_p
6652     = maybe_ne (get_frame_size ()
6653                 + cfun->machine->frame.saved_varargs_size, 0);
6654
6655   /* Emit a barrier to prevent loads from a deallocated stack.  */
6656   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
6657       || cfun->calls_alloca
6658       || crtl->calls_eh_return)
6659     {
6660       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6661       need_barrier_p = false;
6662     }
6663
6664   /* Restore the stack pointer from the frame pointer if it may not
6665      be the same as the stack pointer.  */
6666   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
6667   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6668   if (frame_pointer_needed
6669       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
6670     /* If writeback is used when restoring callee-saves, the CFA
6671        is restored on the instruction doing the writeback.  */
6672     aarch64_add_offset (Pmode, stack_pointer_rtx,
6673                         hard_frame_pointer_rtx, -callee_offset,
6674                         tmp1_rtx, tmp0_rtx, callee_adjust == 0);
6675   else
6676      /* The case where we need to re-use the register here is very rare, so
6677         avoid the complicated condition and just always emit a move if the
6678         immediate doesn't fit.  */
6679      aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
6680
6681   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
6682                                 callee_adjust != 0, &cfi_ops);
6683   if (aarch64_simd_decl_p (cfun->decl))
6684     aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6685                                   callee_adjust != 0, &cfi_ops);
6686   else
6687     aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6688                                   callee_adjust != 0, &cfi_ops);
6689
6690   if (need_barrier_p)
6691     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6692
6693   if (callee_adjust != 0)
6694     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
6695
6696   if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
6697     {
6698       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
6699       insn = get_last_insn ();
6700       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
6701       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
6702       RTX_FRAME_RELATED_P (insn) = 1;
6703       cfi_ops = NULL;
6704     }
6705
6706   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
6707      add restriction on emit_move optimization to leaf functions.  */
6708   aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
6709                   (!can_inherit_p || !crtl->is_leaf
6710                    || df_regs_ever_live_p (EP0_REGNUM)));
6711
6712   if (cfi_ops)
6713     {
6714       /* Emit delayed restores and reset the CFA to be SP.  */
6715       insn = get_last_insn ();
6716       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
6717       REG_NOTES (insn) = cfi_ops;
6718       RTX_FRAME_RELATED_P (insn) = 1;
6719     }
6720
6721   /* We prefer to emit the combined return/authenticate instruction RETAA,
6722      however there are three cases in which we must instead emit an explicit
6723      authentication instruction.
6724
6725         1) Sibcalls don't return in a normal way, so if we're about to call one
6726            we must authenticate.
6727
6728         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
6729            generating code for !TARGET_ARMV8_3 we can't use it and must
6730            explicitly authenticate.
6731
6732         3) On an eh_return path we make extra stack adjustments to update the
6733            canonical frame address to be the exception handler's CFA.  We want
6734            to authenticate using the CFA of the function which calls eh_return.
6735     */
6736   if (aarch64_return_address_signing_enabled ()
6737       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
6738     {
6739       switch (aarch64_ra_sign_key)
6740         {
6741           case AARCH64_KEY_A:
6742             insn = emit_insn (gen_autiasp ());
6743             break;
6744           case AARCH64_KEY_B:
6745             insn = emit_insn (gen_autibsp ());
6746             break;
6747           default:
6748             gcc_unreachable ();
6749         }
6750       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6751       RTX_FRAME_RELATED_P (insn) = 1;
6752     }
6753
6754   /* Stack adjustment for exception handler.  */
6755   if (crtl->calls_eh_return && !for_sibcall)
6756     {
6757       /* We need to unwind the stack by the offset computed by
6758          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
6759          to be SP; letting the CFA move during this adjustment
6760          is just as correct as retaining the CFA from the body
6761          of the function.  Therefore, do nothing special.  */
6762       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
6763     }
6764
6765   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
6766   if (!for_sibcall)
6767     emit_jump_insn (ret_rtx);
6768 }
6769
6770 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
6771    normally or return to a previous frame after unwinding.
6772
6773    An EH return uses a single shared return sequence.  The epilogue is
6774    exactly like a normal epilogue except that it has an extra input
6775    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
6776    that must be applied after the frame has been destroyed.  An extra label
6777    is inserted before the epilogue which initializes this register to zero,
6778    and this is the entry point for a normal return.
6779
6780    An actual EH return updates the return address, initializes the stack
6781    adjustment and jumps directly into the epilogue (bypassing the zeroing
6782    of the adjustment).  Since the return address is typically saved on the
6783    stack when a function makes a call, the saved LR must be updated outside
6784    the epilogue.
6785
6786    This poses problems as the store is generated well before the epilogue,
6787    so the offset of LR is not known yet.  Also optimizations will remove the
6788    store as it appears dead, even after the epilogue is generated (as the
6789    base or offset for loading LR is different in many cases).
6790
6791    To avoid these problems this implementation forces the frame pointer
6792    in eh_return functions so that the location of LR is fixed and known early.
6793    It also marks the store volatile, so no optimization is permitted to
6794    remove the store.  */
6795 rtx
6796 aarch64_eh_return_handler_rtx (void)
6797 {
6798   rtx tmp = gen_frame_mem (Pmode,
6799     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
6800
6801   /* Mark the store volatile, so no optimization is permitted to remove it.  */
6802   MEM_VOLATILE_P (tmp) = true;
6803   return tmp;
6804 }
6805
6806 /* Output code to add DELTA to the first argument, and then jump
6807    to FUNCTION.  Used for C++ multiple inheritance.  */
6808 static void
6809 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
6810                          HOST_WIDE_INT delta,
6811                          HOST_WIDE_INT vcall_offset,
6812                          tree function)
6813 {
6814   /* The this pointer is always in x0.  Note that this differs from
6815      Arm where the this pointer maybe bumped to r1 if r0 is required
6816      to return a pointer to an aggregate.  On AArch64 a result value
6817      pointer will be in x8.  */
6818   int this_regno = R0_REGNUM;
6819   rtx this_rtx, temp0, temp1, addr, funexp;
6820   rtx_insn *insn;
6821   const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
6822
6823   if (aarch64_bti_enabled ())
6824     emit_insn (gen_bti_c());
6825
6826   reload_completed = 1;
6827   emit_note (NOTE_INSN_PROLOGUE_END);
6828
6829   this_rtx = gen_rtx_REG (Pmode, this_regno);
6830   temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
6831   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
6832
6833   if (vcall_offset == 0)
6834     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
6835   else
6836     {
6837       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
6838
6839       addr = this_rtx;
6840       if (delta != 0)
6841         {
6842           if (delta >= -256 && delta < 256)
6843             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
6844                                        plus_constant (Pmode, this_rtx, delta));
6845           else
6846             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
6847                                 temp1, temp0, false);
6848         }
6849
6850       if (Pmode == ptr_mode)
6851         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
6852       else
6853         aarch64_emit_move (temp0,
6854                            gen_rtx_ZERO_EXTEND (Pmode,
6855                                                 gen_rtx_MEM (ptr_mode, addr)));
6856
6857       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
6858           addr = plus_constant (Pmode, temp0, vcall_offset);
6859       else
6860         {
6861           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
6862                                           Pmode);
6863           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
6864         }
6865
6866       if (Pmode == ptr_mode)
6867         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
6868       else
6869         aarch64_emit_move (temp1,
6870                            gen_rtx_SIGN_EXTEND (Pmode,
6871                                                 gen_rtx_MEM (ptr_mode, addr)));
6872
6873       emit_insn (gen_add2_insn (this_rtx, temp1));
6874     }
6875
6876   /* Generate a tail call to the target function.  */
6877   if (!TREE_USED (function))
6878     {
6879       assemble_external (function);
6880       TREE_USED (function) = 1;
6881     }
6882   funexp = XEXP (DECL_RTL (function), 0);
6883   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
6884   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
6885   SIBLING_CALL_P (insn) = 1;
6886
6887   insn = get_insns ();
6888   shorten_branches (insn);
6889
6890   assemble_start_function (thunk, fnname);
6891   final_start_function (insn, file, 1);
6892   final (insn, file, 1);
6893   final_end_function ();
6894   assemble_end_function (thunk, fnname);
6895
6896   /* Stop pretending to be a post-reload pass.  */
6897   reload_completed = 0;
6898 }
6899
6900 static bool
6901 aarch64_tls_referenced_p (rtx x)
6902 {
6903   if (!TARGET_HAVE_TLS)
6904     return false;
6905   subrtx_iterator::array_type array;
6906   FOR_EACH_SUBRTX (iter, array, x, ALL)
6907     {
6908       const_rtx x = *iter;
6909       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
6910         return true;
6911       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
6912          TLS offsets, not real symbol references.  */
6913       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
6914         iter.skip_subrtxes ();
6915     }
6916   return false;
6917 }
6918
6919
6920 /* Return true if val can be encoded as a 12-bit unsigned immediate with
6921    a left shift of 0 or 12 bits.  */
6922 bool
6923 aarch64_uimm12_shift (HOST_WIDE_INT val)
6924 {
6925   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
6926           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
6927           );
6928 }
6929
6930 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
6931    that can be created with a left shift of 0 or 12.  */
6932 static HOST_WIDE_INT
6933 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
6934 {
6935   /* Check to see if the value fits in 24 bits, as that is the maximum we can
6936      handle correctly.  */
6937   gcc_assert ((val & 0xffffff) == val);
6938
6939   if (((val & 0xfff) << 0) == val)
6940     return val;
6941
6942   return val & (0xfff << 12);
6943 }
6944
6945 /* Return true if val is an immediate that can be loaded into a
6946    register by a MOVZ instruction.  */
6947 static bool
6948 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
6949 {
6950   if (GET_MODE_SIZE (mode) > 4)
6951     {
6952       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
6953           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
6954         return 1;
6955     }
6956   else
6957     {
6958       /* Ignore sign extension.  */
6959       val &= (HOST_WIDE_INT) 0xffffffff;
6960     }
6961   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
6962           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
6963 }
6964
6965 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
6966    64-bit (DImode) integer.  */
6967
6968 static unsigned HOST_WIDE_INT
6969 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
6970 {
6971   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
6972   while (size < 64)
6973     {
6974       val &= (HOST_WIDE_INT_1U << size) - 1;
6975       val |= val << size;
6976       size *= 2;
6977     }
6978   return val;
6979 }
6980
6981 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
6982
6983 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
6984   {
6985     0x0000000100000001ull,
6986     0x0001000100010001ull,
6987     0x0101010101010101ull,
6988     0x1111111111111111ull,
6989     0x5555555555555555ull,
6990   };
6991
6992
6993 /* Return true if val is a valid bitmask immediate.  */
6994
6995 bool
6996 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
6997 {
6998   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
6999   int bits;
7000
7001   /* Check for a single sequence of one bits and return quickly if so.
7002      The special cases of all ones and all zeroes returns false.  */
7003   val = aarch64_replicate_bitmask_imm (val_in, mode);
7004   tmp = val + (val & -val);
7005
7006   if (tmp == (tmp & -tmp))
7007     return (val + 1) > 1;
7008
7009   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
7010   if (mode == SImode)
7011     val = (val << 32) | (val & 0xffffffff);
7012
7013   /* Invert if the immediate doesn't start with a zero bit - this means we
7014      only need to search for sequences of one bits.  */
7015   if (val & 1)
7016     val = ~val;
7017
7018   /* Find the first set bit and set tmp to val with the first sequence of one
7019      bits removed.  Return success if there is a single sequence of ones.  */
7020   first_one = val & -val;
7021   tmp = val & (val + first_one);
7022
7023   if (tmp == 0)
7024     return true;
7025
7026   /* Find the next set bit and compute the difference in bit position.  */
7027   next_one = tmp & -tmp;
7028   bits = clz_hwi (first_one) - clz_hwi (next_one);
7029   mask = val ^ tmp;
7030
7031   /* Check the bit position difference is a power of 2, and that the first
7032      sequence of one bits fits within 'bits' bits.  */
7033   if ((mask >> bits) != 0 || bits != (bits & -bits))
7034     return false;
7035
7036   /* Check the sequence of one bits is repeated 64/bits times.  */
7037   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
7038 }
7039
7040 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
7041    Assumed precondition: VAL_IN Is not zero.  */
7042
7043 unsigned HOST_WIDE_INT
7044 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
7045 {
7046   int lowest_bit_set = ctz_hwi (val_in);
7047   int highest_bit_set = floor_log2 (val_in);
7048   gcc_assert (val_in != 0);
7049
7050   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
7051           (HOST_WIDE_INT_1U << lowest_bit_set));
7052 }
7053
7054 /* Create constant where bits outside of lowest bit set to highest bit set
7055    are set to 1.  */
7056
7057 unsigned HOST_WIDE_INT
7058 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
7059 {
7060   return val_in | ~aarch64_and_split_imm1 (val_in);
7061 }
7062
7063 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
7064
7065 bool
7066 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
7067 {
7068   scalar_int_mode int_mode;
7069   if (!is_a <scalar_int_mode> (mode, &int_mode))
7070     return false;
7071
7072   if (aarch64_bitmask_imm (val_in, int_mode))
7073     return false;
7074
7075   if (aarch64_move_imm (val_in, int_mode))
7076     return false;
7077
7078   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
7079
7080   return aarch64_bitmask_imm (imm2, int_mode);
7081 }
7082
7083 /* Return true if val is an immediate that can be loaded into a
7084    register in a single instruction.  */
7085 bool
7086 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
7087 {
7088   scalar_int_mode int_mode;
7089   if (!is_a <scalar_int_mode> (mode, &int_mode))
7090     return false;
7091
7092   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
7093     return 1;
7094   return aarch64_bitmask_imm (val, int_mode);
7095 }
7096
7097 static bool
7098 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
7099 {
7100   rtx base, offset;
7101
7102   if (GET_CODE (x) == HIGH)
7103     return true;
7104
7105   /* There's no way to calculate VL-based values using relocations.  */
7106   subrtx_iterator::array_type array;
7107   FOR_EACH_SUBRTX (iter, array, x, ALL)
7108     if (GET_CODE (*iter) == CONST_POLY_INT)
7109       return true;
7110
7111   split_const (x, &base, &offset);
7112   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
7113     {
7114       if (aarch64_classify_symbol (base, INTVAL (offset))
7115           != SYMBOL_FORCE_TO_MEM)
7116         return true;
7117       else
7118         /* Avoid generating a 64-bit relocation in ILP32; leave
7119            to aarch64_expand_mov_immediate to handle it properly.  */
7120         return mode != ptr_mode;
7121     }
7122
7123   return aarch64_tls_referenced_p (x);
7124 }
7125
7126 /* Implement TARGET_CASE_VALUES_THRESHOLD.
7127    The expansion for a table switch is quite expensive due to the number
7128    of instructions, the table lookup and hard to predict indirect jump.
7129    When optimizing for speed, and -O3 enabled, use the per-core tuning if
7130    set, otherwise use tables for > 16 cases as a tradeoff between size and
7131    performance.  When optimizing for size, use the default setting.  */
7132
7133 static unsigned int
7134 aarch64_case_values_threshold (void)
7135 {
7136   /* Use the specified limit for the number of cases before using jump
7137      tables at higher optimization levels.  */
7138   if (optimize > 2
7139       && selected_cpu->tune->max_case_values != 0)
7140     return selected_cpu->tune->max_case_values;
7141   else
7142     return optimize_size ? default_case_values_threshold () : 17;
7143 }
7144
7145 /* Return true if register REGNO is a valid index register.
7146    STRICT_P is true if REG_OK_STRICT is in effect.  */
7147
7148 bool
7149 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
7150 {
7151   if (!HARD_REGISTER_NUM_P (regno))
7152     {
7153       if (!strict_p)
7154         return true;
7155
7156       if (!reg_renumber)
7157         return false;
7158
7159       regno = reg_renumber[regno];
7160     }
7161   return GP_REGNUM_P (regno);
7162 }
7163
7164 /* Return true if register REGNO is a valid base register for mode MODE.
7165    STRICT_P is true if REG_OK_STRICT is in effect.  */
7166
7167 bool
7168 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
7169 {
7170   if (!HARD_REGISTER_NUM_P (regno))
7171     {
7172       if (!strict_p)
7173         return true;
7174
7175       if (!reg_renumber)
7176         return false;
7177
7178       regno = reg_renumber[regno];
7179     }
7180
7181   /* The fake registers will be eliminated to either the stack or
7182      hard frame pointer, both of which are usually valid base registers.
7183      Reload deals with the cases where the eliminated form isn't valid.  */
7184   return (GP_REGNUM_P (regno)
7185           || regno == SP_REGNUM
7186           || regno == FRAME_POINTER_REGNUM
7187           || regno == ARG_POINTER_REGNUM);
7188 }
7189
7190 /* Return true if X is a valid base register for mode MODE.
7191    STRICT_P is true if REG_OK_STRICT is in effect.  */
7192
7193 static bool
7194 aarch64_base_register_rtx_p (rtx x, bool strict_p)
7195 {
7196   if (!strict_p
7197       && GET_CODE (x) == SUBREG
7198       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
7199     x = SUBREG_REG (x);
7200
7201   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
7202 }
7203
7204 /* Return true if address offset is a valid index.  If it is, fill in INFO
7205    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
7206
7207 static bool
7208 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
7209                         machine_mode mode, bool strict_p)
7210 {
7211   enum aarch64_address_type type;
7212   rtx index;
7213   int shift;
7214
7215   /* (reg:P) */
7216   if ((REG_P (x) || GET_CODE (x) == SUBREG)
7217       && GET_MODE (x) == Pmode)
7218     {
7219       type = ADDRESS_REG_REG;
7220       index = x;
7221       shift = 0;
7222     }
7223   /* (sign_extend:DI (reg:SI)) */
7224   else if ((GET_CODE (x) == SIGN_EXTEND
7225             || GET_CODE (x) == ZERO_EXTEND)
7226            && GET_MODE (x) == DImode
7227            && GET_MODE (XEXP (x, 0)) == SImode)
7228     {
7229       type = (GET_CODE (x) == SIGN_EXTEND)
7230         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7231       index = XEXP (x, 0);
7232       shift = 0;
7233     }
7234   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
7235   else if (GET_CODE (x) == MULT
7236            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
7237                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
7238            && GET_MODE (XEXP (x, 0)) == DImode
7239            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
7240            && CONST_INT_P (XEXP (x, 1)))
7241     {
7242       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
7243         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7244       index = XEXP (XEXP (x, 0), 0);
7245       shift = exact_log2 (INTVAL (XEXP (x, 1)));
7246     }
7247   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
7248   else if (GET_CODE (x) == ASHIFT
7249            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
7250                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
7251            && GET_MODE (XEXP (x, 0)) == DImode
7252            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
7253            && CONST_INT_P (XEXP (x, 1)))
7254     {
7255       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
7256         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7257       index = XEXP (XEXP (x, 0), 0);
7258       shift = INTVAL (XEXP (x, 1));
7259     }
7260   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
7261   else if ((GET_CODE (x) == SIGN_EXTRACT
7262             || GET_CODE (x) == ZERO_EXTRACT)
7263            && GET_MODE (x) == DImode
7264            && GET_CODE (XEXP (x, 0)) == MULT
7265            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7266            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
7267     {
7268       type = (GET_CODE (x) == SIGN_EXTRACT)
7269         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7270       index = XEXP (XEXP (x, 0), 0);
7271       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
7272       if (INTVAL (XEXP (x, 1)) != 32 + shift
7273           || INTVAL (XEXP (x, 2)) != 0)
7274         shift = -1;
7275     }
7276   /* (and:DI (mult:DI (reg:DI) (const_int scale))
7277      (const_int 0xffffffff<<shift)) */
7278   else if (GET_CODE (x) == AND
7279            && GET_MODE (x) == DImode
7280            && GET_CODE (XEXP (x, 0)) == MULT
7281            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7282            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7283            && CONST_INT_P (XEXP (x, 1)))
7284     {
7285       type = ADDRESS_REG_UXTW;
7286       index = XEXP (XEXP (x, 0), 0);
7287       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
7288       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
7289         shift = -1;
7290     }
7291   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
7292   else if ((GET_CODE (x) == SIGN_EXTRACT
7293             || GET_CODE (x) == ZERO_EXTRACT)
7294            && GET_MODE (x) == DImode
7295            && GET_CODE (XEXP (x, 0)) == ASHIFT
7296            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7297            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
7298     {
7299       type = (GET_CODE (x) == SIGN_EXTRACT)
7300         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7301       index = XEXP (XEXP (x, 0), 0);
7302       shift = INTVAL (XEXP (XEXP (x, 0), 1));
7303       if (INTVAL (XEXP (x, 1)) != 32 + shift
7304           || INTVAL (XEXP (x, 2)) != 0)
7305         shift = -1;
7306     }
7307   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
7308      (const_int 0xffffffff<<shift)) */
7309   else if (GET_CODE (x) == AND
7310            && GET_MODE (x) == DImode
7311            && GET_CODE (XEXP (x, 0)) == ASHIFT
7312            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7313            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7314            && CONST_INT_P (XEXP (x, 1)))
7315     {
7316       type = ADDRESS_REG_UXTW;
7317       index = XEXP (XEXP (x, 0), 0);
7318       shift = INTVAL (XEXP (XEXP (x, 0), 1));
7319       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
7320         shift = -1;
7321     }
7322   /* (mult:P (reg:P) (const_int scale)) */
7323   else if (GET_CODE (x) == MULT
7324            && GET_MODE (x) == Pmode
7325            && GET_MODE (XEXP (x, 0)) == Pmode
7326            && CONST_INT_P (XEXP (x, 1)))
7327     {
7328       type = ADDRESS_REG_REG;
7329       index = XEXP (x, 0);
7330       shift = exact_log2 (INTVAL (XEXP (x, 1)));
7331     }
7332   /* (ashift:P (reg:P) (const_int shift)) */
7333   else if (GET_CODE (x) == ASHIFT
7334            && GET_MODE (x) == Pmode
7335            && GET_MODE (XEXP (x, 0)) == Pmode
7336            && CONST_INT_P (XEXP (x, 1)))
7337     {
7338       type = ADDRESS_REG_REG;
7339       index = XEXP (x, 0);
7340       shift = INTVAL (XEXP (x, 1));
7341     }
7342   else
7343     return false;
7344
7345   if (!strict_p
7346       && GET_CODE (index) == SUBREG
7347       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
7348     index = SUBREG_REG (index);
7349
7350   if (aarch64_sve_data_mode_p (mode))
7351     {
7352       if (type != ADDRESS_REG_REG
7353           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
7354         return false;
7355     }
7356   else
7357     {
7358       if (shift != 0
7359           && !(IN_RANGE (shift, 1, 3)
7360                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
7361         return false;
7362     }
7363
7364   if (REG_P (index)
7365       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
7366     {
7367       info->type = type;
7368       info->offset = index;
7369       info->shift = shift;
7370       return true;
7371     }
7372
7373   return false;
7374 }
7375
7376 /* Return true if MODE is one of the modes for which we
7377    support LDP/STP operations.  */
7378
7379 static bool
7380 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
7381 {
7382   return mode == SImode || mode == DImode
7383          || mode == SFmode || mode == DFmode
7384          || (aarch64_vector_mode_supported_p (mode)
7385              && (known_eq (GET_MODE_SIZE (mode), 8)
7386                  || (known_eq (GET_MODE_SIZE (mode), 16)
7387                     && (aarch64_tune_params.extra_tuning_flags
7388                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
7389 }
7390
7391 /* Return true if REGNO is a virtual pointer register, or an eliminable
7392    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
7393    include stack_pointer or hard_frame_pointer.  */
7394 static bool
7395 virt_or_elim_regno_p (unsigned regno)
7396 {
7397   return ((regno >= FIRST_VIRTUAL_REGISTER
7398            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
7399           || regno == FRAME_POINTER_REGNUM
7400           || regno == ARG_POINTER_REGNUM);
7401 }
7402
7403 /* Return true if X is a valid address of type TYPE for machine mode MODE.
7404    If it is, fill in INFO appropriately.  STRICT_P is true if
7405    REG_OK_STRICT is in effect.  */
7406
7407 bool
7408 aarch64_classify_address (struct aarch64_address_info *info,
7409                           rtx x, machine_mode mode, bool strict_p,
7410                           aarch64_addr_query_type type)
7411 {
7412   enum rtx_code code = GET_CODE (x);
7413   rtx op0, op1;
7414   poly_int64 offset;
7415
7416   HOST_WIDE_INT const_size;
7417
7418   /* On BE, we use load/store pair for all large int mode load/stores.
7419      TI/TFmode may also use a load/store pair.  */
7420   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7421   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
7422   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
7423                             || type == ADDR_QUERY_LDP_STP_N
7424                             || mode == TImode
7425                             || mode == TFmode
7426                             || (BYTES_BIG_ENDIAN && advsimd_struct_p));
7427
7428   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
7429      corresponds to the actual size of the memory being loaded/stored and the
7430      mode of the corresponding addressing mode is half of that.  */
7431   if (type == ADDR_QUERY_LDP_STP_N
7432       && known_eq (GET_MODE_SIZE (mode), 16))
7433     mode = DFmode;
7434
7435   bool allow_reg_index_p = (!load_store_pair_p
7436                             && (known_lt (GET_MODE_SIZE (mode), 16)
7437                                 || vec_flags == VEC_ADVSIMD
7438                                 || vec_flags & VEC_SVE_DATA));
7439
7440   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
7441      [Rn, #offset, MUL VL].  */
7442   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
7443       && (code != REG && code != PLUS))
7444     return false;
7445
7446   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
7447      REG addressing.  */
7448   if (advsimd_struct_p
7449       && !BYTES_BIG_ENDIAN
7450       && (code != POST_INC && code != REG))
7451     return false;
7452
7453   gcc_checking_assert (GET_MODE (x) == VOIDmode
7454                        || SCALAR_INT_MODE_P (GET_MODE (x)));
7455
7456   switch (code)
7457     {
7458     case REG:
7459     case SUBREG:
7460       info->type = ADDRESS_REG_IMM;
7461       info->base = x;
7462       info->offset = const0_rtx;
7463       info->const_offset = 0;
7464       return aarch64_base_register_rtx_p (x, strict_p);
7465
7466     case PLUS:
7467       op0 = XEXP (x, 0);
7468       op1 = XEXP (x, 1);
7469
7470       if (! strict_p
7471           && REG_P (op0)
7472           && virt_or_elim_regno_p (REGNO (op0))
7473           && poly_int_rtx_p (op1, &offset))
7474         {
7475           info->type = ADDRESS_REG_IMM;
7476           info->base = op0;
7477           info->offset = op1;
7478           info->const_offset = offset;
7479
7480           return true;
7481         }
7482
7483       if (maybe_ne (GET_MODE_SIZE (mode), 0)
7484           && aarch64_base_register_rtx_p (op0, strict_p)
7485           && poly_int_rtx_p (op1, &offset))
7486         {
7487           info->type = ADDRESS_REG_IMM;
7488           info->base = op0;
7489           info->offset = op1;
7490           info->const_offset = offset;
7491
7492           /* TImode and TFmode values are allowed in both pairs of X
7493              registers and individual Q registers.  The available
7494              address modes are:
7495              X,X: 7-bit signed scaled offset
7496              Q:   9-bit signed offset
7497              We conservatively require an offset representable in either mode.
7498              When performing the check for pairs of X registers i.e.  LDP/STP
7499              pass down DImode since that is the natural size of the LDP/STP
7500              instruction memory accesses.  */
7501           if (mode == TImode || mode == TFmode)
7502             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
7503                     && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
7504                         || offset_12bit_unsigned_scaled_p (mode, offset)));
7505
7506           /* A 7bit offset check because OImode will emit a ldp/stp
7507              instruction (only big endian will get here).
7508              For ldp/stp instructions, the offset is scaled for the size of a
7509              single element of the pair.  */
7510           if (mode == OImode)
7511             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
7512
7513           /* Three 9/12 bit offsets checks because CImode will emit three
7514              ldr/str instructions (only big endian will get here).  */
7515           if (mode == CImode)
7516             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
7517                     && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
7518                                                                offset + 32)
7519                         || offset_12bit_unsigned_scaled_p (V16QImode,
7520                                                            offset + 32)));
7521
7522           /* Two 7bit offsets checks because XImode will emit two ldp/stp
7523              instructions (only big endian will get here).  */
7524           if (mode == XImode)
7525             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
7526                     && aarch64_offset_7bit_signed_scaled_p (TImode,
7527                                                             offset + 32));
7528
7529           /* Make "m" use the LD1 offset range for SVE data modes, so
7530              that pre-RTL optimizers like ivopts will work to that
7531              instead of the wider LDR/STR range.  */
7532           if (vec_flags == VEC_SVE_DATA)
7533             return (type == ADDR_QUERY_M
7534                     ? offset_4bit_signed_scaled_p (mode, offset)
7535                     : offset_9bit_signed_scaled_p (mode, offset));
7536
7537           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
7538             {
7539               poly_int64 end_offset = (offset
7540                                        + GET_MODE_SIZE (mode)
7541                                        - BYTES_PER_SVE_VECTOR);
7542               return (type == ADDR_QUERY_M
7543                       ? offset_4bit_signed_scaled_p (mode, offset)
7544                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
7545                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
7546                                                          end_offset)));
7547             }
7548
7549           if (vec_flags == VEC_SVE_PRED)
7550             return offset_9bit_signed_scaled_p (mode, offset);
7551
7552           if (load_store_pair_p)
7553             return ((known_eq (GET_MODE_SIZE (mode), 4)
7554                      || known_eq (GET_MODE_SIZE (mode), 8)
7555                      || known_eq (GET_MODE_SIZE (mode), 16))
7556                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
7557           else
7558             return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
7559                     || offset_12bit_unsigned_scaled_p (mode, offset));
7560         }
7561
7562       if (allow_reg_index_p)
7563         {
7564           /* Look for base + (scaled/extended) index register.  */
7565           if (aarch64_base_register_rtx_p (op0, strict_p)
7566               && aarch64_classify_index (info, op1, mode, strict_p))
7567             {
7568               info->base = op0;
7569               return true;
7570             }
7571           if (aarch64_base_register_rtx_p (op1, strict_p)
7572               && aarch64_classify_index (info, op0, mode, strict_p))
7573             {
7574               info->base = op1;
7575               return true;
7576             }
7577         }
7578
7579       return false;
7580
7581     case POST_INC:
7582     case POST_DEC:
7583     case PRE_INC:
7584     case PRE_DEC:
7585       info->type = ADDRESS_REG_WB;
7586       info->base = XEXP (x, 0);
7587       info->offset = NULL_RTX;
7588       return aarch64_base_register_rtx_p (info->base, strict_p);
7589
7590     case POST_MODIFY:
7591     case PRE_MODIFY:
7592       info->type = ADDRESS_REG_WB;
7593       info->base = XEXP (x, 0);
7594       if (GET_CODE (XEXP (x, 1)) == PLUS
7595           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
7596           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
7597           && aarch64_base_register_rtx_p (info->base, strict_p))
7598         {
7599           info->offset = XEXP (XEXP (x, 1), 1);
7600           info->const_offset = offset;
7601
7602           /* TImode and TFmode values are allowed in both pairs of X
7603              registers and individual Q registers.  The available
7604              address modes are:
7605              X,X: 7-bit signed scaled offset
7606              Q:   9-bit signed offset
7607              We conservatively require an offset representable in either mode.
7608            */
7609           if (mode == TImode || mode == TFmode)
7610             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
7611                     && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
7612
7613           if (load_store_pair_p)
7614             return ((known_eq (GET_MODE_SIZE (mode), 4)
7615                      || known_eq (GET_MODE_SIZE (mode), 8)
7616                      || known_eq (GET_MODE_SIZE (mode), 16))
7617                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
7618           else
7619             return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
7620         }
7621       return false;
7622
7623     case CONST:
7624     case SYMBOL_REF:
7625     case LABEL_REF:
7626       /* load literal: pc-relative constant pool entry.  Only supported
7627          for SI mode or larger.  */
7628       info->type = ADDRESS_SYMBOLIC;
7629
7630       if (!load_store_pair_p
7631           && GET_MODE_SIZE (mode).is_constant (&const_size)
7632           && const_size >= 4)
7633         {
7634           rtx sym, addend;
7635
7636           split_const (x, &sym, &addend);
7637           return ((GET_CODE (sym) == LABEL_REF
7638                    || (GET_CODE (sym) == SYMBOL_REF
7639                        && CONSTANT_POOL_ADDRESS_P (sym)
7640                        && aarch64_pcrelative_literal_loads)));
7641         }
7642       return false;
7643
7644     case LO_SUM:
7645       info->type = ADDRESS_LO_SUM;
7646       info->base = XEXP (x, 0);
7647       info->offset = XEXP (x, 1);
7648       if (allow_reg_index_p
7649           && aarch64_base_register_rtx_p (info->base, strict_p))
7650         {
7651           rtx sym, offs;
7652           split_const (info->offset, &sym, &offs);
7653           if (GET_CODE (sym) == SYMBOL_REF
7654               && (aarch64_classify_symbol (sym, INTVAL (offs))
7655                   == SYMBOL_SMALL_ABSOLUTE))
7656             {
7657               /* The symbol and offset must be aligned to the access size.  */
7658               unsigned int align;
7659
7660               if (CONSTANT_POOL_ADDRESS_P (sym))
7661                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
7662               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
7663                 {
7664                   tree exp = SYMBOL_REF_DECL (sym);
7665                   align = TYPE_ALIGN (TREE_TYPE (exp));
7666                   align = aarch64_constant_alignment (exp, align);
7667                 }
7668               else if (SYMBOL_REF_DECL (sym))
7669                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
7670               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
7671                        && SYMBOL_REF_BLOCK (sym) != NULL)
7672                 align = SYMBOL_REF_BLOCK (sym)->alignment;
7673               else
7674                 align = BITS_PER_UNIT;
7675
7676               poly_int64 ref_size = GET_MODE_SIZE (mode);
7677               if (known_eq (ref_size, 0))
7678                 ref_size = GET_MODE_SIZE (DImode);
7679
7680               return (multiple_p (INTVAL (offs), ref_size)
7681                       && multiple_p (align / BITS_PER_UNIT, ref_size));
7682             }
7683         }
7684       return false;
7685
7686     default:
7687       return false;
7688     }
7689 }
7690
7691 /* Return true if the address X is valid for a PRFM instruction.
7692    STRICT_P is true if we should do strict checking with
7693    aarch64_classify_address.  */
7694
7695 bool
7696 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
7697 {
7698   struct aarch64_address_info addr;
7699
7700   /* PRFM accepts the same addresses as DImode...  */
7701   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
7702   if (!res)
7703     return false;
7704
7705   /* ... except writeback forms.  */
7706   return addr.type != ADDRESS_REG_WB;
7707 }
7708
7709 bool
7710 aarch64_symbolic_address_p (rtx x)
7711 {
7712   rtx offset;
7713
7714   split_const (x, &x, &offset);
7715   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
7716 }
7717
7718 /* Classify the base of symbolic expression X.  */
7719
7720 enum aarch64_symbol_type
7721 aarch64_classify_symbolic_expression (rtx x)
7722 {
7723   rtx offset;
7724
7725   split_const (x, &x, &offset);
7726   return aarch64_classify_symbol (x, INTVAL (offset));
7727 }
7728
7729
7730 /* Return TRUE if X is a legitimate address for accessing memory in
7731    mode MODE.  */
7732 static bool
7733 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
7734 {
7735   struct aarch64_address_info addr;
7736
7737   return aarch64_classify_address (&addr, x, mode, strict_p);
7738 }
7739
7740 /* Return TRUE if X is a legitimate address of type TYPE for accessing
7741    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
7742 bool
7743 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
7744                               aarch64_addr_query_type type)
7745 {
7746   struct aarch64_address_info addr;
7747
7748   return aarch64_classify_address (&addr, x, mode, strict_p, type);
7749 }
7750
7751 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
7752
7753 static bool
7754 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
7755                                          poly_int64 orig_offset,
7756                                          machine_mode mode)
7757 {
7758   HOST_WIDE_INT size;
7759   if (GET_MODE_SIZE (mode).is_constant (&size))
7760     {
7761       HOST_WIDE_INT const_offset, second_offset;
7762
7763       /* A general SVE offset is A * VQ + B.  Remove the A component from
7764          coefficient 0 in order to get the constant B.  */
7765       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
7766
7767       /* Split an out-of-range address displacement into a base and
7768          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
7769          range otherwise to increase opportunities for sharing the base
7770          address of different sizes.  Unaligned accesses use the signed
7771          9-bit range, TImode/TFmode use the intersection of signed
7772          scaled 7-bit and signed 9-bit offset.  */
7773       if (mode == TImode || mode == TFmode)
7774         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
7775       else if ((const_offset & (size - 1)) != 0)
7776         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
7777       else
7778         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
7779
7780       if (second_offset == 0 || known_eq (orig_offset, second_offset))
7781         return false;
7782
7783       /* Split the offset into second_offset and the rest.  */
7784       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7785       *offset2 = gen_int_mode (second_offset, Pmode);
7786       return true;
7787     }
7788   else
7789     {
7790       /* Get the mode we should use as the basis of the range.  For structure
7791          modes this is the mode of one vector.  */
7792       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7793       machine_mode step_mode
7794         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
7795
7796       /* Get the "mul vl" multiplier we'd like to use.  */
7797       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
7798       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
7799       if (vec_flags & VEC_SVE_DATA)
7800         /* LDR supports a 9-bit range, but the move patterns for
7801            structure modes require all vectors to be in range of the
7802            same base.  The simplest way of accomodating that while still
7803            promoting reuse of anchor points between different modes is
7804            to use an 8-bit range unconditionally.  */
7805         vnum = ((vnum + 128) & 255) - 128;
7806       else
7807         /* Predicates are only handled singly, so we might as well use
7808            the full range.  */
7809         vnum = ((vnum + 256) & 511) - 256;
7810       if (vnum == 0)
7811         return false;
7812
7813       /* Convert the "mul vl" multiplier into a byte offset.  */
7814       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
7815       if (known_eq (second_offset, orig_offset))
7816         return false;
7817
7818       /* Split the offset into second_offset and the rest.  */
7819       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7820       *offset2 = gen_int_mode (second_offset, Pmode);
7821       return true;
7822     }
7823 }
7824
7825 /* Return the binary representation of floating point constant VALUE in INTVAL.
7826    If the value cannot be converted, return false without setting INTVAL.
7827    The conversion is done in the given MODE.  */
7828 bool
7829 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
7830 {
7831
7832   /* We make a general exception for 0.  */
7833   if (aarch64_float_const_zero_rtx_p (value))
7834     {
7835       *intval = 0;
7836       return true;
7837     }
7838
7839   scalar_float_mode mode;
7840   if (GET_CODE (value) != CONST_DOUBLE
7841       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
7842       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
7843       /* Only support up to DF mode.  */
7844       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
7845     return false;
7846
7847   unsigned HOST_WIDE_INT ival = 0;
7848
7849   long res[2];
7850   real_to_target (res,
7851                   CONST_DOUBLE_REAL_VALUE (value),
7852                   REAL_MODE_FORMAT (mode));
7853
7854   if (mode == DFmode)
7855     {
7856       int order = BYTES_BIG_ENDIAN ? 1 : 0;
7857       ival = zext_hwi (res[order], 32);
7858       ival |= (zext_hwi (res[1 - order], 32) << 32);
7859     }
7860   else
7861       ival = zext_hwi (res[0], 32);
7862
7863   *intval = ival;
7864   return true;
7865 }
7866
7867 /* Return TRUE if rtx X is an immediate constant that can be moved using a
7868    single MOV(+MOVK) followed by an FMOV.  */
7869 bool
7870 aarch64_float_const_rtx_p (rtx x)
7871 {
7872   machine_mode mode = GET_MODE (x);
7873   if (mode == VOIDmode)
7874     return false;
7875
7876   /* Determine whether it's cheaper to write float constants as
7877      mov/movk pairs over ldr/adrp pairs.  */
7878   unsigned HOST_WIDE_INT ival;
7879
7880   if (GET_CODE (x) == CONST_DOUBLE
7881       && SCALAR_FLOAT_MODE_P (mode)
7882       && aarch64_reinterpret_float_as_int (x, &ival))
7883     {
7884       scalar_int_mode imode = (mode == HFmode
7885                                ? SImode
7886                                : int_mode_for_mode (mode).require ());
7887       int num_instr = aarch64_internal_mov_immediate
7888                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7889       return num_instr < 3;
7890     }
7891
7892   return false;
7893 }
7894
7895 /* Return TRUE if rtx X is immediate constant 0.0 */
7896 bool
7897 aarch64_float_const_zero_rtx_p (rtx x)
7898 {
7899   if (GET_MODE (x) == VOIDmode)
7900     return false;
7901
7902   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
7903     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
7904   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
7905 }
7906
7907 /* Return TRUE if rtx X is immediate constant that fits in a single
7908    MOVI immediate operation.  */
7909 bool
7910 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
7911 {
7912   if (!TARGET_SIMD)
7913      return false;
7914
7915   machine_mode vmode;
7916   scalar_int_mode imode;
7917   unsigned HOST_WIDE_INT ival;
7918
7919   if (GET_CODE (x) == CONST_DOUBLE
7920       && SCALAR_FLOAT_MODE_P (mode))
7921     {
7922       if (!aarch64_reinterpret_float_as_int (x, &ival))
7923         return false;
7924
7925       /* We make a general exception for 0.  */
7926       if (aarch64_float_const_zero_rtx_p (x))
7927         return true;
7928
7929       imode = int_mode_for_mode (mode).require ();
7930     }
7931   else if (GET_CODE (x) == CONST_INT
7932            && is_a <scalar_int_mode> (mode, &imode))
7933     ival = INTVAL (x);
7934   else
7935     return false;
7936
7937    /* use a 64 bit mode for everything except for DI/DF mode, where we use
7938      a 128 bit vector mode.  */
7939   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
7940
7941   vmode = aarch64_simd_container_mode (imode, width);
7942   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
7943
7944   return aarch64_simd_valid_immediate (v_op, NULL);
7945 }
7946
7947
7948 /* Return the fixed registers used for condition codes.  */
7949
7950 static bool
7951 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
7952 {
7953   *p1 = CC_REGNUM;
7954   *p2 = INVALID_REGNUM;
7955   return true;
7956 }
7957
7958 /* This function is used by the call expanders of the machine description.
7959    RESULT is the register in which the result is returned.  It's NULL for
7960    "call" and "sibcall".
7961    MEM is the location of the function call.
7962    SIBCALL indicates whether this function call is normal call or sibling call.
7963    It will generate different pattern accordingly.  */
7964
7965 void
7966 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
7967 {
7968   rtx call, callee, tmp;
7969   rtvec vec;
7970   machine_mode mode;
7971
7972   gcc_assert (MEM_P (mem));
7973   callee = XEXP (mem, 0);
7974   mode = GET_MODE (callee);
7975   gcc_assert (mode == Pmode);
7976
7977   /* Decide if we should generate indirect calls by loading the
7978      address of the callee into a register before performing
7979      the branch-and-link.  */
7980   if (SYMBOL_REF_P (callee)
7981       ? (aarch64_is_long_call_p (callee)
7982          || aarch64_is_noplt_call_p (callee))
7983       : !REG_P (callee))
7984     XEXP (mem, 0) = force_reg (mode, callee);
7985
7986   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
7987
7988   if (result != NULL_RTX)
7989     call = gen_rtx_SET (result, call);
7990
7991   if (sibcall)
7992     tmp = ret_rtx;
7993   else
7994     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
7995
7996   vec = gen_rtvec (2, call, tmp);
7997   call = gen_rtx_PARALLEL (VOIDmode, vec);
7998
7999   aarch64_emit_call_insn (call);
8000 }
8001
8002 /* Emit call insn with PAT and do aarch64-specific handling.  */
8003
8004 void
8005 aarch64_emit_call_insn (rtx pat)
8006 {
8007   rtx insn = emit_call_insn (pat);
8008
8009   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
8010   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
8011   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
8012 }
8013
8014 machine_mode
8015 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
8016 {
8017   machine_mode mode_x = GET_MODE (x);
8018   rtx_code code_x = GET_CODE (x);
8019
8020   /* All floating point compares return CCFP if it is an equality
8021      comparison, and CCFPE otherwise.  */
8022   if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
8023     {
8024       switch (code)
8025         {
8026         case EQ:
8027         case NE:
8028         case UNORDERED:
8029         case ORDERED:
8030         case UNLT:
8031         case UNLE:
8032         case UNGT:
8033         case UNGE:
8034         case UNEQ:
8035           return CCFPmode;
8036
8037         case LT:
8038         case LE:
8039         case GT:
8040         case GE:
8041         case LTGT:
8042           return CCFPEmode;
8043
8044         default:
8045           gcc_unreachable ();
8046         }
8047     }
8048
8049   /* Equality comparisons of short modes against zero can be performed
8050      using the TST instruction with the appropriate bitmask.  */
8051   if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
8052       && (code == EQ || code == NE)
8053       && (mode_x == HImode || mode_x == QImode))
8054     return CC_NZmode;
8055
8056   /* Similarly, comparisons of zero_extends from shorter modes can
8057      be performed using an ANDS with an immediate mask.  */
8058   if (y == const0_rtx && code_x == ZERO_EXTEND
8059       && (mode_x == SImode || mode_x == DImode)
8060       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
8061       && (code == EQ || code == NE))
8062     return CC_NZmode;
8063
8064   if ((mode_x == SImode || mode_x == DImode)
8065       && y == const0_rtx
8066       && (code == EQ || code == NE || code == LT || code == GE)
8067       && (code_x == PLUS || code_x == MINUS || code_x == AND
8068           || code_x == NEG
8069           || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
8070               && CONST_INT_P (XEXP (x, 2)))))
8071     return CC_NZmode;
8072
8073   /* A compare with a shifted operand.  Because of canonicalization,
8074      the comparison will have to be swapped when we emit the assembly
8075      code.  */
8076   if ((mode_x == SImode || mode_x == DImode)
8077       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
8078       && (code_x == ASHIFT || code_x == ASHIFTRT
8079           || code_x == LSHIFTRT
8080           || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
8081     return CC_SWPmode;
8082
8083   /* Similarly for a negated operand, but we can only do this for
8084      equalities.  */
8085   if ((mode_x == SImode || mode_x == DImode)
8086       && (REG_P (y) || GET_CODE (y) == SUBREG)
8087       && (code == EQ || code == NE)
8088       && code_x == NEG)
8089     return CC_Zmode;
8090
8091   /* A test for unsigned overflow from an addition.  */
8092   if ((mode_x == DImode || mode_x == TImode)
8093       && (code == LTU || code == GEU)
8094       && code_x == PLUS
8095       && rtx_equal_p (XEXP (x, 0), y))
8096     return CC_Cmode;
8097
8098   /* A test for unsigned overflow from an add with carry.  */
8099   if ((mode_x == DImode || mode_x == TImode)
8100       && (code == LTU || code == GEU)
8101       && code_x == PLUS
8102       && CONST_SCALAR_INT_P (y)
8103       && (rtx_mode_t (y, mode_x)
8104           == (wi::shwi (1, mode_x)
8105               << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
8106     return CC_ADCmode;
8107
8108   /* A test for signed overflow.  */
8109   if ((mode_x == DImode || mode_x == TImode)
8110       && code == NE
8111       && code_x == PLUS
8112       && GET_CODE (y) == SIGN_EXTEND)
8113     return CC_Vmode;
8114
8115   /* For everything else, return CCmode.  */
8116   return CCmode;
8117 }
8118
8119 static int
8120 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
8121
8122 int
8123 aarch64_get_condition_code (rtx x)
8124 {
8125   machine_mode mode = GET_MODE (XEXP (x, 0));
8126   enum rtx_code comp_code = GET_CODE (x);
8127
8128   if (GET_MODE_CLASS (mode) != MODE_CC)
8129     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
8130   return aarch64_get_condition_code_1 (mode, comp_code);
8131 }
8132
8133 static int
8134 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
8135 {
8136   switch (mode)
8137     {
8138     case E_CCFPmode:
8139     case E_CCFPEmode:
8140       switch (comp_code)
8141         {
8142         case GE: return AARCH64_GE;
8143         case GT: return AARCH64_GT;
8144         case LE: return AARCH64_LS;
8145         case LT: return AARCH64_MI;
8146         case NE: return AARCH64_NE;
8147         case EQ: return AARCH64_EQ;
8148         case ORDERED: return AARCH64_VC;
8149         case UNORDERED: return AARCH64_VS;
8150         case UNLT: return AARCH64_LT;
8151         case UNLE: return AARCH64_LE;
8152         case UNGT: return AARCH64_HI;
8153         case UNGE: return AARCH64_PL;
8154         default: return -1;
8155         }
8156       break;
8157
8158     case E_CCmode:
8159       switch (comp_code)
8160         {
8161         case NE: return AARCH64_NE;
8162         case EQ: return AARCH64_EQ;
8163         case GE: return AARCH64_GE;
8164         case GT: return AARCH64_GT;
8165         case LE: return AARCH64_LE;
8166         case LT: return AARCH64_LT;
8167         case GEU: return AARCH64_CS;
8168         case GTU: return AARCH64_HI;
8169         case LEU: return AARCH64_LS;
8170         case LTU: return AARCH64_CC;
8171         default: return -1;
8172         }
8173       break;
8174
8175     case E_CC_SWPmode:
8176       switch (comp_code)
8177         {
8178         case NE: return AARCH64_NE;
8179         case EQ: return AARCH64_EQ;
8180         case GE: return AARCH64_LE;
8181         case GT: return AARCH64_LT;
8182         case LE: return AARCH64_GE;
8183         case LT: return AARCH64_GT;
8184         case GEU: return AARCH64_LS;
8185         case GTU: return AARCH64_CC;
8186         case LEU: return AARCH64_CS;
8187         case LTU: return AARCH64_HI;
8188         default: return -1;
8189         }
8190       break;
8191
8192     case E_CC_NZCmode:
8193       switch (comp_code)
8194         {
8195         case NE: return AARCH64_NE; /* = any */
8196         case EQ: return AARCH64_EQ; /* = none */
8197         case GE: return AARCH64_PL; /* = nfrst */
8198         case LT: return AARCH64_MI; /* = first */
8199         case GEU: return AARCH64_CS; /* = nlast */
8200         case GTU: return AARCH64_HI; /* = pmore */
8201         case LEU: return AARCH64_LS; /* = plast */
8202         case LTU: return AARCH64_CC; /* = last */
8203         default: return -1;
8204         }
8205       break;
8206
8207     case E_CC_NZmode:
8208       switch (comp_code)
8209         {
8210         case NE: return AARCH64_NE;
8211         case EQ: return AARCH64_EQ;
8212         case GE: return AARCH64_PL;
8213         case LT: return AARCH64_MI;
8214         default: return -1;
8215         }
8216       break;
8217
8218     case E_CC_Zmode:
8219       switch (comp_code)
8220         {
8221         case NE: return AARCH64_NE;
8222         case EQ: return AARCH64_EQ;
8223         default: return -1;
8224         }
8225       break;
8226
8227     case E_CC_Cmode:
8228       switch (comp_code)
8229         {
8230         case LTU: return AARCH64_CS;
8231         case GEU: return AARCH64_CC;
8232         default: return -1;
8233         }
8234       break;
8235
8236     case E_CC_ADCmode:
8237       switch (comp_code)
8238         {
8239         case GEU: return AARCH64_CS;
8240         case LTU: return AARCH64_CC;
8241         default: return -1;
8242         }
8243       break;
8244
8245     case E_CC_Vmode:
8246       switch (comp_code)
8247         {
8248         case NE: return AARCH64_VS;
8249         case EQ: return AARCH64_VC;
8250         default: return -1;
8251         }
8252       break;
8253
8254     default:
8255       return -1;
8256     }
8257
8258   return -1;
8259 }
8260
8261 bool
8262 aarch64_const_vec_all_same_in_range_p (rtx x,
8263                                        HOST_WIDE_INT minval,
8264                                        HOST_WIDE_INT maxval)
8265 {
8266   rtx elt;
8267   return (const_vec_duplicate_p (x, &elt)
8268           && CONST_INT_P (elt)
8269           && IN_RANGE (INTVAL (elt), minval, maxval));
8270 }
8271
8272 bool
8273 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
8274 {
8275   return aarch64_const_vec_all_same_in_range_p (x, val, val);
8276 }
8277
8278 /* Return true if VEC is a constant in which every element is in the range
8279    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
8280
8281 static bool
8282 aarch64_const_vec_all_in_range_p (rtx vec,
8283                                   HOST_WIDE_INT minval,
8284                                   HOST_WIDE_INT maxval)
8285 {
8286   if (GET_CODE (vec) != CONST_VECTOR
8287       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
8288     return false;
8289
8290   int nunits;
8291   if (!CONST_VECTOR_STEPPED_P (vec))
8292     nunits = const_vector_encoded_nelts (vec);
8293   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
8294     return false;
8295
8296   for (int i = 0; i < nunits; i++)
8297     {
8298       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
8299       if (!CONST_INT_P (vec_elem)
8300           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
8301         return false;
8302     }
8303   return true;
8304 }
8305
8306 /* N Z C V.  */
8307 #define AARCH64_CC_V 1
8308 #define AARCH64_CC_C (1 << 1)
8309 #define AARCH64_CC_Z (1 << 2)
8310 #define AARCH64_CC_N (1 << 3)
8311
8312 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
8313 static const int aarch64_nzcv_codes[] =
8314 {
8315   0,            /* EQ, Z == 1.  */
8316   AARCH64_CC_Z, /* NE, Z == 0.  */
8317   0,            /* CS, C == 1.  */
8318   AARCH64_CC_C, /* CC, C == 0.  */
8319   0,            /* MI, N == 1.  */
8320   AARCH64_CC_N, /* PL, N == 0.  */
8321   0,            /* VS, V == 1.  */
8322   AARCH64_CC_V, /* VC, V == 0.  */
8323   0,            /* HI, C ==1 && Z == 0.  */
8324   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
8325   AARCH64_CC_V, /* GE, N == V.  */
8326   0,            /* LT, N != V.  */
8327   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
8328   0,            /* LE, !(Z == 0 && N == V).  */
8329   0,            /* AL, Any.  */
8330   0             /* NV, Any.  */
8331 };
8332
8333 /* Print floating-point vector immediate operand X to F, negating it
8334    first if NEGATE is true.  Return true on success, false if it isn't
8335    a constant we can handle.  */
8336
8337 static bool
8338 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
8339 {
8340   rtx elt;
8341
8342   if (!const_vec_duplicate_p (x, &elt))
8343     return false;
8344
8345   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
8346   if (negate)
8347     r = real_value_negate (&r);
8348
8349   /* Handle the SVE single-bit immediates specially, since they have a
8350      fixed form in the assembly syntax.  */
8351   if (real_equal (&r, &dconst0))
8352     asm_fprintf (f, "0.0");
8353   else if (real_equal (&r, &dconst2))
8354     asm_fprintf (f, "2.0");
8355   else if (real_equal (&r, &dconst1))
8356     asm_fprintf (f, "1.0");
8357   else if (real_equal (&r, &dconsthalf))
8358     asm_fprintf (f, "0.5");
8359   else
8360     {
8361       const int buf_size = 20;
8362       char float_buf[buf_size] = {'\0'};
8363       real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
8364                                 1, GET_MODE (elt));
8365       asm_fprintf (f, "%s", float_buf);
8366     }
8367
8368   return true;
8369 }
8370
8371 /* Return the equivalent letter for size.  */
8372 static char
8373 sizetochar (int size)
8374 {
8375   switch (size)
8376     {
8377     case 64: return 'd';
8378     case 32: return 's';
8379     case 16: return 'h';
8380     case 8 : return 'b';
8381     default: gcc_unreachable ();
8382     }
8383 }
8384
8385 /* Print operand X to file F in a target specific manner according to CODE.
8386    The acceptable formatting commands given by CODE are:
8387      'c':               An integer or symbol address without a preceding #
8388                         sign.
8389      'C':               Take the duplicated element in a vector constant
8390                         and print it in hex.
8391      'D':               Take the duplicated element in a vector constant
8392                         and print it as an unsigned integer, in decimal.
8393      'e':               Print the sign/zero-extend size as a character 8->b,
8394                         16->h, 32->w.  Can also be used for masks:
8395                         0xff->b, 0xffff->h, 0xffffffff->w.
8396      'I':               If the operand is a duplicated vector constant,
8397                         replace it with the duplicated scalar.  If the
8398                         operand is then a floating-point constant, replace
8399                         it with the integer bit representation.  Print the
8400                         transformed constant as a signed decimal number.
8401      'p':               Prints N such that 2^N == X (X must be power of 2 and
8402                         const int).
8403      'P':               Print the number of non-zero bits in X (a const_int).
8404      'H':               Print the higher numbered register of a pair (TImode)
8405                         of regs.
8406      'm':               Print a condition (eq, ne, etc).
8407      'M':               Same as 'm', but invert condition.
8408      'N':               Take the duplicated element in a vector constant
8409                         and print the negative of it in decimal.
8410      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
8411      'S/T/U/V':         Print a FP/SIMD register name for a register list.
8412                         The register printed is the FP/SIMD register name
8413                         of X + 0/1/2/3 for S/T/U/V.
8414      'R':               Print a scalar FP/SIMD register name + 1.
8415      'X':               Print bottom 16 bits of integer constant in hex.
8416      'w/x':             Print a general register name or the zero register
8417                         (32-bit or 64-bit).
8418      '0':               Print a normal operand, if it's a general register,
8419                         then we assume DImode.
8420      'k':               Print NZCV for conditional compare instructions.
8421      'A':               Output address constant representing the first
8422                         argument of X, specifying a relocation offset
8423                         if appropriate.
8424      'L':               Output constant address specified by X
8425                         with a relocation offset if appropriate.
8426      'G':               Prints address of X, specifying a PC relative
8427                         relocation mode if appropriate.
8428      'y':               Output address of LDP or STP - this is used for
8429                         some LDP/STPs which don't use a PARALLEL in their
8430                         pattern (so the mode needs to be adjusted).
8431      'z':               Output address of a typical LDP or STP.  */
8432
8433 static void
8434 aarch64_print_operand (FILE *f, rtx x, int code)
8435 {
8436   rtx elt;
8437   switch (code)
8438     {
8439     case 'c':
8440       switch (GET_CODE (x))
8441         {
8442         case CONST_INT:
8443           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
8444           break;
8445
8446         case SYMBOL_REF:
8447           output_addr_const (f, x);
8448           break;
8449
8450         case CONST:
8451           if (GET_CODE (XEXP (x, 0)) == PLUS
8452               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
8453             {
8454               output_addr_const (f, x);
8455               break;
8456             }
8457           /* Fall through.  */
8458
8459         default:
8460           output_operand_lossage ("unsupported operand for code '%c'", code);
8461         }
8462       break;
8463
8464     case 'e':
8465       {
8466         x = unwrap_const_vec_duplicate (x);
8467         if (!CONST_INT_P (x))
8468           {
8469             output_operand_lossage ("invalid operand for '%%%c'", code);
8470             return;
8471           }
8472
8473         HOST_WIDE_INT val = INTVAL (x);
8474         if ((val & ~7) == 8 || val == 0xff)
8475           fputc ('b', f);
8476         else if ((val & ~7) == 16 || val == 0xffff)
8477           fputc ('h', f);
8478         else if ((val & ~7) == 32 || val == 0xffffffff)
8479           fputc ('w', f);
8480         else
8481           {
8482             output_operand_lossage ("invalid operand for '%%%c'", code);
8483             return;
8484           }
8485       }
8486       break;
8487
8488     case 'p':
8489       {
8490         int n;
8491
8492         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
8493           {
8494             output_operand_lossage ("invalid operand for '%%%c'", code);
8495             return;
8496           }
8497
8498         asm_fprintf (f, "%d", n);
8499       }
8500       break;
8501
8502     case 'P':
8503       if (!CONST_INT_P (x))
8504         {
8505           output_operand_lossage ("invalid operand for '%%%c'", code);
8506           return;
8507         }
8508
8509       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
8510       break;
8511
8512     case 'H':
8513       if (x == const0_rtx)
8514         {
8515           asm_fprintf (f, "xzr");
8516           break;
8517         }
8518
8519       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
8520         {
8521           output_operand_lossage ("invalid operand for '%%%c'", code);
8522           return;
8523         }
8524
8525       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
8526       break;
8527
8528     case 'I':
8529       {
8530         x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
8531         if (CONST_INT_P (x))
8532           asm_fprintf (f, "%wd", INTVAL (x));
8533         else
8534           {
8535             output_operand_lossage ("invalid operand for '%%%c'", code);
8536             return;
8537           }
8538         break;
8539       }
8540
8541     case 'M':
8542     case 'm':
8543       {
8544         int cond_code;
8545         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
8546         if (x == const_true_rtx)
8547           {
8548             if (code == 'M')
8549               fputs ("nv", f);
8550             return;
8551           }
8552
8553         if (!COMPARISON_P (x))
8554           {
8555             output_operand_lossage ("invalid operand for '%%%c'", code);
8556             return;
8557           }
8558
8559         cond_code = aarch64_get_condition_code (x);
8560         gcc_assert (cond_code >= 0);
8561         if (code == 'M')
8562           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
8563         if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
8564           fputs (aarch64_sve_condition_codes[cond_code], f);
8565         else
8566           fputs (aarch64_condition_codes[cond_code], f);
8567       }
8568       break;
8569
8570     case 'N':
8571       if (!const_vec_duplicate_p (x, &elt))
8572         {
8573           output_operand_lossage ("invalid vector constant");
8574           return;
8575         }
8576
8577       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
8578         asm_fprintf (f, "%wd", -INTVAL (elt));
8579       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
8580                && aarch64_print_vector_float_operand (f, x, true))
8581         ;
8582       else
8583         {
8584           output_operand_lossage ("invalid vector constant");
8585           return;
8586         }
8587       break;
8588
8589     case 'b':
8590     case 'h':
8591     case 's':
8592     case 'd':
8593     case 'q':
8594       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8595         {
8596           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8597           return;
8598         }
8599       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
8600       break;
8601
8602     case 'S':
8603     case 'T':
8604     case 'U':
8605     case 'V':
8606       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8607         {
8608           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8609           return;
8610         }
8611       asm_fprintf (f, "%c%d",
8612                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
8613                    REGNO (x) - V0_REGNUM + (code - 'S'));
8614       break;
8615
8616     case 'R':
8617       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8618         {
8619           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8620           return;
8621         }
8622       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
8623       break;
8624
8625     case 'X':
8626       if (!CONST_INT_P (x))
8627         {
8628           output_operand_lossage ("invalid operand for '%%%c'", code);
8629           return;
8630         }
8631       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
8632       break;
8633
8634     case 'C':
8635       {
8636         /* Print a replicated constant in hex.  */
8637         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
8638           {
8639             output_operand_lossage ("invalid operand for '%%%c'", code);
8640             return;
8641           }
8642         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
8643         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
8644       }
8645       break;
8646
8647     case 'D':
8648       {
8649         /* Print a replicated constant in decimal, treating it as
8650            unsigned.  */
8651         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
8652           {
8653             output_operand_lossage ("invalid operand for '%%%c'", code);
8654             return;
8655           }
8656         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
8657         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
8658       }
8659       break;
8660
8661     case 'w':
8662     case 'x':
8663       if (x == const0_rtx
8664           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
8665         {
8666           asm_fprintf (f, "%czr", code);
8667           break;
8668         }
8669
8670       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
8671         {
8672           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
8673           break;
8674         }
8675
8676       if (REG_P (x) && REGNO (x) == SP_REGNUM)
8677         {
8678           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
8679           break;
8680         }
8681
8682       /* Fall through */
8683
8684     case 0:
8685       if (x == NULL)
8686         {
8687           output_operand_lossage ("missing operand");
8688           return;
8689         }
8690
8691       switch (GET_CODE (x))
8692         {
8693         case REG:
8694           if (aarch64_sve_data_mode_p (GET_MODE (x)))
8695             {
8696               if (REG_NREGS (x) == 1)
8697                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
8698               else
8699                 {
8700                   char suffix
8701                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
8702                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
8703                                REGNO (x) - V0_REGNUM, suffix,
8704                                END_REGNO (x) - V0_REGNUM - 1, suffix);
8705                 }
8706             }
8707           else
8708             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
8709           break;
8710
8711         case MEM:
8712           output_address (GET_MODE (x), XEXP (x, 0));
8713           break;
8714
8715         case LABEL_REF:
8716         case SYMBOL_REF:
8717           output_addr_const (asm_out_file, x);
8718           break;
8719
8720         case CONST_INT:
8721           asm_fprintf (f, "%wd", INTVAL (x));
8722           break;
8723
8724         case CONST:
8725           if (!VECTOR_MODE_P (GET_MODE (x)))
8726             {
8727               output_addr_const (asm_out_file, x);
8728               break;
8729             }
8730           /* fall through */
8731
8732         case CONST_VECTOR:
8733           if (!const_vec_duplicate_p (x, &elt))
8734             {
8735               output_operand_lossage ("invalid vector constant");
8736               return;
8737             }
8738
8739           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
8740             asm_fprintf (f, "%wd", INTVAL (elt));
8741           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
8742                    && aarch64_print_vector_float_operand (f, x, false))
8743             ;
8744           else
8745             {
8746               output_operand_lossage ("invalid vector constant");
8747               return;
8748             }
8749           break;
8750
8751         case CONST_DOUBLE:
8752           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
8753              be getting CONST_DOUBLEs holding integers.  */
8754           gcc_assert (GET_MODE (x) != VOIDmode);
8755           if (aarch64_float_const_zero_rtx_p (x))
8756             {
8757               fputc ('0', f);
8758               break;
8759             }
8760           else if (aarch64_float_const_representable_p (x))
8761             {
8762 #define buf_size 20
8763               char float_buf[buf_size] = {'\0'};
8764               real_to_decimal_for_mode (float_buf,
8765                                         CONST_DOUBLE_REAL_VALUE (x),
8766                                         buf_size, buf_size,
8767                                         1, GET_MODE (x));
8768               asm_fprintf (asm_out_file, "%s", float_buf);
8769               break;
8770 #undef buf_size
8771             }
8772           output_operand_lossage ("invalid constant");
8773           return;
8774         default:
8775           output_operand_lossage ("invalid operand");
8776           return;
8777         }
8778       break;
8779
8780     case 'A':
8781       if (GET_CODE (x) == HIGH)
8782         x = XEXP (x, 0);
8783
8784       switch (aarch64_classify_symbolic_expression (x))
8785         {
8786         case SYMBOL_SMALL_GOT_4G:
8787           asm_fprintf (asm_out_file, ":got:");
8788           break;
8789
8790         case SYMBOL_SMALL_TLSGD:
8791           asm_fprintf (asm_out_file, ":tlsgd:");
8792           break;
8793
8794         case SYMBOL_SMALL_TLSDESC:
8795           asm_fprintf (asm_out_file, ":tlsdesc:");
8796           break;
8797
8798         case SYMBOL_SMALL_TLSIE:
8799           asm_fprintf (asm_out_file, ":gottprel:");
8800           break;
8801
8802         case SYMBOL_TLSLE24:
8803           asm_fprintf (asm_out_file, ":tprel:");
8804           break;
8805
8806         case SYMBOL_TINY_GOT:
8807           gcc_unreachable ();
8808           break;
8809
8810         default:
8811           break;
8812         }
8813       output_addr_const (asm_out_file, x);
8814       break;
8815
8816     case 'L':
8817       switch (aarch64_classify_symbolic_expression (x))
8818         {
8819         case SYMBOL_SMALL_GOT_4G:
8820           asm_fprintf (asm_out_file, ":lo12:");
8821           break;
8822
8823         case SYMBOL_SMALL_TLSGD:
8824           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
8825           break;
8826
8827         case SYMBOL_SMALL_TLSDESC:
8828           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
8829           break;
8830
8831         case SYMBOL_SMALL_TLSIE:
8832           asm_fprintf (asm_out_file, ":gottprel_lo12:");
8833           break;
8834
8835         case SYMBOL_TLSLE12:
8836           asm_fprintf (asm_out_file, ":tprel_lo12:");
8837           break;
8838
8839         case SYMBOL_TLSLE24:
8840           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
8841           break;
8842
8843         case SYMBOL_TINY_GOT:
8844           asm_fprintf (asm_out_file, ":got:");
8845           break;
8846
8847         case SYMBOL_TINY_TLSIE:
8848           asm_fprintf (asm_out_file, ":gottprel:");
8849           break;
8850
8851         default:
8852           break;
8853         }
8854       output_addr_const (asm_out_file, x);
8855       break;
8856
8857     case 'G':
8858       switch (aarch64_classify_symbolic_expression (x))
8859         {
8860         case SYMBOL_TLSLE24:
8861           asm_fprintf (asm_out_file, ":tprel_hi12:");
8862           break;
8863         default:
8864           break;
8865         }
8866       output_addr_const (asm_out_file, x);
8867       break;
8868
8869     case 'k':
8870       {
8871         HOST_WIDE_INT cond_code;
8872
8873         if (!CONST_INT_P (x))
8874           {
8875             output_operand_lossage ("invalid operand for '%%%c'", code);
8876             return;
8877           }
8878
8879         cond_code = INTVAL (x);
8880         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
8881         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
8882       }
8883       break;
8884
8885     case 'y':
8886     case 'z':
8887       {
8888         machine_mode mode = GET_MODE (x);
8889
8890         if (GET_CODE (x) != MEM
8891             || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
8892           {
8893             output_operand_lossage ("invalid operand for '%%%c'", code);
8894             return;
8895           }
8896
8897         if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
8898                                             code == 'y'
8899                                             ? ADDR_QUERY_LDP_STP_N
8900                                             : ADDR_QUERY_LDP_STP))
8901           output_operand_lossage ("invalid operand prefix '%%%c'", code);
8902       }
8903       break;
8904
8905     default:
8906       output_operand_lossage ("invalid operand prefix '%%%c'", code);
8907       return;
8908     }
8909 }
8910
8911 /* Print address 'x' of a memory access with mode 'mode'.
8912    'op' is the context required by aarch64_classify_address.  It can either be
8913    MEM for a normal memory access or PARALLEL for LDP/STP.  */
8914 static bool
8915 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
8916                                 aarch64_addr_query_type type)
8917 {
8918   struct aarch64_address_info addr;
8919   unsigned int size;
8920
8921   /* Check all addresses are Pmode - including ILP32.  */
8922   if (GET_MODE (x) != Pmode
8923       && (!CONST_INT_P (x)
8924           || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
8925     {
8926       output_operand_lossage ("invalid address mode");
8927       return false;
8928     }
8929
8930   if (aarch64_classify_address (&addr, x, mode, true, type))
8931     switch (addr.type)
8932       {
8933       case ADDRESS_REG_IMM:
8934         if (known_eq (addr.const_offset, 0))
8935           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
8936         else if (aarch64_sve_data_mode_p (mode))
8937           {
8938             HOST_WIDE_INT vnum
8939               = exact_div (addr.const_offset,
8940                            BYTES_PER_SVE_VECTOR).to_constant ();
8941             asm_fprintf (f, "[%s, #%wd, mul vl]",
8942                          reg_names[REGNO (addr.base)], vnum);
8943           }
8944         else if (aarch64_sve_pred_mode_p (mode))
8945           {
8946             HOST_WIDE_INT vnum
8947               = exact_div (addr.const_offset,
8948                            BYTES_PER_SVE_PRED).to_constant ();
8949             asm_fprintf (f, "[%s, #%wd, mul vl]",
8950                          reg_names[REGNO (addr.base)], vnum);
8951           }
8952         else
8953           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
8954                        INTVAL (addr.offset));
8955         return true;
8956
8957       case ADDRESS_REG_REG:
8958         if (addr.shift == 0)
8959           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
8960                        reg_names [REGNO (addr.offset)]);
8961         else
8962           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
8963                        reg_names [REGNO (addr.offset)], addr.shift);
8964         return true;
8965
8966       case ADDRESS_REG_UXTW:
8967         if (addr.shift == 0)
8968           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
8969                        REGNO (addr.offset) - R0_REGNUM);
8970         else
8971           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
8972                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
8973         return true;
8974
8975       case ADDRESS_REG_SXTW:
8976         if (addr.shift == 0)
8977           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
8978                        REGNO (addr.offset) - R0_REGNUM);
8979         else
8980           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
8981                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
8982         return true;
8983
8984       case ADDRESS_REG_WB:
8985         /* Writeback is only supported for fixed-width modes.  */
8986         size = GET_MODE_SIZE (mode).to_constant ();
8987         switch (GET_CODE (x))
8988           {
8989           case PRE_INC:
8990             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
8991             return true;
8992           case POST_INC:
8993             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
8994             return true;
8995           case PRE_DEC:
8996             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
8997             return true;
8998           case POST_DEC:
8999             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
9000             return true;
9001           case PRE_MODIFY:
9002             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
9003                          INTVAL (addr.offset));
9004             return true;
9005           case POST_MODIFY:
9006             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
9007                          INTVAL (addr.offset));
9008             return true;
9009           default:
9010             break;
9011           }
9012         break;
9013
9014       case ADDRESS_LO_SUM:
9015         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
9016         output_addr_const (f, addr.offset);
9017         asm_fprintf (f, "]");
9018         return true;
9019
9020       case ADDRESS_SYMBOLIC:
9021         output_addr_const (f, x);
9022         return true;
9023       }
9024
9025   return false;
9026 }
9027
9028 /* Print address 'x' of a memory access with mode 'mode'.  */
9029 static void
9030 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
9031 {
9032   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
9033     output_addr_const (f, x);
9034 }
9035
9036 bool
9037 aarch64_label_mentioned_p (rtx x)
9038 {
9039   const char *fmt;
9040   int i;
9041
9042   if (GET_CODE (x) == LABEL_REF)
9043     return true;
9044
9045   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
9046      referencing instruction, but they are constant offsets, not
9047      symbols.  */
9048   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
9049     return false;
9050
9051   fmt = GET_RTX_FORMAT (GET_CODE (x));
9052   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
9053     {
9054       if (fmt[i] == 'E')
9055         {
9056           int j;
9057
9058           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
9059             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
9060               return 1;
9061         }
9062       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
9063         return 1;
9064     }
9065
9066   return 0;
9067 }
9068
9069 /* Implement REGNO_REG_CLASS.  */
9070
9071 enum reg_class
9072 aarch64_regno_regclass (unsigned regno)
9073 {
9074   if (GP_REGNUM_P (regno))
9075     return GENERAL_REGS;
9076
9077   if (regno == SP_REGNUM)
9078     return STACK_REG;
9079
9080   if (regno == FRAME_POINTER_REGNUM
9081       || regno == ARG_POINTER_REGNUM)
9082     return POINTER_REGS;
9083
9084   if (FP_REGNUM_P (regno))
9085     return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
9086             : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
9087
9088   if (PR_REGNUM_P (regno))
9089     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
9090
9091   return NO_REGS;
9092 }
9093
9094 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
9095    If OFFSET is out of range, return an offset of an anchor point
9096    that is in range.  Return 0 otherwise.  */
9097
9098 static HOST_WIDE_INT
9099 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
9100                        machine_mode mode)
9101 {
9102   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
9103   if (size > 16)
9104     return (offset + 0x400) & ~0x7f0;
9105
9106   /* For offsets that aren't a multiple of the access size, the limit is
9107      -256...255.  */
9108   if (offset & (size - 1))
9109     {
9110       /* BLKmode typically uses LDP of X-registers.  */
9111       if (mode == BLKmode)
9112         return (offset + 512) & ~0x3ff;
9113       return (offset + 0x100) & ~0x1ff;
9114     }
9115
9116   /* Small negative offsets are supported.  */
9117   if (IN_RANGE (offset, -256, 0))
9118     return 0;
9119
9120   if (mode == TImode || mode == TFmode)
9121     return (offset + 0x100) & ~0x1ff;
9122
9123   /* Use 12-bit offset by access size.  */
9124   return offset & (~0xfff * size);
9125 }
9126
9127 static rtx
9128 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
9129 {
9130   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
9131      where mask is selected by alignment and size of the offset.
9132      We try to pick as large a range for the offset as possible to
9133      maximize the chance of a CSE.  However, for aligned addresses
9134      we limit the range to 4k so that structures with different sized
9135      elements are likely to use the same base.  We need to be careful
9136      not to split a CONST for some forms of address expression, otherwise
9137      it will generate sub-optimal code.  */
9138
9139   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
9140     {
9141       rtx base = XEXP (x, 0);
9142       rtx offset_rtx = XEXP (x, 1);
9143       HOST_WIDE_INT offset = INTVAL (offset_rtx);
9144
9145       if (GET_CODE (base) == PLUS)
9146         {
9147           rtx op0 = XEXP (base, 0);
9148           rtx op1 = XEXP (base, 1);
9149
9150           /* Force any scaling into a temp for CSE.  */
9151           op0 = force_reg (Pmode, op0);
9152           op1 = force_reg (Pmode, op1);
9153
9154           /* Let the pointer register be in op0.  */
9155           if (REG_POINTER (op1))
9156             std::swap (op0, op1);
9157
9158           /* If the pointer is virtual or frame related, then we know that
9159              virtual register instantiation or register elimination is going
9160              to apply a second constant.  We want the two constants folded
9161              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
9162           if (virt_or_elim_regno_p (REGNO (op0)))
9163             {
9164               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
9165                                    NULL_RTX, true, OPTAB_DIRECT);
9166               return gen_rtx_PLUS (Pmode, base, op1);
9167             }
9168
9169           /* Otherwise, in order to encourage CSE (and thence loop strength
9170              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
9171           base = expand_binop (Pmode, add_optab, op0, op1,
9172                                NULL_RTX, true, OPTAB_DIRECT);
9173           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
9174         }
9175
9176       HOST_WIDE_INT size;
9177       if (GET_MODE_SIZE (mode).is_constant (&size))
9178         {
9179           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
9180                                                              mode);
9181           if (base_offset != 0)
9182             {
9183               base = plus_constant (Pmode, base, base_offset);
9184               base = force_operand (base, NULL_RTX);
9185               return plus_constant (Pmode, base, offset - base_offset);
9186             }
9187         }
9188     }
9189
9190   return x;
9191 }
9192
9193 static reg_class_t
9194 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
9195                           reg_class_t rclass,
9196                           machine_mode mode,
9197                           secondary_reload_info *sri)
9198 {
9199   /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
9200      directly by the *aarch64_sve_mov<mode>_be move pattern.  See the
9201      comment at the head of aarch64-sve.md for more details about the
9202      big-endian handling.  */
9203   if (BYTES_BIG_ENDIAN
9204       && reg_class_subset_p (rclass, FP_REGS)
9205       && !((REG_P (x) && HARD_REGISTER_P (x))
9206            || aarch64_simd_valid_immediate (x, NULL))
9207       && aarch64_sve_data_mode_p (mode))
9208     {
9209       sri->icode = CODE_FOR_aarch64_sve_reload_be;
9210       return NO_REGS;
9211     }
9212
9213   /* If we have to disable direct literal pool loads and stores because the
9214      function is too big, then we need a scratch register.  */
9215   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
9216       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
9217           || targetm.vector_mode_supported_p (GET_MODE (x)))
9218       && !aarch64_pcrelative_literal_loads)
9219     {
9220       sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
9221       return NO_REGS;
9222     }
9223
9224   /* Without the TARGET_SIMD instructions we cannot move a Q register
9225      to a Q register directly.  We need a scratch.  */
9226   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
9227       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
9228       && reg_class_subset_p (rclass, FP_REGS))
9229     {
9230       sri->icode = code_for_aarch64_reload_mov (mode);
9231       return NO_REGS;
9232     }
9233
9234   /* A TFmode or TImode memory access should be handled via an FP_REGS
9235      because AArch64 has richer addressing modes for LDR/STR instructions
9236      than LDP/STP instructions.  */
9237   if (TARGET_FLOAT && rclass == GENERAL_REGS
9238       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
9239     return FP_REGS;
9240
9241   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
9242       return GENERAL_REGS;
9243
9244   return NO_REGS;
9245 }
9246
9247 static bool
9248 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
9249 {
9250   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
9251
9252   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
9253      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
9254   if (frame_pointer_needed)
9255     return to == HARD_FRAME_POINTER_REGNUM;
9256   return true;
9257 }
9258
9259 poly_int64
9260 aarch64_initial_elimination_offset (unsigned from, unsigned to)
9261 {
9262   if (to == HARD_FRAME_POINTER_REGNUM)
9263     {
9264       if (from == ARG_POINTER_REGNUM)
9265         return cfun->machine->frame.hard_fp_offset;
9266
9267       if (from == FRAME_POINTER_REGNUM)
9268         return cfun->machine->frame.hard_fp_offset
9269                - cfun->machine->frame.locals_offset;
9270     }
9271
9272   if (to == STACK_POINTER_REGNUM)
9273     {
9274       if (from == FRAME_POINTER_REGNUM)
9275           return cfun->machine->frame.frame_size
9276                  - cfun->machine->frame.locals_offset;
9277     }
9278
9279   return cfun->machine->frame.frame_size;
9280 }
9281
9282 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
9283    previous frame.  */
9284
9285 rtx
9286 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
9287 {
9288   if (count != 0)
9289     return const0_rtx;
9290   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
9291 }
9292
9293
9294 static void
9295 aarch64_asm_trampoline_template (FILE *f)
9296 {
9297   int offset1 = 16;
9298   int offset2 = 20;
9299
9300   if (aarch64_bti_enabled ())
9301     {
9302       asm_fprintf (f, "\thint\t34 // bti c\n");
9303       offset1 -= 4;
9304       offset2 -= 4;
9305     }
9306
9307   if (TARGET_ILP32)
9308     {
9309       asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
9310       asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
9311                    offset1);
9312     }
9313   else
9314     {
9315       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
9316       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
9317                    offset2);
9318     }
9319   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
9320
9321   /* The trampoline needs an extra padding instruction.  In case if BTI is
9322      enabled the padding instruction is replaced by the BTI instruction at
9323      the beginning.  */
9324   if (!aarch64_bti_enabled ())
9325     assemble_aligned_integer (4, const0_rtx);
9326
9327   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
9328   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
9329 }
9330
9331 static void
9332 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
9333 {
9334   rtx fnaddr, mem, a_tramp;
9335   const int tramp_code_sz = 16;
9336
9337   /* Don't need to copy the trailing D-words, we fill those in below.  */
9338   emit_block_move (m_tramp, assemble_trampoline_template (),
9339                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
9340   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
9341   fnaddr = XEXP (DECL_RTL (fndecl), 0);
9342   if (GET_MODE (fnaddr) != ptr_mode)
9343     fnaddr = convert_memory_address (ptr_mode, fnaddr);
9344   emit_move_insn (mem, fnaddr);
9345
9346   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
9347   emit_move_insn (mem, chain_value);
9348
9349   /* XXX We should really define a "clear_cache" pattern and use
9350      gen_clear_cache().  */
9351   a_tramp = XEXP (m_tramp, 0);
9352   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
9353                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
9354                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
9355                      ptr_mode);
9356 }
9357
9358 static unsigned char
9359 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
9360 {
9361   /* ??? Logically we should only need to provide a value when
9362      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
9363      can hold MODE, but at the moment we need to handle all modes.
9364      Just ignore any runtime parts for registers that can't store them.  */
9365   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
9366   unsigned int nregs;
9367   switch (regclass)
9368     {
9369     case TAILCALL_ADDR_REGS:
9370     case POINTER_REGS:
9371     case GENERAL_REGS:
9372     case ALL_REGS:
9373     case POINTER_AND_FP_REGS:
9374     case FP_REGS:
9375     case FP_LO_REGS:
9376     case FP_LO8_REGS:
9377       if (aarch64_sve_data_mode_p (mode)
9378           && constant_multiple_p (GET_MODE_SIZE (mode),
9379                                   BYTES_PER_SVE_VECTOR, &nregs))
9380         return nregs;
9381       return (aarch64_vector_data_mode_p (mode)
9382               ? CEIL (lowest_size, UNITS_PER_VREG)
9383               : CEIL (lowest_size, UNITS_PER_WORD));
9384     case STACK_REG:
9385     case PR_REGS:
9386     case PR_LO_REGS:
9387     case PR_HI_REGS:
9388       return 1;
9389
9390     case NO_REGS:
9391       return 0;
9392
9393     default:
9394       break;
9395     }
9396   gcc_unreachable ();
9397 }
9398
9399 static reg_class_t
9400 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
9401 {
9402   if (regclass == POINTER_REGS)
9403     return GENERAL_REGS;
9404
9405   if (regclass == STACK_REG)
9406     {
9407       if (REG_P(x)
9408           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
9409           return regclass;
9410
9411       return NO_REGS;
9412     }
9413
9414   /* Register eliminiation can result in a request for
9415      SP+constant->FP_REGS.  We cannot support such operations which
9416      use SP as source and an FP_REG as destination, so reject out
9417      right now.  */
9418   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
9419     {
9420       rtx lhs = XEXP (x, 0);
9421
9422       /* Look through a possible SUBREG introduced by ILP32.  */
9423       if (GET_CODE (lhs) == SUBREG)
9424         lhs = SUBREG_REG (lhs);
9425
9426       gcc_assert (REG_P (lhs));
9427       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
9428                                       POINTER_REGS));
9429       return NO_REGS;
9430     }
9431
9432   return regclass;
9433 }
9434
9435 void
9436 aarch64_asm_output_labelref (FILE* f, const char *name)
9437 {
9438   asm_fprintf (f, "%U%s", name);
9439 }
9440
9441 static void
9442 aarch64_elf_asm_constructor (rtx symbol, int priority)
9443 {
9444   if (priority == DEFAULT_INIT_PRIORITY)
9445     default_ctor_section_asm_out_constructor (symbol, priority);
9446   else
9447     {
9448       section *s;
9449       /* While priority is known to be in range [0, 65535], so 18 bytes
9450          would be enough, the compiler might not know that.  To avoid
9451          -Wformat-truncation false positive, use a larger size.  */
9452       char buf[23];
9453       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
9454       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
9455       switch_to_section (s);
9456       assemble_align (POINTER_SIZE);
9457       assemble_aligned_integer (POINTER_BYTES, symbol);
9458     }
9459 }
9460
9461 static void
9462 aarch64_elf_asm_destructor (rtx symbol, int priority)
9463 {
9464   if (priority == DEFAULT_INIT_PRIORITY)
9465     default_dtor_section_asm_out_destructor (symbol, priority);
9466   else
9467     {
9468       section *s;
9469       /* While priority is known to be in range [0, 65535], so 18 bytes
9470          would be enough, the compiler might not know that.  To avoid
9471          -Wformat-truncation false positive, use a larger size.  */
9472       char buf[23];
9473       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
9474       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
9475       switch_to_section (s);
9476       assemble_align (POINTER_SIZE);
9477       assemble_aligned_integer (POINTER_BYTES, symbol);
9478     }
9479 }
9480
9481 const char*
9482 aarch64_output_casesi (rtx *operands)
9483 {
9484   char buf[100];
9485   char label[100];
9486   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
9487   int index;
9488   static const char *const patterns[4][2] =
9489   {
9490     {
9491       "ldrb\t%w3, [%0,%w1,uxtw]",
9492       "add\t%3, %4, %w3, sxtb #2"
9493     },
9494     {
9495       "ldrh\t%w3, [%0,%w1,uxtw #1]",
9496       "add\t%3, %4, %w3, sxth #2"
9497     },
9498     {
9499       "ldr\t%w3, [%0,%w1,uxtw #2]",
9500       "add\t%3, %4, %w3, sxtw #2"
9501     },
9502     /* We assume that DImode is only generated when not optimizing and
9503        that we don't really need 64-bit address offsets.  That would
9504        imply an object file with 8GB of code in a single function!  */
9505     {
9506       "ldr\t%w3, [%0,%w1,uxtw #2]",
9507       "add\t%3, %4, %w3, sxtw #2"
9508     }
9509   };
9510
9511   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
9512
9513   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
9514   index = exact_log2 (GET_MODE_SIZE (mode));
9515
9516   gcc_assert (index >= 0 && index <= 3);
9517
9518   /* Need to implement table size reduction, by chaning the code below.  */
9519   output_asm_insn (patterns[index][0], operands);
9520   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
9521   snprintf (buf, sizeof (buf),
9522             "adr\t%%4, %s", targetm.strip_name_encoding (label));
9523   output_asm_insn (buf, operands);
9524   output_asm_insn (patterns[index][1], operands);
9525   output_asm_insn ("br\t%3", operands);
9526   assemble_label (asm_out_file, label);
9527   return "";
9528 }
9529
9530
9531 /* Return size in bits of an arithmetic operand which is shifted/scaled and
9532    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
9533    operator.  */
9534
9535 int
9536 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
9537 {
9538   if (shift >= 0 && shift <= 3)
9539     {
9540       int size;
9541       for (size = 8; size <= 32; size *= 2)
9542         {
9543           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
9544           if (mask == bits << shift)
9545             return size;
9546         }
9547     }
9548   return 0;
9549 }
9550
9551 /* Constant pools are per function only when PC relative
9552    literal loads are true or we are in the large memory
9553    model.  */
9554
9555 static inline bool
9556 aarch64_can_use_per_function_literal_pools_p (void)
9557 {
9558   return (aarch64_pcrelative_literal_loads
9559           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
9560 }
9561
9562 static bool
9563 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
9564 {
9565   /* We can't use blocks for constants when we're using a per-function
9566      constant pool.  */
9567   return !aarch64_can_use_per_function_literal_pools_p ();
9568 }
9569
9570 /* Select appropriate section for constants depending
9571    on where we place literal pools.  */
9572
9573 static section *
9574 aarch64_select_rtx_section (machine_mode mode,
9575                             rtx x,
9576                             unsigned HOST_WIDE_INT align)
9577 {
9578   if (aarch64_can_use_per_function_literal_pools_p ())
9579     return function_section (current_function_decl);
9580
9581   return default_elf_select_rtx_section (mode, x, align);
9582 }
9583
9584 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
9585 void
9586 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
9587                                   HOST_WIDE_INT offset)
9588 {
9589   /* When using per-function literal pools, we must ensure that any code
9590      section is aligned to the minimal instruction length, lest we get
9591      errors from the assembler re "unaligned instructions".  */
9592   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
9593     ASM_OUTPUT_ALIGN (f, 2);
9594 }
9595
9596 /* Costs.  */
9597
9598 /* Helper function for rtx cost calculation.  Strip a shift expression
9599    from X.  Returns the inner operand if successful, or the original
9600    expression on failure.  */
9601 static rtx
9602 aarch64_strip_shift (rtx x)
9603 {
9604   rtx op = x;
9605
9606   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
9607      we can convert both to ROR during final output.  */
9608   if ((GET_CODE (op) == ASHIFT
9609        || GET_CODE (op) == ASHIFTRT
9610        || GET_CODE (op) == LSHIFTRT
9611        || GET_CODE (op) == ROTATERT
9612        || GET_CODE (op) == ROTATE)
9613       && CONST_INT_P (XEXP (op, 1)))
9614     return XEXP (op, 0);
9615
9616   if (GET_CODE (op) == MULT
9617       && CONST_INT_P (XEXP (op, 1))
9618       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
9619     return XEXP (op, 0);
9620
9621   return x;
9622 }
9623
9624 /* Helper function for rtx cost calculation.  Strip an extend
9625    expression from X.  Returns the inner operand if successful, or the
9626    original expression on failure.  We deal with a number of possible
9627    canonicalization variations here. If STRIP_SHIFT is true, then
9628    we can strip off a shift also.  */
9629 static rtx
9630 aarch64_strip_extend (rtx x, bool strip_shift)
9631 {
9632   scalar_int_mode mode;
9633   rtx op = x;
9634
9635   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
9636     return op;
9637
9638   /* Zero and sign extraction of a widened value.  */
9639   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
9640       && XEXP (op, 2) == const0_rtx
9641       && GET_CODE (XEXP (op, 0)) == MULT
9642       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
9643                                          XEXP (op, 1)))
9644     return XEXP (XEXP (op, 0), 0);
9645
9646   /* It can also be represented (for zero-extend) as an AND with an
9647      immediate.  */
9648   if (GET_CODE (op) == AND
9649       && GET_CODE (XEXP (op, 0)) == MULT
9650       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
9651       && CONST_INT_P (XEXP (op, 1))
9652       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
9653                            INTVAL (XEXP (op, 1))) != 0)
9654     return XEXP (XEXP (op, 0), 0);
9655
9656   /* Now handle extended register, as this may also have an optional
9657      left shift by 1..4.  */
9658   if (strip_shift
9659       && GET_CODE (op) == ASHIFT
9660       && CONST_INT_P (XEXP (op, 1))
9661       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
9662     op = XEXP (op, 0);
9663
9664   if (GET_CODE (op) == ZERO_EXTEND
9665       || GET_CODE (op) == SIGN_EXTEND)
9666     op = XEXP (op, 0);
9667
9668   if (op != x)
9669     return op;
9670
9671   return x;
9672 }
9673
9674 /* Return true iff CODE is a shift supported in combination
9675    with arithmetic instructions.  */
9676
9677 static bool
9678 aarch64_shift_p (enum rtx_code code)
9679 {
9680   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
9681 }
9682
9683
9684 /* Return true iff X is a cheap shift without a sign extend. */
9685
9686 static bool
9687 aarch64_cheap_mult_shift_p (rtx x)
9688 {
9689   rtx op0, op1;
9690
9691   op0 = XEXP (x, 0);
9692   op1 = XEXP (x, 1);
9693
9694   if (!(aarch64_tune_params.extra_tuning_flags
9695                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
9696     return false;
9697
9698   if (GET_CODE (op0) == SIGN_EXTEND)
9699     return false;
9700
9701   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
9702       && UINTVAL (op1) <= 4)
9703     return true;
9704
9705   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
9706     return false;
9707
9708   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
9709
9710   if (l2 > 0 && l2 <= 4)
9711     return true;
9712
9713   return false;
9714 }
9715
9716 /* Helper function for rtx cost calculation.  Calculate the cost of
9717    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
9718    Return the calculated cost of the expression, recursing manually in to
9719    operands where needed.  */
9720
9721 static int
9722 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
9723 {
9724   rtx op0, op1;
9725   const struct cpu_cost_table *extra_cost
9726     = aarch64_tune_params.insn_extra_cost;
9727   int cost = 0;
9728   bool compound_p = (outer == PLUS || outer == MINUS);
9729   machine_mode mode = GET_MODE (x);
9730
9731   gcc_checking_assert (code == MULT);
9732
9733   op0 = XEXP (x, 0);
9734   op1 = XEXP (x, 1);
9735
9736   if (VECTOR_MODE_P (mode))
9737     mode = GET_MODE_INNER (mode);
9738
9739   /* Integer multiply/fma.  */
9740   if (GET_MODE_CLASS (mode) == MODE_INT)
9741     {
9742       /* The multiply will be canonicalized as a shift, cost it as such.  */
9743       if (aarch64_shift_p (GET_CODE (x))
9744           || (CONST_INT_P (op1)
9745               && exact_log2 (INTVAL (op1)) > 0))
9746         {
9747           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
9748                            || GET_CODE (op0) == SIGN_EXTEND;
9749           if (speed)
9750             {
9751               if (compound_p)
9752                 {
9753                   /* If the shift is considered cheap,
9754                      then don't add any cost. */
9755                   if (aarch64_cheap_mult_shift_p (x))
9756                     ;
9757                   else if (REG_P (op1))
9758                     /* ARITH + shift-by-register.  */
9759                     cost += extra_cost->alu.arith_shift_reg;
9760                   else if (is_extend)
9761                     /* ARITH + extended register.  We don't have a cost field
9762                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
9763                     cost += extra_cost->alu.extend_arith;
9764                   else
9765                     /* ARITH + shift-by-immediate.  */
9766                     cost += extra_cost->alu.arith_shift;
9767                 }
9768               else
9769                 /* LSL (immediate).  */
9770                 cost += extra_cost->alu.shift;
9771
9772             }
9773           /* Strip extends as we will have costed them in the case above.  */
9774           if (is_extend)
9775             op0 = aarch64_strip_extend (op0, true);
9776
9777           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
9778
9779           return cost;
9780         }
9781
9782       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
9783          compound and let the below cases handle it.  After all, MNEG is a
9784          special-case alias of MSUB.  */
9785       if (GET_CODE (op0) == NEG)
9786         {
9787           op0 = XEXP (op0, 0);
9788           compound_p = true;
9789         }
9790
9791       /* Integer multiplies or FMAs have zero/sign extending variants.  */
9792       if ((GET_CODE (op0) == ZERO_EXTEND
9793            && GET_CODE (op1) == ZERO_EXTEND)
9794           || (GET_CODE (op0) == SIGN_EXTEND
9795               && GET_CODE (op1) == SIGN_EXTEND))
9796         {
9797           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
9798           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
9799
9800           if (speed)
9801             {
9802               if (compound_p)
9803                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
9804                 cost += extra_cost->mult[0].extend_add;
9805               else
9806                 /* MUL/SMULL/UMULL.  */
9807                 cost += extra_cost->mult[0].extend;
9808             }
9809
9810           return cost;
9811         }
9812
9813       /* This is either an integer multiply or a MADD.  In both cases
9814          we want to recurse and cost the operands.  */
9815       cost += rtx_cost (op0, mode, MULT, 0, speed);
9816       cost += rtx_cost (op1, mode, MULT, 1, speed);
9817
9818       if (speed)
9819         {
9820           if (compound_p)
9821             /* MADD/MSUB.  */
9822             cost += extra_cost->mult[mode == DImode].add;
9823           else
9824             /* MUL.  */
9825             cost += extra_cost->mult[mode == DImode].simple;
9826         }
9827
9828       return cost;
9829     }
9830   else
9831     {
9832       if (speed)
9833         {
9834           /* Floating-point FMA/FMUL can also support negations of the
9835              operands, unless the rounding mode is upward or downward in
9836              which case FNMUL is different than FMUL with operand negation.  */
9837           bool neg0 = GET_CODE (op0) == NEG;
9838           bool neg1 = GET_CODE (op1) == NEG;
9839           if (compound_p || !flag_rounding_math || (neg0 && neg1))
9840             {
9841               if (neg0)
9842                 op0 = XEXP (op0, 0);
9843               if (neg1)
9844                 op1 = XEXP (op1, 0);
9845             }
9846
9847           if (compound_p)
9848             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
9849             cost += extra_cost->fp[mode == DFmode].fma;
9850           else
9851             /* FMUL/FNMUL.  */
9852             cost += extra_cost->fp[mode == DFmode].mult;
9853         }
9854
9855       cost += rtx_cost (op0, mode, MULT, 0, speed);
9856       cost += rtx_cost (op1, mode, MULT, 1, speed);
9857       return cost;
9858     }
9859 }
9860
9861 static int
9862 aarch64_address_cost (rtx x,
9863                       machine_mode mode,
9864                       addr_space_t as ATTRIBUTE_UNUSED,
9865                       bool speed)
9866 {
9867   enum rtx_code c = GET_CODE (x);
9868   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
9869   struct aarch64_address_info info;
9870   int cost = 0;
9871   info.shift = 0;
9872
9873   if (!aarch64_classify_address (&info, x, mode, false))
9874     {
9875       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
9876         {
9877           /* This is a CONST or SYMBOL ref which will be split
9878              in a different way depending on the code model in use.
9879              Cost it through the generic infrastructure.  */
9880           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
9881           /* Divide through by the cost of one instruction to
9882              bring it to the same units as the address costs.  */
9883           cost_symbol_ref /= COSTS_N_INSNS (1);
9884           /* The cost is then the cost of preparing the address,
9885              followed by an immediate (possibly 0) offset.  */
9886           return cost_symbol_ref + addr_cost->imm_offset;
9887         }
9888       else
9889         {
9890           /* This is most likely a jump table from a case
9891              statement.  */
9892           return addr_cost->register_offset;
9893         }
9894     }
9895
9896   switch (info.type)
9897     {
9898       case ADDRESS_LO_SUM:
9899       case ADDRESS_SYMBOLIC:
9900       case ADDRESS_REG_IMM:
9901         cost += addr_cost->imm_offset;
9902         break;
9903
9904       case ADDRESS_REG_WB:
9905         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
9906           cost += addr_cost->pre_modify;
9907         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
9908           cost += addr_cost->post_modify;
9909         else
9910           gcc_unreachable ();
9911
9912         break;
9913
9914       case ADDRESS_REG_REG:
9915         cost += addr_cost->register_offset;
9916         break;
9917
9918       case ADDRESS_REG_SXTW:
9919         cost += addr_cost->register_sextend;
9920         break;
9921
9922       case ADDRESS_REG_UXTW:
9923         cost += addr_cost->register_zextend;
9924         break;
9925
9926       default:
9927         gcc_unreachable ();
9928     }
9929
9930
9931   if (info.shift > 0)
9932     {
9933       /* For the sake of calculating the cost of the shifted register
9934          component, we can treat same sized modes in the same way.  */
9935       if (known_eq (GET_MODE_BITSIZE (mode), 16))
9936         cost += addr_cost->addr_scale_costs.hi;
9937       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
9938         cost += addr_cost->addr_scale_costs.si;
9939       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
9940         cost += addr_cost->addr_scale_costs.di;
9941       else
9942         /* We can't tell, or this is a 128-bit vector.  */
9943         cost += addr_cost->addr_scale_costs.ti;
9944     }
9945
9946   return cost;
9947 }
9948
9949 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
9950    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
9951    to be taken.  */
9952
9953 int
9954 aarch64_branch_cost (bool speed_p, bool predictable_p)
9955 {
9956   /* When optimizing for speed, use the cost of unpredictable branches.  */
9957   const struct cpu_branch_cost *branch_costs =
9958     aarch64_tune_params.branch_costs;
9959
9960   if (!speed_p || predictable_p)
9961     return branch_costs->predictable;
9962   else
9963     return branch_costs->unpredictable;
9964 }
9965
9966 /* Return true if the RTX X in mode MODE is a zero or sign extract
9967    usable in an ADD or SUB (extended register) instruction.  */
9968 static bool
9969 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
9970 {
9971   /* Catch add with a sign extract.
9972      This is add_<optab><mode>_multp2.  */
9973   if (GET_CODE (x) == SIGN_EXTRACT
9974       || GET_CODE (x) == ZERO_EXTRACT)
9975     {
9976       rtx op0 = XEXP (x, 0);
9977       rtx op1 = XEXP (x, 1);
9978       rtx op2 = XEXP (x, 2);
9979
9980       if (GET_CODE (op0) == MULT
9981           && CONST_INT_P (op1)
9982           && op2 == const0_rtx
9983           && CONST_INT_P (XEXP (op0, 1))
9984           && aarch64_is_extend_from_extract (mode,
9985                                              XEXP (op0, 1),
9986                                              op1))
9987         {
9988           return true;
9989         }
9990     }
9991   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
9992      No shift.  */
9993   else if (GET_CODE (x) == SIGN_EXTEND
9994            || GET_CODE (x) == ZERO_EXTEND)
9995     return REG_P (XEXP (x, 0));
9996
9997   return false;
9998 }
9999
10000 static bool
10001 aarch64_frint_unspec_p (unsigned int u)
10002 {
10003   switch (u)
10004     {
10005       case UNSPEC_FRINTZ:
10006       case UNSPEC_FRINTP:
10007       case UNSPEC_FRINTM:
10008       case UNSPEC_FRINTA:
10009       case UNSPEC_FRINTN:
10010       case UNSPEC_FRINTX:
10011       case UNSPEC_FRINTI:
10012         return true;
10013
10014       default:
10015         return false;
10016     }
10017 }
10018
10019 /* Return true iff X is an rtx that will match an extr instruction
10020    i.e. as described in the *extr<mode>5_insn family of patterns.
10021    OP0 and OP1 will be set to the operands of the shifts involved
10022    on success and will be NULL_RTX otherwise.  */
10023
10024 static bool
10025 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
10026 {
10027   rtx op0, op1;
10028   scalar_int_mode mode;
10029   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
10030     return false;
10031
10032   *res_op0 = NULL_RTX;
10033   *res_op1 = NULL_RTX;
10034
10035   if (GET_CODE (x) != IOR)
10036     return false;
10037
10038   op0 = XEXP (x, 0);
10039   op1 = XEXP (x, 1);
10040
10041   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
10042       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
10043     {
10044      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
10045       if (GET_CODE (op1) == ASHIFT)
10046         std::swap (op0, op1);
10047
10048       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
10049         return false;
10050
10051       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
10052       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
10053
10054       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
10055           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
10056         {
10057           *res_op0 = XEXP (op0, 0);
10058           *res_op1 = XEXP (op1, 0);
10059           return true;
10060         }
10061     }
10062
10063   return false;
10064 }
10065
10066 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
10067    storing it in *COST.  Result is true if the total cost of the operation
10068    has now been calculated.  */
10069 static bool
10070 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
10071 {
10072   rtx inner;
10073   rtx comparator;
10074   enum rtx_code cmpcode;
10075
10076   if (COMPARISON_P (op0))
10077     {
10078       inner = XEXP (op0, 0);
10079       comparator = XEXP (op0, 1);
10080       cmpcode = GET_CODE (op0);
10081     }
10082   else
10083     {
10084       inner = op0;
10085       comparator = const0_rtx;
10086       cmpcode = NE;
10087     }
10088
10089   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
10090     {
10091       /* Conditional branch.  */
10092       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
10093         return true;
10094       else
10095         {
10096           if (cmpcode == NE || cmpcode == EQ)
10097             {
10098               if (comparator == const0_rtx)
10099                 {
10100                   /* TBZ/TBNZ/CBZ/CBNZ.  */
10101                   if (GET_CODE (inner) == ZERO_EXTRACT)
10102                     /* TBZ/TBNZ.  */
10103                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
10104                                        ZERO_EXTRACT, 0, speed);
10105                   else
10106                     /* CBZ/CBNZ.  */
10107                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
10108
10109                 return true;
10110               }
10111             }
10112           else if (cmpcode == LT || cmpcode == GE)
10113             {
10114               /* TBZ/TBNZ.  */
10115               if (comparator == const0_rtx)
10116                 return true;
10117             }
10118         }
10119     }
10120   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
10121     {
10122       /* CCMP.  */
10123       if (GET_CODE (op1) == COMPARE)
10124         {
10125           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
10126           if (XEXP (op1, 1) == const0_rtx)
10127             *cost += 1;
10128           if (speed)
10129             {
10130               machine_mode mode = GET_MODE (XEXP (op1, 0));
10131               const struct cpu_cost_table *extra_cost
10132                 = aarch64_tune_params.insn_extra_cost;
10133
10134               if (GET_MODE_CLASS (mode) == MODE_INT)
10135                 *cost += extra_cost->alu.arith;
10136               else
10137                 *cost += extra_cost->fp[mode == DFmode].compare;
10138             }
10139           return true;
10140         }
10141
10142       /* It's a conditional operation based on the status flags,
10143          so it must be some flavor of CSEL.  */
10144
10145       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
10146       if (GET_CODE (op1) == NEG
10147           || GET_CODE (op1) == NOT
10148           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
10149         op1 = XEXP (op1, 0);
10150       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
10151         {
10152           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
10153           op1 = XEXP (op1, 0);
10154           op2 = XEXP (op2, 0);
10155         }
10156
10157       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
10158       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
10159       return true;
10160     }
10161
10162   /* We don't know what this is, cost all operands.  */
10163   return false;
10164 }
10165
10166 /* Check whether X is a bitfield operation of the form shift + extend that
10167    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
10168    operand to which the bitfield operation is applied.  Otherwise return
10169    NULL_RTX.  */
10170
10171 static rtx
10172 aarch64_extend_bitfield_pattern_p (rtx x)
10173 {
10174   rtx_code outer_code = GET_CODE (x);
10175   machine_mode outer_mode = GET_MODE (x);
10176
10177   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
10178       && outer_mode != SImode && outer_mode != DImode)
10179     return NULL_RTX;
10180
10181   rtx inner = XEXP (x, 0);
10182   rtx_code inner_code = GET_CODE (inner);
10183   machine_mode inner_mode = GET_MODE (inner);
10184   rtx op = NULL_RTX;
10185
10186   switch (inner_code)
10187     {
10188       case ASHIFT:
10189         if (CONST_INT_P (XEXP (inner, 1))
10190             && (inner_mode == QImode || inner_mode == HImode))
10191           op = XEXP (inner, 0);
10192         break;
10193       case LSHIFTRT:
10194         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
10195             && (inner_mode == QImode || inner_mode == HImode))
10196           op = XEXP (inner, 0);
10197         break;
10198       case ASHIFTRT:
10199         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
10200             && (inner_mode == QImode || inner_mode == HImode))
10201           op = XEXP (inner, 0);
10202         break;
10203       default:
10204         break;
10205     }
10206
10207   return op;
10208 }
10209
10210 /* Return true if the mask and a shift amount from an RTX of the form
10211    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
10212    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
10213
10214 bool
10215 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
10216                                     rtx shft_amnt)
10217 {
10218   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
10219          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
10220          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
10221          && (INTVAL (mask)
10222              & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
10223 }
10224
10225 /* Return true if the masks and a shift amount from an RTX of the form
10226    ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
10227    a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
10228
10229 bool
10230 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
10231                                    unsigned HOST_WIDE_INT mask1,
10232                                    unsigned HOST_WIDE_INT shft_amnt,
10233                                    unsigned HOST_WIDE_INT mask2)
10234 {
10235   unsigned HOST_WIDE_INT t;
10236
10237   /* Verify that there is no overlap in what bits are set in the two masks.  */
10238   if (mask1 != ~mask2)
10239     return false;
10240
10241   /* Verify that mask2 is not all zeros or ones.  */
10242   if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
10243     return false;
10244
10245   /* The shift amount should always be less than the mode size.  */
10246   gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
10247
10248   /* Verify that the mask being shifted is contiguous and would be in the
10249      least significant bits after shifting by shft_amnt.  */
10250   t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
10251   return (t == (t & -t));
10252 }
10253
10254 /* Calculate the cost of calculating X, storing it in *COST.  Result
10255    is true if the total cost of the operation has now been calculated.  */
10256 static bool
10257 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
10258                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
10259 {
10260   rtx op0, op1, op2;
10261   const struct cpu_cost_table *extra_cost
10262     = aarch64_tune_params.insn_extra_cost;
10263   int code = GET_CODE (x);
10264   scalar_int_mode int_mode;
10265
10266   /* By default, assume that everything has equivalent cost to the
10267      cheapest instruction.  Any additional costs are applied as a delta
10268      above this default.  */
10269   *cost = COSTS_N_INSNS (1);
10270
10271   switch (code)
10272     {
10273     case SET:
10274       /* The cost depends entirely on the operands to SET.  */
10275       *cost = 0;
10276       op0 = SET_DEST (x);
10277       op1 = SET_SRC (x);
10278
10279       switch (GET_CODE (op0))
10280         {
10281         case MEM:
10282           if (speed)
10283             {
10284               rtx address = XEXP (op0, 0);
10285               if (VECTOR_MODE_P (mode))
10286                 *cost += extra_cost->ldst.storev;
10287               else if (GET_MODE_CLASS (mode) == MODE_INT)
10288                 *cost += extra_cost->ldst.store;
10289               else if (mode == SFmode)
10290                 *cost += extra_cost->ldst.storef;
10291               else if (mode == DFmode)
10292                 *cost += extra_cost->ldst.stored;
10293
10294               *cost +=
10295                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10296                                                      0, speed));
10297             }
10298
10299           *cost += rtx_cost (op1, mode, SET, 1, speed);
10300           return true;
10301
10302         case SUBREG:
10303           if (! REG_P (SUBREG_REG (op0)))
10304             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
10305
10306           /* Fall through.  */
10307         case REG:
10308           /* The cost is one per vector-register copied.  */
10309           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
10310             {
10311               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
10312               *cost = COSTS_N_INSNS (nregs);
10313             }
10314           /* const0_rtx is in general free, but we will use an
10315              instruction to set a register to 0.  */
10316           else if (REG_P (op1) || op1 == const0_rtx)
10317             {
10318               /* The cost is 1 per register copied.  */
10319               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
10320               *cost = COSTS_N_INSNS (nregs);
10321             }
10322           else
10323             /* Cost is just the cost of the RHS of the set.  */
10324             *cost += rtx_cost (op1, mode, SET, 1, speed);
10325           return true;
10326
10327         case ZERO_EXTRACT:
10328         case SIGN_EXTRACT:
10329           /* Bit-field insertion.  Strip any redundant widening of
10330              the RHS to meet the width of the target.  */
10331           if (GET_CODE (op1) == SUBREG)
10332             op1 = SUBREG_REG (op1);
10333           if ((GET_CODE (op1) == ZERO_EXTEND
10334                || GET_CODE (op1) == SIGN_EXTEND)
10335               && CONST_INT_P (XEXP (op0, 1))
10336               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
10337               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
10338             op1 = XEXP (op1, 0);
10339
10340           if (CONST_INT_P (op1))
10341             {
10342               /* MOV immediate is assumed to always be cheap.  */
10343               *cost = COSTS_N_INSNS (1);
10344             }
10345           else
10346             {
10347               /* BFM.  */
10348               if (speed)
10349                 *cost += extra_cost->alu.bfi;
10350               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
10351             }
10352
10353           return true;
10354
10355         default:
10356           /* We can't make sense of this, assume default cost.  */
10357           *cost = COSTS_N_INSNS (1);
10358           return false;
10359         }
10360       return false;
10361
10362     case CONST_INT:
10363       /* If an instruction can incorporate a constant within the
10364          instruction, the instruction's expression avoids calling
10365          rtx_cost() on the constant.  If rtx_cost() is called on a
10366          constant, then it is usually because the constant must be
10367          moved into a register by one or more instructions.
10368
10369          The exception is constant 0, which can be expressed
10370          as XZR/WZR and is therefore free.  The exception to this is
10371          if we have (set (reg) (const0_rtx)) in which case we must cost
10372          the move.  However, we can catch that when we cost the SET, so
10373          we don't need to consider that here.  */
10374       if (x == const0_rtx)
10375         *cost = 0;
10376       else
10377         {
10378           /* To an approximation, building any other constant is
10379              proportionally expensive to the number of instructions
10380              required to build that constant.  This is true whether we
10381              are compiling for SPEED or otherwise.  */
10382           if (!is_a <scalar_int_mode> (mode, &int_mode))
10383             int_mode = word_mode;
10384           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
10385                                  (NULL_RTX, x, false, int_mode));
10386         }
10387       return true;
10388
10389     case CONST_DOUBLE:
10390
10391       /* First determine number of instructions to do the move
10392           as an integer constant.  */
10393       if (!aarch64_float_const_representable_p (x)
10394            && !aarch64_can_const_movi_rtx_p (x, mode)
10395            && aarch64_float_const_rtx_p (x))
10396         {
10397           unsigned HOST_WIDE_INT ival;
10398           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
10399           gcc_assert (succeed);
10400
10401           scalar_int_mode imode = (mode == HFmode
10402                                    ? SImode
10403                                    : int_mode_for_mode (mode).require ());
10404           int ncost = aarch64_internal_mov_immediate
10405                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
10406           *cost += COSTS_N_INSNS (ncost);
10407           return true;
10408         }
10409
10410       if (speed)
10411         {
10412           /* mov[df,sf]_aarch64.  */
10413           if (aarch64_float_const_representable_p (x))
10414             /* FMOV (scalar immediate).  */
10415             *cost += extra_cost->fp[mode == DFmode].fpconst;
10416           else if (!aarch64_float_const_zero_rtx_p (x))
10417             {
10418               /* This will be a load from memory.  */
10419               if (mode == DFmode)
10420                 *cost += extra_cost->ldst.loadd;
10421               else
10422                 *cost += extra_cost->ldst.loadf;
10423             }
10424           else
10425             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
10426                or MOV v0.s[0], wzr - neither of which are modeled by the
10427                cost tables.  Just use the default cost.  */
10428             {
10429             }
10430         }
10431
10432       return true;
10433
10434     case MEM:
10435       if (speed)
10436         {
10437           /* For loads we want the base cost of a load, plus an
10438              approximation for the additional cost of the addressing
10439              mode.  */
10440           rtx address = XEXP (x, 0);
10441           if (VECTOR_MODE_P (mode))
10442             *cost += extra_cost->ldst.loadv;
10443           else if (GET_MODE_CLASS (mode) == MODE_INT)
10444             *cost += extra_cost->ldst.load;
10445           else if (mode == SFmode)
10446             *cost += extra_cost->ldst.loadf;
10447           else if (mode == DFmode)
10448             *cost += extra_cost->ldst.loadd;
10449
10450           *cost +=
10451                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10452                                                      0, speed));
10453         }
10454
10455       return true;
10456
10457     case NEG:
10458       op0 = XEXP (x, 0);
10459
10460       if (VECTOR_MODE_P (mode))
10461         {
10462           if (speed)
10463             {
10464               /* FNEG.  */
10465               *cost += extra_cost->vect.alu;
10466             }
10467           return false;
10468         }
10469
10470       if (GET_MODE_CLASS (mode) == MODE_INT)
10471         {
10472           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
10473               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
10474             {
10475               /* CSETM.  */
10476               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
10477               return true;
10478             }
10479
10480           /* Cost this as SUB wzr, X.  */
10481           op0 = CONST0_RTX (mode);
10482           op1 = XEXP (x, 0);
10483           goto cost_minus;
10484         }
10485
10486       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10487         {
10488           /* Support (neg(fma...)) as a single instruction only if
10489              sign of zeros is unimportant.  This matches the decision
10490              making in aarch64.md.  */
10491           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
10492             {
10493               /* FNMADD.  */
10494               *cost = rtx_cost (op0, mode, NEG, 0, speed);
10495               return true;
10496             }
10497           if (GET_CODE (op0) == MULT)
10498             {
10499               /* FNMUL.  */
10500               *cost = rtx_cost (op0, mode, NEG, 0, speed);
10501               return true;
10502             }
10503           if (speed)
10504             /* FNEG.  */
10505             *cost += extra_cost->fp[mode == DFmode].neg;
10506           return false;
10507         }
10508
10509       return false;
10510
10511     case CLRSB:
10512     case CLZ:
10513       if (speed)
10514         {
10515           if (VECTOR_MODE_P (mode))
10516             *cost += extra_cost->vect.alu;
10517           else
10518             *cost += extra_cost->alu.clz;
10519         }
10520
10521       return false;
10522
10523     case COMPARE:
10524       op0 = XEXP (x, 0);
10525       op1 = XEXP (x, 1);
10526
10527       if (op1 == const0_rtx
10528           && GET_CODE (op0) == AND)
10529         {
10530           x = op0;
10531           mode = GET_MODE (op0);
10532           goto cost_logic;
10533         }
10534
10535       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
10536         {
10537           /* TODO: A write to the CC flags possibly costs extra, this
10538              needs encoding in the cost tables.  */
10539
10540           mode = GET_MODE (op0);
10541           /* ANDS.  */
10542           if (GET_CODE (op0) == AND)
10543             {
10544               x = op0;
10545               goto cost_logic;
10546             }
10547
10548           if (GET_CODE (op0) == PLUS)
10549             {
10550               /* ADDS (and CMN alias).  */
10551               x = op0;
10552               goto cost_plus;
10553             }
10554
10555           if (GET_CODE (op0) == MINUS)
10556             {
10557               /* SUBS.  */
10558               x = op0;
10559               goto cost_minus;
10560             }
10561
10562           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
10563               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
10564               && CONST_INT_P (XEXP (op0, 2)))
10565             {
10566               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
10567                  Handle it here directly rather than going to cost_logic
10568                  since we know the immediate generated for the TST is valid
10569                  so we can avoid creating an intermediate rtx for it only
10570                  for costing purposes.  */
10571               if (speed)
10572                 *cost += extra_cost->alu.logical;
10573
10574               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
10575                                  ZERO_EXTRACT, 0, speed);
10576               return true;
10577             }
10578
10579           if (GET_CODE (op1) == NEG)
10580             {
10581               /* CMN.  */
10582               if (speed)
10583                 *cost += extra_cost->alu.arith;
10584
10585               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
10586               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
10587               return true;
10588             }
10589
10590           /* CMP.
10591
10592              Compare can freely swap the order of operands, and
10593              canonicalization puts the more complex operation first.
10594              But the integer MINUS logic expects the shift/extend
10595              operation in op1.  */
10596           if (! (REG_P (op0)
10597                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
10598           {
10599             op0 = XEXP (x, 1);
10600             op1 = XEXP (x, 0);
10601           }
10602           goto cost_minus;
10603         }
10604
10605       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
10606         {
10607           /* FCMP.  */
10608           if (speed)
10609             *cost += extra_cost->fp[mode == DFmode].compare;
10610
10611           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
10612             {
10613               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
10614               /* FCMP supports constant 0.0 for no extra cost. */
10615               return true;
10616             }
10617           return false;
10618         }
10619
10620       if (VECTOR_MODE_P (mode))
10621         {
10622           /* Vector compare.  */
10623           if (speed)
10624             *cost += extra_cost->vect.alu;
10625
10626           if (aarch64_float_const_zero_rtx_p (op1))
10627             {
10628               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
10629                  cost.  */
10630               return true;
10631             }
10632           return false;
10633         }
10634       return false;
10635
10636     case MINUS:
10637       {
10638         op0 = XEXP (x, 0);
10639         op1 = XEXP (x, 1);
10640
10641 cost_minus:
10642         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
10643
10644         /* Detect valid immediates.  */
10645         if ((GET_MODE_CLASS (mode) == MODE_INT
10646              || (GET_MODE_CLASS (mode) == MODE_CC
10647                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
10648             && CONST_INT_P (op1)
10649             && aarch64_uimm12_shift (INTVAL (op1)))
10650           {
10651             if (speed)
10652               /* SUB(S) (immediate).  */
10653               *cost += extra_cost->alu.arith;
10654             return true;
10655           }
10656
10657         /* Look for SUB (extended register).  */
10658         if (is_a <scalar_int_mode> (mode, &int_mode)
10659             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
10660           {
10661             if (speed)
10662               *cost += extra_cost->alu.extend_arith;
10663
10664             op1 = aarch64_strip_extend (op1, true);
10665             *cost += rtx_cost (op1, VOIDmode,
10666                                (enum rtx_code) GET_CODE (op1), 0, speed);
10667             return true;
10668           }
10669
10670         rtx new_op1 = aarch64_strip_extend (op1, false);
10671
10672         /* Cost this as an FMA-alike operation.  */
10673         if ((GET_CODE (new_op1) == MULT
10674              || aarch64_shift_p (GET_CODE (new_op1)))
10675             && code != COMPARE)
10676           {
10677             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
10678                                             (enum rtx_code) code,
10679                                             speed);
10680             return true;
10681           }
10682
10683         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
10684
10685         if (speed)
10686           {
10687             if (VECTOR_MODE_P (mode))
10688               {
10689                 /* Vector SUB.  */
10690                 *cost += extra_cost->vect.alu;
10691               }
10692             else if (GET_MODE_CLASS (mode) == MODE_INT)
10693               {
10694                 /* SUB(S).  */
10695                 *cost += extra_cost->alu.arith;
10696               }
10697             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10698               {
10699                 /* FSUB.  */
10700                 *cost += extra_cost->fp[mode == DFmode].addsub;
10701               }
10702           }
10703         return true;
10704       }
10705
10706     case PLUS:
10707       {
10708         rtx new_op0;
10709
10710         op0 = XEXP (x, 0);
10711         op1 = XEXP (x, 1);
10712
10713 cost_plus:
10714         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
10715             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
10716           {
10717             /* CSINC.  */
10718             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
10719             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10720             return true;
10721           }
10722
10723         if (GET_MODE_CLASS (mode) == MODE_INT
10724             && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
10725                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
10726           {
10727             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
10728
10729             if (speed)
10730               /* ADD (immediate).  */
10731               *cost += extra_cost->alu.arith;
10732             return true;
10733           }
10734
10735         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10736
10737         /* Look for ADD (extended register).  */
10738         if (is_a <scalar_int_mode> (mode, &int_mode)
10739             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
10740           {
10741             if (speed)
10742               *cost += extra_cost->alu.extend_arith;
10743
10744             op0 = aarch64_strip_extend (op0, true);
10745             *cost += rtx_cost (op0, VOIDmode,
10746                                (enum rtx_code) GET_CODE (op0), 0, speed);
10747             return true;
10748           }
10749
10750         /* Strip any extend, leave shifts behind as we will
10751            cost them through mult_cost.  */
10752         new_op0 = aarch64_strip_extend (op0, false);
10753
10754         if (GET_CODE (new_op0) == MULT
10755             || aarch64_shift_p (GET_CODE (new_op0)))
10756           {
10757             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
10758                                             speed);
10759             return true;
10760           }
10761
10762         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
10763
10764         if (speed)
10765           {
10766             if (VECTOR_MODE_P (mode))
10767               {
10768                 /* Vector ADD.  */
10769                 *cost += extra_cost->vect.alu;
10770               }
10771             else if (GET_MODE_CLASS (mode) == MODE_INT)
10772               {
10773                 /* ADD.  */
10774                 *cost += extra_cost->alu.arith;
10775               }
10776             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10777               {
10778                 /* FADD.  */
10779                 *cost += extra_cost->fp[mode == DFmode].addsub;
10780               }
10781           }
10782         return true;
10783       }
10784
10785     case BSWAP:
10786       *cost = COSTS_N_INSNS (1);
10787
10788       if (speed)
10789         {
10790           if (VECTOR_MODE_P (mode))
10791             *cost += extra_cost->vect.alu;
10792           else
10793             *cost += extra_cost->alu.rev;
10794         }
10795       return false;
10796
10797     case IOR:
10798       if (aarch_rev16_p (x))
10799         {
10800           *cost = COSTS_N_INSNS (1);
10801
10802           if (speed)
10803             {
10804               if (VECTOR_MODE_P (mode))
10805                 *cost += extra_cost->vect.alu;
10806               else
10807                 *cost += extra_cost->alu.rev;
10808             }
10809           return true;
10810         }
10811
10812       if (aarch64_extr_rtx_p (x, &op0, &op1))
10813         {
10814           *cost += rtx_cost (op0, mode, IOR, 0, speed);
10815           *cost += rtx_cost (op1, mode, IOR, 1, speed);
10816           if (speed)
10817             *cost += extra_cost->alu.shift;
10818
10819           return true;
10820         }
10821     /* Fall through.  */
10822     case XOR:
10823     case AND:
10824     cost_logic:
10825       op0 = XEXP (x, 0);
10826       op1 = XEXP (x, 1);
10827
10828       if (VECTOR_MODE_P (mode))
10829         {
10830           if (speed)
10831             *cost += extra_cost->vect.alu;
10832           return true;
10833         }
10834
10835       if (code == AND
10836           && GET_CODE (op0) == MULT
10837           && CONST_INT_P (XEXP (op0, 1))
10838           && CONST_INT_P (op1)
10839           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
10840                                INTVAL (op1)) != 0)
10841         {
10842           /* This is a UBFM/SBFM.  */
10843           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
10844           if (speed)
10845             *cost += extra_cost->alu.bfx;
10846           return true;
10847         }
10848
10849       if (is_int_mode (mode, &int_mode))
10850         {
10851           if (CONST_INT_P (op1))
10852             {
10853               /* We have a mask + shift version of a UBFIZ
10854                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
10855               if (GET_CODE (op0) == ASHIFT
10856                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
10857                                                          XEXP (op0, 1)))
10858                 {
10859                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
10860                                      (enum rtx_code) code, 0, speed);
10861                   if (speed)
10862                     *cost += extra_cost->alu.bfx;
10863
10864                   return true;
10865                 }
10866               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
10867                 {
10868                 /* We possibly get the immediate for free, this is not
10869                    modelled.  */
10870                   *cost += rtx_cost (op0, int_mode,
10871                                      (enum rtx_code) code, 0, speed);
10872                   if (speed)
10873                     *cost += extra_cost->alu.logical;
10874
10875                   return true;
10876                 }
10877             }
10878           else
10879             {
10880               rtx new_op0 = op0;
10881
10882               /* Handle ORN, EON, or BIC.  */
10883               if (GET_CODE (op0) == NOT)
10884                 op0 = XEXP (op0, 0);
10885
10886               new_op0 = aarch64_strip_shift (op0);
10887
10888               /* If we had a shift on op0 then this is a logical-shift-
10889                  by-register/immediate operation.  Otherwise, this is just
10890                  a logical operation.  */
10891               if (speed)
10892                 {
10893                   if (new_op0 != op0)
10894                     {
10895                       /* Shift by immediate.  */
10896                       if (CONST_INT_P (XEXP (op0, 1)))
10897                         *cost += extra_cost->alu.log_shift;
10898                       else
10899                         *cost += extra_cost->alu.log_shift_reg;
10900                     }
10901                   else
10902                     *cost += extra_cost->alu.logical;
10903                 }
10904
10905               /* In both cases we want to cost both operands.  */
10906               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
10907                                  0, speed);
10908               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
10909                                  1, speed);
10910
10911               return true;
10912             }
10913         }
10914       return false;
10915
10916     case NOT:
10917       x = XEXP (x, 0);
10918       op0 = aarch64_strip_shift (x);
10919
10920       if (VECTOR_MODE_P (mode))
10921         {
10922           /* Vector NOT.  */
10923           *cost += extra_cost->vect.alu;
10924           return false;
10925         }
10926
10927       /* MVN-shifted-reg.  */
10928       if (op0 != x)
10929         {
10930           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10931
10932           if (speed)
10933             *cost += extra_cost->alu.log_shift;
10934
10935           return true;
10936         }
10937       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
10938          Handle the second form here taking care that 'a' in the above can
10939          be a shift.  */
10940       else if (GET_CODE (op0) == XOR)
10941         {
10942           rtx newop0 = XEXP (op0, 0);
10943           rtx newop1 = XEXP (op0, 1);
10944           rtx op0_stripped = aarch64_strip_shift (newop0);
10945
10946           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
10947           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
10948
10949           if (speed)
10950             {
10951               if (op0_stripped != newop0)
10952                 *cost += extra_cost->alu.log_shift;
10953               else
10954                 *cost += extra_cost->alu.logical;
10955             }
10956
10957           return true;
10958         }
10959       /* MVN.  */
10960       if (speed)
10961         *cost += extra_cost->alu.logical;
10962
10963       return false;
10964
10965     case ZERO_EXTEND:
10966
10967       op0 = XEXP (x, 0);
10968       /* If a value is written in SI mode, then zero extended to DI
10969          mode, the operation will in general be free as a write to
10970          a 'w' register implicitly zeroes the upper bits of an 'x'
10971          register.  However, if this is
10972
10973            (set (reg) (zero_extend (reg)))
10974
10975          we must cost the explicit register move.  */
10976       if (mode == DImode
10977           && GET_MODE (op0) == SImode
10978           && outer == SET)
10979         {
10980           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
10981
10982         /* If OP_COST is non-zero, then the cost of the zero extend
10983            is effectively the cost of the inner operation.  Otherwise
10984            we have a MOV instruction and we take the cost from the MOV
10985            itself.  This is true independently of whether we are
10986            optimizing for space or time.  */
10987           if (op_cost)
10988             *cost = op_cost;
10989
10990           return true;
10991         }
10992       else if (MEM_P (op0))
10993         {
10994           /* All loads can zero extend to any size for free.  */
10995           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
10996           return true;
10997         }
10998
10999       op0 = aarch64_extend_bitfield_pattern_p (x);
11000       if (op0)
11001         {
11002           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
11003           if (speed)
11004             *cost += extra_cost->alu.bfx;
11005           return true;
11006         }
11007
11008       if (speed)
11009         {
11010           if (VECTOR_MODE_P (mode))
11011             {
11012               /* UMOV.  */
11013               *cost += extra_cost->vect.alu;
11014             }
11015           else
11016             {
11017               /* We generate an AND instead of UXTB/UXTH.  */
11018               *cost += extra_cost->alu.logical;
11019             }
11020         }
11021       return false;
11022
11023     case SIGN_EXTEND:
11024       if (MEM_P (XEXP (x, 0)))
11025         {
11026           /* LDRSH.  */
11027           if (speed)
11028             {
11029               rtx address = XEXP (XEXP (x, 0), 0);
11030               *cost += extra_cost->ldst.load_sign_extend;
11031
11032               *cost +=
11033                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
11034                                                      0, speed));
11035             }
11036           return true;
11037         }
11038
11039       op0 = aarch64_extend_bitfield_pattern_p (x);
11040       if (op0)
11041         {
11042           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
11043           if (speed)
11044             *cost += extra_cost->alu.bfx;
11045           return true;
11046         }
11047
11048       if (speed)
11049         {
11050           if (VECTOR_MODE_P (mode))
11051             *cost += extra_cost->vect.alu;
11052           else
11053             *cost += extra_cost->alu.extend;
11054         }
11055       return false;
11056
11057     case ASHIFT:
11058       op0 = XEXP (x, 0);
11059       op1 = XEXP (x, 1);
11060
11061       if (CONST_INT_P (op1))
11062         {
11063           if (speed)
11064             {
11065               if (VECTOR_MODE_P (mode))
11066                 {
11067                   /* Vector shift (immediate).  */
11068                   *cost += extra_cost->vect.alu;
11069                 }
11070               else
11071                 {
11072                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
11073                      aliases.  */
11074                   *cost += extra_cost->alu.shift;
11075                 }
11076             }
11077
11078           /* We can incorporate zero/sign extend for free.  */
11079           if (GET_CODE (op0) == ZERO_EXTEND
11080               || GET_CODE (op0) == SIGN_EXTEND)
11081             op0 = XEXP (op0, 0);
11082
11083           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
11084           return true;
11085         }
11086       else
11087         {
11088           if (VECTOR_MODE_P (mode))
11089             {
11090               if (speed)
11091                 /* Vector shift (register).  */
11092                 *cost += extra_cost->vect.alu;
11093             }
11094           else
11095             {
11096               if (speed)
11097                 /* LSLV.  */
11098                 *cost += extra_cost->alu.shift_reg;
11099
11100               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
11101                   && CONST_INT_P (XEXP (op1, 1))
11102                   && known_eq (INTVAL (XEXP (op1, 1)),
11103                                GET_MODE_BITSIZE (mode) - 1))
11104                 {
11105                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
11106                   /* We already demanded XEXP (op1, 0) to be REG_P, so
11107                      don't recurse into it.  */
11108                   return true;
11109                 }
11110             }
11111           return false;  /* All arguments need to be in registers.  */
11112         }
11113
11114     case ROTATE:
11115     case ROTATERT:
11116     case LSHIFTRT:
11117     case ASHIFTRT:
11118       op0 = XEXP (x, 0);
11119       op1 = XEXP (x, 1);
11120
11121       if (CONST_INT_P (op1))
11122         {
11123           /* ASR (immediate) and friends.  */
11124           if (speed)
11125             {
11126               if (VECTOR_MODE_P (mode))
11127                 *cost += extra_cost->vect.alu;
11128               else
11129                 *cost += extra_cost->alu.shift;
11130             }
11131
11132           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
11133           return true;
11134         }
11135       else
11136         {
11137           if (VECTOR_MODE_P (mode))
11138             {
11139               if (speed)
11140                 /* Vector shift (register).  */
11141                 *cost += extra_cost->vect.alu;
11142             }
11143           else
11144             {
11145               if (speed)
11146                 /* ASR (register) and friends.  */
11147                 *cost += extra_cost->alu.shift_reg;
11148
11149               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
11150                   && CONST_INT_P (XEXP (op1, 1))
11151                   && known_eq (INTVAL (XEXP (op1, 1)),
11152                                GET_MODE_BITSIZE (mode) - 1))
11153                 {
11154                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
11155                   /* We already demanded XEXP (op1, 0) to be REG_P, so
11156                      don't recurse into it.  */
11157                   return true;
11158                 }
11159             }
11160           return false;  /* All arguments need to be in registers.  */
11161         }
11162
11163     case SYMBOL_REF:
11164
11165       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
11166           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
11167         {
11168           /* LDR.  */
11169           if (speed)
11170             *cost += extra_cost->ldst.load;
11171         }
11172       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
11173                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
11174         {
11175           /* ADRP, followed by ADD.  */
11176           *cost += COSTS_N_INSNS (1);
11177           if (speed)
11178             *cost += 2 * extra_cost->alu.arith;
11179         }
11180       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
11181                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
11182         {
11183           /* ADR.  */
11184           if (speed)
11185             *cost += extra_cost->alu.arith;
11186         }
11187
11188       if (flag_pic)
11189         {
11190           /* One extra load instruction, after accessing the GOT.  */
11191           *cost += COSTS_N_INSNS (1);
11192           if (speed)
11193             *cost += extra_cost->ldst.load;
11194         }
11195       return true;
11196
11197     case HIGH:
11198     case LO_SUM:
11199       /* ADRP/ADD (immediate).  */
11200       if (speed)
11201         *cost += extra_cost->alu.arith;
11202       return true;
11203
11204     case ZERO_EXTRACT:
11205     case SIGN_EXTRACT:
11206       /* UBFX/SBFX.  */
11207       if (speed)
11208         {
11209           if (VECTOR_MODE_P (mode))
11210             *cost += extra_cost->vect.alu;
11211           else
11212             *cost += extra_cost->alu.bfx;
11213         }
11214
11215       /* We can trust that the immediates used will be correct (there
11216          are no by-register forms), so we need only cost op0.  */
11217       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
11218       return true;
11219
11220     case MULT:
11221       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
11222       /* aarch64_rtx_mult_cost always handles recursion to its
11223          operands.  */
11224       return true;
11225
11226     case MOD:
11227     /* We can expand signed mod by power of 2 using a NEGS, two parallel
11228        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
11229        an unconditional negate.  This case should only ever be reached through
11230        the set_smod_pow2_cheap check in expmed.c.  */
11231       if (CONST_INT_P (XEXP (x, 1))
11232           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
11233           && (mode == SImode || mode == DImode))
11234         {
11235           /* We expand to 4 instructions.  Reset the baseline.  */
11236           *cost = COSTS_N_INSNS (4);
11237
11238           if (speed)
11239             *cost += 2 * extra_cost->alu.logical
11240                      + 2 * extra_cost->alu.arith;
11241
11242           return true;
11243         }
11244
11245     /* Fall-through.  */
11246     case UMOD:
11247       if (speed)
11248         {
11249           /* Slighly prefer UMOD over SMOD.  */
11250           if (VECTOR_MODE_P (mode))
11251             *cost += extra_cost->vect.alu;
11252           else if (GET_MODE_CLASS (mode) == MODE_INT)
11253             *cost += (extra_cost->mult[mode == DImode].add
11254                       + extra_cost->mult[mode == DImode].idiv
11255                       + (code == MOD ? 1 : 0));
11256         }
11257       return false;  /* All arguments need to be in registers.  */
11258
11259     case DIV:
11260     case UDIV:
11261     case SQRT:
11262       if (speed)
11263         {
11264           if (VECTOR_MODE_P (mode))
11265             *cost += extra_cost->vect.alu;
11266           else if (GET_MODE_CLASS (mode) == MODE_INT)
11267             /* There is no integer SQRT, so only DIV and UDIV can get
11268                here.  */
11269             *cost += (extra_cost->mult[mode == DImode].idiv
11270                      /* Slighly prefer UDIV over SDIV.  */
11271                      + (code == DIV ? 1 : 0));
11272           else
11273             *cost += extra_cost->fp[mode == DFmode].div;
11274         }
11275       return false;  /* All arguments need to be in registers.  */
11276
11277     case IF_THEN_ELSE:
11278       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
11279                                          XEXP (x, 2), cost, speed);
11280
11281     case EQ:
11282     case NE:
11283     case GT:
11284     case GTU:
11285     case LT:
11286     case LTU:
11287     case GE:
11288     case GEU:
11289     case LE:
11290     case LEU:
11291
11292       return false; /* All arguments must be in registers.  */
11293
11294     case FMA:
11295       op0 = XEXP (x, 0);
11296       op1 = XEXP (x, 1);
11297       op2 = XEXP (x, 2);
11298
11299       if (speed)
11300         {
11301           if (VECTOR_MODE_P (mode))
11302             *cost += extra_cost->vect.alu;
11303           else
11304             *cost += extra_cost->fp[mode == DFmode].fma;
11305         }
11306
11307       /* FMSUB, FNMADD, and FNMSUB are free.  */
11308       if (GET_CODE (op0) == NEG)
11309         op0 = XEXP (op0, 0);
11310
11311       if (GET_CODE (op2) == NEG)
11312         op2 = XEXP (op2, 0);
11313
11314       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
11315          and the by-element operand as operand 0.  */
11316       if (GET_CODE (op1) == NEG)
11317         op1 = XEXP (op1, 0);
11318
11319       /* Catch vector-by-element operations.  The by-element operand can
11320          either be (vec_duplicate (vec_select (x))) or just
11321          (vec_select (x)), depending on whether we are multiplying by
11322          a vector or a scalar.
11323
11324          Canonicalization is not very good in these cases, FMA4 will put the
11325          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
11326       if (GET_CODE (op0) == VEC_DUPLICATE)
11327         op0 = XEXP (op0, 0);
11328       else if (GET_CODE (op1) == VEC_DUPLICATE)
11329         op1 = XEXP (op1, 0);
11330
11331       if (GET_CODE (op0) == VEC_SELECT)
11332         op0 = XEXP (op0, 0);
11333       else if (GET_CODE (op1) == VEC_SELECT)
11334         op1 = XEXP (op1, 0);
11335
11336       /* If the remaining parameters are not registers,
11337          get the cost to put them into registers.  */
11338       *cost += rtx_cost (op0, mode, FMA, 0, speed);
11339       *cost += rtx_cost (op1, mode, FMA, 1, speed);
11340       *cost += rtx_cost (op2, mode, FMA, 2, speed);
11341       return true;
11342
11343     case FLOAT:
11344     case UNSIGNED_FLOAT:
11345       if (speed)
11346         *cost += extra_cost->fp[mode == DFmode].fromint;
11347       return false;
11348
11349     case FLOAT_EXTEND:
11350       if (speed)
11351         {
11352           if (VECTOR_MODE_P (mode))
11353             {
11354               /*Vector truncate.  */
11355               *cost += extra_cost->vect.alu;
11356             }
11357           else
11358             *cost += extra_cost->fp[mode == DFmode].widen;
11359         }
11360       return false;
11361
11362     case FLOAT_TRUNCATE:
11363       if (speed)
11364         {
11365           if (VECTOR_MODE_P (mode))
11366             {
11367               /*Vector conversion.  */
11368               *cost += extra_cost->vect.alu;
11369             }
11370           else
11371             *cost += extra_cost->fp[mode == DFmode].narrow;
11372         }
11373       return false;
11374
11375     case FIX:
11376     case UNSIGNED_FIX:
11377       x = XEXP (x, 0);
11378       /* Strip the rounding part.  They will all be implemented
11379          by the fcvt* family of instructions anyway.  */
11380       if (GET_CODE (x) == UNSPEC)
11381         {
11382           unsigned int uns_code = XINT (x, 1);
11383
11384           if (uns_code == UNSPEC_FRINTA
11385               || uns_code == UNSPEC_FRINTM
11386               || uns_code == UNSPEC_FRINTN
11387               || uns_code == UNSPEC_FRINTP
11388               || uns_code == UNSPEC_FRINTZ)
11389             x = XVECEXP (x, 0, 0);
11390         }
11391
11392       if (speed)
11393         {
11394           if (VECTOR_MODE_P (mode))
11395             *cost += extra_cost->vect.alu;
11396           else
11397             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
11398         }
11399
11400       /* We can combine fmul by a power of 2 followed by a fcvt into a single
11401          fixed-point fcvt.  */
11402       if (GET_CODE (x) == MULT
11403           && ((VECTOR_MODE_P (mode)
11404                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
11405               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
11406         {
11407           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
11408                              0, speed);
11409           return true;
11410         }
11411
11412       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
11413       return true;
11414
11415     case ABS:
11416       if (VECTOR_MODE_P (mode))
11417         {
11418           /* ABS (vector).  */
11419           if (speed)
11420             *cost += extra_cost->vect.alu;
11421         }
11422       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11423         {
11424           op0 = XEXP (x, 0);
11425
11426           /* FABD, which is analogous to FADD.  */
11427           if (GET_CODE (op0) == MINUS)
11428             {
11429               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
11430               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
11431               if (speed)
11432                 *cost += extra_cost->fp[mode == DFmode].addsub;
11433
11434               return true;
11435             }
11436           /* Simple FABS is analogous to FNEG.  */
11437           if (speed)
11438             *cost += extra_cost->fp[mode == DFmode].neg;
11439         }
11440       else
11441         {
11442           /* Integer ABS will either be split to
11443              two arithmetic instructions, or will be an ABS
11444              (scalar), which we don't model.  */
11445           *cost = COSTS_N_INSNS (2);
11446           if (speed)
11447             *cost += 2 * extra_cost->alu.arith;
11448         }
11449       return false;
11450
11451     case SMAX:
11452     case SMIN:
11453       if (speed)
11454         {
11455           if (VECTOR_MODE_P (mode))
11456             *cost += extra_cost->vect.alu;
11457           else
11458             {
11459               /* FMAXNM/FMINNM/FMAX/FMIN.
11460                  TODO: This may not be accurate for all implementations, but
11461                  we do not model this in the cost tables.  */
11462               *cost += extra_cost->fp[mode == DFmode].addsub;
11463             }
11464         }
11465       return false;
11466
11467     case UNSPEC:
11468       /* The floating point round to integer frint* instructions.  */
11469       if (aarch64_frint_unspec_p (XINT (x, 1)))
11470         {
11471           if (speed)
11472             *cost += extra_cost->fp[mode == DFmode].roundint;
11473
11474           return false;
11475         }
11476
11477       if (XINT (x, 1) == UNSPEC_RBIT)
11478         {
11479           if (speed)
11480             *cost += extra_cost->alu.rev;
11481
11482           return false;
11483         }
11484       break;
11485
11486     case TRUNCATE:
11487
11488       /* Decompose <su>muldi3_highpart.  */
11489       if (/* (truncate:DI  */
11490           mode == DImode
11491           /*   (lshiftrt:TI  */
11492           && GET_MODE (XEXP (x, 0)) == TImode
11493           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
11494           /*      (mult:TI  */
11495           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
11496           /*        (ANY_EXTEND:TI (reg:DI))
11497                     (ANY_EXTEND:TI (reg:DI)))  */
11498           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
11499                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
11500               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
11501                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
11502           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
11503           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
11504           /*     (const_int 64)  */
11505           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
11506           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
11507         {
11508           /* UMULH/SMULH.  */
11509           if (speed)
11510             *cost += extra_cost->mult[mode == DImode].extend;
11511           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
11512                              mode, MULT, 0, speed);
11513           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
11514                              mode, MULT, 1, speed);
11515           return true;
11516         }
11517
11518       /* Fall through.  */
11519     default:
11520       break;
11521     }
11522
11523   if (dump_file
11524       && flag_aarch64_verbose_cost)
11525     fprintf (dump_file,
11526       "\nFailed to cost RTX.  Assuming default cost.\n");
11527
11528   return true;
11529 }
11530
11531 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
11532    calculated for X.  This cost is stored in *COST.  Returns true
11533    if the total cost of X was calculated.  */
11534 static bool
11535 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
11536                    int param, int *cost, bool speed)
11537 {
11538   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
11539
11540   if (dump_file
11541       && flag_aarch64_verbose_cost)
11542     {
11543       print_rtl_single (dump_file, x);
11544       fprintf (dump_file, "\n%s cost: %d (%s)\n",
11545                speed ? "Hot" : "Cold",
11546                *cost, result ? "final" : "partial");
11547     }
11548
11549   return result;
11550 }
11551
11552 static int
11553 aarch64_register_move_cost (machine_mode mode,
11554                             reg_class_t from_i, reg_class_t to_i)
11555 {
11556   enum reg_class from = (enum reg_class) from_i;
11557   enum reg_class to = (enum reg_class) to_i;
11558   const struct cpu_regmove_cost *regmove_cost
11559     = aarch64_tune_params.regmove_cost;
11560
11561   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
11562   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
11563     to = GENERAL_REGS;
11564
11565   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
11566     from = GENERAL_REGS;
11567
11568   /* Moving between GPR and stack cost is the same as GP2GP.  */
11569   if ((from == GENERAL_REGS && to == STACK_REG)
11570       || (to == GENERAL_REGS && from == STACK_REG))
11571     return regmove_cost->GP2GP;
11572
11573   /* To/From the stack register, we move via the gprs.  */
11574   if (to == STACK_REG || from == STACK_REG)
11575     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
11576             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
11577
11578   if (known_eq (GET_MODE_SIZE (mode), 16))
11579     {
11580       /* 128-bit operations on general registers require 2 instructions.  */
11581       if (from == GENERAL_REGS && to == GENERAL_REGS)
11582         return regmove_cost->GP2GP * 2;
11583       else if (from == GENERAL_REGS)
11584         return regmove_cost->GP2FP * 2;
11585       else if (to == GENERAL_REGS)
11586         return regmove_cost->FP2GP * 2;
11587
11588       /* When AdvSIMD instructions are disabled it is not possible to move
11589          a 128-bit value directly between Q registers.  This is handled in
11590          secondary reload.  A general register is used as a scratch to move
11591          the upper DI value and the lower DI value is moved directly,
11592          hence the cost is the sum of three moves. */
11593       if (! TARGET_SIMD)
11594         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
11595
11596       return regmove_cost->FP2FP;
11597     }
11598
11599   if (from == GENERAL_REGS && to == GENERAL_REGS)
11600     return regmove_cost->GP2GP;
11601   else if (from == GENERAL_REGS)
11602     return regmove_cost->GP2FP;
11603   else if (to == GENERAL_REGS)
11604     return regmove_cost->FP2GP;
11605
11606   return regmove_cost->FP2FP;
11607 }
11608
11609 static int
11610 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
11611                           reg_class_t rclass ATTRIBUTE_UNUSED,
11612                           bool in ATTRIBUTE_UNUSED)
11613 {
11614   return aarch64_tune_params.memmov_cost;
11615 }
11616
11617 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
11618    to optimize 1.0/sqrt.  */
11619
11620 static bool
11621 use_rsqrt_p (machine_mode mode)
11622 {
11623   return (!flag_trapping_math
11624           && flag_unsafe_math_optimizations
11625           && ((aarch64_tune_params.approx_modes->recip_sqrt
11626                & AARCH64_APPROX_MODE (mode))
11627               || flag_mrecip_low_precision_sqrt));
11628 }
11629
11630 /* Function to decide when to use the approximate reciprocal square root
11631    builtin.  */
11632
11633 static tree
11634 aarch64_builtin_reciprocal (tree fndecl)
11635 {
11636   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
11637
11638   if (!use_rsqrt_p (mode))
11639     return NULL_TREE;
11640   return aarch64_builtin_rsqrt (DECL_MD_FUNCTION_CODE (fndecl));
11641 }
11642
11643 /* Emit instruction sequence to compute either the approximate square root
11644    or its approximate reciprocal, depending on the flag RECP, and return
11645    whether the sequence was emitted or not.  */
11646
11647 bool
11648 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
11649 {
11650   machine_mode mode = GET_MODE (dst);
11651
11652   if (GET_MODE_INNER (mode) == HFmode)
11653     {
11654       gcc_assert (!recp);
11655       return false;
11656     }
11657
11658   if (!recp)
11659     {
11660       if (!(flag_mlow_precision_sqrt
11661             || (aarch64_tune_params.approx_modes->sqrt
11662                 & AARCH64_APPROX_MODE (mode))))
11663         return false;
11664
11665       if (flag_finite_math_only
11666           || flag_trapping_math
11667           || !flag_unsafe_math_optimizations
11668           || optimize_function_for_size_p (cfun))
11669         return false;
11670     }
11671   else
11672     /* Caller assumes we cannot fail.  */
11673     gcc_assert (use_rsqrt_p (mode));
11674
11675   machine_mode mmsk = mode_for_int_vector (mode).require ();
11676   rtx xmsk = gen_reg_rtx (mmsk);
11677   if (!recp)
11678     /* When calculating the approximate square root, compare the
11679        argument with 0.0 and create a mask.  */
11680     emit_insn (gen_rtx_SET (xmsk,
11681                             gen_rtx_NEG (mmsk,
11682                                          gen_rtx_EQ (mmsk, src,
11683                                                      CONST0_RTX (mode)))));
11684
11685   /* Estimate the approximate reciprocal square root.  */
11686   rtx xdst = gen_reg_rtx (mode);
11687   emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
11688
11689   /* Iterate over the series twice for SF and thrice for DF.  */
11690   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11691
11692   /* Optionally iterate over the series once less for faster performance
11693      while sacrificing the accuracy.  */
11694   if ((recp && flag_mrecip_low_precision_sqrt)
11695       || (!recp && flag_mlow_precision_sqrt))
11696     iterations--;
11697
11698   /* Iterate over the series to calculate the approximate reciprocal square
11699      root.  */
11700   rtx x1 = gen_reg_rtx (mode);
11701   while (iterations--)
11702     {
11703       rtx x2 = gen_reg_rtx (mode);
11704       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
11705
11706       emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
11707
11708       if (iterations > 0)
11709         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
11710     }
11711
11712   if (!recp)
11713     {
11714       /* Qualify the approximate reciprocal square root when the argument is
11715          0.0 by squashing the intermediary result to 0.0.  */
11716       rtx xtmp = gen_reg_rtx (mmsk);
11717       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
11718                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
11719       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
11720
11721       /* Calculate the approximate square root.  */
11722       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
11723     }
11724
11725   /* Finalize the approximation.  */
11726   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
11727
11728   return true;
11729 }
11730
11731 /* Emit the instruction sequence to compute the approximation for the division
11732    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
11733
11734 bool
11735 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
11736 {
11737   machine_mode mode = GET_MODE (quo);
11738
11739   if (GET_MODE_INNER (mode) == HFmode)
11740     return false;
11741
11742   bool use_approx_division_p = (flag_mlow_precision_div
11743                                 || (aarch64_tune_params.approx_modes->division
11744                                     & AARCH64_APPROX_MODE (mode)));
11745
11746   if (!flag_finite_math_only
11747       || flag_trapping_math
11748       || !flag_unsafe_math_optimizations
11749       || optimize_function_for_size_p (cfun)
11750       || !use_approx_division_p)
11751     return false;
11752
11753   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
11754     return false;
11755
11756   /* Estimate the approximate reciprocal.  */
11757   rtx xrcp = gen_reg_rtx (mode);
11758   emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
11759
11760   /* Iterate over the series twice for SF and thrice for DF.  */
11761   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11762
11763   /* Optionally iterate over the series once less for faster performance,
11764      while sacrificing the accuracy.  */
11765   if (flag_mlow_precision_div)
11766     iterations--;
11767
11768   /* Iterate over the series to calculate the approximate reciprocal.  */
11769   rtx xtmp = gen_reg_rtx (mode);
11770   while (iterations--)
11771     {
11772       emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
11773
11774       if (iterations > 0)
11775         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
11776     }
11777
11778   if (num != CONST1_RTX (mode))
11779     {
11780       /* As the approximate reciprocal of DEN is already calculated, only
11781          calculate the approximate division when NUM is not 1.0.  */
11782       rtx xnum = force_reg (mode, num);
11783       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
11784     }
11785
11786   /* Finalize the approximation.  */
11787   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
11788   return true;
11789 }
11790
11791 /* Return the number of instructions that can be issued per cycle.  */
11792 static int
11793 aarch64_sched_issue_rate (void)
11794 {
11795   return aarch64_tune_params.issue_rate;
11796 }
11797
11798 static int
11799 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
11800 {
11801   int issue_rate = aarch64_sched_issue_rate ();
11802
11803   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
11804 }
11805
11806
11807 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
11808    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
11809    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
11810
11811 static int
11812 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
11813                                                     int ready_index)
11814 {
11815   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
11816 }
11817
11818
11819 /* Vectorizer cost model target hooks.  */
11820
11821 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
11822 static int
11823 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
11824                                     tree vectype,
11825                                     int misalign ATTRIBUTE_UNUSED)
11826 {
11827   unsigned elements;
11828   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
11829   bool fp = false;
11830
11831   if (vectype != NULL)
11832     fp = FLOAT_TYPE_P (vectype);
11833
11834   switch (type_of_cost)
11835     {
11836       case scalar_stmt:
11837         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
11838
11839       case scalar_load:
11840         return costs->scalar_load_cost;
11841
11842       case scalar_store:
11843         return costs->scalar_store_cost;
11844
11845       case vector_stmt:
11846         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11847
11848       case vector_load:
11849         return costs->vec_align_load_cost;
11850
11851       case vector_store:
11852         return costs->vec_store_cost;
11853
11854       case vec_to_scalar:
11855         return costs->vec_to_scalar_cost;
11856
11857       case scalar_to_vec:
11858         return costs->scalar_to_vec_cost;
11859
11860       case unaligned_load:
11861       case vector_gather_load:
11862         return costs->vec_unalign_load_cost;
11863
11864       case unaligned_store:
11865       case vector_scatter_store:
11866         return costs->vec_unalign_store_cost;
11867
11868       case cond_branch_taken:
11869         return costs->cond_taken_branch_cost;
11870
11871       case cond_branch_not_taken:
11872         return costs->cond_not_taken_branch_cost;
11873
11874       case vec_perm:
11875         return costs->vec_permute_cost;
11876
11877       case vec_promote_demote:
11878         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11879
11880       case vec_construct:
11881         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
11882         return elements / 2 + 1;
11883
11884       default:
11885         gcc_unreachable ();
11886     }
11887 }
11888
11889 /* Implement targetm.vectorize.add_stmt_cost.  */
11890 static unsigned
11891 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
11892                        struct _stmt_vec_info *stmt_info, int misalign,
11893                        enum vect_cost_model_location where)
11894 {
11895   unsigned *cost = (unsigned *) data;
11896   unsigned retval = 0;
11897
11898   if (flag_vect_cost_model)
11899     {
11900       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
11901       int stmt_cost =
11902             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
11903
11904       /* Statements in an inner loop relative to the loop being
11905          vectorized are weighted more heavily.  The value here is
11906          arbitrary and could potentially be improved with analysis.  */
11907       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
11908         count *= 50; /*  FIXME  */
11909
11910       retval = (unsigned) (count * stmt_cost);
11911       cost[where] += retval;
11912     }
11913
11914   return retval;
11915 }
11916
11917 static void initialize_aarch64_code_model (struct gcc_options *);
11918
11919 /* Parse the TO_PARSE string and put the architecture struct that it
11920    selects into RES and the architectural features into ISA_FLAGS.
11921    Return an aarch64_parse_opt_result describing the parse result.
11922    If there is an error parsing, RES and ISA_FLAGS are left unchanged.
11923    When the TO_PARSE string contains an invalid extension,
11924    a copy of the string is created and stored to INVALID_EXTENSION.  */
11925
11926 static enum aarch64_parse_opt_result
11927 aarch64_parse_arch (const char *to_parse, const struct processor **res,
11928                     uint64_t *isa_flags, std::string *invalid_extension)
11929 {
11930   const char *ext;
11931   const struct processor *arch;
11932   size_t len;
11933
11934   ext = strchr (to_parse, '+');
11935
11936   if (ext != NULL)
11937     len = ext - to_parse;
11938   else
11939     len = strlen (to_parse);
11940
11941   if (len == 0)
11942     return AARCH64_PARSE_MISSING_ARG;
11943
11944
11945   /* Loop through the list of supported ARCHes to find a match.  */
11946   for (arch = all_architectures; arch->name != NULL; arch++)
11947     {
11948       if (strlen (arch->name) == len
11949           && strncmp (arch->name, to_parse, len) == 0)
11950         {
11951           uint64_t isa_temp = arch->flags;
11952
11953           if (ext != NULL)
11954             {
11955               /* TO_PARSE string contains at least one extension.  */
11956               enum aarch64_parse_opt_result ext_res
11957                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11958
11959               if (ext_res != AARCH64_PARSE_OK)
11960                 return ext_res;
11961             }
11962           /* Extension parsing was successful.  Confirm the result
11963              arch and ISA flags.  */
11964           *res = arch;
11965           *isa_flags = isa_temp;
11966           return AARCH64_PARSE_OK;
11967         }
11968     }
11969
11970   /* ARCH name not found in list.  */
11971   return AARCH64_PARSE_INVALID_ARG;
11972 }
11973
11974 /* Parse the TO_PARSE string and put the result tuning in RES and the
11975    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
11976    describing the parse result.  If there is an error parsing, RES and
11977    ISA_FLAGS are left unchanged.
11978    When the TO_PARSE string contains an invalid extension,
11979    a copy of the string is created and stored to INVALID_EXTENSION.  */
11980
11981 static enum aarch64_parse_opt_result
11982 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
11983                    uint64_t *isa_flags, std::string *invalid_extension)
11984 {
11985   const char *ext;
11986   const struct processor *cpu;
11987   size_t len;
11988
11989   ext = strchr (to_parse, '+');
11990
11991   if (ext != NULL)
11992     len = ext - to_parse;
11993   else
11994     len = strlen (to_parse);
11995
11996   if (len == 0)
11997     return AARCH64_PARSE_MISSING_ARG;
11998
11999
12000   /* Loop through the list of supported CPUs to find a match.  */
12001   for (cpu = all_cores; cpu->name != NULL; cpu++)
12002     {
12003       if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
12004         {
12005           uint64_t isa_temp = cpu->flags;
12006
12007
12008           if (ext != NULL)
12009             {
12010               /* TO_PARSE string contains at least one extension.  */
12011               enum aarch64_parse_opt_result ext_res
12012                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
12013
12014               if (ext_res != AARCH64_PARSE_OK)
12015                 return ext_res;
12016             }
12017           /* Extension parsing was successfull.  Confirm the result
12018              cpu and ISA flags.  */
12019           *res = cpu;
12020           *isa_flags = isa_temp;
12021           return AARCH64_PARSE_OK;
12022         }
12023     }
12024
12025   /* CPU name not found in list.  */
12026   return AARCH64_PARSE_INVALID_ARG;
12027 }
12028
12029 /* Parse the TO_PARSE string and put the cpu it selects into RES.
12030    Return an aarch64_parse_opt_result describing the parse result.
12031    If the parsing fails the RES does not change.  */
12032
12033 static enum aarch64_parse_opt_result
12034 aarch64_parse_tune (const char *to_parse, const struct processor **res)
12035 {
12036   const struct processor *cpu;
12037
12038   /* Loop through the list of supported CPUs to find a match.  */
12039   for (cpu = all_cores; cpu->name != NULL; cpu++)
12040     {
12041       if (strcmp (cpu->name, to_parse) == 0)
12042         {
12043           *res = cpu;
12044           return AARCH64_PARSE_OK;
12045         }
12046     }
12047
12048   /* CPU name not found in list.  */
12049   return AARCH64_PARSE_INVALID_ARG;
12050 }
12051
12052 /* Parse TOKEN, which has length LENGTH to see if it is an option
12053    described in FLAG.  If it is, return the index bit for that fusion type.
12054    If not, error (printing OPTION_NAME) and return zero.  */
12055
12056 static unsigned int
12057 aarch64_parse_one_option_token (const char *token,
12058                                 size_t length,
12059                                 const struct aarch64_flag_desc *flag,
12060                                 const char *option_name)
12061 {
12062   for (; flag->name != NULL; flag++)
12063     {
12064       if (length == strlen (flag->name)
12065           && !strncmp (flag->name, token, length))
12066         return flag->flag;
12067     }
12068
12069   error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
12070   return 0;
12071 }
12072
12073 /* Parse OPTION which is a comma-separated list of flags to enable.
12074    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
12075    default state we inherit from the CPU tuning structures.  OPTION_NAME
12076    gives the top-level option we are parsing in the -moverride string,
12077    for use in error messages.  */
12078
12079 static unsigned int
12080 aarch64_parse_boolean_options (const char *option,
12081                                const struct aarch64_flag_desc *flags,
12082                                unsigned int initial_state,
12083                                const char *option_name)
12084 {
12085   const char separator = '.';
12086   const char* specs = option;
12087   const char* ntoken = option;
12088   unsigned int found_flags = initial_state;
12089
12090   while ((ntoken = strchr (specs, separator)))
12091     {
12092       size_t token_length = ntoken - specs;
12093       unsigned token_ops = aarch64_parse_one_option_token (specs,
12094                                                            token_length,
12095                                                            flags,
12096                                                            option_name);
12097       /* If we find "none" (or, for simplicity's sake, an error) anywhere
12098          in the token stream, reset the supported operations.  So:
12099
12100            adrp+add.cmp+branch.none.adrp+add
12101
12102            would have the result of turning on only adrp+add fusion.  */
12103       if (!token_ops)
12104         found_flags = 0;
12105
12106       found_flags |= token_ops;
12107       specs = ++ntoken;
12108     }
12109
12110   /* We ended with a comma, print something.  */
12111   if (!(*specs))
12112     {
12113       error ("%s string ill-formed\n", option_name);
12114       return 0;
12115     }
12116
12117   /* We still have one more token to parse.  */
12118   size_t token_length = strlen (specs);
12119   unsigned token_ops = aarch64_parse_one_option_token (specs,
12120                                                        token_length,
12121                                                        flags,
12122                                                        option_name);
12123    if (!token_ops)
12124      found_flags = 0;
12125
12126   found_flags |= token_ops;
12127   return found_flags;
12128 }
12129
12130 /* Support for overriding instruction fusion.  */
12131
12132 static void
12133 aarch64_parse_fuse_string (const char *fuse_string,
12134                             struct tune_params *tune)
12135 {
12136   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
12137                                                      aarch64_fusible_pairs,
12138                                                      tune->fusible_ops,
12139                                                      "fuse=");
12140 }
12141
12142 /* Support for overriding other tuning flags.  */
12143
12144 static void
12145 aarch64_parse_tune_string (const char *tune_string,
12146                             struct tune_params *tune)
12147 {
12148   tune->extra_tuning_flags
12149     = aarch64_parse_boolean_options (tune_string,
12150                                      aarch64_tuning_flags,
12151                                      tune->extra_tuning_flags,
12152                                      "tune=");
12153 }
12154
12155 /* Parse the sve_width tuning moverride string in TUNE_STRING.
12156    Accept the valid SVE vector widths allowed by
12157    aarch64_sve_vector_bits_enum and use it to override sve_width
12158    in TUNE.  */
12159
12160 static void
12161 aarch64_parse_sve_width_string (const char *tune_string,
12162                                 struct tune_params *tune)
12163 {
12164   int width = -1;
12165
12166   int n = sscanf (tune_string, "%d", &width);
12167   if (n == EOF)
12168     {
12169       error ("invalid format for sve_width");
12170       return;
12171     }
12172   switch (width)
12173     {
12174     case SVE_128:
12175     case SVE_256:
12176     case SVE_512:
12177     case SVE_1024:
12178     case SVE_2048:
12179       break;
12180     default:
12181       error ("invalid sve_width value: %d", width);
12182     }
12183   tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
12184 }
12185
12186 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
12187    we understand.  If it is, extract the option string and handoff to
12188    the appropriate function.  */
12189
12190 void
12191 aarch64_parse_one_override_token (const char* token,
12192                                   size_t length,
12193                                   struct tune_params *tune)
12194 {
12195   const struct aarch64_tuning_override_function *fn
12196     = aarch64_tuning_override_functions;
12197
12198   const char *option_part = strchr (token, '=');
12199   if (!option_part)
12200     {
12201       error ("tuning string missing in option (%s)", token);
12202       return;
12203     }
12204
12205   /* Get the length of the option name.  */
12206   length = option_part - token;
12207   /* Skip the '=' to get to the option string.  */
12208   option_part++;
12209
12210   for (; fn->name != NULL; fn++)
12211     {
12212       if (!strncmp (fn->name, token, length))
12213         {
12214           fn->parse_override (option_part, tune);
12215           return;
12216         }
12217     }
12218
12219   error ("unknown tuning option (%s)",token);
12220   return;
12221 }
12222
12223 /* A checking mechanism for the implementation of the tls size.  */
12224
12225 static void
12226 initialize_aarch64_tls_size (struct gcc_options *opts)
12227 {
12228   if (aarch64_tls_size == 0)
12229     aarch64_tls_size = 24;
12230
12231   switch (opts->x_aarch64_cmodel_var)
12232     {
12233     case AARCH64_CMODEL_TINY:
12234       /* Both the default and maximum TLS size allowed under tiny is 1M which
12235          needs two instructions to address, so we clamp the size to 24.  */
12236       if (aarch64_tls_size > 24)
12237         aarch64_tls_size = 24;
12238       break;
12239     case AARCH64_CMODEL_SMALL:
12240       /* The maximum TLS size allowed under small is 4G.  */
12241       if (aarch64_tls_size > 32)
12242         aarch64_tls_size = 32;
12243       break;
12244     case AARCH64_CMODEL_LARGE:
12245       /* The maximum TLS size allowed under large is 16E.
12246          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
12247       if (aarch64_tls_size > 48)
12248         aarch64_tls_size = 48;
12249       break;
12250     default:
12251       gcc_unreachable ();
12252     }
12253
12254   return;
12255 }
12256
12257 /* Parse STRING looking for options in the format:
12258      string     :: option:string
12259      option     :: name=substring
12260      name       :: {a-z}
12261      substring  :: defined by option.  */
12262
12263 static void
12264 aarch64_parse_override_string (const char* input_string,
12265                                struct tune_params* tune)
12266 {
12267   const char separator = ':';
12268   size_t string_length = strlen (input_string) + 1;
12269   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
12270   char *string = string_root;
12271   strncpy (string, input_string, string_length);
12272   string[string_length - 1] = '\0';
12273
12274   char* ntoken = string;
12275
12276   while ((ntoken = strchr (string, separator)))
12277     {
12278       size_t token_length = ntoken - string;
12279       /* Make this substring look like a string.  */
12280       *ntoken = '\0';
12281       aarch64_parse_one_override_token (string, token_length, tune);
12282       string = ++ntoken;
12283     }
12284
12285   /* One last option to parse.  */
12286   aarch64_parse_one_override_token (string, strlen (string), tune);
12287   free (string_root);
12288 }
12289
12290
12291 static void
12292 aarch64_override_options_after_change_1 (struct gcc_options *opts)
12293 {
12294   if (accepted_branch_protection_string)
12295     {
12296       opts->x_aarch64_branch_protection_string
12297         = xstrdup (accepted_branch_protection_string);
12298     }
12299
12300   /* PR 70044: We have to be careful about being called multiple times for the
12301      same function.  This means all changes should be repeatable.  */
12302
12303   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
12304      Disable the frame pointer flag so the mid-end will not use a frame
12305      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
12306      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
12307      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
12308   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
12309   if (opts->x_flag_omit_frame_pointer == 0)
12310     opts->x_flag_omit_frame_pointer = 2;
12311
12312   /* If not optimizing for size, set the default
12313      alignment to what the target wants.  */
12314   if (!opts->x_optimize_size)
12315     {
12316       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
12317         opts->x_str_align_loops = aarch64_tune_params.loop_align;
12318       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
12319         opts->x_str_align_jumps = aarch64_tune_params.jump_align;
12320       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
12321         opts->x_str_align_functions = aarch64_tune_params.function_align;
12322     }
12323
12324   /* We default to no pc-relative literal loads.  */
12325
12326   aarch64_pcrelative_literal_loads = false;
12327
12328   /* If -mpc-relative-literal-loads is set on the command line, this
12329      implies that the user asked for PC relative literal loads.  */
12330   if (opts->x_pcrelative_literal_loads == 1)
12331     aarch64_pcrelative_literal_loads = true;
12332
12333   /* In the tiny memory model it makes no sense to disallow PC relative
12334      literal pool loads.  */
12335   if (aarch64_cmodel == AARCH64_CMODEL_TINY
12336       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
12337     aarch64_pcrelative_literal_loads = true;
12338
12339   /* When enabling the lower precision Newton series for the square root, also
12340      enable it for the reciprocal square root, since the latter is an
12341      intermediary step for the former.  */
12342   if (flag_mlow_precision_sqrt)
12343     flag_mrecip_low_precision_sqrt = true;
12344 }
12345
12346 /* 'Unpack' up the internal tuning structs and update the options
12347     in OPTS.  The caller must have set up selected_tune and selected_arch
12348     as all the other target-specific codegen decisions are
12349     derived from them.  */
12350
12351 void
12352 aarch64_override_options_internal (struct gcc_options *opts)
12353 {
12354   aarch64_tune_flags = selected_tune->flags;
12355   aarch64_tune = selected_tune->sched_core;
12356   /* Make a copy of the tuning parameters attached to the core, which
12357      we may later overwrite.  */
12358   aarch64_tune_params = *(selected_tune->tune);
12359   aarch64_architecture_version = selected_arch->architecture_version;
12360
12361   if (opts->x_aarch64_override_tune_string)
12362     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
12363                                   &aarch64_tune_params);
12364
12365   /* This target defaults to strict volatile bitfields.  */
12366   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
12367     opts->x_flag_strict_volatile_bitfields = 1;
12368
12369   if (aarch64_stack_protector_guard == SSP_GLOBAL
12370       && opts->x_aarch64_stack_protector_guard_offset_str)
12371     {
12372       error ("incompatible options %<-mstack-protector-guard=global%> and "
12373              "%<-mstack-protector-guard-offset=%s%>",
12374              aarch64_stack_protector_guard_offset_str);
12375     }
12376
12377   if (aarch64_stack_protector_guard == SSP_SYSREG
12378       && !(opts->x_aarch64_stack_protector_guard_offset_str
12379            && opts->x_aarch64_stack_protector_guard_reg_str))
12380     {
12381       error ("both %<-mstack-protector-guard-offset%> and "
12382              "%<-mstack-protector-guard-reg%> must be used "
12383              "with %<-mstack-protector-guard=sysreg%>");
12384     }
12385
12386   if (opts->x_aarch64_stack_protector_guard_reg_str)
12387     {
12388       if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
12389           error ("specify a system register with a small string length.");
12390     }
12391
12392   if (opts->x_aarch64_stack_protector_guard_offset_str)
12393     {
12394       char *end;
12395       const char *str = aarch64_stack_protector_guard_offset_str;
12396       errno = 0;
12397       long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
12398       if (!*str || *end || errno)
12399         error ("%qs is not a valid offset in %qs", str,
12400                "-mstack-protector-guard-offset=");
12401       aarch64_stack_protector_guard_offset = offs;
12402     }
12403
12404   initialize_aarch64_code_model (opts);
12405   initialize_aarch64_tls_size (opts);
12406
12407   int queue_depth = 0;
12408   switch (aarch64_tune_params.autoprefetcher_model)
12409     {
12410       case tune_params::AUTOPREFETCHER_OFF:
12411         queue_depth = -1;
12412         break;
12413       case tune_params::AUTOPREFETCHER_WEAK:
12414         queue_depth = 0;
12415         break;
12416       case tune_params::AUTOPREFETCHER_STRONG:
12417         queue_depth = max_insn_queue_index + 1;
12418         break;
12419       default:
12420         gcc_unreachable ();
12421     }
12422
12423   /* We don't mind passing in global_options_set here as we don't use
12424      the *options_set structs anyway.  */
12425   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
12426                          queue_depth,
12427                          opts->x_param_values,
12428                          global_options_set.x_param_values);
12429
12430   /* Set up parameters to be used in prefetching algorithm.  Do not
12431      override the defaults unless we are tuning for a core we have
12432      researched values for.  */
12433   if (aarch64_tune_params.prefetch->num_slots > 0)
12434     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
12435                            aarch64_tune_params.prefetch->num_slots,
12436                            opts->x_param_values,
12437                            global_options_set.x_param_values);
12438   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
12439     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
12440                            aarch64_tune_params.prefetch->l1_cache_size,
12441                            opts->x_param_values,
12442                            global_options_set.x_param_values);
12443   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
12444     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
12445                            aarch64_tune_params.prefetch->l1_cache_line_size,
12446                            opts->x_param_values,
12447                            global_options_set.x_param_values);
12448   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
12449     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
12450                            aarch64_tune_params.prefetch->l2_cache_size,
12451                            opts->x_param_values,
12452                            global_options_set.x_param_values);
12453   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
12454     maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
12455                            0,
12456                            opts->x_param_values,
12457                            global_options_set.x_param_values);
12458   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
12459     maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
12460                            aarch64_tune_params.prefetch->minimum_stride,
12461                            opts->x_param_values,
12462                            global_options_set.x_param_values);
12463
12464   /* Use the alternative scheduling-pressure algorithm by default.  */
12465   maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
12466                          opts->x_param_values,
12467                          global_options_set.x_param_values);
12468
12469   /* If the user hasn't changed it via configure then set the default to 64 KB
12470      for the backend.  */
12471   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
12472                          DEFAULT_STK_CLASH_GUARD_SIZE == 0
12473                            ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE,
12474                          opts->x_param_values,
12475                          global_options_set.x_param_values);
12476
12477   /* Validate the guard size.  */
12478   int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
12479
12480   /* Enforce that interval is the same size as size so the mid-end does the
12481      right thing.  */
12482   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
12483                          guard_size,
12484                          opts->x_param_values,
12485                          global_options_set.x_param_values);
12486
12487   /* The maybe_set calls won't update the value if the user has explicitly set
12488      one.  Which means we need to validate that probing interval and guard size
12489      are equal.  */
12490   int probe_interval
12491     = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
12492   if (guard_size != probe_interval)
12493     error ("stack clash guard size %<%d%> must be equal to probing interval "
12494            "%<%d%>", guard_size, probe_interval);
12495
12496   /* Enable sw prefetching at specified optimization level for
12497      CPUS that have prefetch.  Lower optimization level threshold by 1
12498      when profiling is enabled.  */
12499   if (opts->x_flag_prefetch_loop_arrays < 0
12500       && !opts->x_optimize_size
12501       && aarch64_tune_params.prefetch->default_opt_level >= 0
12502       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
12503     opts->x_flag_prefetch_loop_arrays = 1;
12504
12505   if (opts->x_aarch64_arch_string == NULL)
12506     opts->x_aarch64_arch_string = selected_arch->name;
12507   if (opts->x_aarch64_cpu_string == NULL)
12508     opts->x_aarch64_cpu_string = selected_cpu->name;
12509   if (opts->x_aarch64_tune_string == NULL)
12510     opts->x_aarch64_tune_string = selected_tune->name;
12511
12512   aarch64_override_options_after_change_1 (opts);
12513 }
12514
12515 /* Print a hint with a suggestion for a core or architecture name that
12516    most closely resembles what the user passed in STR.  ARCH is true if
12517    the user is asking for an architecture name.  ARCH is false if the user
12518    is asking for a core name.  */
12519
12520 static void
12521 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
12522 {
12523   auto_vec<const char *> candidates;
12524   const struct processor *entry = arch ? all_architectures : all_cores;
12525   for (; entry->name != NULL; entry++)
12526     candidates.safe_push (entry->name);
12527
12528 #ifdef HAVE_LOCAL_CPU_DETECT
12529   /* Add also "native" as possible value.  */
12530   if (arch)
12531     candidates.safe_push ("native");
12532 #endif
12533
12534   char *s;
12535   const char *hint = candidates_list_and_hint (str, s, candidates);
12536   if (hint)
12537     inform (input_location, "valid arguments are: %s;"
12538                              " did you mean %qs?", s, hint);
12539   else
12540     inform (input_location, "valid arguments are: %s", s);
12541
12542   XDELETEVEC (s);
12543 }
12544
12545 /* Print a hint with a suggestion for a core name that most closely resembles
12546    what the user passed in STR.  */
12547
12548 inline static void
12549 aarch64_print_hint_for_core (const char *str)
12550 {
12551   aarch64_print_hint_for_core_or_arch (str, false);
12552 }
12553
12554 /* Print a hint with a suggestion for an architecture name that most closely
12555    resembles what the user passed in STR.  */
12556
12557 inline static void
12558 aarch64_print_hint_for_arch (const char *str)
12559 {
12560   aarch64_print_hint_for_core_or_arch (str, true);
12561 }
12562
12563
12564 /* Print a hint with a suggestion for an extension name
12565    that most closely resembles what the user passed in STR.  */
12566
12567 void
12568 aarch64_print_hint_for_extensions (const std::string &str)
12569 {
12570   auto_vec<const char *> candidates;
12571   aarch64_get_all_extension_candidates (&candidates);
12572   char *s;
12573   const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
12574   if (hint)
12575     inform (input_location, "valid arguments are: %s;"
12576                              " did you mean %qs?", s, hint);
12577   else
12578     inform (input_location, "valid arguments are: %s;", s);
12579
12580   XDELETEVEC (s);
12581 }
12582
12583 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
12584    specified in STR and throw errors if appropriate.  Put the results if
12585    they are valid in RES and ISA_FLAGS.  Return whether the option is
12586    valid.  */
12587
12588 static bool
12589 aarch64_validate_mcpu (const char *str, const struct processor **res,
12590                        uint64_t *isa_flags)
12591 {
12592   std::string invalid_extension;
12593   enum aarch64_parse_opt_result parse_res
12594     = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
12595
12596   if (parse_res == AARCH64_PARSE_OK)
12597     return true;
12598
12599   switch (parse_res)
12600     {
12601       case AARCH64_PARSE_MISSING_ARG:
12602         error ("missing cpu name in %<-mcpu=%s%>", str);
12603         break;
12604       case AARCH64_PARSE_INVALID_ARG:
12605         error ("unknown value %qs for %<-mcpu%>", str);
12606         aarch64_print_hint_for_core (str);
12607         break;
12608       case AARCH64_PARSE_INVALID_FEATURE:
12609         error ("invalid feature modifier %qs in %<-mcpu=%s%>",
12610                invalid_extension.c_str (), str);
12611         aarch64_print_hint_for_extensions (invalid_extension);
12612         break;
12613       default:
12614         gcc_unreachable ();
12615     }
12616
12617   return false;
12618 }
12619
12620 /* Parses CONST_STR for branch protection features specified in
12621    aarch64_branch_protect_types, and set any global variables required.  Returns
12622    the parsing result and assigns LAST_STR to the last processed token from
12623    CONST_STR so that it can be used for error reporting.  */
12624
12625 static enum
12626 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
12627                                                           char** last_str)
12628 {
12629   char *str_root = xstrdup (const_str);
12630   char* token_save = NULL;
12631   char *str = strtok_r (str_root, "+", &token_save);
12632   enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
12633   if (!str)
12634     res = AARCH64_PARSE_MISSING_ARG;
12635   else
12636     {
12637       char *next_str = strtok_r (NULL, "+", &token_save);
12638       /* Reset the branch protection features to their defaults.  */
12639       aarch64_handle_no_branch_protection (NULL, NULL);
12640
12641       while (str && res == AARCH64_PARSE_OK)
12642         {
12643           const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
12644           bool found = false;
12645           /* Search for this type.  */
12646           while (type && type->name && !found && res == AARCH64_PARSE_OK)
12647             {
12648               if (strcmp (str, type->name) == 0)
12649                 {
12650                   found = true;
12651                   res = type->handler (str, next_str);
12652                   str = next_str;
12653                   next_str = strtok_r (NULL, "+", &token_save);
12654                 }
12655               else
12656                 type++;
12657             }
12658           if (found && res == AARCH64_PARSE_OK)
12659             {
12660               bool found_subtype = true;
12661               /* Loop through each token until we find one that isn't a
12662                  subtype.  */
12663               while (found_subtype)
12664                 {
12665                   found_subtype = false;
12666                   const aarch64_branch_protect_type *subtype = type->subtypes;
12667                   /* Search for the subtype.  */
12668                   while (str && subtype && subtype->name && !found_subtype
12669                           && res == AARCH64_PARSE_OK)
12670                     {
12671                       if (strcmp (str, subtype->name) == 0)
12672                         {
12673                           found_subtype = true;
12674                           res = subtype->handler (str, next_str);
12675                           str = next_str;
12676                           next_str = strtok_r (NULL, "+", &token_save);
12677                         }
12678                       else
12679                         subtype++;
12680                     }
12681                 }
12682             }
12683           else if (!found)
12684             res = AARCH64_PARSE_INVALID_ARG;
12685         }
12686     }
12687   /* Copy the last processed token into the argument to pass it back.
12688     Used by option and attribute validation to print the offending token.  */
12689   if (last_str)
12690     {
12691       if (str) strcpy (*last_str, str);
12692       else *last_str = NULL;
12693     }
12694   if (res == AARCH64_PARSE_OK)
12695     {
12696       /* If needed, alloc the accepted string then copy in const_str.
12697         Used by override_option_after_change_1.  */
12698       if (!accepted_branch_protection_string)
12699         accepted_branch_protection_string = (char *) xmalloc (
12700                                                       BRANCH_PROTECT_STR_MAX
12701                                                         + 1);
12702       strncpy (accepted_branch_protection_string, const_str,
12703                 BRANCH_PROTECT_STR_MAX + 1);
12704       /* Forcibly null-terminate.  */
12705       accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
12706     }
12707   return res;
12708 }
12709
12710 static bool
12711 aarch64_validate_mbranch_protection (const char *const_str)
12712 {
12713   char *str = (char *) xmalloc (strlen (const_str));
12714   enum aarch64_parse_opt_result res =
12715     aarch64_parse_branch_protection (const_str, &str);
12716   if (res == AARCH64_PARSE_INVALID_ARG)
12717     error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
12718   else if (res == AARCH64_PARSE_MISSING_ARG)
12719     error ("missing argument for %<-mbranch-protection=%>");
12720   free (str);
12721   return res == AARCH64_PARSE_OK;
12722 }
12723
12724 /* Validate a command-line -march option.  Parse the arch and extensions
12725    (if any) specified in STR and throw errors if appropriate.  Put the
12726    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
12727    option is valid.  */
12728
12729 static bool
12730 aarch64_validate_march (const char *str, const struct processor **res,
12731                          uint64_t *isa_flags)
12732 {
12733   std::string invalid_extension;
12734   enum aarch64_parse_opt_result parse_res
12735     = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
12736
12737   if (parse_res == AARCH64_PARSE_OK)
12738     return true;
12739
12740   switch (parse_res)
12741     {
12742       case AARCH64_PARSE_MISSING_ARG:
12743         error ("missing arch name in %<-march=%s%>", str);
12744         break;
12745       case AARCH64_PARSE_INVALID_ARG:
12746         error ("unknown value %qs for %<-march%>", str);
12747         aarch64_print_hint_for_arch (str);
12748         break;
12749       case AARCH64_PARSE_INVALID_FEATURE:
12750         error ("invalid feature modifier %qs in %<-march=%s%>",
12751                invalid_extension.c_str (), str);
12752         aarch64_print_hint_for_extensions (invalid_extension);
12753         break;
12754       default:
12755         gcc_unreachable ();
12756     }
12757
12758   return false;
12759 }
12760
12761 /* Validate a command-line -mtune option.  Parse the cpu
12762    specified in STR and throw errors if appropriate.  Put the
12763    result, if it is valid, in RES.  Return whether the option is
12764    valid.  */
12765
12766 static bool
12767 aarch64_validate_mtune (const char *str, const struct processor **res)
12768 {
12769   enum aarch64_parse_opt_result parse_res
12770     = aarch64_parse_tune (str, res);
12771
12772   if (parse_res == AARCH64_PARSE_OK)
12773     return true;
12774
12775   switch (parse_res)
12776     {
12777       case AARCH64_PARSE_MISSING_ARG:
12778         error ("missing cpu name in %<-mtune=%s%>", str);
12779         break;
12780       case AARCH64_PARSE_INVALID_ARG:
12781         error ("unknown value %qs for %<-mtune%>", str);
12782         aarch64_print_hint_for_core (str);
12783         break;
12784       default:
12785         gcc_unreachable ();
12786     }
12787   return false;
12788 }
12789
12790 /* Return the CPU corresponding to the enum CPU.
12791    If it doesn't specify a cpu, return the default.  */
12792
12793 static const struct processor *
12794 aarch64_get_tune_cpu (enum aarch64_processor cpu)
12795 {
12796   if (cpu != aarch64_none)
12797     return &all_cores[cpu];
12798
12799   /* The & 0x3f is to extract the bottom 6 bits that encode the
12800      default cpu as selected by the --with-cpu GCC configure option
12801      in config.gcc.
12802      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
12803      flags mechanism should be reworked to make it more sane.  */
12804   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12805 }
12806
12807 /* Return the architecture corresponding to the enum ARCH.
12808    If it doesn't specify a valid architecture, return the default.  */
12809
12810 static const struct processor *
12811 aarch64_get_arch (enum aarch64_arch arch)
12812 {
12813   if (arch != aarch64_no_arch)
12814     return &all_architectures[arch];
12815
12816   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12817
12818   return &all_architectures[cpu->arch];
12819 }
12820
12821 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
12822
12823 static poly_uint16
12824 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
12825 {
12826   /* For now generate vector-length agnostic code for -msve-vector-bits=128.
12827      This ensures we can clearly distinguish SVE and Advanced SIMD modes when
12828      deciding which .md file patterns to use and when deciding whether
12829      something is a legitimate address or constant.  */
12830   if (value == SVE_SCALABLE || value == SVE_128)
12831     return poly_uint16 (2, 2);
12832   else
12833     return (int) value / 64;
12834 }
12835
12836 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
12837    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
12838    tuning structs.  In particular it must set selected_tune and
12839    aarch64_isa_flags that define the available ISA features and tuning
12840    decisions.  It must also set selected_arch as this will be used to
12841    output the .arch asm tags for each function.  */
12842
12843 static void
12844 aarch64_override_options (void)
12845 {
12846   uint64_t cpu_isa = 0;
12847   uint64_t arch_isa = 0;
12848   aarch64_isa_flags = 0;
12849
12850   bool valid_cpu = true;
12851   bool valid_tune = true;
12852   bool valid_arch = true;
12853
12854   selected_cpu = NULL;
12855   selected_arch = NULL;
12856   selected_tune = NULL;
12857
12858   if (aarch64_branch_protection_string)
12859     aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
12860
12861   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
12862      If either of -march or -mtune is given, they override their
12863      respective component of -mcpu.  */
12864   if (aarch64_cpu_string)
12865     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
12866                                         &cpu_isa);
12867
12868   if (aarch64_arch_string)
12869     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
12870                                           &arch_isa);
12871
12872   if (aarch64_tune_string)
12873     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
12874
12875 #ifdef SUBTARGET_OVERRIDE_OPTIONS
12876   SUBTARGET_OVERRIDE_OPTIONS;
12877 #endif
12878
12879   /* If the user did not specify a processor, choose the default
12880      one for them.  This will be the CPU set during configuration using
12881      --with-cpu, otherwise it is "generic".  */
12882   if (!selected_cpu)
12883     {
12884       if (selected_arch)
12885         {
12886           selected_cpu = &all_cores[selected_arch->ident];
12887           aarch64_isa_flags = arch_isa;
12888           explicit_arch = selected_arch->arch;
12889         }
12890       else
12891         {
12892           /* Get default configure-time CPU.  */
12893           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
12894           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
12895         }
12896
12897       if (selected_tune)
12898         explicit_tune_core = selected_tune->ident;
12899     }
12900   /* If both -mcpu and -march are specified check that they are architecturally
12901      compatible, warn if they're not and prefer the -march ISA flags.  */
12902   else if (selected_arch)
12903     {
12904       if (selected_arch->arch != selected_cpu->arch)
12905         {
12906           warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
12907                        all_architectures[selected_cpu->arch].name,
12908                        selected_arch->name);
12909         }
12910       aarch64_isa_flags = arch_isa;
12911       explicit_arch = selected_arch->arch;
12912       explicit_tune_core = selected_tune ? selected_tune->ident
12913                                           : selected_cpu->ident;
12914     }
12915   else
12916     {
12917       /* -mcpu but no -march.  */
12918       aarch64_isa_flags = cpu_isa;
12919       explicit_tune_core = selected_tune ? selected_tune->ident
12920                                           : selected_cpu->ident;
12921       gcc_assert (selected_cpu);
12922       selected_arch = &all_architectures[selected_cpu->arch];
12923       explicit_arch = selected_arch->arch;
12924     }
12925
12926   /* Set the arch as well as we will need it when outputing
12927      the .arch directive in assembly.  */
12928   if (!selected_arch)
12929     {
12930       gcc_assert (selected_cpu);
12931       selected_arch = &all_architectures[selected_cpu->arch];
12932     }
12933
12934   if (!selected_tune)
12935     selected_tune = selected_cpu;
12936
12937   if (aarch64_enable_bti == 2)
12938     {
12939 #ifdef TARGET_ENABLE_BTI
12940       aarch64_enable_bti = 1;
12941 #else
12942       aarch64_enable_bti = 0;
12943 #endif
12944     }
12945
12946   /* Return address signing is currently not supported for ILP32 targets.  For
12947      LP64 targets use the configured option in the absence of a command-line
12948      option for -mbranch-protection.  */
12949   if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
12950     {
12951 #ifdef TARGET_ENABLE_PAC_RET
12952       aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
12953 #else
12954       aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
12955 #endif
12956     }
12957
12958 #ifndef HAVE_AS_MABI_OPTION
12959   /* The compiler may have been configured with 2.23.* binutils, which does
12960      not have support for ILP32.  */
12961   if (TARGET_ILP32)
12962     error ("assembler does not support %<-mabi=ilp32%>");
12963 #endif
12964
12965   /* Convert -msve-vector-bits to a VG count.  */
12966   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
12967
12968   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
12969     sorry ("return address signing is only supported for %<-mabi=lp64%>");
12970
12971   /* Make sure we properly set up the explicit options.  */
12972   if ((aarch64_cpu_string && valid_cpu)
12973        || (aarch64_tune_string && valid_tune))
12974     gcc_assert (explicit_tune_core != aarch64_none);
12975
12976   if ((aarch64_cpu_string && valid_cpu)
12977        || (aarch64_arch_string && valid_arch))
12978     gcc_assert (explicit_arch != aarch64_no_arch);
12979
12980   /* The pass to insert speculation tracking runs before
12981      shrink-wrapping and the latter does not know how to update the
12982      tracking status.  So disable it in this case.  */
12983   if (aarch64_track_speculation)
12984     flag_shrink_wrap = 0;
12985
12986   aarch64_override_options_internal (&global_options);
12987
12988   /* Save these options as the default ones in case we push and pop them later
12989      while processing functions with potential target attributes.  */
12990   target_option_default_node = target_option_current_node
12991       = build_target_option_node (&global_options);
12992 }
12993
12994 /* Implement targetm.override_options_after_change.  */
12995
12996 static void
12997 aarch64_override_options_after_change (void)
12998 {
12999   aarch64_override_options_after_change_1 (&global_options);
13000 }
13001
13002 static struct machine_function *
13003 aarch64_init_machine_status (void)
13004 {
13005   struct machine_function *machine;
13006   machine = ggc_cleared_alloc<machine_function> ();
13007   return machine;
13008 }
13009
13010 void
13011 aarch64_init_expanders (void)
13012 {
13013   init_machine_status = aarch64_init_machine_status;
13014 }
13015
13016 /* A checking mechanism for the implementation of the various code models.  */
13017 static void
13018 initialize_aarch64_code_model (struct gcc_options *opts)
13019 {
13020    if (opts->x_flag_pic)
13021      {
13022        switch (opts->x_aarch64_cmodel_var)
13023          {
13024          case AARCH64_CMODEL_TINY:
13025            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
13026            break;
13027          case AARCH64_CMODEL_SMALL:
13028 #ifdef HAVE_AS_SMALL_PIC_RELOCS
13029            aarch64_cmodel = (flag_pic == 2
13030                              ? AARCH64_CMODEL_SMALL_PIC
13031                              : AARCH64_CMODEL_SMALL_SPIC);
13032 #else
13033            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
13034 #endif
13035            break;
13036          case AARCH64_CMODEL_LARGE:
13037            sorry ("code model %qs with %<-f%s%>", "large",
13038                   opts->x_flag_pic > 1 ? "PIC" : "pic");
13039            break;
13040          default:
13041            gcc_unreachable ();
13042          }
13043      }
13044    else
13045      aarch64_cmodel = opts->x_aarch64_cmodel_var;
13046 }
13047
13048 /* Implement TARGET_OPTION_SAVE.  */
13049
13050 static void
13051 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
13052 {
13053   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
13054   ptr->x_aarch64_branch_protection_string
13055     = opts->x_aarch64_branch_protection_string;
13056 }
13057
13058 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
13059    using the information saved in PTR.  */
13060
13061 static void
13062 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
13063 {
13064   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
13065   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
13066   opts->x_explicit_arch = ptr->x_explicit_arch;
13067   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
13068   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
13069   opts->x_aarch64_branch_protection_string
13070     = ptr->x_aarch64_branch_protection_string;
13071   if (opts->x_aarch64_branch_protection_string)
13072     {
13073       aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
13074                                         NULL);
13075     }
13076
13077   aarch64_override_options_internal (opts);
13078 }
13079
13080 /* Implement TARGET_OPTION_PRINT.  */
13081
13082 static void
13083 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
13084 {
13085   const struct processor *cpu
13086     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
13087   uint64_t isa_flags = ptr->x_aarch64_isa_flags;
13088   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
13089   std::string extension
13090     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
13091
13092   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
13093   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
13094            arch->name, extension.c_str ());
13095 }
13096
13097 static GTY(()) tree aarch64_previous_fndecl;
13098
13099 void
13100 aarch64_reset_previous_fndecl (void)
13101 {
13102   aarch64_previous_fndecl = NULL;
13103 }
13104
13105 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
13106    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
13107    make sure optab availability predicates are recomputed when necessary.  */
13108
13109 void
13110 aarch64_save_restore_target_globals (tree new_tree)
13111 {
13112   if (TREE_TARGET_GLOBALS (new_tree))
13113     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
13114   else if (new_tree == target_option_default_node)
13115     restore_target_globals (&default_target_globals);
13116   else
13117     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
13118 }
13119
13120 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
13121    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
13122    of the function, if such exists.  This function may be called multiple
13123    times on a single function so use aarch64_previous_fndecl to avoid
13124    setting up identical state.  */
13125
13126 static void
13127 aarch64_set_current_function (tree fndecl)
13128 {
13129   if (!fndecl || fndecl == aarch64_previous_fndecl)
13130     return;
13131
13132   tree old_tree = (aarch64_previous_fndecl
13133                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
13134                    : NULL_TREE);
13135
13136   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13137
13138   /* If current function has no attributes but the previous one did,
13139      use the default node.  */
13140   if (!new_tree && old_tree)
13141     new_tree = target_option_default_node;
13142
13143   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
13144      the default have been handled by aarch64_save_restore_target_globals from
13145      aarch64_pragma_target_parse.  */
13146   if (old_tree == new_tree)
13147     return;
13148
13149   aarch64_previous_fndecl = fndecl;
13150
13151   /* First set the target options.  */
13152   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
13153
13154   aarch64_save_restore_target_globals (new_tree);
13155 }
13156
13157 /* Enum describing the various ways we can handle attributes.
13158    In many cases we can reuse the generic option handling machinery.  */
13159
13160 enum aarch64_attr_opt_type
13161 {
13162   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
13163   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
13164   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
13165   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
13166 };
13167
13168 /* All the information needed to handle a target attribute.
13169    NAME is the name of the attribute.
13170    ATTR_TYPE specifies the type of behavior of the attribute as described
13171    in the definition of enum aarch64_attr_opt_type.
13172    ALLOW_NEG is true if the attribute supports a "no-" form.
13173    HANDLER is the function that takes the attribute string as an argument
13174    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
13175    OPT_NUM is the enum specifying the option that the attribute modifies.
13176    This is needed for attributes that mirror the behavior of a command-line
13177    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
13178    aarch64_attr_enum.  */
13179
13180 struct aarch64_attribute_info
13181 {
13182   const char *name;
13183   enum aarch64_attr_opt_type attr_type;
13184   bool allow_neg;
13185   bool (*handler) (const char *);
13186   enum opt_code opt_num;
13187 };
13188
13189 /* Handle the ARCH_STR argument to the arch= target attribute.  */
13190
13191 static bool
13192 aarch64_handle_attr_arch (const char *str)
13193 {
13194   const struct processor *tmp_arch = NULL;
13195   std::string invalid_extension;
13196   enum aarch64_parse_opt_result parse_res
13197     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
13198
13199   if (parse_res == AARCH64_PARSE_OK)
13200     {
13201       gcc_assert (tmp_arch);
13202       selected_arch = tmp_arch;
13203       explicit_arch = selected_arch->arch;
13204       return true;
13205     }
13206
13207   switch (parse_res)
13208     {
13209       case AARCH64_PARSE_MISSING_ARG:
13210         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
13211         break;
13212       case AARCH64_PARSE_INVALID_ARG:
13213         error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
13214         aarch64_print_hint_for_arch (str);
13215         break;
13216       case AARCH64_PARSE_INVALID_FEATURE:
13217         error ("invalid feature modifier %s of value (\"%s\") in "
13218                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13219         aarch64_print_hint_for_extensions (invalid_extension);
13220         break;
13221       default:
13222         gcc_unreachable ();
13223     }
13224
13225   return false;
13226 }
13227
13228 /* Handle the argument CPU_STR to the cpu= target attribute.  */
13229
13230 static bool
13231 aarch64_handle_attr_cpu (const char *str)
13232 {
13233   const struct processor *tmp_cpu = NULL;
13234   std::string invalid_extension;
13235   enum aarch64_parse_opt_result parse_res
13236     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
13237
13238   if (parse_res == AARCH64_PARSE_OK)
13239     {
13240       gcc_assert (tmp_cpu);
13241       selected_tune = tmp_cpu;
13242       explicit_tune_core = selected_tune->ident;
13243
13244       selected_arch = &all_architectures[tmp_cpu->arch];
13245       explicit_arch = selected_arch->arch;
13246       return true;
13247     }
13248
13249   switch (parse_res)
13250     {
13251       case AARCH64_PARSE_MISSING_ARG:
13252         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
13253         break;
13254       case AARCH64_PARSE_INVALID_ARG:
13255         error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
13256         aarch64_print_hint_for_core (str);
13257         break;
13258       case AARCH64_PARSE_INVALID_FEATURE:
13259         error ("invalid feature modifier %s of value (\"%s\") in "
13260                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13261         aarch64_print_hint_for_extensions (invalid_extension);
13262         break;
13263       default:
13264         gcc_unreachable ();
13265     }
13266
13267   return false;
13268 }
13269
13270 /* Handle the argument STR to the branch-protection= attribute.  */
13271
13272  static bool
13273  aarch64_handle_attr_branch_protection (const char* str)
13274  {
13275   char *err_str = (char *) xmalloc (strlen (str));
13276   enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
13277                                                                       &err_str);
13278   bool success = false;
13279   switch (res)
13280     {
13281      case AARCH64_PARSE_MISSING_ARG:
13282        error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
13283               " attribute");
13284        break;
13285      case AARCH64_PARSE_INVALID_ARG:
13286        error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
13287               "=\")%> pragma or attribute", err_str);
13288        break;
13289      case AARCH64_PARSE_OK:
13290        success = true;
13291       /* Fall through.  */
13292      case AARCH64_PARSE_INVALID_FEATURE:
13293        break;
13294      default:
13295        gcc_unreachable ();
13296     }
13297   free (err_str);
13298   return success;
13299  }
13300
13301 /* Handle the argument STR to the tune= target attribute.  */
13302
13303 static bool
13304 aarch64_handle_attr_tune (const char *str)
13305 {
13306   const struct processor *tmp_tune = NULL;
13307   enum aarch64_parse_opt_result parse_res
13308     = aarch64_parse_tune (str, &tmp_tune);
13309
13310   if (parse_res == AARCH64_PARSE_OK)
13311     {
13312       gcc_assert (tmp_tune);
13313       selected_tune = tmp_tune;
13314       explicit_tune_core = selected_tune->ident;
13315       return true;
13316     }
13317
13318   switch (parse_res)
13319     {
13320       case AARCH64_PARSE_INVALID_ARG:
13321         error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
13322         aarch64_print_hint_for_core (str);
13323         break;
13324       default:
13325         gcc_unreachable ();
13326     }
13327
13328   return false;
13329 }
13330
13331 /* Parse an architecture extensions target attribute string specified in STR.
13332    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
13333    if successful.  Update aarch64_isa_flags to reflect the ISA features
13334    modified.  */
13335
13336 static bool
13337 aarch64_handle_attr_isa_flags (char *str)
13338 {
13339   enum aarch64_parse_opt_result parse_res;
13340   uint64_t isa_flags = aarch64_isa_flags;
13341
13342   /* We allow "+nothing" in the beginning to clear out all architectural
13343      features if the user wants to handpick specific features.  */
13344   if (strncmp ("+nothing", str, 8) == 0)
13345     {
13346       isa_flags = 0;
13347       str += 8;
13348     }
13349
13350   std::string invalid_extension;
13351   parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
13352
13353   if (parse_res == AARCH64_PARSE_OK)
13354     {
13355       aarch64_isa_flags = isa_flags;
13356       return true;
13357     }
13358
13359   switch (parse_res)
13360     {
13361       case AARCH64_PARSE_MISSING_ARG:
13362         error ("missing value in %<target()%> pragma or attribute");
13363         break;
13364
13365       case AARCH64_PARSE_INVALID_FEATURE:
13366         error ("invalid feature modifier %s of value (\"%s\") in "
13367                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13368         break;
13369
13370       default:
13371         gcc_unreachable ();
13372     }
13373
13374  return false;
13375 }
13376
13377 /* The target attributes that we support.  On top of these we also support just
13378    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
13379    handled explicitly in aarch64_process_one_target_attr.  */
13380
13381 static const struct aarch64_attribute_info aarch64_attributes[] =
13382 {
13383   { "general-regs-only", aarch64_attr_mask, false, NULL,
13384      OPT_mgeneral_regs_only },
13385   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
13386      OPT_mfix_cortex_a53_835769 },
13387   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
13388      OPT_mfix_cortex_a53_843419 },
13389   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
13390   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
13391   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
13392      OPT_momit_leaf_frame_pointer },
13393   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
13394   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
13395      OPT_march_ },
13396   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
13397   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
13398      OPT_mtune_ },
13399   { "branch-protection", aarch64_attr_custom, false,
13400      aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
13401   { "sign-return-address", aarch64_attr_enum, false, NULL,
13402      OPT_msign_return_address_ },
13403   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
13404 };
13405
13406 /* Parse ARG_STR which contains the definition of one target attribute.
13407    Show appropriate errors if any or return true if the attribute is valid.  */
13408
13409 static bool
13410 aarch64_process_one_target_attr (char *arg_str)
13411 {
13412   bool invert = false;
13413
13414   size_t len = strlen (arg_str);
13415
13416   if (len == 0)
13417     {
13418       error ("malformed %<target()%> pragma or attribute");
13419       return false;
13420     }
13421
13422   char *str_to_check = (char *) alloca (len + 1);
13423   strcpy (str_to_check, arg_str);
13424
13425   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
13426      It is easier to detect and handle it explicitly here rather than going
13427      through the machinery for the rest of the target attributes in this
13428      function.  */
13429   if (*str_to_check == '+')
13430     return aarch64_handle_attr_isa_flags (str_to_check);
13431
13432   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
13433     {
13434       invert = true;
13435       str_to_check += 3;
13436     }
13437   char *arg = strchr (str_to_check, '=');
13438
13439   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
13440      and point ARG to "foo".  */
13441   if (arg)
13442     {
13443       *arg = '\0';
13444       arg++;
13445     }
13446   const struct aarch64_attribute_info *p_attr;
13447   bool found = false;
13448   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
13449     {
13450       /* If the names don't match up, or the user has given an argument
13451          to an attribute that doesn't accept one, or didn't give an argument
13452          to an attribute that expects one, fail to match.  */
13453       if (strcmp (str_to_check, p_attr->name) != 0)
13454         continue;
13455
13456       found = true;
13457       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
13458                               || p_attr->attr_type == aarch64_attr_enum;
13459
13460       if (attr_need_arg_p ^ (arg != NULL))
13461         {
13462           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
13463           return false;
13464         }
13465
13466       /* If the name matches but the attribute does not allow "no-" versions
13467          then we can't match.  */
13468       if (invert && !p_attr->allow_neg)
13469         {
13470           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
13471           return false;
13472         }
13473
13474       switch (p_attr->attr_type)
13475         {
13476         /* Has a custom handler registered.
13477            For example, cpu=, arch=, tune=.  */
13478           case aarch64_attr_custom:
13479             gcc_assert (p_attr->handler);
13480             if (!p_attr->handler (arg))
13481               return false;
13482             break;
13483
13484           /* Either set or unset a boolean option.  */
13485           case aarch64_attr_bool:
13486             {
13487               struct cl_decoded_option decoded;
13488
13489               generate_option (p_attr->opt_num, NULL, !invert,
13490                                CL_TARGET, &decoded);
13491               aarch64_handle_option (&global_options, &global_options_set,
13492                                       &decoded, input_location);
13493               break;
13494             }
13495           /* Set or unset a bit in the target_flags.  aarch64_handle_option
13496              should know what mask to apply given the option number.  */
13497           case aarch64_attr_mask:
13498             {
13499               struct cl_decoded_option decoded;
13500               /* We only need to specify the option number.
13501                  aarch64_handle_option will know which mask to apply.  */
13502               decoded.opt_index = p_attr->opt_num;
13503               decoded.value = !invert;
13504               aarch64_handle_option (&global_options, &global_options_set,
13505                                       &decoded, input_location);
13506               break;
13507             }
13508           /* Use the option setting machinery to set an option to an enum.  */
13509           case aarch64_attr_enum:
13510             {
13511               gcc_assert (arg);
13512               bool valid;
13513               int value;
13514               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
13515                                               &value, CL_TARGET);
13516               if (valid)
13517                 {
13518                   set_option (&global_options, NULL, p_attr->opt_num, value,
13519                               NULL, DK_UNSPECIFIED, input_location,
13520                               global_dc);
13521                 }
13522               else
13523                 {
13524                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
13525                 }
13526               break;
13527             }
13528           default:
13529             gcc_unreachable ();
13530         }
13531     }
13532
13533   /* If we reached here we either have found an attribute and validated
13534      it or didn't match any.  If we matched an attribute but its arguments
13535      were malformed we will have returned false already.  */
13536   return found;
13537 }
13538
13539 /* Count how many times the character C appears in
13540    NULL-terminated string STR.  */
13541
13542 static unsigned int
13543 num_occurences_in_str (char c, char *str)
13544 {
13545   unsigned int res = 0;
13546   while (*str != '\0')
13547     {
13548       if (*str == c)
13549         res++;
13550
13551       str++;
13552     }
13553
13554   return res;
13555 }
13556
13557 /* Parse the tree in ARGS that contains the target attribute information
13558    and update the global target options space.  */
13559
13560 bool
13561 aarch64_process_target_attr (tree args)
13562 {
13563   if (TREE_CODE (args) == TREE_LIST)
13564     {
13565       do
13566         {
13567           tree head = TREE_VALUE (args);
13568           if (head)
13569             {
13570               if (!aarch64_process_target_attr (head))
13571                 return false;
13572             }
13573           args = TREE_CHAIN (args);
13574         } while (args);
13575
13576       return true;
13577     }
13578
13579   if (TREE_CODE (args) != STRING_CST)
13580     {
13581       error ("attribute %<target%> argument not a string");
13582       return false;
13583     }
13584
13585   size_t len = strlen (TREE_STRING_POINTER (args));
13586   char *str_to_check = (char *) alloca (len + 1);
13587   strcpy (str_to_check, TREE_STRING_POINTER (args));
13588
13589   if (len == 0)
13590     {
13591       error ("malformed %<target()%> pragma or attribute");
13592       return false;
13593     }
13594
13595   /* Used to catch empty spaces between commas i.e.
13596      attribute ((target ("attr1,,attr2"))).  */
13597   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
13598
13599   /* Handle multiple target attributes separated by ','.  */
13600   char *token = strtok_r (str_to_check, ",", &str_to_check);
13601
13602   unsigned int num_attrs = 0;
13603   while (token)
13604     {
13605       num_attrs++;
13606       if (!aarch64_process_one_target_attr (token))
13607         {
13608           error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
13609           return false;
13610         }
13611
13612       token = strtok_r (NULL, ",", &str_to_check);
13613     }
13614
13615   if (num_attrs != num_commas + 1)
13616     {
13617       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
13618       return false;
13619     }
13620
13621   return true;
13622 }
13623
13624 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
13625    process attribute ((target ("..."))).  */
13626
13627 static bool
13628 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
13629 {
13630   struct cl_target_option cur_target;
13631   bool ret;
13632   tree old_optimize;
13633   tree new_target, new_optimize;
13634   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13635
13636   /* If what we're processing is the current pragma string then the
13637      target option node is already stored in target_option_current_node
13638      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
13639      having to re-parse the string.  This is especially useful to keep
13640      arm_neon.h compile times down since that header contains a lot
13641      of intrinsics enclosed in pragmas.  */
13642   if (!existing_target && args == current_target_pragma)
13643     {
13644       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
13645       return true;
13646     }
13647   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13648
13649   old_optimize = build_optimization_node (&global_options);
13650   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13651
13652   /* If the function changed the optimization levels as well as setting
13653      target options, start with the optimizations specified.  */
13654   if (func_optimize && func_optimize != old_optimize)
13655     cl_optimization_restore (&global_options,
13656                              TREE_OPTIMIZATION (func_optimize));
13657
13658   /* Save the current target options to restore at the end.  */
13659   cl_target_option_save (&cur_target, &global_options);
13660
13661   /* If fndecl already has some target attributes applied to it, unpack
13662      them so that we add this attribute on top of them, rather than
13663      overwriting them.  */
13664   if (existing_target)
13665     {
13666       struct cl_target_option *existing_options
13667         = TREE_TARGET_OPTION (existing_target);
13668
13669       if (existing_options)
13670         cl_target_option_restore (&global_options, existing_options);
13671     }
13672   else
13673     cl_target_option_restore (&global_options,
13674                         TREE_TARGET_OPTION (target_option_current_node));
13675
13676   ret = aarch64_process_target_attr (args);
13677
13678   /* Set up any additional state.  */
13679   if (ret)
13680     {
13681       aarch64_override_options_internal (&global_options);
13682       /* Initialize SIMD builtins if we haven't already.
13683          Set current_target_pragma to NULL for the duration so that
13684          the builtin initialization code doesn't try to tag the functions
13685          being built with the attributes specified by any current pragma, thus
13686          going into an infinite recursion.  */
13687       if (TARGET_SIMD)
13688         {
13689           tree saved_current_target_pragma = current_target_pragma;
13690           current_target_pragma = NULL;
13691           aarch64_init_simd_builtins ();
13692           current_target_pragma = saved_current_target_pragma;
13693         }
13694       new_target = build_target_option_node (&global_options);
13695     }
13696   else
13697     new_target = NULL;
13698
13699   new_optimize = build_optimization_node (&global_options);
13700
13701   if (fndecl && ret)
13702     {
13703       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
13704
13705       if (old_optimize != new_optimize)
13706         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
13707     }
13708
13709   cl_target_option_restore (&global_options, &cur_target);
13710
13711   if (old_optimize != new_optimize)
13712     cl_optimization_restore (&global_options,
13713                              TREE_OPTIMIZATION (old_optimize));
13714   return ret;
13715 }
13716
13717 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
13718    tri-bool options (yes, no, don't care) and the default value is
13719    DEF, determine whether to reject inlining.  */
13720
13721 static bool
13722 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
13723                                      int dont_care, int def)
13724 {
13725   /* If the callee doesn't care, always allow inlining.  */
13726   if (callee == dont_care)
13727     return true;
13728
13729   /* If the caller doesn't care, always allow inlining.  */
13730   if (caller == dont_care)
13731     return true;
13732
13733   /* Otherwise, allow inlining if either the callee and caller values
13734      agree, or if the callee is using the default value.  */
13735   return (callee == caller || callee == def);
13736 }
13737
13738 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
13739    to inline CALLEE into CALLER based on target-specific info.
13740    Make sure that the caller and callee have compatible architectural
13741    features.  Then go through the other possible target attributes
13742    and see if they can block inlining.  Try not to reject always_inline
13743    callees unless they are incompatible architecturally.  */
13744
13745 static bool
13746 aarch64_can_inline_p (tree caller, tree callee)
13747 {
13748   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
13749   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
13750
13751   struct cl_target_option *caller_opts
13752         = TREE_TARGET_OPTION (caller_tree ? caller_tree
13753                                            : target_option_default_node);
13754
13755   struct cl_target_option *callee_opts
13756         = TREE_TARGET_OPTION (callee_tree ? callee_tree
13757                                            : target_option_default_node);
13758
13759   /* Callee's ISA flags should be a subset of the caller's.  */
13760   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
13761        != callee_opts->x_aarch64_isa_flags)
13762     return false;
13763
13764   /* Allow non-strict aligned functions inlining into strict
13765      aligned ones.  */
13766   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
13767        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
13768       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
13769            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
13770     return false;
13771
13772   bool always_inline = lookup_attribute ("always_inline",
13773                                           DECL_ATTRIBUTES (callee));
13774
13775   /* If the architectural features match up and the callee is always_inline
13776      then the other attributes don't matter.  */
13777   if (always_inline)
13778     return true;
13779
13780   if (caller_opts->x_aarch64_cmodel_var
13781       != callee_opts->x_aarch64_cmodel_var)
13782     return false;
13783
13784   if (caller_opts->x_aarch64_tls_dialect
13785       != callee_opts->x_aarch64_tls_dialect)
13786     return false;
13787
13788   /* Honour explicit requests to workaround errata.  */
13789   if (!aarch64_tribools_ok_for_inlining_p (
13790           caller_opts->x_aarch64_fix_a53_err835769,
13791           callee_opts->x_aarch64_fix_a53_err835769,
13792           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
13793     return false;
13794
13795   if (!aarch64_tribools_ok_for_inlining_p (
13796           caller_opts->x_aarch64_fix_a53_err843419,
13797           callee_opts->x_aarch64_fix_a53_err843419,
13798           2, TARGET_FIX_ERR_A53_843419))
13799     return false;
13800
13801   /* If the user explicitly specified -momit-leaf-frame-pointer for the
13802      caller and calle and they don't match up, reject inlining.  */
13803   if (!aarch64_tribools_ok_for_inlining_p (
13804           caller_opts->x_flag_omit_leaf_frame_pointer,
13805           callee_opts->x_flag_omit_leaf_frame_pointer,
13806           2, 1))
13807     return false;
13808
13809   /* If the callee has specific tuning overrides, respect them.  */
13810   if (callee_opts->x_aarch64_override_tune_string != NULL
13811       && caller_opts->x_aarch64_override_tune_string == NULL)
13812     return false;
13813
13814   /* If the user specified tuning override strings for the
13815      caller and callee and they don't match up, reject inlining.
13816      We just do a string compare here, we don't analyze the meaning
13817      of the string, as it would be too costly for little gain.  */
13818   if (callee_opts->x_aarch64_override_tune_string
13819       && caller_opts->x_aarch64_override_tune_string
13820       && (strcmp (callee_opts->x_aarch64_override_tune_string,
13821                   caller_opts->x_aarch64_override_tune_string) != 0))
13822     return false;
13823
13824   return true;
13825 }
13826
13827 /* Return true if SYMBOL_REF X binds locally.  */
13828
13829 static bool
13830 aarch64_symbol_binds_local_p (const_rtx x)
13831 {
13832   return (SYMBOL_REF_DECL (x)
13833           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
13834           : SYMBOL_REF_LOCAL_P (x));
13835 }
13836
13837 /* Return true if SYMBOL_REF X is thread local */
13838 static bool
13839 aarch64_tls_symbol_p (rtx x)
13840 {
13841   if (! TARGET_HAVE_TLS)
13842     return false;
13843
13844   if (GET_CODE (x) != SYMBOL_REF)
13845     return false;
13846
13847   return SYMBOL_REF_TLS_MODEL (x) != 0;
13848 }
13849
13850 /* Classify a TLS symbol into one of the TLS kinds.  */
13851 enum aarch64_symbol_type
13852 aarch64_classify_tls_symbol (rtx x)
13853 {
13854   enum tls_model tls_kind = tls_symbolic_operand_type (x);
13855
13856   switch (tls_kind)
13857     {
13858     case TLS_MODEL_GLOBAL_DYNAMIC:
13859     case TLS_MODEL_LOCAL_DYNAMIC:
13860       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
13861
13862     case TLS_MODEL_INITIAL_EXEC:
13863       switch (aarch64_cmodel)
13864         {
13865         case AARCH64_CMODEL_TINY:
13866         case AARCH64_CMODEL_TINY_PIC:
13867           return SYMBOL_TINY_TLSIE;
13868         default:
13869           return SYMBOL_SMALL_TLSIE;
13870         }
13871
13872     case TLS_MODEL_LOCAL_EXEC:
13873       if (aarch64_tls_size == 12)
13874         return SYMBOL_TLSLE12;
13875       else if (aarch64_tls_size == 24)
13876         return SYMBOL_TLSLE24;
13877       else if (aarch64_tls_size == 32)
13878         return SYMBOL_TLSLE32;
13879       else if (aarch64_tls_size == 48)
13880         return SYMBOL_TLSLE48;
13881       else
13882         gcc_unreachable ();
13883
13884     case TLS_MODEL_EMULATED:
13885     case TLS_MODEL_NONE:
13886       return SYMBOL_FORCE_TO_MEM;
13887
13888     default:
13889       gcc_unreachable ();
13890     }
13891 }
13892
13893 /* Return the correct method for accessing X + OFFSET, where X is either
13894    a SYMBOL_REF or LABEL_REF.  */
13895
13896 enum aarch64_symbol_type
13897 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
13898 {
13899   if (GET_CODE (x) == LABEL_REF)
13900     {
13901       switch (aarch64_cmodel)
13902         {
13903         case AARCH64_CMODEL_LARGE:
13904           return SYMBOL_FORCE_TO_MEM;
13905
13906         case AARCH64_CMODEL_TINY_PIC:
13907         case AARCH64_CMODEL_TINY:
13908           return SYMBOL_TINY_ABSOLUTE;
13909
13910         case AARCH64_CMODEL_SMALL_SPIC:
13911         case AARCH64_CMODEL_SMALL_PIC:
13912         case AARCH64_CMODEL_SMALL:
13913           return SYMBOL_SMALL_ABSOLUTE;
13914
13915         default:
13916           gcc_unreachable ();
13917         }
13918     }
13919
13920   if (GET_CODE (x) == SYMBOL_REF)
13921     {
13922       if (aarch64_tls_symbol_p (x))
13923         return aarch64_classify_tls_symbol (x);
13924
13925       switch (aarch64_cmodel)
13926         {
13927         case AARCH64_CMODEL_TINY:
13928           /* When we retrieve symbol + offset address, we have to make sure
13929              the offset does not cause overflow of the final address.  But
13930              we have no way of knowing the address of symbol at compile time
13931              so we can't accurately say if the distance between the PC and
13932              symbol + offset is outside the addressible range of +/-1M in the
13933              TINY code model.  So we rely on images not being greater than
13934              1M and cap the offset at 1M and anything beyond 1M will have to
13935              be loaded using an alternative mechanism.  Furthermore if the
13936              symbol is a weak reference to something that isn't known to
13937              resolve to a symbol in this module, then force to memory.  */
13938           if ((SYMBOL_REF_WEAK (x)
13939                && !aarch64_symbol_binds_local_p (x))
13940               || !IN_RANGE (offset, -1048575, 1048575))
13941             return SYMBOL_FORCE_TO_MEM;
13942           return SYMBOL_TINY_ABSOLUTE;
13943
13944         case AARCH64_CMODEL_SMALL:
13945           /* Same reasoning as the tiny code model, but the offset cap here is
13946              4G.  */
13947           if ((SYMBOL_REF_WEAK (x)
13948                && !aarch64_symbol_binds_local_p (x))
13949               || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
13950                             HOST_WIDE_INT_C (4294967264)))
13951             return SYMBOL_FORCE_TO_MEM;
13952           return SYMBOL_SMALL_ABSOLUTE;
13953
13954         case AARCH64_CMODEL_TINY_PIC:
13955           if (!aarch64_symbol_binds_local_p (x))
13956             return SYMBOL_TINY_GOT;
13957           return SYMBOL_TINY_ABSOLUTE;
13958
13959         case AARCH64_CMODEL_SMALL_SPIC:
13960         case AARCH64_CMODEL_SMALL_PIC:
13961           if (!aarch64_symbol_binds_local_p (x))
13962             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
13963                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
13964           return SYMBOL_SMALL_ABSOLUTE;
13965
13966         case AARCH64_CMODEL_LARGE:
13967           /* This is alright even in PIC code as the constant
13968              pool reference is always PC relative and within
13969              the same translation unit.  */
13970           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
13971             return SYMBOL_SMALL_ABSOLUTE;
13972           else
13973             return SYMBOL_FORCE_TO_MEM;
13974
13975         default:
13976           gcc_unreachable ();
13977         }
13978     }
13979
13980   /* By default push everything into the constant pool.  */
13981   return SYMBOL_FORCE_TO_MEM;
13982 }
13983
13984 bool
13985 aarch64_constant_address_p (rtx x)
13986 {
13987   return (CONSTANT_P (x) && memory_address_p (DImode, x));
13988 }
13989
13990 bool
13991 aarch64_legitimate_pic_operand_p (rtx x)
13992 {
13993   if (GET_CODE (x) == SYMBOL_REF
13994       || (GET_CODE (x) == CONST
13995           && GET_CODE (XEXP (x, 0)) == PLUS
13996           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
13997      return false;
13998
13999   return true;
14000 }
14001
14002 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
14003    that should be rematerialized rather than spilled.  */
14004
14005 static bool
14006 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
14007 {
14008   /* Support CSE and rematerialization of common constants.  */
14009   if (CONST_INT_P (x)
14010       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
14011       || GET_CODE (x) == CONST_VECTOR)
14012     return true;
14013
14014   /* Do not allow vector struct mode constants for Advanced SIMD.
14015      We could support 0 and -1 easily, but they need support in
14016      aarch64-simd.md.  */
14017   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14018   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
14019     return false;
14020
14021   /* Only accept variable-length vector constants if they can be
14022      handled directly.
14023
14024      ??? It would be possible to handle rematerialization of other
14025      constants via secondary reloads.  */
14026   if (vec_flags & VEC_ANY_SVE)
14027     return aarch64_simd_valid_immediate (x, NULL);
14028
14029   if (GET_CODE (x) == HIGH)
14030     x = XEXP (x, 0);
14031
14032   /* Accept polynomial constants that can be calculated by using the
14033      destination of a move as the sole temporary.  Constants that
14034      require a second temporary cannot be rematerialized (they can't be
14035      forced to memory and also aren't legitimate constants).  */
14036   poly_int64 offset;
14037   if (poly_int_rtx_p (x, &offset))
14038     return aarch64_offset_temporaries (false, offset) <= 1;
14039
14040   /* If an offset is being added to something else, we need to allow the
14041      base to be moved into the destination register, meaning that there
14042      are no free temporaries for the offset.  */
14043   x = strip_offset (x, &offset);
14044   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
14045     return false;
14046
14047   /* Do not allow const (plus (anchor_symbol, const_int)).  */
14048   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
14049     return false;
14050
14051   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
14052      so spilling them is better than rematerialization.  */
14053   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
14054     return true;
14055
14056   /* Label references are always constant.  */
14057   if (GET_CODE (x) == LABEL_REF)
14058     return true;
14059
14060   return false;
14061 }
14062
14063 rtx
14064 aarch64_load_tp (rtx target)
14065 {
14066   if (!target
14067       || GET_MODE (target) != Pmode
14068       || !register_operand (target, Pmode))
14069     target = gen_reg_rtx (Pmode);
14070
14071   /* Can return in any reg.  */
14072   emit_insn (gen_aarch64_load_tp_hard (target));
14073   return target;
14074 }
14075
14076 /* On AAPCS systems, this is the "struct __va_list".  */
14077 static GTY(()) tree va_list_type;
14078
14079 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
14080    Return the type to use as __builtin_va_list.
14081
14082    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
14083
14084    struct __va_list
14085    {
14086      void *__stack;
14087      void *__gr_top;
14088      void *__vr_top;
14089      int   __gr_offs;
14090      int   __vr_offs;
14091    };  */
14092
14093 static tree
14094 aarch64_build_builtin_va_list (void)
14095 {
14096   tree va_list_name;
14097   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14098
14099   /* Create the type.  */
14100   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
14101   /* Give it the required name.  */
14102   va_list_name = build_decl (BUILTINS_LOCATION,
14103                              TYPE_DECL,
14104                              get_identifier ("__va_list"),
14105                              va_list_type);
14106   DECL_ARTIFICIAL (va_list_name) = 1;
14107   TYPE_NAME (va_list_type) = va_list_name;
14108   TYPE_STUB_DECL (va_list_type) = va_list_name;
14109
14110   /* Create the fields.  */
14111   f_stack = build_decl (BUILTINS_LOCATION,
14112                         FIELD_DECL, get_identifier ("__stack"),
14113                         ptr_type_node);
14114   f_grtop = build_decl (BUILTINS_LOCATION,
14115                         FIELD_DECL, get_identifier ("__gr_top"),
14116                         ptr_type_node);
14117   f_vrtop = build_decl (BUILTINS_LOCATION,
14118                         FIELD_DECL, get_identifier ("__vr_top"),
14119                         ptr_type_node);
14120   f_groff = build_decl (BUILTINS_LOCATION,
14121                         FIELD_DECL, get_identifier ("__gr_offs"),
14122                         integer_type_node);
14123   f_vroff = build_decl (BUILTINS_LOCATION,
14124                         FIELD_DECL, get_identifier ("__vr_offs"),
14125                         integer_type_node);
14126
14127   /* Tell tree-stdarg pass about our internal offset fields.
14128      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
14129      purpose to identify whether the code is updating va_list internal
14130      offset fields through irregular way.  */
14131   va_list_gpr_counter_field = f_groff;
14132   va_list_fpr_counter_field = f_vroff;
14133
14134   DECL_ARTIFICIAL (f_stack) = 1;
14135   DECL_ARTIFICIAL (f_grtop) = 1;
14136   DECL_ARTIFICIAL (f_vrtop) = 1;
14137   DECL_ARTIFICIAL (f_groff) = 1;
14138   DECL_ARTIFICIAL (f_vroff) = 1;
14139
14140   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
14141   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
14142   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
14143   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
14144   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
14145
14146   TYPE_FIELDS (va_list_type) = f_stack;
14147   DECL_CHAIN (f_stack) = f_grtop;
14148   DECL_CHAIN (f_grtop) = f_vrtop;
14149   DECL_CHAIN (f_vrtop) = f_groff;
14150   DECL_CHAIN (f_groff) = f_vroff;
14151
14152   /* Compute its layout.  */
14153   layout_type (va_list_type);
14154
14155   return va_list_type;
14156 }
14157
14158 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
14159 static void
14160 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
14161 {
14162   const CUMULATIVE_ARGS *cum;
14163   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14164   tree stack, grtop, vrtop, groff, vroff;
14165   tree t;
14166   int gr_save_area_size = cfun->va_list_gpr_size;
14167   int vr_save_area_size = cfun->va_list_fpr_size;
14168   int vr_offset;
14169
14170   cum = &crtl->args.info;
14171   if (cfun->va_list_gpr_size)
14172     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
14173                              cfun->va_list_gpr_size);
14174   if (cfun->va_list_fpr_size)
14175     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
14176                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
14177
14178   if (!TARGET_FLOAT)
14179     {
14180       gcc_assert (cum->aapcs_nvrn == 0);
14181       vr_save_area_size = 0;
14182     }
14183
14184   f_stack = TYPE_FIELDS (va_list_type_node);
14185   f_grtop = DECL_CHAIN (f_stack);
14186   f_vrtop = DECL_CHAIN (f_grtop);
14187   f_groff = DECL_CHAIN (f_vrtop);
14188   f_vroff = DECL_CHAIN (f_groff);
14189
14190   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
14191                   NULL_TREE);
14192   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
14193                   NULL_TREE);
14194   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
14195                   NULL_TREE);
14196   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
14197                   NULL_TREE);
14198   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
14199                   NULL_TREE);
14200
14201   /* Emit code to initialize STACK, which points to the next varargs stack
14202      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
14203      by named arguments.  STACK is 8-byte aligned.  */
14204   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
14205   if (cum->aapcs_stack_size > 0)
14206     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
14207   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
14208   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14209
14210   /* Emit code to initialize GRTOP, the top of the GR save area.
14211      virtual_incoming_args_rtx should have been 16 byte aligned.  */
14212   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
14213   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
14214   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14215
14216   /* Emit code to initialize VRTOP, the top of the VR save area.
14217      This address is gr_save_area_bytes below GRTOP, rounded
14218      down to the next 16-byte boundary.  */
14219   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
14220   vr_offset = ROUND_UP (gr_save_area_size,
14221                         STACK_BOUNDARY / BITS_PER_UNIT);
14222
14223   if (vr_offset)
14224     t = fold_build_pointer_plus_hwi (t, -vr_offset);
14225   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
14226   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14227
14228   /* Emit code to initialize GROFF, the offset from GRTOP of the
14229      next GPR argument.  */
14230   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
14231               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
14232   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14233
14234   /* Likewise emit code to initialize VROFF, the offset from FTOP
14235      of the next VR argument.  */
14236   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
14237               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
14238   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14239 }
14240
14241 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
14242
14243 static tree
14244 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
14245                               gimple_seq *post_p ATTRIBUTE_UNUSED)
14246 {
14247   tree addr;
14248   bool indirect_p;
14249   bool is_ha;           /* is HFA or HVA.  */
14250   bool dw_align;        /* double-word align.  */
14251   machine_mode ag_mode = VOIDmode;
14252   int nregs;
14253   machine_mode mode;
14254
14255   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14256   tree stack, f_top, f_off, off, arg, roundup, on_stack;
14257   HOST_WIDE_INT size, rsize, adjust, align;
14258   tree t, u, cond1, cond2;
14259
14260   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
14261   if (indirect_p)
14262     type = build_pointer_type (type);
14263
14264   mode = TYPE_MODE (type);
14265
14266   f_stack = TYPE_FIELDS (va_list_type_node);
14267   f_grtop = DECL_CHAIN (f_stack);
14268   f_vrtop = DECL_CHAIN (f_grtop);
14269   f_groff = DECL_CHAIN (f_vrtop);
14270   f_vroff = DECL_CHAIN (f_groff);
14271
14272   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
14273                   f_stack, NULL_TREE);
14274   size = int_size_in_bytes (type);
14275
14276   bool abi_break;
14277   align
14278     = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
14279
14280   dw_align = false;
14281   adjust = 0;
14282   if (aarch64_vfp_is_call_or_return_candidate (mode,
14283                                                type,
14284                                                &ag_mode,
14285                                                &nregs,
14286                                                &is_ha))
14287     {
14288       /* No frontends can create types with variable-sized modes, so we
14289          shouldn't be asked to pass or return them.  */
14290       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
14291
14292       /* TYPE passed in fp/simd registers.  */
14293       if (!TARGET_FLOAT)
14294         aarch64_err_no_fpadvsimd (mode);
14295
14296       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
14297                       unshare_expr (valist), f_vrtop, NULL_TREE);
14298       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
14299                       unshare_expr (valist), f_vroff, NULL_TREE);
14300
14301       rsize = nregs * UNITS_PER_VREG;
14302
14303       if (is_ha)
14304         {
14305           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
14306             adjust = UNITS_PER_VREG - ag_size;
14307         }
14308       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14309                && size < UNITS_PER_VREG)
14310         {
14311           adjust = UNITS_PER_VREG - size;
14312         }
14313     }
14314   else
14315     {
14316       /* TYPE passed in general registers.  */
14317       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
14318                       unshare_expr (valist), f_grtop, NULL_TREE);
14319       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
14320                       unshare_expr (valist), f_groff, NULL_TREE);
14321       rsize = ROUND_UP (size, UNITS_PER_WORD);
14322       nregs = rsize / UNITS_PER_WORD;
14323
14324       if (align > 8)
14325         {
14326           if (abi_break && warn_psabi)
14327             inform (input_location, "parameter passing for argument of type "
14328                     "%qT changed in GCC 9.1", type);
14329           dw_align = true;
14330         }
14331
14332       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14333           && size < UNITS_PER_WORD)
14334         {
14335           adjust = UNITS_PER_WORD  - size;
14336         }
14337     }
14338
14339   /* Get a local temporary for the field value.  */
14340   off = get_initialized_tmp_var (f_off, pre_p, NULL);
14341
14342   /* Emit code to branch if off >= 0.  */
14343   t = build2 (GE_EXPR, boolean_type_node, off,
14344               build_int_cst (TREE_TYPE (off), 0));
14345   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
14346
14347   if (dw_align)
14348     {
14349       /* Emit: offs = (offs + 15) & -16.  */
14350       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
14351                   build_int_cst (TREE_TYPE (off), 15));
14352       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
14353                   build_int_cst (TREE_TYPE (off), -16));
14354       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
14355     }
14356   else
14357     roundup = NULL;
14358
14359   /* Update ap.__[g|v]r_offs  */
14360   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
14361               build_int_cst (TREE_TYPE (off), rsize));
14362   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
14363
14364   /* String up.  */
14365   if (roundup)
14366     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
14367
14368   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
14369   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
14370               build_int_cst (TREE_TYPE (f_off), 0));
14371   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
14372
14373   /* String up: make sure the assignment happens before the use.  */
14374   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
14375   COND_EXPR_ELSE (cond1) = t;
14376
14377   /* Prepare the trees handling the argument that is passed on the stack;
14378      the top level node will store in ON_STACK.  */
14379   arg = get_initialized_tmp_var (stack, pre_p, NULL);
14380   if (align > 8)
14381     {
14382       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
14383       t = fold_build_pointer_plus_hwi (arg, 15);
14384       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
14385                   build_int_cst (TREE_TYPE (t), -16));
14386       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
14387     }
14388   else
14389     roundup = NULL;
14390   /* Advance ap.__stack  */
14391   t = fold_build_pointer_plus_hwi (arg, size + 7);
14392   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
14393               build_int_cst (TREE_TYPE (t), -8));
14394   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
14395   /* String up roundup and advance.  */
14396   if (roundup)
14397     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
14398   /* String up with arg */
14399   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
14400   /* Big-endianness related address adjustment.  */
14401   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14402       && size < UNITS_PER_WORD)
14403   {
14404     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
14405                 size_int (UNITS_PER_WORD - size));
14406     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
14407   }
14408
14409   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
14410   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
14411
14412   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
14413   t = off;
14414   if (adjust)
14415     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
14416                 build_int_cst (TREE_TYPE (off), adjust));
14417
14418   t = fold_convert (sizetype, t);
14419   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
14420
14421   if (is_ha)
14422     {
14423       /* type ha; // treat as "struct {ftype field[n];}"
14424          ... [computing offs]
14425          for (i = 0; i <nregs; ++i, offs += 16)
14426            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
14427          return ha;  */
14428       int i;
14429       tree tmp_ha, field_t, field_ptr_t;
14430
14431       /* Declare a local variable.  */
14432       tmp_ha = create_tmp_var_raw (type, "ha");
14433       gimple_add_tmp_var (tmp_ha);
14434
14435       /* Establish the base type.  */
14436       switch (ag_mode)
14437         {
14438         case E_SFmode:
14439           field_t = float_type_node;
14440           field_ptr_t = float_ptr_type_node;
14441           break;
14442         case E_DFmode:
14443           field_t = double_type_node;
14444           field_ptr_t = double_ptr_type_node;
14445           break;
14446         case E_TFmode:
14447           field_t = long_double_type_node;
14448           field_ptr_t = long_double_ptr_type_node;
14449           break;
14450         case E_HFmode:
14451           field_t = aarch64_fp16_type_node;
14452           field_ptr_t = aarch64_fp16_ptr_type_node;
14453           break;
14454         case E_V2SImode:
14455         case E_V4SImode:
14456             {
14457               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
14458               field_t = build_vector_type_for_mode (innertype, ag_mode);
14459               field_ptr_t = build_pointer_type (field_t);
14460             }
14461           break;
14462         default:
14463           gcc_assert (0);
14464         }
14465
14466       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
14467       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
14468       addr = t;
14469       t = fold_convert (field_ptr_t, addr);
14470       t = build2 (MODIFY_EXPR, field_t,
14471                   build1 (INDIRECT_REF, field_t, tmp_ha),
14472                   build1 (INDIRECT_REF, field_t, t));
14473
14474       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
14475       for (i = 1; i < nregs; ++i)
14476         {
14477           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
14478           u = fold_convert (field_ptr_t, addr);
14479           u = build2 (MODIFY_EXPR, field_t,
14480                       build2 (MEM_REF, field_t, tmp_ha,
14481                               build_int_cst (field_ptr_t,
14482                                              (i *
14483                                               int_size_in_bytes (field_t)))),
14484                       build1 (INDIRECT_REF, field_t, u));
14485           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
14486         }
14487
14488       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
14489       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
14490     }
14491
14492   COND_EXPR_ELSE (cond2) = t;
14493   addr = fold_convert (build_pointer_type (type), cond1);
14494   addr = build_va_arg_indirect_ref (addr);
14495
14496   if (indirect_p)
14497     addr = build_va_arg_indirect_ref (addr);
14498
14499   return addr;
14500 }
14501
14502 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
14503
14504 static void
14505 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
14506                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
14507                                 int no_rtl)
14508 {
14509   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
14510   CUMULATIVE_ARGS local_cum;
14511   int gr_saved = cfun->va_list_gpr_size;
14512   int vr_saved = cfun->va_list_fpr_size;
14513
14514   /* The caller has advanced CUM up to, but not beyond, the last named
14515      argument.  Advance a local copy of CUM past the last "real" named
14516      argument, to find out how many registers are left over.  */
14517   local_cum = *cum;
14518   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
14519
14520   /* Found out how many registers we need to save.
14521      Honor tree-stdvar analysis results.  */
14522   if (cfun->va_list_gpr_size)
14523     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
14524                     cfun->va_list_gpr_size / UNITS_PER_WORD);
14525   if (cfun->va_list_fpr_size)
14526     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
14527                     cfun->va_list_fpr_size / UNITS_PER_VREG);
14528
14529   if (!TARGET_FLOAT)
14530     {
14531       gcc_assert (local_cum.aapcs_nvrn == 0);
14532       vr_saved = 0;
14533     }
14534
14535   if (!no_rtl)
14536     {
14537       if (gr_saved > 0)
14538         {
14539           rtx ptr, mem;
14540
14541           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
14542           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
14543                                - gr_saved * UNITS_PER_WORD);
14544           mem = gen_frame_mem (BLKmode, ptr);
14545           set_mem_alias_set (mem, get_varargs_alias_set ());
14546
14547           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
14548                                mem, gr_saved);
14549         }
14550       if (vr_saved > 0)
14551         {
14552           /* We can't use move_block_from_reg, because it will use
14553              the wrong mode, storing D regs only.  */
14554           machine_mode mode = TImode;
14555           int off, i, vr_start;
14556
14557           /* Set OFF to the offset from virtual_incoming_args_rtx of
14558              the first vector register.  The VR save area lies below
14559              the GR one, and is aligned to 16 bytes.  */
14560           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
14561                            STACK_BOUNDARY / BITS_PER_UNIT);
14562           off -= vr_saved * UNITS_PER_VREG;
14563
14564           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
14565           for (i = 0; i < vr_saved; ++i)
14566             {
14567               rtx ptr, mem;
14568
14569               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
14570               mem = gen_frame_mem (mode, ptr);
14571               set_mem_alias_set (mem, get_varargs_alias_set ());
14572               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
14573               off += UNITS_PER_VREG;
14574             }
14575         }
14576     }
14577
14578   /* We don't save the size into *PRETEND_SIZE because we want to avoid
14579      any complication of having crtl->args.pretend_args_size changed.  */
14580   cfun->machine->frame.saved_varargs_size
14581     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
14582                  STACK_BOUNDARY / BITS_PER_UNIT)
14583        + vr_saved * UNITS_PER_VREG);
14584 }
14585
14586 static void
14587 aarch64_conditional_register_usage (void)
14588 {
14589   int i;
14590   if (!TARGET_FLOAT)
14591     {
14592       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
14593         {
14594           fixed_regs[i] = 1;
14595           call_used_regs[i] = 1;
14596         }
14597     }
14598   if (!TARGET_SVE)
14599     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
14600       {
14601         fixed_regs[i] = 1;
14602         call_used_regs[i] = 1;
14603       }
14604
14605   /* When tracking speculation, we need a couple of call-clobbered registers
14606      to track the speculation state.  It would be nice to just use
14607      IP0 and IP1, but currently there are numerous places that just
14608      assume these registers are free for other uses (eg pointer
14609      authentication).  */
14610   if (aarch64_track_speculation)
14611     {
14612       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
14613       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
14614       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14615       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14616     }
14617 }
14618
14619 /* Walk down the type tree of TYPE counting consecutive base elements.
14620    If *MODEP is VOIDmode, then set it to the first valid floating point
14621    type.  If a non-floating point type is found, or if a floating point
14622    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
14623    otherwise return the count in the sub-tree.  */
14624 static int
14625 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
14626 {
14627   machine_mode mode;
14628   HOST_WIDE_INT size;
14629
14630   switch (TREE_CODE (type))
14631     {
14632     case REAL_TYPE:
14633       mode = TYPE_MODE (type);
14634       if (mode != DFmode && mode != SFmode
14635           && mode != TFmode && mode != HFmode)
14636         return -1;
14637
14638       if (*modep == VOIDmode)
14639         *modep = mode;
14640
14641       if (*modep == mode)
14642         return 1;
14643
14644       break;
14645
14646     case COMPLEX_TYPE:
14647       mode = TYPE_MODE (TREE_TYPE (type));
14648       if (mode != DFmode && mode != SFmode
14649           && mode != TFmode && mode != HFmode)
14650         return -1;
14651
14652       if (*modep == VOIDmode)
14653         *modep = mode;
14654
14655       if (*modep == mode)
14656         return 2;
14657
14658       break;
14659
14660     case VECTOR_TYPE:
14661       /* Use V2SImode and V4SImode as representatives of all 64-bit
14662          and 128-bit vector types.  */
14663       size = int_size_in_bytes (type);
14664       switch (size)
14665         {
14666         case 8:
14667           mode = V2SImode;
14668           break;
14669         case 16:
14670           mode = V4SImode;
14671           break;
14672         default:
14673           return -1;
14674         }
14675
14676       if (*modep == VOIDmode)
14677         *modep = mode;
14678
14679       /* Vector modes are considered to be opaque: two vectors are
14680          equivalent for the purposes of being homogeneous aggregates
14681          if they are the same size.  */
14682       if (*modep == mode)
14683         return 1;
14684
14685       break;
14686
14687     case ARRAY_TYPE:
14688       {
14689         int count;
14690         tree index = TYPE_DOMAIN (type);
14691
14692         /* Can't handle incomplete types nor sizes that are not
14693            fixed.  */
14694         if (!COMPLETE_TYPE_P (type)
14695             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14696           return -1;
14697
14698         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
14699         if (count == -1
14700             || !index
14701             || !TYPE_MAX_VALUE (index)
14702             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
14703             || !TYPE_MIN_VALUE (index)
14704             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
14705             || count < 0)
14706           return -1;
14707
14708         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
14709                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
14710
14711         /* There must be no padding.  */
14712         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14713                       count * GET_MODE_BITSIZE (*modep)))
14714           return -1;
14715
14716         return count;
14717       }
14718
14719     case RECORD_TYPE:
14720       {
14721         int count = 0;
14722         int sub_count;
14723         tree field;
14724
14725         /* Can't handle incomplete types nor sizes that are not
14726            fixed.  */
14727         if (!COMPLETE_TYPE_P (type)
14728             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14729           return -1;
14730
14731         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14732           {
14733             if (TREE_CODE (field) != FIELD_DECL)
14734               continue;
14735
14736             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14737             if (sub_count < 0)
14738               return -1;
14739             count += sub_count;
14740           }
14741
14742         /* There must be no padding.  */
14743         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14744                       count * GET_MODE_BITSIZE (*modep)))
14745           return -1;
14746
14747         return count;
14748       }
14749
14750     case UNION_TYPE:
14751     case QUAL_UNION_TYPE:
14752       {
14753         /* These aren't very interesting except in a degenerate case.  */
14754         int count = 0;
14755         int sub_count;
14756         tree field;
14757
14758         /* Can't handle incomplete types nor sizes that are not
14759            fixed.  */
14760         if (!COMPLETE_TYPE_P (type)
14761             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14762           return -1;
14763
14764         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14765           {
14766             if (TREE_CODE (field) != FIELD_DECL)
14767               continue;
14768
14769             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14770             if (sub_count < 0)
14771               return -1;
14772             count = count > sub_count ? count : sub_count;
14773           }
14774
14775         /* There must be no padding.  */
14776         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14777                       count * GET_MODE_BITSIZE (*modep)))
14778           return -1;
14779
14780         return count;
14781       }
14782
14783     default:
14784       break;
14785     }
14786
14787   return -1;
14788 }
14789
14790 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
14791    type as described in AAPCS64 \S 4.1.2.
14792
14793    See the comment above aarch64_composite_type_p for the notes on MODE.  */
14794
14795 static bool
14796 aarch64_short_vector_p (const_tree type,
14797                         machine_mode mode)
14798 {
14799   poly_int64 size = -1;
14800
14801   if (type && TREE_CODE (type) == VECTOR_TYPE)
14802     size = int_size_in_bytes (type);
14803   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
14804             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
14805     size = GET_MODE_SIZE (mode);
14806
14807   return known_eq (size, 8) || known_eq (size, 16);
14808 }
14809
14810 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
14811    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
14812    array types.  The C99 floating-point complex types are also considered
14813    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
14814    types, which are GCC extensions and out of the scope of AAPCS64, are
14815    treated as composite types here as well.
14816
14817    Note that MODE itself is not sufficient in determining whether a type
14818    is such a composite type or not.  This is because
14819    stor-layout.c:compute_record_mode may have already changed the MODE
14820    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
14821    structure with only one field may have its MODE set to the mode of the
14822    field.  Also an integer mode whose size matches the size of the
14823    RECORD_TYPE type may be used to substitute the original mode
14824    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
14825    solely relied on.  */
14826
14827 static bool
14828 aarch64_composite_type_p (const_tree type,
14829                           machine_mode mode)
14830 {
14831   if (aarch64_short_vector_p (type, mode))
14832     return false;
14833
14834   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
14835     return true;
14836
14837   if (mode == BLKmode
14838       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
14839       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
14840     return true;
14841
14842   return false;
14843 }
14844
14845 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
14846    shall be passed or returned in simd/fp register(s) (providing these
14847    parameter passing registers are available).
14848
14849    Upon successful return, *COUNT returns the number of needed registers,
14850    *BASE_MODE returns the mode of the individual register and when IS_HAF
14851    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
14852    floating-point aggregate or a homogeneous short-vector aggregate.  */
14853
14854 static bool
14855 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
14856                                          const_tree type,
14857                                          machine_mode *base_mode,
14858                                          int *count,
14859                                          bool *is_ha)
14860 {
14861   machine_mode new_mode = VOIDmode;
14862   bool composite_p = aarch64_composite_type_p (type, mode);
14863
14864   if (is_ha != NULL) *is_ha = false;
14865
14866   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
14867       || aarch64_short_vector_p (type, mode))
14868     {
14869       *count = 1;
14870       new_mode = mode;
14871     }
14872   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
14873     {
14874       if (is_ha != NULL) *is_ha = true;
14875       *count = 2;
14876       new_mode = GET_MODE_INNER (mode);
14877     }
14878   else if (type && composite_p)
14879     {
14880       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
14881
14882       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
14883         {
14884           if (is_ha != NULL) *is_ha = true;
14885           *count = ag_count;
14886         }
14887       else
14888         return false;
14889     }
14890   else
14891     return false;
14892
14893   *base_mode = new_mode;
14894   return true;
14895 }
14896
14897 /* Implement TARGET_STRUCT_VALUE_RTX.  */
14898
14899 static rtx
14900 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
14901                           int incoming ATTRIBUTE_UNUSED)
14902 {
14903   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
14904 }
14905
14906 /* Implements target hook vector_mode_supported_p.  */
14907 static bool
14908 aarch64_vector_mode_supported_p (machine_mode mode)
14909 {
14910   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14911   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
14912 }
14913
14914 /* Return the full-width SVE vector mode for element mode MODE, if one
14915    exists.  */
14916 opt_machine_mode
14917 aarch64_full_sve_mode (scalar_mode mode)
14918 {
14919   switch (mode)
14920     {
14921     case E_DFmode:
14922       return VNx2DFmode;
14923     case E_SFmode:
14924       return VNx4SFmode;
14925     case E_HFmode:
14926       return VNx8HFmode;
14927     case E_DImode:
14928         return VNx2DImode;
14929     case E_SImode:
14930       return VNx4SImode;
14931     case E_HImode:
14932       return VNx8HImode;
14933     case E_QImode:
14934       return VNx16QImode;
14935     default:
14936       return opt_machine_mode ();
14937     }
14938 }
14939
14940 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
14941    if it exists.  */
14942 opt_machine_mode
14943 aarch64_vq_mode (scalar_mode mode)
14944 {
14945   switch (mode)
14946     {
14947     case E_DFmode:
14948       return V2DFmode;
14949     case E_SFmode:
14950       return V4SFmode;
14951     case E_HFmode:
14952       return V8HFmode;
14953     case E_SImode:
14954       return V4SImode;
14955     case E_HImode:
14956       return V8HImode;
14957     case E_QImode:
14958       return V16QImode;
14959     case E_DImode:
14960       return V2DImode;
14961     default:
14962       return opt_machine_mode ();
14963     }
14964 }
14965
14966 /* Return appropriate SIMD container
14967    for MODE within a vector of WIDTH bits.  */
14968 static machine_mode
14969 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
14970 {
14971   if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
14972     return aarch64_full_sve_mode (mode).else_mode (word_mode);
14973
14974   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
14975   if (TARGET_SIMD)
14976     {
14977       if (known_eq (width, 128))
14978         return aarch64_vq_mode (mode).else_mode (word_mode);
14979       else
14980         switch (mode)
14981           {
14982           case E_SFmode:
14983             return V2SFmode;
14984           case E_HFmode:
14985             return V4HFmode;
14986           case E_SImode:
14987             return V2SImode;
14988           case E_HImode:
14989             return V4HImode;
14990           case E_QImode:
14991             return V8QImode;
14992           default:
14993             break;
14994           }
14995     }
14996   return word_mode;
14997 }
14998
14999 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
15000 static machine_mode
15001 aarch64_preferred_simd_mode (scalar_mode mode)
15002 {
15003   poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
15004   return aarch64_simd_container_mode (mode, bits);
15005 }
15006
15007 /* Return a list of possible vector sizes for the vectorizer
15008    to iterate over.  */
15009 static void
15010 aarch64_autovectorize_vector_sizes (vector_sizes *sizes, bool)
15011 {
15012   if (TARGET_SVE)
15013     sizes->safe_push (BYTES_PER_SVE_VECTOR);
15014   sizes->safe_push (16);
15015   sizes->safe_push (8);
15016 }
15017
15018 /* Implement TARGET_MANGLE_TYPE.  */
15019
15020 static const char *
15021 aarch64_mangle_type (const_tree type)
15022 {
15023   /* The AArch64 ABI documents say that "__va_list" has to be
15024      mangled as if it is in the "std" namespace.  */
15025   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
15026     return "St9__va_list";
15027
15028   /* Half-precision float.  */
15029   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
15030     return "Dh";
15031
15032   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
15033      builtin types.  */
15034   if (TYPE_NAME (type) != NULL)
15035     return aarch64_mangle_builtin_type (type);
15036
15037   /* Use the default mangling.  */
15038   return NULL;
15039 }
15040
15041 /* Find the first rtx_insn before insn that will generate an assembly
15042    instruction.  */
15043
15044 static rtx_insn *
15045 aarch64_prev_real_insn (rtx_insn *insn)
15046 {
15047   if (!insn)
15048     return NULL;
15049
15050   do
15051     {
15052       insn = prev_real_insn (insn);
15053     }
15054   while (insn && recog_memoized (insn) < 0);
15055
15056   return insn;
15057 }
15058
15059 static bool
15060 is_madd_op (enum attr_type t1)
15061 {
15062   unsigned int i;
15063   /* A number of these may be AArch32 only.  */
15064   enum attr_type mlatypes[] = {
15065     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
15066     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
15067     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
15068   };
15069
15070   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
15071     {
15072       if (t1 == mlatypes[i])
15073         return true;
15074     }
15075
15076   return false;
15077 }
15078
15079 /* Check if there is a register dependency between a load and the insn
15080    for which we hold recog_data.  */
15081
15082 static bool
15083 dep_between_memop_and_curr (rtx memop)
15084 {
15085   rtx load_reg;
15086   int opno;
15087
15088   gcc_assert (GET_CODE (memop) == SET);
15089
15090   if (!REG_P (SET_DEST (memop)))
15091     return false;
15092
15093   load_reg = SET_DEST (memop);
15094   for (opno = 1; opno < recog_data.n_operands; opno++)
15095     {
15096       rtx operand = recog_data.operand[opno];
15097       if (REG_P (operand)
15098           && reg_overlap_mentioned_p (load_reg, operand))
15099         return true;
15100
15101     }
15102   return false;
15103 }
15104
15105
15106 /* When working around the Cortex-A53 erratum 835769,
15107    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
15108    instruction and has a preceding memory instruction such that a NOP
15109    should be inserted between them.  */
15110
15111 bool
15112 aarch64_madd_needs_nop (rtx_insn* insn)
15113 {
15114   enum attr_type attr_type;
15115   rtx_insn *prev;
15116   rtx body;
15117
15118   if (!TARGET_FIX_ERR_A53_835769)
15119     return false;
15120
15121   if (!INSN_P (insn) || recog_memoized (insn) < 0)
15122     return false;
15123
15124   attr_type = get_attr_type (insn);
15125   if (!is_madd_op (attr_type))
15126     return false;
15127
15128   prev = aarch64_prev_real_insn (insn);
15129   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
15130      Restore recog state to INSN to avoid state corruption.  */
15131   extract_constrain_insn_cached (insn);
15132
15133   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
15134     return false;
15135
15136   body = single_set (prev);
15137
15138   /* If the previous insn is a memory op and there is no dependency between
15139      it and the DImode madd, emit a NOP between them.  If body is NULL then we
15140      have a complex memory operation, probably a load/store pair.
15141      Be conservative for now and emit a NOP.  */
15142   if (GET_MODE (recog_data.operand[0]) == DImode
15143       && (!body || !dep_between_memop_and_curr (body)))
15144     return true;
15145
15146   return false;
15147
15148 }
15149
15150
15151 /* Implement FINAL_PRESCAN_INSN.  */
15152
15153 void
15154 aarch64_final_prescan_insn (rtx_insn *insn)
15155 {
15156   if (aarch64_madd_needs_nop (insn))
15157     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
15158 }
15159
15160
15161 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
15162    instruction.  */
15163
15164 bool
15165 aarch64_sve_index_immediate_p (rtx base_or_step)
15166 {
15167   return (CONST_INT_P (base_or_step)
15168           && IN_RANGE (INTVAL (base_or_step), -16, 15));
15169 }
15170
15171 /* Return true if X is a valid immediate for the SVE ADD and SUB
15172    instructions.  Negate X first if NEGATE_P is true.  */
15173
15174 bool
15175 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
15176 {
15177   rtx elt;
15178
15179   if (!const_vec_duplicate_p (x, &elt)
15180       || !CONST_INT_P (elt))
15181     return false;
15182
15183   HOST_WIDE_INT val = INTVAL (elt);
15184   if (negate_p)
15185     val = -val;
15186   val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
15187
15188   if (val & 0xff)
15189     return IN_RANGE (val, 0, 0xff);
15190   return IN_RANGE (val, 0, 0xff00);
15191 }
15192
15193 /* Return true if X is a valid immediate operand for an SVE logical
15194    instruction such as AND.  */
15195
15196 bool
15197 aarch64_sve_bitmask_immediate_p (rtx x)
15198 {
15199   rtx elt;
15200
15201   return (const_vec_duplicate_p (x, &elt)
15202           && CONST_INT_P (elt)
15203           && aarch64_bitmask_imm (INTVAL (elt),
15204                                   GET_MODE_INNER (GET_MODE (x))));
15205 }
15206
15207 /* Return true if X is a valid immediate for the SVE DUP and CPY
15208    instructions.  */
15209
15210 bool
15211 aarch64_sve_dup_immediate_p (rtx x)
15212 {
15213   x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
15214   if (!CONST_INT_P (x))
15215     return false;
15216
15217   HOST_WIDE_INT val = INTVAL (x);
15218   if (val & 0xff)
15219     return IN_RANGE (val, -0x80, 0x7f);
15220   return IN_RANGE (val, -0x8000, 0x7f00);
15221 }
15222
15223 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
15224    SIGNED_P says whether the operand is signed rather than unsigned.  */
15225
15226 bool
15227 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
15228 {
15229   rtx elt;
15230
15231   return (const_vec_duplicate_p (x, &elt)
15232           && CONST_INT_P (elt)
15233           && (signed_p
15234               ? IN_RANGE (INTVAL (elt), -16, 15)
15235               : IN_RANGE (INTVAL (elt), 0, 127)));
15236 }
15237
15238 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
15239    instruction.  Negate X first if NEGATE_P is true.  */
15240
15241 bool
15242 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
15243 {
15244   rtx elt;
15245   REAL_VALUE_TYPE r;
15246
15247   if (!const_vec_duplicate_p (x, &elt)
15248       || GET_CODE (elt) != CONST_DOUBLE)
15249     return false;
15250
15251   r = *CONST_DOUBLE_REAL_VALUE (elt);
15252
15253   if (negate_p)
15254     r = real_value_negate (&r);
15255
15256   if (real_equal (&r, &dconst1))
15257     return true;
15258   if (real_equal (&r, &dconsthalf))
15259     return true;
15260   return false;
15261 }
15262
15263 /* Return true if X is a valid immediate operand for an SVE FMUL
15264    instruction.  */
15265
15266 bool
15267 aarch64_sve_float_mul_immediate_p (rtx x)
15268 {
15269   rtx elt;
15270
15271   return (const_vec_duplicate_p (x, &elt)
15272           && GET_CODE (elt) == CONST_DOUBLE
15273           && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
15274               || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
15275 }
15276
15277 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
15278    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
15279    is nonnull, use it to describe valid immediates.  */
15280 static bool
15281 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
15282                                     simd_immediate_info *info,
15283                                     enum simd_immediate_check which,
15284                                     simd_immediate_info::insn_type insn)
15285 {
15286   /* Try a 4-byte immediate with LSL.  */
15287   for (unsigned int shift = 0; shift < 32; shift += 8)
15288     if ((val32 & (0xff << shift)) == val32)
15289       {
15290         if (info)
15291           *info = simd_immediate_info (SImode, val32 >> shift, insn,
15292                                        simd_immediate_info::LSL, shift);
15293         return true;
15294       }
15295
15296   /* Try a 2-byte immediate with LSL.  */
15297   unsigned int imm16 = val32 & 0xffff;
15298   if (imm16 == (val32 >> 16))
15299     for (unsigned int shift = 0; shift < 16; shift += 8)
15300       if ((imm16 & (0xff << shift)) == imm16)
15301         {
15302           if (info)
15303             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
15304                                          simd_immediate_info::LSL, shift);
15305           return true;
15306         }
15307
15308   /* Try a 4-byte immediate with MSL, except for cases that MVN
15309      can handle.  */
15310   if (which == AARCH64_CHECK_MOV)
15311     for (unsigned int shift = 8; shift < 24; shift += 8)
15312       {
15313         unsigned int low = (1 << shift) - 1;
15314         if (((val32 & (0xff << shift)) | low) == val32)
15315           {
15316             if (info)
15317               *info = simd_immediate_info (SImode, val32 >> shift, insn,
15318                                            simd_immediate_info::MSL, shift);
15319             return true;
15320           }
15321       }
15322
15323   return false;
15324 }
15325
15326 /* Return true if replicating VAL64 is a valid immediate for the
15327    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
15328    use it to describe valid immediates.  */
15329 static bool
15330 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
15331                                  simd_immediate_info *info,
15332                                  enum simd_immediate_check which)
15333 {
15334   unsigned int val32 = val64 & 0xffffffff;
15335   unsigned int val16 = val64 & 0xffff;
15336   unsigned int val8 = val64 & 0xff;
15337
15338   if (val32 == (val64 >> 32))
15339     {
15340       if ((which & AARCH64_CHECK_ORR) != 0
15341           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
15342                                                  simd_immediate_info::MOV))
15343         return true;
15344
15345       if ((which & AARCH64_CHECK_BIC) != 0
15346           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
15347                                                  simd_immediate_info::MVN))
15348         return true;
15349
15350       /* Try using a replicated byte.  */
15351       if (which == AARCH64_CHECK_MOV
15352           && val16 == (val32 >> 16)
15353           && val8 == (val16 >> 8))
15354         {
15355           if (info)
15356             *info = simd_immediate_info (QImode, val8);
15357           return true;
15358         }
15359     }
15360
15361   /* Try using a bit-to-bytemask.  */
15362   if (which == AARCH64_CHECK_MOV)
15363     {
15364       unsigned int i;
15365       for (i = 0; i < 64; i += 8)
15366         {
15367           unsigned char byte = (val64 >> i) & 0xff;
15368           if (byte != 0 && byte != 0xff)
15369             break;
15370         }
15371       if (i == 64)
15372         {
15373           if (info)
15374             *info = simd_immediate_info (DImode, val64);
15375           return true;
15376         }
15377     }
15378   return false;
15379 }
15380
15381 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
15382    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
15383
15384 static bool
15385 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
15386                              simd_immediate_info *info)
15387 {
15388   scalar_int_mode mode = DImode;
15389   unsigned int val32 = val64 & 0xffffffff;
15390   if (val32 == (val64 >> 32))
15391     {
15392       mode = SImode;
15393       unsigned int val16 = val32 & 0xffff;
15394       if (val16 == (val32 >> 16))
15395         {
15396           mode = HImode;
15397           unsigned int val8 = val16 & 0xff;
15398           if (val8 == (val16 >> 8))
15399             mode = QImode;
15400         }
15401     }
15402   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
15403   if (IN_RANGE (val, -0x80, 0x7f))
15404     {
15405       /* DUP with no shift.  */
15406       if (info)
15407         *info = simd_immediate_info (mode, val);
15408       return true;
15409     }
15410   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
15411     {
15412       /* DUP with LSL #8.  */
15413       if (info)
15414         *info = simd_immediate_info (mode, val);
15415       return true;
15416     }
15417   if (aarch64_bitmask_imm (val64, mode))
15418     {
15419       /* DUPM.  */
15420       if (info)
15421         *info = simd_immediate_info (mode, val);
15422       return true;
15423     }
15424   return false;
15425 }
15426
15427 /* Return true if X is a valid SVE predicate.  If INFO is nonnull, use
15428    it to describe valid immediates.  */
15429
15430 static bool
15431 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
15432 {
15433   if (x == CONST0_RTX (GET_MODE (x)))
15434     {
15435       if (info)
15436         *info = simd_immediate_info (DImode, 0);
15437       return true;
15438     }
15439
15440   /* Analyze the value as a VNx16BImode.  This should be relatively
15441      efficient, since rtx_vector_builder has enough built-in capacity
15442      to store all VLA predicate constants without needing the heap.  */
15443   rtx_vector_builder builder;
15444   if (!aarch64_get_sve_pred_bits (builder, x))
15445     return false;
15446
15447   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
15448   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
15449     {
15450       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
15451       aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
15452       if (pattern != AARCH64_NUM_SVPATTERNS)
15453         {
15454           if (info)
15455             {
15456               scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
15457               *info = simd_immediate_info (int_mode, pattern);
15458             }
15459           return true;
15460         }
15461     }
15462   return false;
15463 }
15464
15465 /* Return true if OP is a valid SIMD immediate for the operation
15466    described by WHICH.  If INFO is nonnull, use it to describe valid
15467    immediates.  */
15468 bool
15469 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
15470                               enum simd_immediate_check which)
15471 {
15472   machine_mode mode = GET_MODE (op);
15473   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15474   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15475     return false;
15476
15477   if (vec_flags & VEC_SVE_PRED)
15478     return aarch64_sve_pred_valid_immediate (op, info);
15479
15480   scalar_mode elt_mode = GET_MODE_INNER (mode);
15481   rtx base, step;
15482   unsigned int n_elts;
15483   if (GET_CODE (op) == CONST_VECTOR
15484       && CONST_VECTOR_DUPLICATE_P (op))
15485     n_elts = CONST_VECTOR_NPATTERNS (op);
15486   else if ((vec_flags & VEC_SVE_DATA)
15487            && const_vec_series_p (op, &base, &step))
15488     {
15489       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
15490       if (!aarch64_sve_index_immediate_p (base)
15491           || !aarch64_sve_index_immediate_p (step))
15492         return false;
15493
15494       if (info)
15495         *info = simd_immediate_info (elt_mode, base, step);
15496       return true;
15497     }
15498   else if (GET_CODE (op) == CONST_VECTOR
15499            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
15500     /* N_ELTS set above.  */;
15501   else
15502     return false;
15503
15504   scalar_float_mode elt_float_mode;
15505   if (n_elts == 1
15506       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
15507     {
15508       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
15509       if (aarch64_float_const_zero_rtx_p (elt)
15510           || aarch64_float_const_representable_p (elt))
15511         {
15512           if (info)
15513             *info = simd_immediate_info (elt_float_mode, elt);
15514           return true;
15515         }
15516     }
15517
15518   unsigned int elt_size = GET_MODE_SIZE (elt_mode);
15519   if (elt_size > 8)
15520     return false;
15521
15522   scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
15523
15524   /* Expand the vector constant out into a byte vector, with the least
15525      significant byte of the register first.  */
15526   auto_vec<unsigned char, 16> bytes;
15527   bytes.reserve (n_elts * elt_size);
15528   for (unsigned int i = 0; i < n_elts; i++)
15529     {
15530       /* The vector is provided in gcc endian-neutral fashion.
15531          For aarch64_be Advanced SIMD, it must be laid out in the vector
15532          register in reverse order.  */
15533       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
15534       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
15535
15536       if (elt_mode != elt_int_mode)
15537         elt = gen_lowpart (elt_int_mode, elt);
15538
15539       if (!CONST_INT_P (elt))
15540         return false;
15541
15542       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
15543       for (unsigned int byte = 0; byte < elt_size; byte++)
15544         {
15545           bytes.quick_push (elt_val & 0xff);
15546           elt_val >>= BITS_PER_UNIT;
15547         }
15548     }
15549
15550   /* The immediate must repeat every eight bytes.  */
15551   unsigned int nbytes = bytes.length ();
15552   for (unsigned i = 8; i < nbytes; ++i)
15553     if (bytes[i] != bytes[i - 8])
15554       return false;
15555
15556   /* Get the repeating 8-byte value as an integer.  No endian correction
15557      is needed here because bytes is already in lsb-first order.  */
15558   unsigned HOST_WIDE_INT val64 = 0;
15559   for (unsigned int i = 0; i < 8; i++)
15560     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
15561               << (i * BITS_PER_UNIT));
15562
15563   if (vec_flags & VEC_SVE_DATA)
15564     return aarch64_sve_valid_immediate (val64, info);
15565   else
15566     return aarch64_advsimd_valid_immediate (val64, info, which);
15567 }
15568
15569 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
15570    has a step in the range of INDEX.  Return the index expression if so,
15571    otherwise return null.  */
15572 rtx
15573 aarch64_check_zero_based_sve_index_immediate (rtx x)
15574 {
15575   rtx base, step;
15576   if (const_vec_series_p (x, &base, &step)
15577       && base == const0_rtx
15578       && aarch64_sve_index_immediate_p (step))
15579     return step;
15580   return NULL_RTX;
15581 }
15582
15583 /* Check of immediate shift constants are within range.  */
15584 bool
15585 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
15586 {
15587   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
15588   if (left)
15589     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
15590   else
15591     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
15592 }
15593
15594 /* Return the bitmask CONST_INT to select the bits required by a zero extract
15595    operation of width WIDTH at bit position POS.  */
15596
15597 rtx
15598 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
15599 {
15600   gcc_assert (CONST_INT_P (width));
15601   gcc_assert (CONST_INT_P (pos));
15602
15603   unsigned HOST_WIDE_INT mask
15604     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
15605   return GEN_INT (mask << UINTVAL (pos));
15606 }
15607
15608 bool
15609 aarch64_mov_operand_p (rtx x, machine_mode mode)
15610 {
15611   if (GET_CODE (x) == HIGH
15612       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
15613     return true;
15614
15615   if (CONST_INT_P (x))
15616     return true;
15617
15618   if (VECTOR_MODE_P (GET_MODE (x)))
15619     {
15620       /* Require predicate constants to be VNx16BI before RA, so that we
15621          force everything to have a canonical form.  */
15622       if (!lra_in_progress
15623           && !reload_completed
15624           && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
15625           && GET_MODE (x) != VNx16BImode)
15626         return false;
15627
15628       return aarch64_simd_valid_immediate (x, NULL);
15629     }
15630
15631   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
15632     return true;
15633
15634   if (aarch64_sve_cnt_immediate_p (x))
15635     return true;
15636
15637   return aarch64_classify_symbolic_expression (x)
15638     == SYMBOL_TINY_ABSOLUTE;
15639 }
15640
15641 /* Return a const_int vector of VAL.  */
15642 rtx
15643 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
15644 {
15645   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
15646   return gen_const_vec_duplicate (mode, c);
15647 }
15648
15649 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
15650
15651 bool
15652 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
15653 {
15654   machine_mode vmode;
15655
15656   vmode = aarch64_simd_container_mode (mode, 64);
15657   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
15658   return aarch64_simd_valid_immediate (op_v, NULL);
15659 }
15660
15661 /* Construct and return a PARALLEL RTX vector with elements numbering the
15662    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
15663    the vector - from the perspective of the architecture.  This does not
15664    line up with GCC's perspective on lane numbers, so we end up with
15665    different masks depending on our target endian-ness.  The diagram
15666    below may help.  We must draw the distinction when building masks
15667    which select one half of the vector.  An instruction selecting
15668    architectural low-lanes for a big-endian target, must be described using
15669    a mask selecting GCC high-lanes.
15670
15671                  Big-Endian             Little-Endian
15672
15673 GCC             0   1   2   3           3   2   1   0
15674               | x | x | x | x |       | x | x | x | x |
15675 Architecture    3   2   1   0           3   2   1   0
15676
15677 Low Mask:         { 2, 3 }                { 0, 1 }
15678 High Mask:        { 0, 1 }                { 2, 3 }
15679
15680    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
15681
15682 rtx
15683 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
15684 {
15685   rtvec v = rtvec_alloc (nunits / 2);
15686   int high_base = nunits / 2;
15687   int low_base = 0;
15688   int base;
15689   rtx t1;
15690   int i;
15691
15692   if (BYTES_BIG_ENDIAN)
15693     base = high ? low_base : high_base;
15694   else
15695     base = high ? high_base : low_base;
15696
15697   for (i = 0; i < nunits / 2; i++)
15698     RTVEC_ELT (v, i) = GEN_INT (base + i);
15699
15700   t1 = gen_rtx_PARALLEL (mode, v);
15701   return t1;
15702 }
15703
15704 /* Check OP for validity as a PARALLEL RTX vector with elements
15705    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
15706    from the perspective of the architecture.  See the diagram above
15707    aarch64_simd_vect_par_cnst_half for more details.  */
15708
15709 bool
15710 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
15711                                        bool high)
15712 {
15713   int nelts;
15714   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
15715     return false;
15716
15717   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
15718   HOST_WIDE_INT count_op = XVECLEN (op, 0);
15719   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
15720   int i = 0;
15721
15722   if (count_op != count_ideal)
15723     return false;
15724
15725   for (i = 0; i < count_ideal; i++)
15726     {
15727       rtx elt_op = XVECEXP (op, 0, i);
15728       rtx elt_ideal = XVECEXP (ideal, 0, i);
15729
15730       if (!CONST_INT_P (elt_op)
15731           || INTVAL (elt_ideal) != INTVAL (elt_op))
15732         return false;
15733     }
15734   return true;
15735 }
15736
15737 /* Return a PARALLEL containing NELTS elements, with element I equal
15738    to BASE + I * STEP.  */
15739
15740 rtx
15741 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
15742 {
15743   rtvec vec = rtvec_alloc (nelts);
15744   for (unsigned int i = 0; i < nelts; ++i)
15745     RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
15746   return gen_rtx_PARALLEL (VOIDmode, vec);
15747 }
15748
15749 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
15750    series with step STEP.  */
15751
15752 bool
15753 aarch64_stepped_int_parallel_p (rtx op, int step)
15754 {
15755   if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
15756     return false;
15757
15758   unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
15759   for (int i = 1; i < XVECLEN (op, 0); ++i)
15760     if (!CONST_INT_P (XVECEXP (op, 0, i))
15761         || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
15762       return false;
15763
15764   return true;
15765 }
15766
15767 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
15768    HIGH (exclusive).  */
15769 void
15770 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
15771                           const_tree exp)
15772 {
15773   HOST_WIDE_INT lane;
15774   gcc_assert (CONST_INT_P (operand));
15775   lane = INTVAL (operand);
15776
15777   if (lane < low || lane >= high)
15778   {
15779     if (exp)
15780       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
15781     else
15782       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
15783   }
15784 }
15785
15786 /* Peform endian correction on lane number N, which indexes a vector
15787    of mode MODE, and return the result as an SImode rtx.  */
15788
15789 rtx
15790 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
15791 {
15792   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
15793 }
15794
15795 /* Return TRUE if OP is a valid vector addressing mode.  */
15796
15797 bool
15798 aarch64_simd_mem_operand_p (rtx op)
15799 {
15800   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
15801                         || REG_P (XEXP (op, 0)));
15802 }
15803
15804 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
15805
15806 bool
15807 aarch64_sve_ld1r_operand_p (rtx op)
15808 {
15809   struct aarch64_address_info addr;
15810   scalar_mode mode;
15811
15812   return (MEM_P (op)
15813           && is_a <scalar_mode> (GET_MODE (op), &mode)
15814           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
15815           && addr.type == ADDRESS_REG_IMM
15816           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
15817 }
15818
15819 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction.  */
15820 bool
15821 aarch64_sve_ld1rq_operand_p (rtx op)
15822 {
15823   struct aarch64_address_info addr;
15824   scalar_mode elem_mode = GET_MODE_INNER (GET_MODE (op));
15825   if (!MEM_P (op)
15826       || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
15827     return false;
15828
15829   if (addr.type == ADDRESS_REG_IMM)
15830     return offset_4bit_signed_scaled_p (TImode, addr.const_offset);
15831
15832   if (addr.type == ADDRESS_REG_REG)
15833     return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
15834
15835   return false;
15836 }
15837
15838 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
15839    The conditions for STR are the same.  */
15840 bool
15841 aarch64_sve_ldr_operand_p (rtx op)
15842 {
15843   struct aarch64_address_info addr;
15844
15845   return (MEM_P (op)
15846           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
15847                                        false, ADDR_QUERY_ANY)
15848           && addr.type == ADDRESS_REG_IMM);
15849 }
15850
15851 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
15852    We need to be able to access the individual pieces, so the range
15853    is different from LD[234] and ST[234].  */
15854 bool
15855 aarch64_sve_struct_memory_operand_p (rtx op)
15856 {
15857   if (!MEM_P (op))
15858     return false;
15859
15860   machine_mode mode = GET_MODE (op);
15861   struct aarch64_address_info addr;
15862   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
15863                                  ADDR_QUERY_ANY)
15864       || addr.type != ADDRESS_REG_IMM)
15865     return false;
15866
15867   poly_int64 first = addr.const_offset;
15868   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
15869   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
15870           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
15871 }
15872
15873 /* Emit a register copy from operand to operand, taking care not to
15874    early-clobber source registers in the process.
15875
15876    COUNT is the number of components into which the copy needs to be
15877    decomposed.  */
15878 void
15879 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
15880                                 unsigned int count)
15881 {
15882   unsigned int i;
15883   int rdest = REGNO (operands[0]);
15884   int rsrc = REGNO (operands[1]);
15885
15886   if (!reg_overlap_mentioned_p (operands[0], operands[1])
15887       || rdest < rsrc)
15888     for (i = 0; i < count; i++)
15889       emit_move_insn (gen_rtx_REG (mode, rdest + i),
15890                       gen_rtx_REG (mode, rsrc + i));
15891   else
15892     for (i = 0; i < count; i++)
15893       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
15894                       gen_rtx_REG (mode, rsrc + count - i - 1));
15895 }
15896
15897 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
15898    one of VSTRUCT modes: OI, CI, or XI.  */
15899 int
15900 aarch64_simd_attr_length_rglist (machine_mode mode)
15901 {
15902   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
15903   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
15904 }
15905
15906 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
15907    alignment of a vector to 128 bits.  SVE predicates have an alignment of
15908    16 bits.  */
15909 static HOST_WIDE_INT
15910 aarch64_simd_vector_alignment (const_tree type)
15911 {
15912   if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
15913     /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
15914        be set for non-predicate vectors of booleans.  Modes are the most
15915        direct way we have of identifying real SVE predicate types.  */
15916     return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
15917   return wi::umin (wi::to_wide (TYPE_SIZE (type)), 128).to_uhwi ();
15918 }
15919
15920 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
15921 static poly_uint64
15922 aarch64_vectorize_preferred_vector_alignment (const_tree type)
15923 {
15924   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
15925     {
15926       /* If the length of the vector is fixed, try to align to that length,
15927          otherwise don't try to align at all.  */
15928       HOST_WIDE_INT result;
15929       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
15930         result = TYPE_ALIGN (TREE_TYPE (type));
15931       return result;
15932     }
15933   return TYPE_ALIGN (type);
15934 }
15935
15936 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
15937 static bool
15938 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
15939 {
15940   if (is_packed)
15941     return false;
15942
15943   /* For fixed-length vectors, check that the vectorizer will aim for
15944      full-vector alignment.  This isn't true for generic GCC vectors
15945      that are wider than the ABI maximum of 128 bits.  */
15946   poly_uint64 preferred_alignment =
15947     aarch64_vectorize_preferred_vector_alignment (type);
15948   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
15949       && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
15950                    preferred_alignment))
15951     return false;
15952
15953   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
15954   return true;
15955 }
15956
15957 /* Return true if the vector misalignment factor is supported by the
15958    target.  */
15959 static bool
15960 aarch64_builtin_support_vector_misalignment (machine_mode mode,
15961                                              const_tree type, int misalignment,
15962                                              bool is_packed)
15963 {
15964   if (TARGET_SIMD && STRICT_ALIGNMENT)
15965     {
15966       /* Return if movmisalign pattern is not supported for this mode.  */
15967       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
15968         return false;
15969
15970       /* Misalignment factor is unknown at compile time.  */
15971       if (misalignment == -1)
15972         return false;
15973     }
15974   return default_builtin_support_vector_misalignment (mode, type, misalignment,
15975                                                       is_packed);
15976 }
15977
15978 /* If VALS is a vector constant that can be loaded into a register
15979    using DUP, generate instructions to do so and return an RTX to
15980    assign to the register.  Otherwise return NULL_RTX.  */
15981 static rtx
15982 aarch64_simd_dup_constant (rtx vals)
15983 {
15984   machine_mode mode = GET_MODE (vals);
15985   machine_mode inner_mode = GET_MODE_INNER (mode);
15986   rtx x;
15987
15988   if (!const_vec_duplicate_p (vals, &x))
15989     return NULL_RTX;
15990
15991   /* We can load this constant by using DUP and a constant in a
15992      single ARM register.  This will be cheaper than a vector
15993      load.  */
15994   x = copy_to_mode_reg (inner_mode, x);
15995   return gen_vec_duplicate (mode, x);
15996 }
15997
15998
15999 /* Generate code to load VALS, which is a PARALLEL containing only
16000    constants (for vec_init) or CONST_VECTOR, efficiently into a
16001    register.  Returns an RTX to copy into the register, or NULL_RTX
16002    for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
16003 static rtx
16004 aarch64_simd_make_constant (rtx vals)
16005 {
16006   machine_mode mode = GET_MODE (vals);
16007   rtx const_dup;
16008   rtx const_vec = NULL_RTX;
16009   int n_const = 0;
16010   int i;
16011
16012   if (GET_CODE (vals) == CONST_VECTOR)
16013     const_vec = vals;
16014   else if (GET_CODE (vals) == PARALLEL)
16015     {
16016       /* A CONST_VECTOR must contain only CONST_INTs and
16017          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
16018          Only store valid constants in a CONST_VECTOR.  */
16019       int n_elts = XVECLEN (vals, 0);
16020       for (i = 0; i < n_elts; ++i)
16021         {
16022           rtx x = XVECEXP (vals, 0, i);
16023           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16024             n_const++;
16025         }
16026       if (n_const == n_elts)
16027         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
16028     }
16029   else
16030     gcc_unreachable ();
16031
16032   if (const_vec != NULL_RTX
16033       && aarch64_simd_valid_immediate (const_vec, NULL))
16034     /* Load using MOVI/MVNI.  */
16035     return const_vec;
16036   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
16037     /* Loaded using DUP.  */
16038     return const_dup;
16039   else if (const_vec != NULL_RTX)
16040     /* Load from constant pool. We cannot take advantage of single-cycle
16041        LD1 because we need a PC-relative addressing mode.  */
16042     return const_vec;
16043   else
16044     /* A PARALLEL containing something not valid inside CONST_VECTOR.
16045        We cannot construct an initializer.  */
16046     return NULL_RTX;
16047 }
16048
16049 /* Expand a vector initialisation sequence, such that TARGET is
16050    initialised to contain VALS.  */
16051
16052 void
16053 aarch64_expand_vector_init (rtx target, rtx vals)
16054 {
16055   machine_mode mode = GET_MODE (target);
16056   scalar_mode inner_mode = GET_MODE_INNER (mode);
16057   /* The number of vector elements.  */
16058   int n_elts = XVECLEN (vals, 0);
16059   /* The number of vector elements which are not constant.  */
16060   int n_var = 0;
16061   rtx any_const = NULL_RTX;
16062   /* The first element of vals.  */
16063   rtx v0 = XVECEXP (vals, 0, 0);
16064   bool all_same = true;
16065
16066   /* This is a special vec_init<M><N> where N is not an element mode but a
16067      vector mode with half the elements of M.  We expect to find two entries
16068      of mode N in VALS and we must put their concatentation into TARGET.  */
16069   if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
16070     {
16071       gcc_assert (known_eq (GET_MODE_SIZE (mode),
16072                   2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
16073       rtx lo = XVECEXP (vals, 0, 0);
16074       rtx hi = XVECEXP (vals, 0, 1);
16075       machine_mode narrow_mode = GET_MODE (lo);
16076       gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
16077       gcc_assert (narrow_mode == GET_MODE (hi));
16078
16079       /* When we want to concatenate a half-width vector with zeroes we can
16080          use the aarch64_combinez[_be] patterns.  Just make sure that the
16081          zeroes are in the right half.  */
16082       if (BYTES_BIG_ENDIAN
16083           && aarch64_simd_imm_zero (lo, narrow_mode)
16084           && general_operand (hi, narrow_mode))
16085         emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
16086       else if (!BYTES_BIG_ENDIAN
16087                && aarch64_simd_imm_zero (hi, narrow_mode)
16088                && general_operand (lo, narrow_mode))
16089         emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
16090       else
16091         {
16092           /* Else create the two half-width registers and combine them.  */
16093           if (!REG_P (lo))
16094             lo = force_reg (GET_MODE (lo), lo);
16095           if (!REG_P (hi))
16096             hi = force_reg (GET_MODE (hi), hi);
16097
16098           if (BYTES_BIG_ENDIAN)
16099             std::swap (lo, hi);
16100           emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
16101         }
16102      return;
16103    }
16104
16105   /* Count the number of variable elements to initialise.  */
16106   for (int i = 0; i < n_elts; ++i)
16107     {
16108       rtx x = XVECEXP (vals, 0, i);
16109       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
16110         ++n_var;
16111       else
16112         any_const = x;
16113
16114       all_same &= rtx_equal_p (x, v0);
16115     }
16116
16117   /* No variable elements, hand off to aarch64_simd_make_constant which knows
16118      how best to handle this.  */
16119   if (n_var == 0)
16120     {
16121       rtx constant = aarch64_simd_make_constant (vals);
16122       if (constant != NULL_RTX)
16123         {
16124           emit_move_insn (target, constant);
16125           return;
16126         }
16127     }
16128
16129   /* Splat a single non-constant element if we can.  */
16130   if (all_same)
16131     {
16132       rtx x = copy_to_mode_reg (inner_mode, v0);
16133       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
16134       return;
16135     }
16136
16137   enum insn_code icode = optab_handler (vec_set_optab, mode);
16138   gcc_assert (icode != CODE_FOR_nothing);
16139
16140   /* If there are only variable elements, try to optimize
16141      the insertion using dup for the most common element
16142      followed by insertions.  */
16143
16144   /* The algorithm will fill matches[*][0] with the earliest matching element,
16145      and matches[X][1] with the count of duplicate elements (if X is the
16146      earliest element which has duplicates).  */
16147
16148   if (n_var == n_elts && n_elts <= 16)
16149     {
16150       int matches[16][2] = {0};
16151       for (int i = 0; i < n_elts; i++)
16152         {
16153           for (int j = 0; j <= i; j++)
16154             {
16155               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
16156                 {
16157                   matches[i][0] = j;
16158                   matches[j][1]++;
16159                   break;
16160                 }
16161             }
16162         }
16163       int maxelement = 0;
16164       int maxv = 0;
16165       for (int i = 0; i < n_elts; i++)
16166         if (matches[i][1] > maxv)
16167           {
16168             maxelement = i;
16169             maxv = matches[i][1];
16170           }
16171
16172       /* Create a duplicate of the most common element, unless all elements
16173          are equally useless to us, in which case just immediately set the
16174          vector register using the first element.  */
16175
16176       if (maxv == 1)
16177         {
16178           /* For vectors of two 64-bit elements, we can do even better.  */
16179           if (n_elts == 2
16180               && (inner_mode == E_DImode
16181                   || inner_mode == E_DFmode))
16182
16183             {
16184               rtx x0 = XVECEXP (vals, 0, 0);
16185               rtx x1 = XVECEXP (vals, 0, 1);
16186               /* Combine can pick up this case, but handling it directly
16187                  here leaves clearer RTL.
16188
16189                  This is load_pair_lanes<mode>, and also gives us a clean-up
16190                  for store_pair_lanes<mode>.  */
16191               if (memory_operand (x0, inner_mode)
16192                   && memory_operand (x1, inner_mode)
16193                   && !STRICT_ALIGNMENT
16194                   && rtx_equal_p (XEXP (x1, 0),
16195                                   plus_constant (Pmode,
16196                                                  XEXP (x0, 0),
16197                                                  GET_MODE_SIZE (inner_mode))))
16198                 {
16199                   rtx t;
16200                   if (inner_mode == DFmode)
16201                     t = gen_load_pair_lanesdf (target, x0, x1);
16202                   else
16203                     t = gen_load_pair_lanesdi (target, x0, x1);
16204                   emit_insn (t);
16205                   return;
16206                 }
16207             }
16208           /* The subreg-move sequence below will move into lane zero of the
16209              vector register.  For big-endian we want that position to hold
16210              the last element of VALS.  */
16211           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
16212           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
16213           aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
16214         }
16215       else
16216         {
16217           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
16218           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
16219         }
16220
16221       /* Insert the rest.  */
16222       for (int i = 0; i < n_elts; i++)
16223         {
16224           rtx x = XVECEXP (vals, 0, i);
16225           if (matches[i][0] == maxelement)
16226             continue;
16227           x = copy_to_mode_reg (inner_mode, x);
16228           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
16229         }
16230       return;
16231     }
16232
16233   /* Initialise a vector which is part-variable.  We want to first try
16234      to build those lanes which are constant in the most efficient way we
16235      can.  */
16236   if (n_var != n_elts)
16237     {
16238       rtx copy = copy_rtx (vals);
16239
16240       /* Load constant part of vector.  We really don't care what goes into the
16241          parts we will overwrite, but we're more likely to be able to load the
16242          constant efficiently if it has fewer, larger, repeating parts
16243          (see aarch64_simd_valid_immediate).  */
16244       for (int i = 0; i < n_elts; i++)
16245         {
16246           rtx x = XVECEXP (vals, 0, i);
16247           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16248             continue;
16249           rtx subst = any_const;
16250           for (int bit = n_elts / 2; bit > 0; bit /= 2)
16251             {
16252               /* Look in the copied vector, as more elements are const.  */
16253               rtx test = XVECEXP (copy, 0, i ^ bit);
16254               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
16255                 {
16256                   subst = test;
16257                   break;
16258                 }
16259             }
16260           XVECEXP (copy, 0, i) = subst;
16261         }
16262       aarch64_expand_vector_init (target, copy);
16263     }
16264
16265   /* Insert the variable lanes directly.  */
16266   for (int i = 0; i < n_elts; i++)
16267     {
16268       rtx x = XVECEXP (vals, 0, i);
16269       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16270         continue;
16271       x = copy_to_mode_reg (inner_mode, x);
16272       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
16273     }
16274 }
16275
16276 /* Emit RTL corresponding to:
16277    insr TARGET, ELEM.  */
16278
16279 static void
16280 emit_insr (rtx target, rtx elem)
16281 {
16282   machine_mode mode = GET_MODE (target);
16283   scalar_mode elem_mode = GET_MODE_INNER (mode);
16284   elem = force_reg (elem_mode, elem);
16285
16286   insn_code icode = optab_handler (vec_shl_insert_optab, mode);
16287   gcc_assert (icode != CODE_FOR_nothing);
16288   emit_insn (GEN_FCN (icode) (target, target, elem));
16289 }
16290
16291 /* Subroutine of aarch64_sve_expand_vector_init for handling
16292    trailing constants.
16293    This function works as follows:
16294    (a) Create a new vector consisting of trailing constants.
16295    (b) Initialize TARGET with the constant vector using emit_move_insn.
16296    (c) Insert remaining elements in TARGET using insr.
16297    NELTS is the total number of elements in original vector while
16298    while NELTS_REQD is the number of elements that are actually
16299    significant.
16300
16301    ??? The heuristic used is to do above only if number of constants
16302    is at least half the total number of elements.  May need fine tuning.  */
16303
16304 static bool
16305 aarch64_sve_expand_vector_init_handle_trailing_constants
16306  (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
16307 {
16308   machine_mode mode = GET_MODE (target);
16309   scalar_mode elem_mode = GET_MODE_INNER (mode);
16310   int n_trailing_constants = 0;
16311
16312   for (int i = nelts_reqd - 1;
16313        i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i));
16314        i--)
16315     n_trailing_constants++;
16316
16317   if (n_trailing_constants >= nelts_reqd / 2)
16318     {
16319       rtx_vector_builder v (mode, 1, nelts);
16320       for (int i = 0; i < nelts; i++)
16321         v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants));
16322       rtx const_vec = v.build ();
16323       emit_move_insn (target, const_vec);
16324
16325       for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
16326         emit_insr (target, builder.elt (i));
16327
16328       return true;
16329     }
16330
16331   return false;
16332 }
16333
16334 /* Subroutine of aarch64_sve_expand_vector_init.
16335    Works as follows:
16336    (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
16337    (b) Skip trailing elements from BUILDER, which are the same as
16338        element NELTS_REQD - 1.
16339    (c) Insert earlier elements in reverse order in TARGET using insr.  */
16340
16341 static void
16342 aarch64_sve_expand_vector_init_insert_elems (rtx target,
16343                                              const rtx_vector_builder &builder,
16344                                              int nelts_reqd)
16345 {
16346   machine_mode mode = GET_MODE (target);
16347   scalar_mode elem_mode = GET_MODE_INNER (mode);
16348
16349   struct expand_operand ops[2];
16350   enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
16351   gcc_assert (icode != CODE_FOR_nothing);
16352
16353   create_output_operand (&ops[0], target, mode);
16354   create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
16355   expand_insn (icode, 2, ops);
16356
16357   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
16358   for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
16359     emit_insr (target, builder.elt (i));
16360 }
16361
16362 /* Subroutine of aarch64_sve_expand_vector_init to handle case
16363    when all trailing elements of builder are same.
16364    This works as follows:
16365    (a) Use expand_insn interface to broadcast last vector element in TARGET.
16366    (b) Insert remaining elements in TARGET using insr.
16367
16368    ??? The heuristic used is to do above if number of same trailing elements
16369    is at least 3/4 of total number of elements, loosely based on
16370    heuristic from mostly_zeros_p.  May need fine-tuning.  */
16371
16372 static bool
16373 aarch64_sve_expand_vector_init_handle_trailing_same_elem
16374  (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
16375 {
16376   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
16377   if (ndups >= (3 * nelts_reqd) / 4)
16378     {
16379       aarch64_sve_expand_vector_init_insert_elems (target, builder,
16380                                                    nelts_reqd - ndups + 1);
16381       return true;
16382     }
16383
16384   return false;
16385 }
16386
16387 /* Initialize register TARGET from BUILDER. NELTS is the constant number
16388    of elements in BUILDER.
16389
16390    The function tries to initialize TARGET from BUILDER if it fits one
16391    of the special cases outlined below.
16392
16393    Failing that, the function divides BUILDER into two sub-vectors:
16394    v_even = even elements of BUILDER;
16395    v_odd = odd elements of BUILDER;
16396
16397    and recursively calls itself with v_even and v_odd.
16398
16399    if (recursive call succeeded for v_even or v_odd)
16400      TARGET = zip (v_even, v_odd)
16401
16402    The function returns true if it managed to build TARGET from BUILDER
16403    with one of the special cases, false otherwise.
16404
16405    Example: {a, 1, b, 2, c, 3, d, 4}
16406
16407    The vector gets divided into:
16408    v_even = {a, b, c, d}
16409    v_odd = {1, 2, 3, 4}
16410
16411    aarch64_sve_expand_vector_init(v_odd) hits case 1 and
16412    initialize tmp2 from constant vector v_odd using emit_move_insn.
16413
16414    aarch64_sve_expand_vector_init(v_even) fails since v_even contains
16415    4 elements, so we construct tmp1 from v_even using insr:
16416    tmp1 = dup(d)
16417    insr tmp1, c
16418    insr tmp1, b
16419    insr tmp1, a
16420
16421    And finally:
16422    TARGET = zip (tmp1, tmp2)
16423    which sets TARGET to {a, 1, b, 2, c, 3, d, 4}.  */
16424
16425 static bool
16426 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
16427                                 int nelts, int nelts_reqd)
16428 {
16429   machine_mode mode = GET_MODE (target);
16430
16431   /* Case 1: Vector contains trailing constants.  */
16432
16433   if (aarch64_sve_expand_vector_init_handle_trailing_constants
16434        (target, builder, nelts, nelts_reqd))
16435     return true;
16436
16437   /* Case 2: Vector contains leading constants.  */
16438
16439   rtx_vector_builder rev_builder (mode, 1, nelts_reqd);
16440   for (int i = 0; i < nelts_reqd; i++)
16441     rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
16442   rev_builder.finalize ();
16443
16444   if (aarch64_sve_expand_vector_init_handle_trailing_constants
16445        (target, rev_builder, nelts, nelts_reqd))
16446     {
16447       emit_insn (gen_aarch64_sve_rev (mode, target, target));
16448       return true;
16449     }
16450
16451   /* Case 3: Vector contains trailing same element.  */
16452
16453   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16454        (target, builder, nelts_reqd))
16455     return true;
16456
16457   /* Case 4: Vector contains leading same element.  */
16458
16459   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16460        (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
16461     {
16462       emit_insn (gen_aarch64_sve_rev (mode, target, target));
16463       return true;
16464     }
16465
16466   /* Avoid recursing below 4-elements.
16467      ??? The threshold 4 may need fine-tuning.  */
16468
16469   if (nelts_reqd <= 4)
16470     return false;
16471
16472   rtx_vector_builder v_even (mode, 1, nelts);
16473   rtx_vector_builder v_odd (mode, 1, nelts);
16474
16475   for (int i = 0; i < nelts * 2; i += 2)
16476     {
16477       v_even.quick_push (builder.elt (i));
16478       v_odd.quick_push (builder.elt (i + 1));
16479     }
16480
16481   v_even.finalize ();
16482   v_odd.finalize ();
16483
16484   rtx tmp1 = gen_reg_rtx (mode);
16485   bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
16486                                                     nelts, nelts_reqd / 2);
16487
16488   rtx tmp2 = gen_reg_rtx (mode);
16489   bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
16490                                                    nelts, nelts_reqd / 2);
16491
16492   if (!did_even_p && !did_odd_p)
16493     return false;
16494
16495   /* Initialize v_even and v_odd using INSR if it didn't match any of the
16496      special cases and zip v_even, v_odd.  */
16497
16498   if (!did_even_p)
16499     aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
16500
16501   if (!did_odd_p)
16502     aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
16503
16504   rtvec v = gen_rtvec (2, tmp1, tmp2);
16505   emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
16506   return true;
16507 }
16508
16509 /* Initialize register TARGET from the elements in PARALLEL rtx VALS.  */
16510
16511 void
16512 aarch64_sve_expand_vector_init (rtx target, rtx vals)
16513 {
16514   machine_mode mode = GET_MODE (target);
16515   int nelts = XVECLEN (vals, 0);
16516
16517   rtx_vector_builder v (mode, 1, nelts);
16518   for (int i = 0; i < nelts; i++)
16519     v.quick_push (XVECEXP (vals, 0, i));
16520   v.finalize ();
16521
16522   /* If neither sub-vectors of v could be initialized specially,
16523      then use INSR to insert all elements from v into TARGET.
16524      ??? This might not be optimal for vectors with large
16525      initializers like 16-element or above.
16526      For nelts < 4, it probably isn't useful to handle specially.  */
16527
16528   if (nelts < 4
16529       || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
16530     aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
16531 }
16532
16533 /* Check whether VALUE is a vector constant in which every element
16534    is either a power of 2 or a negated power of 2.  If so, return
16535    a constant vector of log2s, and flip CODE between PLUS and MINUS
16536    if VALUE contains negated powers of 2.  Return NULL_RTX otherwise.  */
16537
16538 static rtx
16539 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
16540 {
16541   if (GET_CODE (value) != CONST_VECTOR)
16542     return NULL_RTX;
16543
16544   rtx_vector_builder builder;
16545   if (!builder.new_unary_operation (GET_MODE (value), value, false))
16546     return NULL_RTX;
16547
16548   scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
16549   /* 1 if the result of the multiplication must be negated,
16550      0 if it mustn't, or -1 if we don't yet care.  */
16551   int negate = -1;
16552   unsigned int encoded_nelts = const_vector_encoded_nelts (value);
16553   for (unsigned int i = 0; i < encoded_nelts; ++i)
16554     {
16555       rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
16556       if (!CONST_SCALAR_INT_P (elt))
16557         return NULL_RTX;
16558       rtx_mode_t val (elt, int_mode);
16559       wide_int pow2 = wi::neg (val);
16560       if (val != pow2)
16561         {
16562           /* It matters whether we negate or not.  Make that choice,
16563              and make sure that it's consistent with previous elements.  */
16564           if (negate == !wi::neg_p (val))
16565             return NULL_RTX;
16566           negate = wi::neg_p (val);
16567           if (!negate)
16568             pow2 = val;
16569         }
16570       /* POW2 is now the value that we want to be a power of 2.  */
16571       int shift = wi::exact_log2 (pow2);
16572       if (shift < 0)
16573         return NULL_RTX;
16574       builder.quick_push (gen_int_mode (shift, int_mode));
16575     }
16576   if (negate == -1)
16577     /* PLUS and MINUS are equivalent; canonicalize on PLUS.  */
16578     code = PLUS;
16579   else if (negate == 1)
16580     code = code == PLUS ? MINUS : PLUS;
16581   return builder.build ();
16582 }
16583
16584 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
16585    CODE is PLUS for the former and MINUS for the latter.  OPERANDS is the
16586    operands array, in the same order as for fma_optab.  Return true if
16587    the function emitted all the necessary instructions, false if the caller
16588    should generate the pattern normally with the new OPERANDS array.  */
16589
16590 bool
16591 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
16592 {
16593   machine_mode mode = GET_MODE (operands[0]);
16594   if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
16595     {
16596       rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
16597                                   NULL_RTX, true, OPTAB_DIRECT);
16598       force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
16599                           operands[3], product, operands[0], true,
16600                           OPTAB_DIRECT);
16601       return true;
16602     }
16603   operands[2] = force_reg (mode, operands[2]);
16604   return false;
16605 }
16606
16607 /* Likewise, but for a conditional pattern.  */
16608
16609 bool
16610 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
16611 {
16612   machine_mode mode = GET_MODE (operands[0]);
16613   if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
16614     {
16615       rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
16616                                   NULL_RTX, true, OPTAB_DIRECT);
16617       emit_insn (gen_cond (code, mode, operands[0], operands[1],
16618                            operands[4], product, operands[5]));
16619       return true;
16620     }
16621   operands[3] = force_reg (mode, operands[3]);
16622   return false;
16623 }
16624
16625 static unsigned HOST_WIDE_INT
16626 aarch64_shift_truncation_mask (machine_mode mode)
16627 {
16628   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
16629     return 0;
16630   return GET_MODE_UNIT_BITSIZE (mode) - 1;
16631 }
16632
16633 /* Select a format to encode pointers in exception handling data.  */
16634 int
16635 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
16636 {
16637    int type;
16638    switch (aarch64_cmodel)
16639      {
16640      case AARCH64_CMODEL_TINY:
16641      case AARCH64_CMODEL_TINY_PIC:
16642      case AARCH64_CMODEL_SMALL:
16643      case AARCH64_CMODEL_SMALL_PIC:
16644      case AARCH64_CMODEL_SMALL_SPIC:
16645        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
16646           for everything.  */
16647        type = DW_EH_PE_sdata4;
16648        break;
16649      default:
16650        /* No assumptions here.  8-byte relocs required.  */
16651        type = DW_EH_PE_sdata8;
16652        break;
16653      }
16654    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
16655 }
16656
16657 /* Output .variant_pcs for aarch64_vector_pcs function symbols.  */
16658
16659 static void
16660 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
16661 {
16662   if (aarch64_simd_decl_p (decl))
16663     {
16664       fprintf (stream, "\t.variant_pcs\t");
16665       assemble_name (stream, name);
16666       fprintf (stream, "\n");
16667     }
16668 }
16669
16670 /* The last .arch and .tune assembly strings that we printed.  */
16671 static std::string aarch64_last_printed_arch_string;
16672 static std::string aarch64_last_printed_tune_string;
16673
16674 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
16675    by the function fndecl.  */
16676
16677 void
16678 aarch64_declare_function_name (FILE *stream, const char* name,
16679                                 tree fndecl)
16680 {
16681   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
16682
16683   struct cl_target_option *targ_options;
16684   if (target_parts)
16685     targ_options = TREE_TARGET_OPTION (target_parts);
16686   else
16687     targ_options = TREE_TARGET_OPTION (target_option_current_node);
16688   gcc_assert (targ_options);
16689
16690   const struct processor *this_arch
16691     = aarch64_get_arch (targ_options->x_explicit_arch);
16692
16693   uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
16694   std::string extension
16695     = aarch64_get_extension_string_for_isa_flags (isa_flags,
16696                                                   this_arch->flags);
16697   /* Only update the assembler .arch string if it is distinct from the last
16698      such string we printed.  */
16699   std::string to_print = this_arch->name + extension;
16700   if (to_print != aarch64_last_printed_arch_string)
16701     {
16702       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
16703       aarch64_last_printed_arch_string = to_print;
16704     }
16705
16706   /* Print the cpu name we're tuning for in the comments, might be
16707      useful to readers of the generated asm.  Do it only when it changes
16708      from function to function and verbose assembly is requested.  */
16709   const struct processor *this_tune
16710     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
16711
16712   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
16713     {
16714       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
16715                    this_tune->name);
16716       aarch64_last_printed_tune_string = this_tune->name;
16717     }
16718
16719   aarch64_asm_output_variant_pcs (stream, fndecl, name);
16720
16721   /* Don't forget the type directive for ELF.  */
16722   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
16723   ASM_OUTPUT_LABEL (stream, name);
16724 }
16725
16726 /* Implement ASM_OUTPUT_DEF_FROM_DECLS.  Output .variant_pcs for aliases.  */
16727
16728 void
16729 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
16730 {
16731   const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
16732   const char *value = IDENTIFIER_POINTER (target);
16733   aarch64_asm_output_variant_pcs (stream, decl, name);
16734   ASM_OUTPUT_DEF (stream, name, value);
16735 }
16736
16737 /* Implement ASM_OUTPUT_EXTERNAL.  Output .variant_pcs for undefined
16738    function symbol references.  */
16739
16740 void
16741 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
16742 {
16743   default_elf_asm_output_external (stream, decl, name);
16744   aarch64_asm_output_variant_pcs (stream, decl, name);
16745 }
16746
16747 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
16748    Used to output the .cfi_b_key_frame directive when signing the current
16749    function with the B key.  */
16750
16751 void
16752 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
16753 {
16754   if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
16755       && aarch64_ra_sign_key == AARCH64_KEY_B)
16756         asm_fprintf (f, "\t.cfi_b_key_frame\n");
16757 }
16758
16759 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
16760
16761 static void
16762 aarch64_start_file (void)
16763 {
16764   struct cl_target_option *default_options
16765     = TREE_TARGET_OPTION (target_option_default_node);
16766
16767   const struct processor *default_arch
16768     = aarch64_get_arch (default_options->x_explicit_arch);
16769   uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
16770   std::string extension
16771     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
16772                                                   default_arch->flags);
16773
16774    aarch64_last_printed_arch_string = default_arch->name + extension;
16775    aarch64_last_printed_tune_string = "";
16776    asm_fprintf (asm_out_file, "\t.arch %s\n",
16777                 aarch64_last_printed_arch_string.c_str ());
16778
16779    default_file_start ();
16780 }
16781
16782 /* Emit load exclusive.  */
16783
16784 static void
16785 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
16786                              rtx mem, rtx model_rtx)
16787 {
16788   emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
16789 }
16790
16791 /* Emit store exclusive.  */
16792
16793 static void
16794 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
16795                               rtx rval, rtx mem, rtx model_rtx)
16796 {
16797   emit_insn (gen_aarch64_store_exclusive (mode, bval, rval, mem, model_rtx));
16798 }
16799
16800 /* Mark the previous jump instruction as unlikely.  */
16801
16802 static void
16803 aarch64_emit_unlikely_jump (rtx insn)
16804 {
16805   rtx_insn *jump = emit_jump_insn (insn);
16806   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
16807 }
16808
16809 /* Expand a compare and swap pattern.  */
16810
16811 void
16812 aarch64_expand_compare_and_swap (rtx operands[])
16813 {
16814   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
16815   machine_mode mode, r_mode;
16816
16817   bval = operands[0];
16818   rval = operands[1];
16819   mem = operands[2];
16820   oldval = operands[3];
16821   newval = operands[4];
16822   is_weak = operands[5];
16823   mod_s = operands[6];
16824   mod_f = operands[7];
16825   mode = GET_MODE (mem);
16826
16827   /* Normally the succ memory model must be stronger than fail, but in the
16828      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
16829      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
16830   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
16831       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
16832     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
16833
16834   r_mode = mode;
16835   if (mode == QImode || mode == HImode)
16836     {
16837       r_mode = SImode;
16838       rval = gen_reg_rtx (r_mode);
16839     }
16840
16841   if (TARGET_LSE)
16842     {
16843       /* The CAS insn requires oldval and rval overlap, but we need to
16844          have a copy of oldval saved across the operation to tell if
16845          the operation is successful.  */
16846       if (reg_overlap_mentioned_p (rval, oldval))
16847         rval = copy_to_mode_reg (r_mode, oldval);
16848       else
16849         emit_move_insn (rval, gen_lowpart (r_mode, oldval));
16850
16851       emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
16852                                                    newval, mod_s));
16853       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
16854     }
16855   else
16856     {
16857       /* The oldval predicate varies by mode.  Test it and force to reg.  */
16858       insn_code code = code_for_aarch64_compare_and_swap (mode);
16859       if (!insn_data[code].operand[2].predicate (oldval, mode))
16860         oldval = force_reg (mode, oldval);
16861
16862       emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
16863                                  is_weak, mod_s, mod_f));
16864       cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
16865     }
16866
16867   if (r_mode != mode)
16868     rval = gen_lowpart (mode, rval);
16869   emit_move_insn (operands[1], rval);
16870
16871   x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
16872   emit_insn (gen_rtx_SET (bval, x));
16873 }
16874
16875 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
16876    sequence implementing an atomic operation.  */
16877
16878 static void
16879 aarch64_emit_post_barrier (enum memmodel model)
16880 {
16881   const enum memmodel base_model = memmodel_base (model);
16882
16883   if (is_mm_sync (model)
16884       && (base_model == MEMMODEL_ACQUIRE
16885           || base_model == MEMMODEL_ACQ_REL
16886           || base_model == MEMMODEL_SEQ_CST))
16887     {
16888       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
16889     }
16890 }
16891
16892 /* Split a compare and swap pattern.  */
16893
16894 void
16895 aarch64_split_compare_and_swap (rtx operands[])
16896 {
16897   rtx rval, mem, oldval, newval, scratch;
16898   machine_mode mode;
16899   bool is_weak;
16900   rtx_code_label *label1, *label2;
16901   rtx x, cond;
16902   enum memmodel model;
16903   rtx model_rtx;
16904
16905   rval = operands[0];
16906   mem = operands[1];
16907   oldval = operands[2];
16908   newval = operands[3];
16909   is_weak = (operands[4] != const0_rtx);
16910   model_rtx = operands[5];
16911   scratch = operands[7];
16912   mode = GET_MODE (mem);
16913   model = memmodel_from_int (INTVAL (model_rtx));
16914
16915   /* When OLDVAL is zero and we want the strong version we can emit a tighter
16916     loop:
16917     .label1:
16918         LD[A]XR rval, [mem]
16919         CBNZ    rval, .label2
16920         ST[L]XR scratch, newval, [mem]
16921         CBNZ    scratch, .label1
16922     .label2:
16923         CMP     rval, 0.  */
16924   bool strong_zero_p = !is_weak && oldval == const0_rtx;
16925
16926   label1 = NULL;
16927   if (!is_weak)
16928     {
16929       label1 = gen_label_rtx ();
16930       emit_label (label1);
16931     }
16932   label2 = gen_label_rtx ();
16933
16934   /* The initial load can be relaxed for a __sync operation since a final
16935      barrier will be emitted to stop code hoisting.  */
16936   if (is_mm_sync (model))
16937     aarch64_emit_load_exclusive (mode, rval, mem,
16938                                  GEN_INT (MEMMODEL_RELAXED));
16939   else
16940     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
16941
16942   if (strong_zero_p)
16943     {
16944       if (aarch64_track_speculation)
16945         {
16946           /* Emit an explicit compare instruction, so that we can correctly
16947              track the condition codes.  */
16948           rtx cc_reg = aarch64_gen_compare_reg (NE, rval, const0_rtx);
16949           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16950         }
16951       else
16952         x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
16953
16954       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16955                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
16956       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16957     }
16958   else
16959     {
16960       cond = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
16961       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
16962       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16963                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
16964       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16965     }
16966
16967   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
16968
16969   if (!is_weak)
16970     {
16971       if (aarch64_track_speculation)
16972         {
16973           /* Emit an explicit compare instruction, so that we can correctly
16974              track the condition codes.  */
16975           rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
16976           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16977         }
16978       else
16979         x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
16980
16981       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16982                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
16983       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16984     }
16985   else
16986     {
16987       cond = gen_rtx_REG (CCmode, CC_REGNUM);
16988       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
16989       emit_insn (gen_rtx_SET (cond, x));
16990     }
16991
16992   emit_label (label2);
16993   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
16994      to set the condition flags.  If this is not used it will be removed by
16995      later passes.  */
16996   if (strong_zero_p)
16997     {
16998       cond = gen_rtx_REG (CCmode, CC_REGNUM);
16999       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
17000       emit_insn (gen_rtx_SET (cond, x));
17001     }
17002   /* Emit any final barrier needed for a __sync operation.  */
17003   if (is_mm_sync (model))
17004     aarch64_emit_post_barrier (model);
17005 }
17006
17007 /* Split an atomic operation.  */
17008
17009 void
17010 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
17011                          rtx value, rtx model_rtx, rtx cond)
17012 {
17013   machine_mode mode = GET_MODE (mem);
17014   machine_mode wmode = (mode == DImode ? DImode : SImode);
17015   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
17016   const bool is_sync = is_mm_sync (model);
17017   rtx_code_label *label;
17018   rtx x;
17019
17020   /* Split the atomic operation into a sequence.  */
17021   label = gen_label_rtx ();
17022   emit_label (label);
17023
17024   if (new_out)
17025     new_out = gen_lowpart (wmode, new_out);
17026   if (old_out)
17027     old_out = gen_lowpart (wmode, old_out);
17028   else
17029     old_out = new_out;
17030   value = simplify_gen_subreg (wmode, value, mode, 0);
17031
17032   /* The initial load can be relaxed for a __sync operation since a final
17033      barrier will be emitted to stop code hoisting.  */
17034  if (is_sync)
17035     aarch64_emit_load_exclusive (mode, old_out, mem,
17036                                  GEN_INT (MEMMODEL_RELAXED));
17037   else
17038     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
17039
17040   switch (code)
17041     {
17042     case SET:
17043       new_out = value;
17044       break;
17045
17046     case NOT:
17047       x = gen_rtx_AND (wmode, old_out, value);
17048       emit_insn (gen_rtx_SET (new_out, x));
17049       x = gen_rtx_NOT (wmode, new_out);
17050       emit_insn (gen_rtx_SET (new_out, x));
17051       break;
17052
17053     case MINUS:
17054       if (CONST_INT_P (value))
17055         {
17056           value = GEN_INT (-INTVAL (value));
17057           code = PLUS;
17058         }
17059       /* Fall through.  */
17060
17061     default:
17062       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
17063       emit_insn (gen_rtx_SET (new_out, x));
17064       break;
17065     }
17066
17067   aarch64_emit_store_exclusive (mode, cond, mem,
17068                                 gen_lowpart (mode, new_out), model_rtx);
17069
17070   if (aarch64_track_speculation)
17071     {
17072       /* Emit an explicit compare instruction, so that we can correctly
17073          track the condition codes.  */
17074       rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
17075       x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
17076     }
17077   else
17078     x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
17079
17080   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
17081                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
17082   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
17083
17084   /* Emit any final barrier needed for a __sync operation.  */
17085   if (is_sync)
17086     aarch64_emit_post_barrier (model);
17087 }
17088
17089 static void
17090 aarch64_init_libfuncs (void)
17091 {
17092    /* Half-precision float operations.  The compiler handles all operations
17093      with NULL libfuncs by converting to SFmode.  */
17094
17095   /* Conversions.  */
17096   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
17097   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
17098
17099   /* Arithmetic.  */
17100   set_optab_libfunc (add_optab, HFmode, NULL);
17101   set_optab_libfunc (sdiv_optab, HFmode, NULL);
17102   set_optab_libfunc (smul_optab, HFmode, NULL);
17103   set_optab_libfunc (neg_optab, HFmode, NULL);
17104   set_optab_libfunc (sub_optab, HFmode, NULL);
17105
17106   /* Comparisons.  */
17107   set_optab_libfunc (eq_optab, HFmode, NULL);
17108   set_optab_libfunc (ne_optab, HFmode, NULL);
17109   set_optab_libfunc (lt_optab, HFmode, NULL);
17110   set_optab_libfunc (le_optab, HFmode, NULL);
17111   set_optab_libfunc (ge_optab, HFmode, NULL);
17112   set_optab_libfunc (gt_optab, HFmode, NULL);
17113   set_optab_libfunc (unord_optab, HFmode, NULL);
17114 }
17115
17116 /* Target hook for c_mode_for_suffix.  */
17117 static machine_mode
17118 aarch64_c_mode_for_suffix (char suffix)
17119 {
17120   if (suffix == 'q')
17121     return TFmode;
17122
17123   return VOIDmode;
17124 }
17125
17126 /* We can only represent floating point constants which will fit in
17127    "quarter-precision" values.  These values are characterised by
17128    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
17129    by:
17130
17131    (-1)^s * (n/16) * 2^r
17132
17133    Where:
17134      's' is the sign bit.
17135      'n' is an integer in the range 16 <= n <= 31.
17136      'r' is an integer in the range -3 <= r <= 4.  */
17137
17138 /* Return true iff X can be represented by a quarter-precision
17139    floating point immediate operand X.  Note, we cannot represent 0.0.  */
17140 bool
17141 aarch64_float_const_representable_p (rtx x)
17142 {
17143   /* This represents our current view of how many bits
17144      make up the mantissa.  */
17145   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
17146   int exponent;
17147   unsigned HOST_WIDE_INT mantissa, mask;
17148   REAL_VALUE_TYPE r, m;
17149   bool fail;
17150
17151   x = unwrap_const_vec_duplicate (x);
17152   if (!CONST_DOUBLE_P (x))
17153     return false;
17154
17155   if (GET_MODE (x) == VOIDmode
17156       || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
17157     return false;
17158
17159   r = *CONST_DOUBLE_REAL_VALUE (x);
17160
17161   /* We cannot represent infinities, NaNs or +/-zero.  We won't
17162      know if we have +zero until we analyse the mantissa, but we
17163      can reject the other invalid values.  */
17164   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
17165       || REAL_VALUE_MINUS_ZERO (r))
17166     return false;
17167
17168   /* Extract exponent.  */
17169   r = real_value_abs (&r);
17170   exponent = REAL_EXP (&r);
17171
17172   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
17173      highest (sign) bit, with a fixed binary point at bit point_pos.
17174      m1 holds the low part of the mantissa, m2 the high part.
17175      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
17176      bits for the mantissa, this can fail (low bits will be lost).  */
17177   real_ldexp (&m, &r, point_pos - exponent);
17178   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
17179
17180   /* If the low part of the mantissa has bits set we cannot represent
17181      the value.  */
17182   if (w.ulow () != 0)
17183     return false;
17184   /* We have rejected the lower HOST_WIDE_INT, so update our
17185      understanding of how many bits lie in the mantissa and
17186      look only at the high HOST_WIDE_INT.  */
17187   mantissa = w.elt (1);
17188   point_pos -= HOST_BITS_PER_WIDE_INT;
17189
17190   /* We can only represent values with a mantissa of the form 1.xxxx.  */
17191   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
17192   if ((mantissa & mask) != 0)
17193     return false;
17194
17195   /* Having filtered unrepresentable values, we may now remove all
17196      but the highest 5 bits.  */
17197   mantissa >>= point_pos - 5;
17198
17199   /* We cannot represent the value 0.0, so reject it.  This is handled
17200      elsewhere.  */
17201   if (mantissa == 0)
17202     return false;
17203
17204   /* Then, as bit 4 is always set, we can mask it off, leaving
17205      the mantissa in the range [0, 15].  */
17206   mantissa &= ~(1 << 4);
17207   gcc_assert (mantissa <= 15);
17208
17209   /* GCC internally does not use IEEE754-like encoding (where normalized
17210      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
17211      Our mantissa values are shifted 4 places to the left relative to
17212      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
17213      by 5 places to correct for GCC's representation.  */
17214   exponent = 5 - exponent;
17215
17216   return (exponent >= 0 && exponent <= 7);
17217 }
17218
17219 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
17220    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
17221    output MOVI/MVNI, ORR or BIC immediate.  */
17222 char*
17223 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
17224                                    enum simd_immediate_check which)
17225 {
17226   bool is_valid;
17227   static char templ[40];
17228   const char *mnemonic;
17229   const char *shift_op;
17230   unsigned int lane_count = 0;
17231   char element_char;
17232
17233   struct simd_immediate_info info;
17234
17235   /* This will return true to show const_vector is legal for use as either
17236      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
17237      It will also update INFO to show how the immediate should be generated.
17238      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
17239   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
17240   gcc_assert (is_valid);
17241
17242   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
17243   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
17244
17245   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
17246     {
17247       gcc_assert (info.insn == simd_immediate_info::MOV
17248                   && info.u.mov.shift == 0);
17249       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
17250          move immediate path.  */
17251       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
17252         info.u.mov.value = GEN_INT (0);
17253       else
17254         {
17255           const unsigned int buf_size = 20;
17256           char float_buf[buf_size] = {'\0'};
17257           real_to_decimal_for_mode (float_buf,
17258                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
17259                                     buf_size, buf_size, 1, info.elt_mode);
17260
17261           if (lane_count == 1)
17262             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
17263           else
17264             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
17265                       lane_count, element_char, float_buf);
17266           return templ;
17267         }
17268     }
17269
17270   gcc_assert (CONST_INT_P (info.u.mov.value));
17271
17272   if (which == AARCH64_CHECK_MOV)
17273     {
17274       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
17275       shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
17276                   ? "msl" : "lsl");
17277       if (lane_count == 1)
17278         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
17279                   mnemonic, UINTVAL (info.u.mov.value));
17280       else if (info.u.mov.shift)
17281         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
17282                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
17283                   element_char, UINTVAL (info.u.mov.value), shift_op,
17284                   info.u.mov.shift);
17285       else
17286         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
17287                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
17288                   element_char, UINTVAL (info.u.mov.value));
17289     }
17290   else
17291     {
17292       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
17293       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
17294       if (info.u.mov.shift)
17295         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
17296                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
17297                   element_char, UINTVAL (info.u.mov.value), "lsl",
17298                   info.u.mov.shift);
17299       else
17300         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
17301                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
17302                   element_char, UINTVAL (info.u.mov.value));
17303     }
17304   return templ;
17305 }
17306
17307 char*
17308 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
17309 {
17310
17311   /* If a floating point number was passed and we desire to use it in an
17312      integer mode do the conversion to integer.  */
17313   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
17314     {
17315       unsigned HOST_WIDE_INT ival;
17316       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
17317           gcc_unreachable ();
17318       immediate = gen_int_mode (ival, mode);
17319     }
17320
17321   machine_mode vmode;
17322   /* use a 64 bit mode for everything except for DI/DF mode, where we use
17323      a 128 bit vector mode.  */
17324   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
17325
17326   vmode = aarch64_simd_container_mode (mode, width);
17327   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
17328   return aarch64_output_simd_mov_immediate (v_op, width);
17329 }
17330
17331 /* Return the output string to use for moving immediate CONST_VECTOR
17332    into an SVE register.  */
17333
17334 char *
17335 aarch64_output_sve_mov_immediate (rtx const_vector)
17336 {
17337   static char templ[40];
17338   struct simd_immediate_info info;
17339   char element_char;
17340
17341   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
17342   gcc_assert (is_valid);
17343
17344   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
17345
17346   machine_mode vec_mode = GET_MODE (const_vector);
17347   if (aarch64_sve_pred_mode_p (vec_mode))
17348     {
17349       static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
17350       if (info.insn == simd_immediate_info::MOV)
17351         {
17352           gcc_assert (info.u.mov.value == const0_rtx);
17353           snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
17354         }
17355       else
17356         {
17357           gcc_assert (info.insn == simd_immediate_info::PTRUE);
17358           unsigned int total_bytes;
17359           if (info.u.pattern == AARCH64_SV_ALL
17360               && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
17361             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
17362                       total_bytes / GET_MODE_SIZE (info.elt_mode));
17363           else
17364             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
17365                       svpattern_token (info.u.pattern));
17366         }
17367       return buf;
17368     }
17369
17370   if (info.insn == simd_immediate_info::INDEX)
17371     {
17372       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
17373                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
17374                 element_char, INTVAL (info.u.index.base),
17375                 INTVAL (info.u.index.step));
17376       return templ;
17377     }
17378
17379   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
17380     {
17381       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
17382         info.u.mov.value = GEN_INT (0);
17383       else
17384         {
17385           const int buf_size = 20;
17386           char float_buf[buf_size] = {};
17387           real_to_decimal_for_mode (float_buf,
17388                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
17389                                     buf_size, buf_size, 1, info.elt_mode);
17390
17391           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
17392                     element_char, float_buf);
17393           return templ;
17394         }
17395     }
17396
17397   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
17398             element_char, INTVAL (info.u.mov.value));
17399   return templ;
17400 }
17401
17402 /* Split operands into moves from op[1] + op[2] into op[0].  */
17403
17404 void
17405 aarch64_split_combinev16qi (rtx operands[3])
17406 {
17407   unsigned int dest = REGNO (operands[0]);
17408   unsigned int src1 = REGNO (operands[1]);
17409   unsigned int src2 = REGNO (operands[2]);
17410   machine_mode halfmode = GET_MODE (operands[1]);
17411   unsigned int halfregs = REG_NREGS (operands[1]);
17412   rtx destlo, desthi;
17413
17414   gcc_assert (halfmode == V16QImode);
17415
17416   if (src1 == dest && src2 == dest + halfregs)
17417     {
17418       /* No-op move.  Can't split to nothing; emit something.  */
17419       emit_note (NOTE_INSN_DELETED);
17420       return;
17421     }
17422
17423   /* Preserve register attributes for variable tracking.  */
17424   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
17425   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
17426                                GET_MODE_SIZE (halfmode));
17427
17428   /* Special case of reversed high/low parts.  */
17429   if (reg_overlap_mentioned_p (operands[2], destlo)
17430       && reg_overlap_mentioned_p (operands[1], desthi))
17431     {
17432       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
17433       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
17434       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
17435     }
17436   else if (!reg_overlap_mentioned_p (operands[2], destlo))
17437     {
17438       /* Try to avoid unnecessary moves if part of the result
17439          is in the right place already.  */
17440       if (src1 != dest)
17441         emit_move_insn (destlo, operands[1]);
17442       if (src2 != dest + halfregs)
17443         emit_move_insn (desthi, operands[2]);
17444     }
17445   else
17446     {
17447       if (src2 != dest + halfregs)
17448         emit_move_insn (desthi, operands[2]);
17449       if (src1 != dest)
17450         emit_move_insn (destlo, operands[1]);
17451     }
17452 }
17453
17454 /* vec_perm support.  */
17455
17456 struct expand_vec_perm_d
17457 {
17458   rtx target, op0, op1;
17459   vec_perm_indices perm;
17460   machine_mode vmode;
17461   unsigned int vec_flags;
17462   bool one_vector_p;
17463   bool testing_p;
17464 };
17465
17466 /* Generate a variable permutation.  */
17467
17468 static void
17469 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
17470 {
17471   machine_mode vmode = GET_MODE (target);
17472   bool one_vector_p = rtx_equal_p (op0, op1);
17473
17474   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
17475   gcc_checking_assert (GET_MODE (op0) == vmode);
17476   gcc_checking_assert (GET_MODE (op1) == vmode);
17477   gcc_checking_assert (GET_MODE (sel) == vmode);
17478   gcc_checking_assert (TARGET_SIMD);
17479
17480   if (one_vector_p)
17481     {
17482       if (vmode == V8QImode)
17483         {
17484           /* Expand the argument to a V16QI mode by duplicating it.  */
17485           rtx pair = gen_reg_rtx (V16QImode);
17486           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
17487           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
17488         }
17489       else
17490         {
17491           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
17492         }
17493     }
17494   else
17495     {
17496       rtx pair;
17497
17498       if (vmode == V8QImode)
17499         {
17500           pair = gen_reg_rtx (V16QImode);
17501           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
17502           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
17503         }
17504       else
17505         {
17506           pair = gen_reg_rtx (OImode);
17507           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
17508           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
17509         }
17510     }
17511 }
17512
17513 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
17514    NELT is the number of elements in the vector.  */
17515
17516 void
17517 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
17518                          unsigned int nelt)
17519 {
17520   machine_mode vmode = GET_MODE (target);
17521   bool one_vector_p = rtx_equal_p (op0, op1);
17522   rtx mask;
17523
17524   /* The TBL instruction does not use a modulo index, so we must take care
17525      of that ourselves.  */
17526   mask = aarch64_simd_gen_const_vector_dup (vmode,
17527       one_vector_p ? nelt - 1 : 2 * nelt - 1);
17528   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
17529
17530   /* For big-endian, we also need to reverse the index within the vector
17531      (but not which vector).  */
17532   if (BYTES_BIG_ENDIAN)
17533     {
17534       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
17535       if (!one_vector_p)
17536         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
17537       sel = expand_simple_binop (vmode, XOR, sel, mask,
17538                                  NULL, 0, OPTAB_LIB_WIDEN);
17539     }
17540   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
17541 }
17542
17543 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
17544
17545 static void
17546 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
17547 {
17548   emit_insn (gen_rtx_SET (target,
17549                           gen_rtx_UNSPEC (GET_MODE (target),
17550                                           gen_rtvec (2, op0, op1), code)));
17551 }
17552
17553 /* Expand an SVE vec_perm with the given operands.  */
17554
17555 void
17556 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
17557 {
17558   machine_mode data_mode = GET_MODE (target);
17559   machine_mode sel_mode = GET_MODE (sel);
17560   /* Enforced by the pattern condition.  */
17561   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
17562
17563   /* Note: vec_perm indices are supposed to wrap when they go beyond the
17564      size of the two value vectors, i.e. the upper bits of the indices
17565      are effectively ignored.  SVE TBL instead produces 0 for any
17566      out-of-range indices, so we need to modulo all the vec_perm indices
17567      to ensure they are all in range.  */
17568   rtx sel_reg = force_reg (sel_mode, sel);
17569
17570   /* Check if the sel only references the first values vector.  */
17571   if (GET_CODE (sel) == CONST_VECTOR
17572       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
17573     {
17574       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
17575       return;
17576     }
17577
17578   /* Check if the two values vectors are the same.  */
17579   if (rtx_equal_p (op0, op1))
17580     {
17581       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
17582       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
17583                                          NULL, 0, OPTAB_DIRECT);
17584       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
17585       return;
17586     }
17587
17588   /* Run TBL on for each value vector and combine the results.  */
17589
17590   rtx res0 = gen_reg_rtx (data_mode);
17591   rtx res1 = gen_reg_rtx (data_mode);
17592   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
17593   if (GET_CODE (sel) != CONST_VECTOR
17594       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
17595     {
17596       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
17597                                                        2 * nunits - 1);
17598       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
17599                                      NULL, 0, OPTAB_DIRECT);
17600     }
17601   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
17602   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
17603                                      NULL, 0, OPTAB_DIRECT);
17604   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
17605   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
17606     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
17607   else
17608     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
17609 }
17610
17611 /* Recognize patterns suitable for the TRN instructions.  */
17612 static bool
17613 aarch64_evpc_trn (struct expand_vec_perm_d *d)
17614 {
17615   HOST_WIDE_INT odd;
17616   poly_uint64 nelt = d->perm.length ();
17617   rtx out, in0, in1, x;
17618   machine_mode vmode = d->vmode;
17619
17620   if (GET_MODE_UNIT_SIZE (vmode) > 8)
17621     return false;
17622
17623   /* Note that these are little-endian tests.
17624      We correct for big-endian later.  */
17625   if (!d->perm[0].is_constant (&odd)
17626       || (odd != 0 && odd != 1)
17627       || !d->perm.series_p (0, 2, odd, 2)
17628       || !d->perm.series_p (1, 2, nelt + odd, 2))
17629     return false;
17630
17631   /* Success!  */
17632   if (d->testing_p)
17633     return true;
17634
17635   in0 = d->op0;
17636   in1 = d->op1;
17637   /* We don't need a big-endian lane correction for SVE; see the comment
17638      at the head of aarch64-sve.md for details.  */
17639   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17640     {
17641       x = in0, in0 = in1, in1 = x;
17642       odd = !odd;
17643     }
17644   out = d->target;
17645
17646   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17647                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
17648   return true;
17649 }
17650
17651 /* Recognize patterns suitable for the UZP instructions.  */
17652 static bool
17653 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
17654 {
17655   HOST_WIDE_INT odd;
17656   rtx out, in0, in1, x;
17657   machine_mode vmode = d->vmode;
17658
17659   if (GET_MODE_UNIT_SIZE (vmode) > 8)
17660     return false;
17661
17662   /* Note that these are little-endian tests.
17663      We correct for big-endian later.  */
17664   if (!d->perm[0].is_constant (&odd)
17665       || (odd != 0 && odd != 1)
17666       || !d->perm.series_p (0, 1, odd, 2))
17667     return false;
17668
17669   /* Success!  */
17670   if (d->testing_p)
17671     return true;
17672
17673   in0 = d->op0;
17674   in1 = d->op1;
17675   /* We don't need a big-endian lane correction for SVE; see the comment
17676      at the head of aarch64-sve.md for details.  */
17677   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17678     {
17679       x = in0, in0 = in1, in1 = x;
17680       odd = !odd;
17681     }
17682   out = d->target;
17683
17684   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17685                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
17686   return true;
17687 }
17688
17689 /* Recognize patterns suitable for the ZIP instructions.  */
17690 static bool
17691 aarch64_evpc_zip (struct expand_vec_perm_d *d)
17692 {
17693   unsigned int high;
17694   poly_uint64 nelt = d->perm.length ();
17695   rtx out, in0, in1, x;
17696   machine_mode vmode = d->vmode;
17697
17698   if (GET_MODE_UNIT_SIZE (vmode) > 8)
17699     return false;
17700
17701   /* Note that these are little-endian tests.
17702      We correct for big-endian later.  */
17703   poly_uint64 first = d->perm[0];
17704   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
17705       || !d->perm.series_p (0, 2, first, 1)
17706       || !d->perm.series_p (1, 2, first + nelt, 1))
17707     return false;
17708   high = maybe_ne (first, 0U);
17709
17710   /* Success!  */
17711   if (d->testing_p)
17712     return true;
17713
17714   in0 = d->op0;
17715   in1 = d->op1;
17716   /* We don't need a big-endian lane correction for SVE; see the comment
17717      at the head of aarch64-sve.md for details.  */
17718   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17719     {
17720       x = in0, in0 = in1, in1 = x;
17721       high = !high;
17722     }
17723   out = d->target;
17724
17725   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17726                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
17727   return true;
17728 }
17729
17730 /* Recognize patterns for the EXT insn.  */
17731
17732 static bool
17733 aarch64_evpc_ext (struct expand_vec_perm_d *d)
17734 {
17735   HOST_WIDE_INT location;
17736   rtx offset;
17737
17738   /* The first element always refers to the first vector.
17739      Check if the extracted indices are increasing by one.  */
17740   if (d->vec_flags == VEC_SVE_PRED
17741       || !d->perm[0].is_constant (&location)
17742       || !d->perm.series_p (0, 1, location, 1))
17743     return false;
17744
17745   /* Success! */
17746   if (d->testing_p)
17747     return true;
17748
17749   /* The case where (location == 0) is a no-op for both big- and little-endian,
17750      and is removed by the mid-end at optimization levels -O1 and higher.
17751
17752      We don't need a big-endian lane correction for SVE; see the comment
17753      at the head of aarch64-sve.md for details.  */
17754   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
17755     {
17756       /* After setup, we want the high elements of the first vector (stored
17757          at the LSB end of the register), and the low elements of the second
17758          vector (stored at the MSB end of the register). So swap.  */
17759       std::swap (d->op0, d->op1);
17760       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
17761          to_constant () is safe since this is restricted to Advanced SIMD
17762          vectors.  */
17763       location = d->perm.length ().to_constant () - location;
17764     }
17765
17766   offset = GEN_INT (location);
17767   emit_set_insn (d->target,
17768                  gen_rtx_UNSPEC (d->vmode,
17769                                  gen_rtvec (3, d->op0, d->op1, offset),
17770                                  UNSPEC_EXT));
17771   return true;
17772 }
17773
17774 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
17775    within each 64-bit, 32-bit or 16-bit granule.  */
17776
17777 static bool
17778 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
17779 {
17780   HOST_WIDE_INT diff;
17781   unsigned int i, size, unspec;
17782   machine_mode pred_mode;
17783
17784   if (d->vec_flags == VEC_SVE_PRED
17785       || !d->one_vector_p
17786       || !d->perm[0].is_constant (&diff))
17787     return false;
17788
17789   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
17790   if (size == 8)
17791     {
17792       unspec = UNSPEC_REV64;
17793       pred_mode = VNx2BImode;
17794     }
17795   else if (size == 4)
17796     {
17797       unspec = UNSPEC_REV32;
17798       pred_mode = VNx4BImode;
17799     }
17800   else if (size == 2)
17801     {
17802       unspec = UNSPEC_REV16;
17803       pred_mode = VNx8BImode;
17804     }
17805   else
17806     return false;
17807
17808   unsigned int step = diff + 1;
17809   for (i = 0; i < step; ++i)
17810     if (!d->perm.series_p (i, step, diff - i, step))
17811       return false;
17812
17813   /* Success! */
17814   if (d->testing_p)
17815     return true;
17816
17817   if (d->vec_flags == VEC_SVE_DATA)
17818     {
17819       machine_mode int_mode = aarch64_sve_int_mode (pred_mode);
17820       rtx target = gen_reg_rtx (int_mode);
17821       if (BYTES_BIG_ENDIAN)
17822         /* The act of taking a subreg between INT_MODE and d->vmode
17823            is itself a reversing operation on big-endian targets;
17824            see the comment at the head of aarch64-sve.md for details.
17825            First reinterpret OP0 as INT_MODE without using a subreg
17826            and without changing the contents.  */
17827         emit_insn (gen_aarch64_sve_reinterpret (int_mode, target, d->op0));
17828       else
17829         {
17830           /* For SVE we use REV[BHW] unspecs derived from the element size
17831              of v->mode and vector modes whose elements have SIZE bytes.
17832              This ensures that the vector modes match the predicate modes.  */
17833           int unspec = aarch64_sve_rev_unspec (d->vmode);
17834           rtx pred = aarch64_ptrue_reg (pred_mode);
17835           emit_insn (gen_aarch64_pred (unspec, int_mode, target, pred,
17836                                        gen_lowpart (int_mode, d->op0)));
17837         }
17838       emit_move_insn (d->target, gen_lowpart (d->vmode, target));
17839       return true;
17840     }
17841   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
17842   emit_set_insn (d->target, src);
17843   return true;
17844 }
17845
17846 /* Recognize patterns for the REV insn, which reverses elements within
17847    a full vector.  */
17848
17849 static bool
17850 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
17851 {
17852   poly_uint64 nelt = d->perm.length ();
17853
17854   if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
17855     return false;
17856
17857   if (!d->perm.series_p (0, 1, nelt - 1, -1))
17858     return false;
17859
17860   /* Success! */
17861   if (d->testing_p)
17862     return true;
17863
17864   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
17865   emit_set_insn (d->target, src);
17866   return true;
17867 }
17868
17869 static bool
17870 aarch64_evpc_dup (struct expand_vec_perm_d *d)
17871 {
17872   rtx out = d->target;
17873   rtx in0;
17874   HOST_WIDE_INT elt;
17875   machine_mode vmode = d->vmode;
17876   rtx lane;
17877
17878   if (d->vec_flags == VEC_SVE_PRED
17879       || d->perm.encoding ().encoded_nelts () != 1
17880       || !d->perm[0].is_constant (&elt))
17881     return false;
17882
17883   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
17884     return false;
17885
17886   /* Success! */
17887   if (d->testing_p)
17888     return true;
17889
17890   /* The generic preparation in aarch64_expand_vec_perm_const_1
17891      swaps the operand order and the permute indices if it finds
17892      d->perm[0] to be in the second operand.  Thus, we can always
17893      use d->op0 and need not do any extra arithmetic to get the
17894      correct lane number.  */
17895   in0 = d->op0;
17896   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
17897
17898   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
17899   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
17900   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
17901   return true;
17902 }
17903
17904 static bool
17905 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
17906 {
17907   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
17908   machine_mode vmode = d->vmode;
17909
17910   /* Make sure that the indices are constant.  */
17911   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
17912   for (unsigned int i = 0; i < encoded_nelts; ++i)
17913     if (!d->perm[i].is_constant ())
17914       return false;
17915
17916   if (d->testing_p)
17917     return true;
17918
17919   /* Generic code will try constant permutation twice.  Once with the
17920      original mode and again with the elements lowered to QImode.
17921      So wait and don't do the selector expansion ourselves.  */
17922   if (vmode != V8QImode && vmode != V16QImode)
17923     return false;
17924
17925   /* to_constant is safe since this routine is specific to Advanced SIMD
17926      vectors.  */
17927   unsigned int nelt = d->perm.length ().to_constant ();
17928   for (unsigned int i = 0; i < nelt; ++i)
17929     /* If big-endian and two vectors we end up with a weird mixed-endian
17930        mode on NEON.  Reverse the index within each word but not the word
17931        itself.  to_constant is safe because we checked is_constant above.  */
17932     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
17933                         ? d->perm[i].to_constant () ^ (nelt - 1)
17934                         : d->perm[i].to_constant ());
17935
17936   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
17937   sel = force_reg (vmode, sel);
17938
17939   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
17940   return true;
17941 }
17942
17943 /* Try to implement D using an SVE TBL instruction.  */
17944
17945 static bool
17946 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
17947 {
17948   unsigned HOST_WIDE_INT nelt;
17949
17950   /* Permuting two variable-length vectors could overflow the
17951      index range.  */
17952   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
17953     return false;
17954
17955   if (d->testing_p)
17956     return true;
17957
17958   machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
17959   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
17960   if (d->one_vector_p)
17961     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
17962   else
17963     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
17964   return true;
17965 }
17966
17967 static bool
17968 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
17969 {
17970   /* The pattern matching functions above are written to look for a small
17971      number to begin the sequence (0, 1, N/2).  If we begin with an index
17972      from the second operand, we can swap the operands.  */
17973   poly_int64 nelt = d->perm.length ();
17974   if (known_ge (d->perm[0], nelt))
17975     {
17976       d->perm.rotate_inputs (1);
17977       std::swap (d->op0, d->op1);
17978     }
17979
17980   if ((d->vec_flags == VEC_ADVSIMD
17981        || d->vec_flags == VEC_SVE_DATA
17982        || d->vec_flags == VEC_SVE_PRED)
17983       && known_gt (nelt, 1))
17984     {
17985       if (aarch64_evpc_rev_local (d))
17986         return true;
17987       else if (aarch64_evpc_rev_global (d))
17988         return true;
17989       else if (aarch64_evpc_ext (d))
17990         return true;
17991       else if (aarch64_evpc_dup (d))
17992         return true;
17993       else if (aarch64_evpc_zip (d))
17994         return true;
17995       else if (aarch64_evpc_uzp (d))
17996         return true;
17997       else if (aarch64_evpc_trn (d))
17998         return true;
17999       if (d->vec_flags == VEC_SVE_DATA)
18000         return aarch64_evpc_sve_tbl (d);
18001       else if (d->vec_flags == VEC_ADVSIMD)
18002         return aarch64_evpc_tbl (d);
18003     }
18004   return false;
18005 }
18006
18007 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
18008
18009 static bool
18010 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
18011                                   rtx op1, const vec_perm_indices &sel)
18012 {
18013   struct expand_vec_perm_d d;
18014
18015   /* Check whether the mask can be applied to a single vector.  */
18016   if (sel.ninputs () == 1
18017       || (op0 && rtx_equal_p (op0, op1)))
18018     d.one_vector_p = true;
18019   else if (sel.all_from_input_p (0))
18020     {
18021       d.one_vector_p = true;
18022       op1 = op0;
18023     }
18024   else if (sel.all_from_input_p (1))
18025     {
18026       d.one_vector_p = true;
18027       op0 = op1;
18028     }
18029   else
18030     d.one_vector_p = false;
18031
18032   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
18033                      sel.nelts_per_input ());
18034   d.vmode = vmode;
18035   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
18036   d.target = target;
18037   d.op0 = op0;
18038   d.op1 = op1;
18039   d.testing_p = !target;
18040
18041   if (!d.testing_p)
18042     return aarch64_expand_vec_perm_const_1 (&d);
18043
18044   rtx_insn *last = get_last_insn ();
18045   bool ret = aarch64_expand_vec_perm_const_1 (&d);
18046   gcc_assert (last == get_last_insn ());
18047
18048   return ret;
18049 }
18050
18051 /* Generate a byte permute mask for a register of mode MODE,
18052    which has NUNITS units.  */
18053
18054 rtx
18055 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
18056 {
18057   /* We have to reverse each vector because we dont have
18058      a permuted load that can reverse-load according to ABI rules.  */
18059   rtx mask;
18060   rtvec v = rtvec_alloc (16);
18061   unsigned int i, j;
18062   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
18063
18064   gcc_assert (BYTES_BIG_ENDIAN);
18065   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
18066
18067   for (i = 0; i < nunits; i++)
18068     for (j = 0; j < usize; j++)
18069       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
18070   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
18071   return force_reg (V16QImode, mask);
18072 }
18073
18074 /* Expand an SVE integer comparison using the SVE equivalent of:
18075
18076      (set TARGET (CODE OP0 OP1)).  */
18077
18078 void
18079 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
18080 {
18081   machine_mode pred_mode = GET_MODE (target);
18082   machine_mode data_mode = GET_MODE (op0);
18083   rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
18084                                       op0, op1);
18085   if (!rtx_equal_p (target, res))
18086     emit_move_insn (target, res);
18087 }
18088
18089 /* Return the UNSPEC_COND_* code for comparison CODE.  */
18090
18091 static unsigned int
18092 aarch64_unspec_cond_code (rtx_code code)
18093 {
18094   switch (code)
18095     {
18096     case NE:
18097       return UNSPEC_COND_FCMNE;
18098     case EQ:
18099       return UNSPEC_COND_FCMEQ;
18100     case LT:
18101       return UNSPEC_COND_FCMLT;
18102     case GT:
18103       return UNSPEC_COND_FCMGT;
18104     case LE:
18105       return UNSPEC_COND_FCMLE;
18106     case GE:
18107       return UNSPEC_COND_FCMGE;
18108     case UNORDERED:
18109       return UNSPEC_COND_FCMUO;
18110     default:
18111       gcc_unreachable ();
18112     }
18113 }
18114
18115 /* Emit:
18116
18117       (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
18118
18119    where <X> is the operation associated with comparison CODE.
18120    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
18121
18122 static void
18123 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
18124                           bool known_ptrue_p, rtx op0, rtx op1)
18125 {
18126   rtx flag = gen_int_mode (known_ptrue_p, SImode);
18127   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
18128                                gen_rtvec (4, pred, flag, op0, op1),
18129                                aarch64_unspec_cond_code (code));
18130   emit_set_insn (target, unspec);
18131 }
18132
18133 /* Emit the SVE equivalent of:
18134
18135       (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
18136       (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
18137       (set TARGET (ior:PRED_MODE TMP1 TMP2))
18138
18139    where <Xi> is the operation associated with comparison CODEi.
18140    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
18141
18142 static void
18143 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
18144                               rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
18145 {
18146   machine_mode pred_mode = GET_MODE (pred);
18147   rtx tmp1 = gen_reg_rtx (pred_mode);
18148   aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
18149   rtx tmp2 = gen_reg_rtx (pred_mode);
18150   aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
18151   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
18152 }
18153
18154 /* Emit the SVE equivalent of:
18155
18156       (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
18157       (set TARGET (not TMP))
18158
18159    where <X> is the operation associated with comparison CODE.
18160    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
18161
18162 static void
18163 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
18164                                  bool known_ptrue_p, rtx op0, rtx op1)
18165 {
18166   machine_mode pred_mode = GET_MODE (pred);
18167   rtx tmp = gen_reg_rtx (pred_mode);
18168   aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
18169   aarch64_emit_unop (target, one_cmpl_optab, tmp);
18170 }
18171
18172 /* Expand an SVE floating-point comparison using the SVE equivalent of:
18173
18174      (set TARGET (CODE OP0 OP1))
18175
18176    If CAN_INVERT_P is true, the caller can also handle inverted results;
18177    return true if the result is in fact inverted.  */
18178
18179 bool
18180 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
18181                                   rtx op0, rtx op1, bool can_invert_p)
18182 {
18183   machine_mode pred_mode = GET_MODE (target);
18184   machine_mode data_mode = GET_MODE (op0);
18185
18186   rtx ptrue = aarch64_ptrue_reg (pred_mode);
18187   switch (code)
18188     {
18189     case UNORDERED:
18190       /* UNORDERED has no immediate form.  */
18191       op1 = force_reg (data_mode, op1);
18192       /* fall through */
18193     case LT:
18194     case LE:
18195     case GT:
18196     case GE:
18197     case EQ:
18198     case NE:
18199       {
18200         /* There is native support for the comparison.  */
18201         aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
18202         return false;
18203       }
18204
18205     case LTGT:
18206       /* This is a trapping operation (LT or GT).  */
18207       aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
18208       return false;
18209
18210     case UNEQ:
18211       if (!flag_trapping_math)
18212         {
18213           /* This would trap for signaling NaNs.  */
18214           op1 = force_reg (data_mode, op1);
18215           aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
18216                                         ptrue, true, op0, op1);
18217           return false;
18218         }
18219       /* fall through */
18220     case UNLT:
18221     case UNLE:
18222     case UNGT:
18223     case UNGE:
18224       if (flag_trapping_math)
18225         {
18226           /* Work out which elements are ordered.  */
18227           rtx ordered = gen_reg_rtx (pred_mode);
18228           op1 = force_reg (data_mode, op1);
18229           aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
18230                                            ptrue, true, op0, op1);
18231
18232           /* Test the opposite condition for the ordered elements,
18233              then invert the result.  */
18234           if (code == UNEQ)
18235             code = NE;
18236           else
18237             code = reverse_condition_maybe_unordered (code);
18238           if (can_invert_p)
18239             {
18240               aarch64_emit_sve_fp_cond (target, code,
18241                                         ordered, false, op0, op1);
18242               return true;
18243             }
18244           aarch64_emit_sve_invert_fp_cond (target, code,
18245                                            ordered, false, op0, op1);
18246           return false;
18247         }
18248       break;
18249
18250     case ORDERED:
18251       /* ORDERED has no immediate form.  */
18252       op1 = force_reg (data_mode, op1);
18253       break;
18254
18255     default:
18256       gcc_unreachable ();
18257     }
18258
18259   /* There is native support for the inverse comparison.  */
18260   code = reverse_condition_maybe_unordered (code);
18261   if (can_invert_p)
18262     {
18263       aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
18264       return true;
18265     }
18266   aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
18267   return false;
18268 }
18269
18270 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
18271    of the data being selected and CMP_MODE is the mode of the values being
18272    compared.  */
18273
18274 void
18275 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
18276                           rtx *ops)
18277 {
18278   machine_mode pred_mode
18279     = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
18280                              GET_MODE_SIZE (cmp_mode)).require ();
18281   rtx pred = gen_reg_rtx (pred_mode);
18282   if (FLOAT_MODE_P (cmp_mode))
18283     {
18284       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
18285                                             ops[4], ops[5], true))
18286         std::swap (ops[1], ops[2]);
18287     }
18288   else
18289     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
18290
18291   if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
18292     ops[1] = force_reg (data_mode, ops[1]);
18293   /* The "false" value can only be zero if the "true" value is a constant.  */
18294   if (register_operand (ops[1], data_mode)
18295       || !aarch64_simd_reg_or_zero (ops[2], data_mode))
18296     ops[2] = force_reg (data_mode, ops[2]);
18297
18298   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
18299   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
18300 }
18301
18302 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
18303    true.  However due to issues with register allocation it is preferable
18304    to avoid tieing integer scalar and FP scalar modes.  Executing integer
18305    operations in general registers is better than treating them as scalar
18306    vector operations.  This reduces latency and avoids redundant int<->FP
18307    moves.  So tie modes if they are either the same class, or vector modes
18308    with other vector modes, vector structs or any scalar mode.  */
18309
18310 static bool
18311 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
18312 {
18313   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
18314     return true;
18315
18316   /* We specifically want to allow elements of "structure" modes to
18317      be tieable to the structure.  This more general condition allows
18318      other rarer situations too.  The reason we don't extend this to
18319      predicate modes is that there are no predicate structure modes
18320      nor any specific instructions for extracting part of a predicate
18321      register.  */
18322   if (aarch64_vector_data_mode_p (mode1)
18323       && aarch64_vector_data_mode_p (mode2))
18324     return true;
18325
18326   /* Also allow any scalar modes with vectors.  */
18327   if (aarch64_vector_mode_supported_p (mode1)
18328       || aarch64_vector_mode_supported_p (mode2))
18329     return true;
18330
18331   return false;
18332 }
18333
18334 /* Return a new RTX holding the result of moving POINTER forward by
18335    AMOUNT bytes.  */
18336
18337 static rtx
18338 aarch64_move_pointer (rtx pointer, poly_int64 amount)
18339 {
18340   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
18341
18342   return adjust_automodify_address (pointer, GET_MODE (pointer),
18343                                     next, amount);
18344 }
18345
18346 /* Return a new RTX holding the result of moving POINTER forward by the
18347    size of the mode it points to.  */
18348
18349 static rtx
18350 aarch64_progress_pointer (rtx pointer)
18351 {
18352   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
18353 }
18354
18355 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
18356    MODE bytes.  */
18357
18358 static void
18359 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
18360                                               machine_mode mode)
18361 {
18362   rtx reg = gen_reg_rtx (mode);
18363
18364   /* "Cast" the pointers to the correct mode.  */
18365   *src = adjust_address (*src, mode, 0);
18366   *dst = adjust_address (*dst, mode, 0);
18367   /* Emit the memcpy.  */
18368   emit_move_insn (reg, *src);
18369   emit_move_insn (*dst, reg);
18370   /* Move the pointers forward.  */
18371   *src = aarch64_progress_pointer (*src);
18372   *dst = aarch64_progress_pointer (*dst);
18373 }
18374
18375 /* Expand cpymem, as if from a __builtin_memcpy.  Return true if
18376    we succeed, otherwise return false.  */
18377
18378 bool
18379 aarch64_expand_cpymem (rtx *operands)
18380 {
18381   int n, mode_bits;
18382   rtx dst = operands[0];
18383   rtx src = operands[1];
18384   rtx base;
18385   machine_mode cur_mode = BLKmode, next_mode;
18386   bool speed_p = !optimize_function_for_size_p (cfun);
18387
18388   /* When optimizing for size, give a better estimate of the length of a
18389      memcpy call, but use the default otherwise.  Moves larger than 8 bytes
18390      will always require an even number of instructions to do now.  And each
18391      operation requires both a load+store, so devide the max number by 2.  */
18392   int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
18393
18394   /* We can't do anything smart if the amount to copy is not constant.  */
18395   if (!CONST_INT_P (operands[2]))
18396     return false;
18397
18398   n = INTVAL (operands[2]);
18399
18400   /* Try to keep the number of instructions low.  For all cases we will do at
18401      most two moves for the residual amount, since we'll always overlap the
18402      remainder.  */
18403   if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
18404     return false;
18405
18406   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
18407   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
18408
18409   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
18410   src = adjust_automodify_address (src, VOIDmode, base, 0);
18411
18412   /* Convert n to bits to make the rest of the code simpler.  */
18413   n = n * BITS_PER_UNIT;
18414
18415   /* Maximum amount to copy in one go.  The AArch64 back-end has integer modes
18416      larger than TImode, but we should not use them for loads/stores here.  */
18417   const int copy_limit = GET_MODE_BITSIZE (TImode);
18418
18419   while (n > 0)
18420     {
18421       /* Find the largest mode in which to do the copy in without over reading
18422          or writing.  */
18423       opt_scalar_int_mode mode_iter;
18424       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
18425         if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
18426           cur_mode = mode_iter.require ();
18427
18428       gcc_assert (cur_mode != BLKmode);
18429
18430       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
18431       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
18432
18433       n -= mode_bits;
18434
18435       /* Do certain trailing copies as overlapping if it's going to be
18436          cheaper.  i.e. less instructions to do so.  For instance doing a 15
18437          byte copy it's more efficient to do two overlapping 8 byte copies than
18438          8 + 6 + 1.  */
18439       if (n > 0 && n <= 8 * BITS_PER_UNIT)
18440         {
18441           next_mode = smallest_mode_for_size (n, MODE_INT);
18442           int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
18443           src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
18444           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
18445           n = n_bits;
18446         }
18447     }
18448
18449   return true;
18450 }
18451
18452 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
18453    SImode stores.  Handle the case when the constant has identical
18454    bottom and top halves.  This is beneficial when the two stores can be
18455    merged into an STP and we avoid synthesising potentially expensive
18456    immediates twice.  Return true if such a split is possible.  */
18457
18458 bool
18459 aarch64_split_dimode_const_store (rtx dst, rtx src)
18460 {
18461   rtx lo = gen_lowpart (SImode, src);
18462   rtx hi = gen_highpart_mode (SImode, DImode, src);
18463
18464   bool size_p = optimize_function_for_size_p (cfun);
18465
18466   if (!rtx_equal_p (lo, hi))
18467     return false;
18468
18469   unsigned int orig_cost
18470     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
18471   unsigned int lo_cost
18472     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
18473
18474   /* We want to transform:
18475      MOV        x1, 49370
18476      MOVK       x1, 0x140, lsl 16
18477      MOVK       x1, 0xc0da, lsl 32
18478      MOVK       x1, 0x140, lsl 48
18479      STR        x1, [x0]
18480    into:
18481      MOV        w1, 49370
18482      MOVK       w1, 0x140, lsl 16
18483      STP        w1, w1, [x0]
18484    So we want to perform this only when we save two instructions
18485    or more.  When optimizing for size, however, accept any code size
18486    savings we can.  */
18487   if (size_p && orig_cost <= lo_cost)
18488     return false;
18489
18490   if (!size_p
18491       && (orig_cost <= lo_cost + 1))
18492     return false;
18493
18494   rtx mem_lo = adjust_address (dst, SImode, 0);
18495   if (!aarch64_mem_pair_operand (mem_lo, SImode))
18496     return false;
18497
18498   rtx tmp_reg = gen_reg_rtx (SImode);
18499   aarch64_expand_mov_immediate (tmp_reg, lo);
18500   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
18501   /* Don't emit an explicit store pair as this may not be always profitable.
18502      Let the sched-fusion logic decide whether to merge them.  */
18503   emit_move_insn (mem_lo, tmp_reg);
18504   emit_move_insn (mem_hi, tmp_reg);
18505
18506   return true;
18507 }
18508
18509 /* Generate RTL for a conditional branch with rtx comparison CODE in
18510    mode CC_MODE.  The destination of the unlikely conditional branch
18511    is LABEL_REF.  */
18512
18513 void
18514 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
18515                               rtx label_ref)
18516 {
18517   rtx x;
18518   x = gen_rtx_fmt_ee (code, VOIDmode,
18519                       gen_rtx_REG (cc_mode, CC_REGNUM),
18520                       const0_rtx);
18521
18522   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
18523                             gen_rtx_LABEL_REF (VOIDmode, label_ref),
18524                             pc_rtx);
18525   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
18526 }
18527
18528 /* Generate DImode scratch registers for 128-bit (TImode) addition.
18529
18530    OP1 represents the TImode destination operand 1
18531    OP2 represents the TImode destination operand 2
18532    LOW_DEST represents the low half (DImode) of TImode operand 0
18533    LOW_IN1 represents the low half (DImode) of TImode operand 1
18534    LOW_IN2 represents the low half (DImode) of TImode operand 2
18535    HIGH_DEST represents the high half (DImode) of TImode operand 0
18536    HIGH_IN1 represents the high half (DImode) of TImode operand 1
18537    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
18538
18539 void
18540 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
18541                             rtx *low_in1, rtx *low_in2,
18542                             rtx *high_dest, rtx *high_in1,
18543                             rtx *high_in2)
18544 {
18545   *low_dest = gen_reg_rtx (DImode);
18546   *low_in1 = gen_lowpart (DImode, op1);
18547   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
18548                                   subreg_lowpart_offset (DImode, TImode));
18549   *high_dest = gen_reg_rtx (DImode);
18550   *high_in1 = gen_highpart (DImode, op1);
18551   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
18552                                    subreg_highpart_offset (DImode, TImode));
18553 }
18554
18555 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
18556
18557    This function differs from 'arch64_addti_scratch_regs' in that
18558    OP1 can be an immediate constant (zero). We must call
18559    subreg_highpart_offset with DImode and TImode arguments, otherwise
18560    VOIDmode will be used for the const_int which generates an internal
18561    error from subreg_size_highpart_offset which does not expect a size of zero.
18562
18563    OP1 represents the TImode destination operand 1
18564    OP2 represents the TImode destination operand 2
18565    LOW_DEST represents the low half (DImode) of TImode operand 0
18566    LOW_IN1 represents the low half (DImode) of TImode operand 1
18567    LOW_IN2 represents the low half (DImode) of TImode operand 2
18568    HIGH_DEST represents the high half (DImode) of TImode operand 0
18569    HIGH_IN1 represents the high half (DImode) of TImode operand 1
18570    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
18571
18572
18573 void
18574 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
18575                              rtx *low_in1, rtx *low_in2,
18576                              rtx *high_dest, rtx *high_in1,
18577                              rtx *high_in2)
18578 {
18579   *low_dest = gen_reg_rtx (DImode);
18580   *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
18581                                   subreg_lowpart_offset (DImode, TImode));
18582
18583   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
18584                                   subreg_lowpart_offset (DImode, TImode));
18585   *high_dest = gen_reg_rtx (DImode);
18586
18587   *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
18588                                    subreg_highpart_offset (DImode, TImode));
18589   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
18590                                    subreg_highpart_offset (DImode, TImode));
18591 }
18592
18593 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
18594
18595    OP0 represents the TImode destination operand 0
18596    LOW_DEST represents the low half (DImode) of TImode operand 0
18597    LOW_IN1 represents the low half (DImode) of TImode operand 1
18598    LOW_IN2 represents the low half (DImode) of TImode operand 2
18599    HIGH_DEST represents the high half (DImode) of TImode operand 0
18600    HIGH_IN1 represents the high half (DImode) of TImode operand 1
18601    HIGH_IN2 represents the high half (DImode) of TImode operand 2
18602    UNSIGNED_P is true if the operation is being performed on unsigned
18603    values.  */
18604 void
18605 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
18606                        rtx low_in2, rtx high_dest, rtx high_in1,
18607                        rtx high_in2, bool unsigned_p)
18608 {
18609   if (low_in2 == const0_rtx)
18610     {
18611       low_dest = low_in1;
18612       high_in2 = force_reg (DImode, high_in2);
18613       if (unsigned_p)
18614         emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
18615       else
18616         emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
18617     }
18618   else
18619     {
18620       if (CONST_INT_P (low_in2))
18621         {
18622           high_in2 = force_reg (DImode, high_in2);
18623           emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
18624                                               GEN_INT (-INTVAL (low_in2))));
18625         }
18626       else
18627         emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
18628
18629       if (unsigned_p)
18630         emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
18631       else
18632         emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
18633     }
18634
18635   emit_move_insn (gen_lowpart (DImode, op0), low_dest);
18636   emit_move_insn (gen_highpart (DImode, op0), high_dest);
18637
18638 }
18639
18640 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
18641
18642 static unsigned HOST_WIDE_INT
18643 aarch64_asan_shadow_offset (void)
18644 {
18645   if (TARGET_ILP32)
18646     return (HOST_WIDE_INT_1 << 29);
18647   else
18648     return (HOST_WIDE_INT_1 << 36);
18649 }
18650
18651 static rtx
18652 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
18653                         int code, tree treeop0, tree treeop1)
18654 {
18655   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
18656   rtx op0, op1;
18657   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
18658   insn_code icode;
18659   struct expand_operand ops[4];
18660
18661   start_sequence ();
18662   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
18663
18664   op_mode = GET_MODE (op0);
18665   if (op_mode == VOIDmode)
18666     op_mode = GET_MODE (op1);
18667
18668   switch (op_mode)
18669     {
18670     case E_QImode:
18671     case E_HImode:
18672     case E_SImode:
18673       cmp_mode = SImode;
18674       icode = CODE_FOR_cmpsi;
18675       break;
18676
18677     case E_DImode:
18678       cmp_mode = DImode;
18679       icode = CODE_FOR_cmpdi;
18680       break;
18681
18682     case E_SFmode:
18683       cmp_mode = SFmode;
18684       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
18685       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
18686       break;
18687
18688     case E_DFmode:
18689       cmp_mode = DFmode;
18690       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
18691       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
18692       break;
18693
18694     default:
18695       end_sequence ();
18696       return NULL_RTX;
18697     }
18698
18699   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
18700   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
18701   if (!op0 || !op1)
18702     {
18703       end_sequence ();
18704       return NULL_RTX;
18705     }
18706   *prep_seq = get_insns ();
18707   end_sequence ();
18708
18709   create_fixed_operand (&ops[0], op0);
18710   create_fixed_operand (&ops[1], op1);
18711
18712   start_sequence ();
18713   if (!maybe_expand_insn (icode, 2, ops))
18714     {
18715       end_sequence ();
18716       return NULL_RTX;
18717     }
18718   *gen_seq = get_insns ();
18719   end_sequence ();
18720
18721   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
18722                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
18723 }
18724
18725 static rtx
18726 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
18727                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
18728 {
18729   rtx op0, op1, target;
18730   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
18731   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
18732   insn_code icode;
18733   struct expand_operand ops[6];
18734   int aarch64_cond;
18735
18736   push_to_sequence (*prep_seq);
18737   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
18738
18739   op_mode = GET_MODE (op0);
18740   if (op_mode == VOIDmode)
18741     op_mode = GET_MODE (op1);
18742
18743   switch (op_mode)
18744     {
18745     case E_QImode:
18746     case E_HImode:
18747     case E_SImode:
18748       cmp_mode = SImode;
18749       icode = CODE_FOR_ccmpsi;
18750       break;
18751
18752     case E_DImode:
18753       cmp_mode = DImode;
18754       icode = CODE_FOR_ccmpdi;
18755       break;
18756
18757     case E_SFmode:
18758       cmp_mode = SFmode;
18759       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
18760       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
18761       break;
18762
18763     case E_DFmode:
18764       cmp_mode = DFmode;
18765       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
18766       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
18767       break;
18768
18769     default:
18770       end_sequence ();
18771       return NULL_RTX;
18772     }
18773
18774   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
18775   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
18776   if (!op0 || !op1)
18777     {
18778       end_sequence ();
18779       return NULL_RTX;
18780     }
18781   *prep_seq = get_insns ();
18782   end_sequence ();
18783
18784   target = gen_rtx_REG (cc_mode, CC_REGNUM);
18785   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
18786
18787   if (bit_code != AND)
18788     {
18789       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
18790                                                 GET_MODE (XEXP (prev, 0))),
18791                              VOIDmode, XEXP (prev, 0), const0_rtx);
18792       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
18793     }
18794
18795   create_fixed_operand (&ops[0], XEXP (prev, 0));
18796   create_fixed_operand (&ops[1], target);
18797   create_fixed_operand (&ops[2], op0);
18798   create_fixed_operand (&ops[3], op1);
18799   create_fixed_operand (&ops[4], prev);
18800   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
18801
18802   push_to_sequence (*gen_seq);
18803   if (!maybe_expand_insn (icode, 6, ops))
18804     {
18805       end_sequence ();
18806       return NULL_RTX;
18807     }
18808
18809   *gen_seq = get_insns ();
18810   end_sequence ();
18811
18812   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
18813 }
18814
18815 #undef TARGET_GEN_CCMP_FIRST
18816 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
18817
18818 #undef TARGET_GEN_CCMP_NEXT
18819 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
18820
18821 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
18822    instruction fusion of some sort.  */
18823
18824 static bool
18825 aarch64_macro_fusion_p (void)
18826 {
18827   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
18828 }
18829
18830
18831 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
18832    should be kept together during scheduling.  */
18833
18834 static bool
18835 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
18836 {
18837   rtx set_dest;
18838   rtx prev_set = single_set (prev);
18839   rtx curr_set = single_set (curr);
18840   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
18841   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
18842
18843   if (!aarch64_macro_fusion_p ())
18844     return false;
18845
18846   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
18847     {
18848       /* We are trying to match:
18849          prev (mov)  == (set (reg r0) (const_int imm16))
18850          curr (movk) == (set (zero_extract (reg r0)
18851                                            (const_int 16)
18852                                            (const_int 16))
18853                              (const_int imm16_1))  */
18854
18855       set_dest = SET_DEST (curr_set);
18856
18857       if (GET_CODE (set_dest) == ZERO_EXTRACT
18858           && CONST_INT_P (SET_SRC (curr_set))
18859           && CONST_INT_P (SET_SRC (prev_set))
18860           && CONST_INT_P (XEXP (set_dest, 2))
18861           && INTVAL (XEXP (set_dest, 2)) == 16
18862           && REG_P (XEXP (set_dest, 0))
18863           && REG_P (SET_DEST (prev_set))
18864           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
18865         {
18866           return true;
18867         }
18868     }
18869
18870   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
18871     {
18872
18873       /*  We're trying to match:
18874           prev (adrp) == (set (reg r1)
18875                               (high (symbol_ref ("SYM"))))
18876           curr (add) == (set (reg r0)
18877                              (lo_sum (reg r1)
18878                                      (symbol_ref ("SYM"))))
18879           Note that r0 need not necessarily be the same as r1, especially
18880           during pre-regalloc scheduling.  */
18881
18882       if (satisfies_constraint_Ush (SET_SRC (prev_set))
18883           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
18884         {
18885           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
18886               && REG_P (XEXP (SET_SRC (curr_set), 0))
18887               && REGNO (XEXP (SET_SRC (curr_set), 0))
18888                  == REGNO (SET_DEST (prev_set))
18889               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
18890                               XEXP (SET_SRC (curr_set), 1)))
18891             return true;
18892         }
18893     }
18894
18895   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
18896     {
18897
18898       /* We're trying to match:
18899          prev (movk) == (set (zero_extract (reg r0)
18900                                            (const_int 16)
18901                                            (const_int 32))
18902                              (const_int imm16_1))
18903          curr (movk) == (set (zero_extract (reg r0)
18904                                            (const_int 16)
18905                                            (const_int 48))
18906                              (const_int imm16_2))  */
18907
18908       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
18909           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
18910           && REG_P (XEXP (SET_DEST (prev_set), 0))
18911           && REG_P (XEXP (SET_DEST (curr_set), 0))
18912           && REGNO (XEXP (SET_DEST (prev_set), 0))
18913              == REGNO (XEXP (SET_DEST (curr_set), 0))
18914           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
18915           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
18916           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
18917           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
18918           && CONST_INT_P (SET_SRC (prev_set))
18919           && CONST_INT_P (SET_SRC (curr_set)))
18920         return true;
18921
18922     }
18923   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
18924     {
18925       /* We're trying to match:
18926           prev (adrp) == (set (reg r0)
18927                               (high (symbol_ref ("SYM"))))
18928           curr (ldr) == (set (reg r1)
18929                              (mem (lo_sum (reg r0)
18930                                              (symbol_ref ("SYM")))))
18931                  or
18932           curr (ldr) == (set (reg r1)
18933                              (zero_extend (mem
18934                                            (lo_sum (reg r0)
18935                                                    (symbol_ref ("SYM"))))))  */
18936       if (satisfies_constraint_Ush (SET_SRC (prev_set))
18937           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
18938         {
18939           rtx curr_src = SET_SRC (curr_set);
18940
18941           if (GET_CODE (curr_src) == ZERO_EXTEND)
18942             curr_src = XEXP (curr_src, 0);
18943
18944           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
18945               && REG_P (XEXP (XEXP (curr_src, 0), 0))
18946               && REGNO (XEXP (XEXP (curr_src, 0), 0))
18947                  == REGNO (SET_DEST (prev_set))
18948               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
18949                               XEXP (SET_SRC (prev_set), 0)))
18950               return true;
18951         }
18952     }
18953
18954   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
18955       && any_condjump_p (curr))
18956     {
18957       unsigned int condreg1, condreg2;
18958       rtx cc_reg_1;
18959       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
18960       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
18961
18962       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
18963           && prev
18964           && modified_in_p (cc_reg_1, prev))
18965         {
18966           enum attr_type prev_type = get_attr_type (prev);
18967
18968           /* FIXME: this misses some which is considered simple arthematic
18969              instructions for ThunderX.  Simple shifts are missed here.  */
18970           if (prev_type == TYPE_ALUS_SREG
18971               || prev_type == TYPE_ALUS_IMM
18972               || prev_type == TYPE_LOGICS_REG
18973               || prev_type == TYPE_LOGICS_IMM)
18974             return true;
18975         }
18976     }
18977
18978   if (prev_set
18979       && curr_set
18980       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
18981       && any_condjump_p (curr))
18982     {
18983       /* We're trying to match:
18984           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
18985           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
18986                                                          (const_int 0))
18987                                                  (label_ref ("SYM"))
18988                                                  (pc))  */
18989       if (SET_DEST (curr_set) == (pc_rtx)
18990           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
18991           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
18992           && REG_P (SET_DEST (prev_set))
18993           && REGNO (SET_DEST (prev_set))
18994              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
18995         {
18996           /* Fuse ALU operations followed by conditional branch instruction.  */
18997           switch (get_attr_type (prev))
18998             {
18999             case TYPE_ALU_IMM:
19000             case TYPE_ALU_SREG:
19001             case TYPE_ADC_REG:
19002             case TYPE_ADC_IMM:
19003             case TYPE_ADCS_REG:
19004             case TYPE_ADCS_IMM:
19005             case TYPE_LOGIC_REG:
19006             case TYPE_LOGIC_IMM:
19007             case TYPE_CSEL:
19008             case TYPE_ADR:
19009             case TYPE_MOV_IMM:
19010             case TYPE_SHIFT_REG:
19011             case TYPE_SHIFT_IMM:
19012             case TYPE_BFM:
19013             case TYPE_RBIT:
19014             case TYPE_REV:
19015             case TYPE_EXTEND:
19016               return true;
19017
19018             default:;
19019             }
19020         }
19021     }
19022
19023   return false;
19024 }
19025
19026 /* Return true iff the instruction fusion described by OP is enabled.  */
19027
19028 bool
19029 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
19030 {
19031   return (aarch64_tune_params.fusible_ops & op) != 0;
19032 }
19033
19034 /* If MEM is in the form of [base+offset], extract the two parts
19035    of address and set to BASE and OFFSET, otherwise return false
19036    after clearing BASE and OFFSET.  */
19037
19038 bool
19039 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
19040 {
19041   rtx addr;
19042
19043   gcc_assert (MEM_P (mem));
19044
19045   addr = XEXP (mem, 0);
19046
19047   if (REG_P (addr))
19048     {
19049       *base = addr;
19050       *offset = const0_rtx;
19051       return true;
19052     }
19053
19054   if (GET_CODE (addr) == PLUS
19055       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
19056     {
19057       *base = XEXP (addr, 0);
19058       *offset = XEXP (addr, 1);
19059       return true;
19060     }
19061
19062   *base = NULL_RTX;
19063   *offset = NULL_RTX;
19064
19065   return false;
19066 }
19067
19068 /* Types for scheduling fusion.  */
19069 enum sched_fusion_type
19070 {
19071   SCHED_FUSION_NONE = 0,
19072   SCHED_FUSION_LD_SIGN_EXTEND,
19073   SCHED_FUSION_LD_ZERO_EXTEND,
19074   SCHED_FUSION_LD,
19075   SCHED_FUSION_ST,
19076   SCHED_FUSION_NUM
19077 };
19078
19079 /* If INSN is a load or store of address in the form of [base+offset],
19080    extract the two parts and set to BASE and OFFSET.  Return scheduling
19081    fusion type this INSN is.  */
19082
19083 static enum sched_fusion_type
19084 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
19085 {
19086   rtx x, dest, src;
19087   enum sched_fusion_type fusion = SCHED_FUSION_LD;
19088
19089   gcc_assert (INSN_P (insn));
19090   x = PATTERN (insn);
19091   if (GET_CODE (x) != SET)
19092     return SCHED_FUSION_NONE;
19093
19094   src = SET_SRC (x);
19095   dest = SET_DEST (x);
19096
19097   machine_mode dest_mode = GET_MODE (dest);
19098
19099   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
19100     return SCHED_FUSION_NONE;
19101
19102   if (GET_CODE (src) == SIGN_EXTEND)
19103     {
19104       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
19105       src = XEXP (src, 0);
19106       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
19107         return SCHED_FUSION_NONE;
19108     }
19109   else if (GET_CODE (src) == ZERO_EXTEND)
19110     {
19111       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
19112       src = XEXP (src, 0);
19113       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
19114         return SCHED_FUSION_NONE;
19115     }
19116
19117   if (GET_CODE (src) == MEM && REG_P (dest))
19118     extract_base_offset_in_addr (src, base, offset);
19119   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
19120     {
19121       fusion = SCHED_FUSION_ST;
19122       extract_base_offset_in_addr (dest, base, offset);
19123     }
19124   else
19125     return SCHED_FUSION_NONE;
19126
19127   if (*base == NULL_RTX || *offset == NULL_RTX)
19128     fusion = SCHED_FUSION_NONE;
19129
19130   return fusion;
19131 }
19132
19133 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
19134
19135    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
19136    and PRI are only calculated for these instructions.  For other instruction,
19137    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
19138    type instruction fusion can be added by returning different priorities.
19139
19140    It's important that irrelevant instructions get the largest FUSION_PRI.  */
19141
19142 static void
19143 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
19144                                int *fusion_pri, int *pri)
19145 {
19146   int tmp, off_val;
19147   rtx base, offset;
19148   enum sched_fusion_type fusion;
19149
19150   gcc_assert (INSN_P (insn));
19151
19152   tmp = max_pri - 1;
19153   fusion = fusion_load_store (insn, &base, &offset);
19154   if (fusion == SCHED_FUSION_NONE)
19155     {
19156       *pri = tmp;
19157       *fusion_pri = tmp;
19158       return;
19159     }
19160
19161   /* Set FUSION_PRI according to fusion type and base register.  */
19162   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
19163
19164   /* Calculate PRI.  */
19165   tmp /= 2;
19166
19167   /* INSN with smaller offset goes first.  */
19168   off_val = (int)(INTVAL (offset));
19169   if (off_val >= 0)
19170     tmp -= (off_val & 0xfffff);
19171   else
19172     tmp += ((- off_val) & 0xfffff);
19173
19174   *pri = tmp;
19175   return;
19176 }
19177
19178 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
19179    Adjust priority of sha1h instructions so they are scheduled before
19180    other SHA1 instructions.  */
19181
19182 static int
19183 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
19184 {
19185   rtx x = PATTERN (insn);
19186
19187   if (GET_CODE (x) == SET)
19188     {
19189       x = SET_SRC (x);
19190
19191       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
19192         return priority + 10;
19193     }
19194
19195   return priority;
19196 }
19197
19198 /* Given OPERANDS of consecutive load/store, check if we can merge
19199    them into ldp/stp.  LOAD is true if they are load instructions.
19200    MODE is the mode of memory operands.  */
19201
19202 bool
19203 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
19204                                 machine_mode mode)
19205 {
19206   HOST_WIDE_INT offval_1, offval_2, msize;
19207   enum reg_class rclass_1, rclass_2;
19208   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
19209
19210   if (load)
19211     {
19212       mem_1 = operands[1];
19213       mem_2 = operands[3];
19214       reg_1 = operands[0];
19215       reg_2 = operands[2];
19216       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
19217       if (REGNO (reg_1) == REGNO (reg_2))
19218         return false;
19219     }
19220   else
19221     {
19222       mem_1 = operands[0];
19223       mem_2 = operands[2];
19224       reg_1 = operands[1];
19225       reg_2 = operands[3];
19226     }
19227
19228   /* The mems cannot be volatile.  */
19229   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
19230     return false;
19231
19232   /* If we have SImode and slow unaligned ldp,
19233      check the alignment to be at least 8 byte. */
19234   if (mode == SImode
19235       && (aarch64_tune_params.extra_tuning_flags
19236           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
19237       && !optimize_size
19238       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
19239     return false;
19240
19241   /* Check if the addresses are in the form of [base+offset].  */
19242   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
19243   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
19244     return false;
19245   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
19246   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
19247     return false;
19248
19249   /* Check if the bases are same.  */
19250   if (!rtx_equal_p (base_1, base_2))
19251     return false;
19252
19253   /* The operands must be of the same size.  */
19254   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
19255                          GET_MODE_SIZE (GET_MODE (mem_2))));
19256
19257   offval_1 = INTVAL (offset_1);
19258   offval_2 = INTVAL (offset_2);
19259   /* We should only be trying this for fixed-sized modes.  There is no
19260      SVE LDP/STP instruction.  */
19261   msize = GET_MODE_SIZE (mode).to_constant ();
19262   /* Check if the offsets are consecutive.  */
19263   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
19264     return false;
19265
19266   /* Check if the addresses are clobbered by load.  */
19267   if (load)
19268     {
19269       if (reg_mentioned_p (reg_1, mem_1))
19270         return false;
19271
19272       /* In increasing order, the last load can clobber the address.  */
19273       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
19274         return false;
19275     }
19276
19277   /* One of the memory accesses must be a mempair operand.
19278      If it is not the first one, they need to be swapped by the
19279      peephole.  */
19280   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
19281        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
19282     return false;
19283
19284   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
19285     rclass_1 = FP_REGS;
19286   else
19287     rclass_1 = GENERAL_REGS;
19288
19289   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
19290     rclass_2 = FP_REGS;
19291   else
19292     rclass_2 = GENERAL_REGS;
19293
19294   /* Check if the registers are of same class.  */
19295   if (rclass_1 != rclass_2)
19296     return false;
19297
19298   return true;
19299 }
19300
19301 /* Given OPERANDS of consecutive load/store that can be merged,
19302    swap them if they are not in ascending order.  */
19303 void
19304 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
19305 {
19306   rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
19307   HOST_WIDE_INT offval_1, offval_2;
19308
19309   if (load)
19310     {
19311       mem_1 = operands[1];
19312       mem_2 = operands[3];
19313     }
19314   else
19315     {
19316       mem_1 = operands[0];
19317       mem_2 = operands[2];
19318     }
19319
19320   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
19321   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
19322
19323   offval_1 = INTVAL (offset_1);
19324   offval_2 = INTVAL (offset_2);
19325
19326   if (offval_1 > offval_2)
19327     {
19328       /* Irrespective of whether this is a load or a store,
19329          we do the same swap.  */
19330       std::swap (operands[0], operands[2]);
19331       std::swap (operands[1], operands[3]);
19332     }
19333 }
19334
19335 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
19336    comparison between the two.  */
19337 int
19338 aarch64_host_wide_int_compare (const void *x, const void *y)
19339 {
19340   return wi::cmps (* ((const HOST_WIDE_INT *) x),
19341                    * ((const HOST_WIDE_INT *) y));
19342 }
19343
19344 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
19345    other pointing to a REG rtx containing an offset, compare the offsets
19346    of the two pairs.
19347
19348    Return:
19349
19350         1 iff offset (X) > offset (Y)
19351         0 iff offset (X) == offset (Y)
19352         -1 iff offset (X) < offset (Y)  */
19353 int
19354 aarch64_ldrstr_offset_compare (const void *x, const void *y)
19355 {
19356   const rtx * operands_1 = (const rtx *) x;
19357   const rtx * operands_2 = (const rtx *) y;
19358   rtx mem_1, mem_2, base, offset_1, offset_2;
19359
19360   if (MEM_P (operands_1[0]))
19361     mem_1 = operands_1[0];
19362   else
19363     mem_1 = operands_1[1];
19364
19365   if (MEM_P (operands_2[0]))
19366     mem_2 = operands_2[0];
19367   else
19368     mem_2 = operands_2[1];
19369
19370   /* Extract the offsets.  */
19371   extract_base_offset_in_addr (mem_1, &base, &offset_1);
19372   extract_base_offset_in_addr (mem_2, &base, &offset_2);
19373
19374   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
19375
19376   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
19377 }
19378
19379 /* Given OPERANDS of consecutive load/store, check if we can merge
19380    them into ldp/stp by adjusting the offset.  LOAD is true if they
19381    are load instructions.  MODE is the mode of memory operands.
19382
19383    Given below consecutive stores:
19384
19385      str  w1, [xb, 0x100]
19386      str  w1, [xb, 0x104]
19387      str  w1, [xb, 0x108]
19388      str  w1, [xb, 0x10c]
19389
19390    Though the offsets are out of the range supported by stp, we can
19391    still pair them after adjusting the offset, like:
19392
19393      add  scratch, xb, 0x100
19394      stp  w1, w1, [scratch]
19395      stp  w1, w1, [scratch, 0x8]
19396
19397    The peephole patterns detecting this opportunity should guarantee
19398    the scratch register is avaliable.  */
19399
19400 bool
19401 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
19402                                        scalar_mode mode)
19403 {
19404   const int num_insns = 4;
19405   enum reg_class rclass;
19406   HOST_WIDE_INT offvals[num_insns], msize;
19407   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
19408
19409   if (load)
19410     {
19411       for (int i = 0; i < num_insns; i++)
19412         {
19413           reg[i] = operands[2 * i];
19414           mem[i] = operands[2 * i + 1];
19415
19416           gcc_assert (REG_P (reg[i]));
19417         }
19418
19419       /* Do not attempt to merge the loads if the loads clobber each other.  */
19420       for (int i = 0; i < 8; i += 2)
19421         for (int j = i + 2; j < 8; j += 2)
19422           if (reg_overlap_mentioned_p (operands[i], operands[j]))
19423             return false;
19424     }
19425   else
19426     for (int i = 0; i < num_insns; i++)
19427       {
19428         mem[i] = operands[2 * i];
19429         reg[i] = operands[2 * i + 1];
19430       }
19431
19432   /* Skip if memory operand is by itself valid for ldp/stp.  */
19433   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
19434     return false;
19435
19436   for (int i = 0; i < num_insns; i++)
19437     {
19438       /* The mems cannot be volatile.  */
19439       if (MEM_VOLATILE_P (mem[i]))
19440         return false;
19441
19442       /* Check if the addresses are in the form of [base+offset].  */
19443       extract_base_offset_in_addr (mem[i], base + i, offset + i);
19444       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
19445         return false;
19446     }
19447
19448   /* Check if the registers are of same class.  */
19449   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
19450     ? FP_REGS : GENERAL_REGS;
19451
19452   for (int i = 1; i < num_insns; i++)
19453     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
19454       {
19455         if (rclass != FP_REGS)
19456           return false;
19457       }
19458     else
19459       {
19460         if (rclass != GENERAL_REGS)
19461           return false;
19462       }
19463
19464   /* Only the last register in the order in which they occur
19465      may be clobbered by the load.  */
19466   if (rclass == GENERAL_REGS && load)
19467     for (int i = 0; i < num_insns - 1; i++)
19468       if (reg_mentioned_p (reg[i], mem[i]))
19469         return false;
19470
19471   /* Check if the bases are same.  */
19472   for (int i = 0; i < num_insns - 1; i++)
19473     if (!rtx_equal_p (base[i], base[i + 1]))
19474       return false;
19475
19476   for (int i = 0; i < num_insns; i++)
19477     offvals[i] = INTVAL (offset[i]);
19478
19479   msize = GET_MODE_SIZE (mode);
19480
19481   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
19482   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
19483          aarch64_host_wide_int_compare);
19484
19485   if (!(offvals[1] == offvals[0] + msize
19486         && offvals[3] == offvals[2] + msize))
19487     return false;
19488
19489   /* Check that offsets are within range of each other.  The ldp/stp
19490      instructions have 7 bit immediate offsets, so use 0x80.  */
19491   if (offvals[2] - offvals[0] >= msize * 0x80)
19492     return false;
19493
19494   /* The offsets must be aligned with respect to each other.  */
19495   if (offvals[0] % msize != offvals[2] % msize)
19496     return false;
19497
19498   /* If we have SImode and slow unaligned ldp,
19499      check the alignment to be at least 8 byte. */
19500   if (mode == SImode
19501       && (aarch64_tune_params.extra_tuning_flags
19502           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
19503       && !optimize_size
19504       && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
19505     return false;
19506
19507   return true;
19508 }
19509
19510 /* Given OPERANDS of consecutive load/store, this function pairs them
19511    into LDP/STP after adjusting the offset.  It depends on the fact
19512    that the operands can be sorted so the offsets are correct for STP.
19513    MODE is the mode of memory operands.  CODE is the rtl operator
19514    which should be applied to all memory operands, it's SIGN_EXTEND,
19515    ZERO_EXTEND or UNKNOWN.  */
19516
19517 bool
19518 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
19519                              scalar_mode mode, RTX_CODE code)
19520 {
19521   rtx base, offset_1, offset_3, t1, t2;
19522   rtx mem_1, mem_2, mem_3, mem_4;
19523   rtx temp_operands[8];
19524   HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
19525                 stp_off_upper_limit, stp_off_lower_limit, msize;
19526
19527   /* We make changes on a copy as we may still bail out.  */
19528   for (int i = 0; i < 8; i ++)
19529     temp_operands[i] = operands[i];
19530
19531   /* Sort the operands.  */
19532   qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
19533
19534   /* Copy the memory operands so that if we have to bail for some
19535      reason the original addresses are unchanged.  */
19536   if (load)
19537     {
19538       mem_1 = copy_rtx (temp_operands[1]);
19539       mem_2 = copy_rtx (temp_operands[3]);
19540       mem_3 = copy_rtx (temp_operands[5]);
19541       mem_4 = copy_rtx (temp_operands[7]);
19542     }
19543   else
19544     {
19545       mem_1 = copy_rtx (temp_operands[0]);
19546       mem_2 = copy_rtx (temp_operands[2]);
19547       mem_3 = copy_rtx (temp_operands[4]);
19548       mem_4 = copy_rtx (temp_operands[6]);
19549       gcc_assert (code == UNKNOWN);
19550     }
19551
19552   extract_base_offset_in_addr (mem_1, &base, &offset_1);
19553   extract_base_offset_in_addr (mem_3, &base, &offset_3);
19554   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
19555               && offset_3 != NULL_RTX);
19556
19557   /* Adjust offset so it can fit in LDP/STP instruction.  */
19558   msize = GET_MODE_SIZE (mode);
19559   stp_off_upper_limit = msize * (0x40 - 1);
19560   stp_off_lower_limit = - msize * 0x40;
19561
19562   off_val_1 = INTVAL (offset_1);
19563   off_val_3 = INTVAL (offset_3);
19564
19565   /* The base offset is optimally half way between the two STP/LDP offsets.  */
19566   if (msize <= 4)
19567     base_off = (off_val_1 + off_val_3) / 2;
19568   else
19569     /* However, due to issues with negative LDP/STP offset generation for
19570        larger modes, for DF, DI and vector modes. we must not use negative
19571        addresses smaller than 9 signed unadjusted bits can store.  This
19572        provides the most range in this case.  */
19573     base_off = off_val_1;
19574
19575   /* Adjust the base so that it is aligned with the addresses but still
19576      optimal.  */
19577   if (base_off % msize != off_val_1 % msize)
19578     /* Fix the offset, bearing in mind we want to make it bigger not
19579        smaller.  */
19580     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19581   else if (msize <= 4)
19582     /* The negative range of LDP/STP is one larger than the positive range.  */
19583     base_off += msize;
19584
19585   /* Check if base offset is too big or too small.  We can attempt to resolve
19586      this issue by setting it to the maximum value and seeing if the offsets
19587      still fit.  */
19588   if (base_off >= 0x1000)
19589     {
19590       base_off = 0x1000 - 1;
19591       /* We must still make sure that the base offset is aligned with respect
19592          to the address.  But it may may not be made any bigger.  */
19593       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19594     }
19595
19596   /* Likewise for the case where the base is too small.  */
19597   if (base_off <= -0x1000)
19598     {
19599       base_off = -0x1000 + 1;
19600       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19601     }
19602
19603   /* Offset of the first STP/LDP.  */
19604   new_off_1 = off_val_1 - base_off;
19605
19606   /* Offset of the second STP/LDP.  */
19607   new_off_3 = off_val_3 - base_off;
19608
19609   /* The offsets must be within the range of the LDP/STP instructions.  */
19610   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
19611       || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
19612     return false;
19613
19614   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
19615                                                   new_off_1), true);
19616   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
19617                                                   new_off_1 + msize), true);
19618   replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
19619                                                   new_off_3), true);
19620   replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
19621                                                   new_off_3 + msize), true);
19622
19623   if (!aarch64_mem_pair_operand (mem_1, mode)
19624       || !aarch64_mem_pair_operand (mem_3, mode))
19625     return false;
19626
19627   if (code == ZERO_EXTEND)
19628     {
19629       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
19630       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
19631       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
19632       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
19633     }
19634   else if (code == SIGN_EXTEND)
19635     {
19636       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
19637       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
19638       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
19639       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
19640     }
19641
19642   if (load)
19643     {
19644       operands[0] = temp_operands[0];
19645       operands[1] = mem_1;
19646       operands[2] = temp_operands[2];
19647       operands[3] = mem_2;
19648       operands[4] = temp_operands[4];
19649       operands[5] = mem_3;
19650       operands[6] = temp_operands[6];
19651       operands[7] = mem_4;
19652     }
19653   else
19654     {
19655       operands[0] = mem_1;
19656       operands[1] = temp_operands[1];
19657       operands[2] = mem_2;
19658       operands[3] = temp_operands[3];
19659       operands[4] = mem_3;
19660       operands[5] = temp_operands[5];
19661       operands[6] = mem_4;
19662       operands[7] = temp_operands[7];
19663     }
19664
19665   /* Emit adjusting instruction.  */
19666   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
19667   /* Emit ldp/stp instructions.  */
19668   t1 = gen_rtx_SET (operands[0], operands[1]);
19669   t2 = gen_rtx_SET (operands[2], operands[3]);
19670   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
19671   t1 = gen_rtx_SET (operands[4], operands[5]);
19672   t2 = gen_rtx_SET (operands[6], operands[7]);
19673   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
19674   return true;
19675 }
19676
19677 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
19678    it isn't worth branching around empty masked ops (including masked
19679    stores).  */
19680
19681 static bool
19682 aarch64_empty_mask_is_expensive (unsigned)
19683 {
19684   return false;
19685 }
19686
19687 /* Return 1 if pseudo register should be created and used to hold
19688    GOT address for PIC code.  */
19689
19690 bool
19691 aarch64_use_pseudo_pic_reg (void)
19692 {
19693   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
19694 }
19695
19696 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
19697
19698 static int
19699 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
19700 {
19701   switch (XINT (x, 1))
19702     {
19703     case UNSPEC_GOTSMALLPIC:
19704     case UNSPEC_GOTSMALLPIC28K:
19705     case UNSPEC_GOTTINYPIC:
19706       return 0;
19707     default:
19708       break;
19709     }
19710
19711   return default_unspec_may_trap_p (x, flags);
19712 }
19713
19714
19715 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
19716    return the log2 of that value.  Otherwise return -1.  */
19717
19718 int
19719 aarch64_fpconst_pow_of_2 (rtx x)
19720 {
19721   const REAL_VALUE_TYPE *r;
19722
19723   if (!CONST_DOUBLE_P (x))
19724     return -1;
19725
19726   r = CONST_DOUBLE_REAL_VALUE (x);
19727
19728   if (REAL_VALUE_NEGATIVE (*r)
19729       || REAL_VALUE_ISNAN (*r)
19730       || REAL_VALUE_ISINF (*r)
19731       || !real_isinteger (r, DFmode))
19732     return -1;
19733
19734   return exact_log2 (real_to_integer (r));
19735 }
19736
19737 /* If X is a vector of equal CONST_DOUBLE values and that value is
19738    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
19739
19740 int
19741 aarch64_vec_fpconst_pow_of_2 (rtx x)
19742 {
19743   int nelts;
19744   if (GET_CODE (x) != CONST_VECTOR
19745       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
19746     return -1;
19747
19748   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
19749     return -1;
19750
19751   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
19752   if (firstval <= 0)
19753     return -1;
19754
19755   for (int i = 1; i < nelts; i++)
19756     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
19757       return -1;
19758
19759   return firstval;
19760 }
19761
19762 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
19763    to float.
19764
19765    __fp16 always promotes through this hook.
19766    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
19767    through the generic excess precision logic rather than here.  */
19768
19769 static tree
19770 aarch64_promoted_type (const_tree t)
19771 {
19772   if (SCALAR_FLOAT_TYPE_P (t)
19773       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
19774     return float_type_node;
19775
19776   return NULL_TREE;
19777 }
19778
19779 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
19780
19781 static bool
19782 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
19783                            optimization_type opt_type)
19784 {
19785   switch (op)
19786     {
19787     case rsqrt_optab:
19788       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
19789
19790     default:
19791       return true;
19792     }
19793 }
19794
19795 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
19796
19797 static unsigned int
19798 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
19799                                         int *offset)
19800 {
19801   /* Polynomial invariant 1 == (VG / 2) - 1.  */
19802   gcc_assert (i == 1);
19803   *factor = 2;
19804   *offset = 1;
19805   return AARCH64_DWARF_VG;
19806 }
19807
19808 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
19809    if MODE is HFmode, and punt to the generic implementation otherwise.  */
19810
19811 static bool
19812 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
19813 {
19814   return (mode == HFmode
19815           ? true
19816           : default_libgcc_floating_mode_supported_p (mode));
19817 }
19818
19819 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
19820    if MODE is HFmode, and punt to the generic implementation otherwise.  */
19821
19822 static bool
19823 aarch64_scalar_mode_supported_p (scalar_mode mode)
19824 {
19825   return (mode == HFmode
19826           ? true
19827           : default_scalar_mode_supported_p (mode));
19828 }
19829
19830 /* Set the value of FLT_EVAL_METHOD.
19831    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
19832
19833     0: evaluate all operations and constants, whose semantic type has at
19834        most the range and precision of type float, to the range and
19835        precision of float; evaluate all other operations and constants to
19836        the range and precision of the semantic type;
19837
19838     N, where _FloatN is a supported interchange floating type
19839        evaluate all operations and constants, whose semantic type has at
19840        most the range and precision of _FloatN type, to the range and
19841        precision of the _FloatN type; evaluate all other operations and
19842        constants to the range and precision of the semantic type;
19843
19844    If we have the ARMv8.2-A extensions then we support _Float16 in native
19845    precision, so we should set this to 16.  Otherwise, we support the type,
19846    but want to evaluate expressions in float precision, so set this to
19847    0.  */
19848
19849 static enum flt_eval_method
19850 aarch64_excess_precision (enum excess_precision_type type)
19851 {
19852   switch (type)
19853     {
19854       case EXCESS_PRECISION_TYPE_FAST:
19855       case EXCESS_PRECISION_TYPE_STANDARD:
19856         /* We can calculate either in 16-bit range and precision or
19857            32-bit range and precision.  Make that decision based on whether
19858            we have native support for the ARMv8.2-A 16-bit floating-point
19859            instructions or not.  */
19860         return (TARGET_FP_F16INST
19861                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
19862                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
19863       case EXCESS_PRECISION_TYPE_IMPLICIT:
19864         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
19865       default:
19866         gcc_unreachable ();
19867     }
19868   return FLT_EVAL_METHOD_UNPREDICTABLE;
19869 }
19870
19871 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
19872    scheduled for speculative execution.  Reject the long-running division
19873    and square-root instructions.  */
19874
19875 static bool
19876 aarch64_sched_can_speculate_insn (rtx_insn *insn)
19877 {
19878   switch (get_attr_type (insn))
19879     {
19880       case TYPE_SDIV:
19881       case TYPE_UDIV:
19882       case TYPE_FDIVS:
19883       case TYPE_FDIVD:
19884       case TYPE_FSQRTS:
19885       case TYPE_FSQRTD:
19886       case TYPE_NEON_FP_SQRT_S:
19887       case TYPE_NEON_FP_SQRT_D:
19888       case TYPE_NEON_FP_SQRT_S_Q:
19889       case TYPE_NEON_FP_SQRT_D_Q:
19890       case TYPE_NEON_FP_DIV_S:
19891       case TYPE_NEON_FP_DIV_D:
19892       case TYPE_NEON_FP_DIV_S_Q:
19893       case TYPE_NEON_FP_DIV_D_Q:
19894         return false;
19895       default:
19896         return true;
19897     }
19898 }
19899
19900 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
19901
19902 static int
19903 aarch64_compute_pressure_classes (reg_class *classes)
19904 {
19905   int i = 0;
19906   classes[i++] = GENERAL_REGS;
19907   classes[i++] = FP_REGS;
19908   /* PR_REGS isn't a useful pressure class because many predicate pseudo
19909      registers need to go in PR_LO_REGS at some point during their
19910      lifetime.  Splitting it into two halves has the effect of making
19911      all predicates count against PR_LO_REGS, so that we try whenever
19912      possible to restrict the number of live predicates to 8.  This
19913      greatly reduces the amount of spilling in certain loops.  */
19914   classes[i++] = PR_LO_REGS;
19915   classes[i++] = PR_HI_REGS;
19916   return i;
19917 }
19918
19919 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
19920
19921 static bool
19922 aarch64_can_change_mode_class (machine_mode from,
19923                                machine_mode to, reg_class_t)
19924 {
19925   if (BYTES_BIG_ENDIAN)
19926     {
19927       bool from_sve_p = aarch64_sve_data_mode_p (from);
19928       bool to_sve_p = aarch64_sve_data_mode_p (to);
19929
19930       /* Don't allow changes between SVE data modes and non-SVE modes.
19931          See the comment at the head of aarch64-sve.md for details.  */
19932       if (from_sve_p != to_sve_p)
19933         return false;
19934
19935       /* Don't allow changes in element size: lane 0 of the new vector
19936          would not then be lane 0 of the old vector.  See the comment
19937          above aarch64_maybe_expand_sve_subreg_move for a more detailed
19938          description.
19939
19940          In the worst case, this forces a register to be spilled in
19941          one mode and reloaded in the other, which handles the
19942          endianness correctly.  */
19943       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
19944         return false;
19945     }
19946   return true;
19947 }
19948
19949 /* Implement TARGET_EARLY_REMAT_MODES.  */
19950
19951 static void
19952 aarch64_select_early_remat_modes (sbitmap modes)
19953 {
19954   /* SVE values are not normally live across a call, so it should be
19955      worth doing early rematerialization even in VL-specific mode.  */
19956   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
19957     {
19958       machine_mode mode = (machine_mode) i;
19959       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
19960       if (vec_flags & VEC_ANY_SVE)
19961         bitmap_set_bit (modes, i);
19962     }
19963 }
19964
19965 /* Override the default target speculation_safe_value.  */
19966 static rtx
19967 aarch64_speculation_safe_value (machine_mode mode,
19968                                 rtx result, rtx val, rtx failval)
19969 {
19970   /* Maybe we should warn if falling back to hard barriers.  They are
19971      likely to be noticably more expensive than the alternative below.  */
19972   if (!aarch64_track_speculation)
19973     return default_speculation_safe_value (mode, result, val, failval);
19974
19975   if (!REG_P (val))
19976     val = copy_to_mode_reg (mode, val);
19977
19978   if (!aarch64_reg_or_zero (failval, mode))
19979     failval = copy_to_mode_reg (mode, failval);
19980
19981   emit_insn (gen_despeculate_copy (mode, result, val, failval));
19982   return result;
19983 }
19984
19985 /* Implement TARGET_ESTIMATED_POLY_VALUE.
19986    Look into the tuning structure for an estimate.
19987    VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
19988    Advanced SIMD 128 bits.  */
19989
19990 static HOST_WIDE_INT
19991 aarch64_estimated_poly_value (poly_int64 val)
19992 {
19993   enum aarch64_sve_vector_bits_enum width_source
19994     = aarch64_tune_params.sve_width;
19995
19996   /* If we still don't have an estimate, use the default.  */
19997   if (width_source == SVE_SCALABLE)
19998     return default_estimated_poly_value (val);
19999
20000   HOST_WIDE_INT over_128 = width_source - 128;
20001   return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
20002 }
20003
20004
20005 /* Return true for types that could be supported as SIMD return or
20006    argument types.  */
20007
20008 static bool
20009 supported_simd_type (tree t)
20010 {
20011   if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
20012     {
20013       HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
20014       return s == 1 || s == 2 || s == 4 || s == 8;
20015     }
20016   return false;
20017 }
20018
20019 /* Return true for types that currently are supported as SIMD return
20020    or argument types.  */
20021
20022 static bool
20023 currently_supported_simd_type (tree t, tree b)
20024 {
20025   if (COMPLEX_FLOAT_TYPE_P (t))
20026     return false;
20027
20028   if (TYPE_SIZE (t) != TYPE_SIZE (b))
20029     return false;
20030
20031   return supported_simd_type (t);
20032 }
20033
20034 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
20035
20036 static int
20037 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
20038                                         struct cgraph_simd_clone *clonei,
20039                                         tree base_type, int num)
20040 {
20041   tree t, ret_type, arg_type;
20042   unsigned int elt_bits, vec_bits, count;
20043
20044   if (!TARGET_SIMD)
20045     return 0;
20046
20047   if (clonei->simdlen
20048       && (clonei->simdlen < 2
20049           || clonei->simdlen > 1024
20050           || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
20051     {
20052       warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20053                   "unsupported simdlen %d", clonei->simdlen);
20054       return 0;
20055     }
20056
20057   ret_type = TREE_TYPE (TREE_TYPE (node->decl));
20058   if (TREE_CODE (ret_type) != VOID_TYPE
20059       && !currently_supported_simd_type (ret_type, base_type))
20060     {
20061       if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
20062         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20063                     "GCC does not currently support mixed size types "
20064                     "for %<simd%> functions");
20065       else if (supported_simd_type (ret_type))
20066         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20067                     "GCC does not currently support return type %qT "
20068                     "for %<simd%> functions", ret_type);
20069       else
20070         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20071                     "unsupported return type %qT for %<simd%> functions",
20072                     ret_type);
20073       return 0;
20074     }
20075
20076   for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
20077     {
20078       arg_type = TREE_TYPE (t);
20079
20080       if (!currently_supported_simd_type (arg_type, base_type))
20081         {
20082           if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
20083             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20084                         "GCC does not currently support mixed size types "
20085                         "for %<simd%> functions");
20086           else
20087             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20088                         "GCC does not currently support argument type %qT "
20089                         "for %<simd%> functions", arg_type);
20090           return 0;
20091         }
20092     }
20093
20094   clonei->vecsize_mangle = 'n';
20095   clonei->mask_mode = VOIDmode;
20096   elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
20097   if (clonei->simdlen == 0)
20098     {
20099       count = 2;
20100       vec_bits = (num == 0 ? 64 : 128);
20101       clonei->simdlen = vec_bits / elt_bits;
20102     }
20103   else
20104     {
20105       count = 1;
20106       vec_bits = clonei->simdlen * elt_bits;
20107       if (vec_bits != 64 && vec_bits != 128)
20108         {
20109           warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20110                       "GCC does not currently support simdlen %d for type %qT",
20111                       clonei->simdlen, base_type);
20112           return 0;
20113         }
20114     }
20115   clonei->vecsize_int = vec_bits;
20116   clonei->vecsize_float = vec_bits;
20117   return count;
20118 }
20119
20120 /* Implement TARGET_SIMD_CLONE_ADJUST.  */
20121
20122 static void
20123 aarch64_simd_clone_adjust (struct cgraph_node *node)
20124 {
20125   /* Add aarch64_vector_pcs target attribute to SIMD clones so they
20126      use the correct ABI.  */
20127
20128   tree t = TREE_TYPE (node->decl);
20129   TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
20130                                         TYPE_ATTRIBUTES (t));
20131 }
20132
20133 /* Implement TARGET_SIMD_CLONE_USABLE.  */
20134
20135 static int
20136 aarch64_simd_clone_usable (struct cgraph_node *node)
20137 {
20138   switch (node->simdclone->vecsize_mangle)
20139     {
20140     case 'n':
20141       if (!TARGET_SIMD)
20142         return -1;
20143       return 0;
20144     default:
20145       gcc_unreachable ();
20146     }
20147 }
20148
20149 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
20150
20151 static int
20152 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
20153 {
20154   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
20155       != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
20156     return 0;
20157   return 1;
20158 }
20159
20160 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
20161
20162 static const char *
20163 aarch64_get_multilib_abi_name (void)
20164 {
20165   if (TARGET_BIG_END)
20166     return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
20167   return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
20168 }
20169
20170 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
20171    global variable based guard use the default else
20172    return a null tree.  */
20173 static tree
20174 aarch64_stack_protect_guard (void)
20175 {
20176   if (aarch64_stack_protector_guard == SSP_GLOBAL)
20177     return default_stack_protect_guard ();
20178
20179   return NULL_TREE;
20180 }
20181
20182 /* Implement TARGET_ASM_FILE_END for AArch64.  This adds the AArch64 GNU NOTE
20183    section at the end if needed.  */
20184 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND      0xc0000000
20185 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI      (1U << 0)
20186 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC      (1U << 1)
20187 void
20188 aarch64_file_end_indicate_exec_stack ()
20189 {
20190   file_end_indicate_exec_stack ();
20191
20192   unsigned feature_1_and = 0;
20193   if (aarch64_bti_enabled ())
20194     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
20195
20196   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
20197     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
20198
20199   if (feature_1_and)
20200     {
20201       /* Generate .note.gnu.property section.  */
20202       switch_to_section (get_section (".note.gnu.property",
20203                                       SECTION_NOTYPE, NULL));
20204
20205       /* PT_NOTE header: namesz, descsz, type.
20206          namesz = 4 ("GNU\0")
20207          descsz = 16 (Size of the program property array)
20208                   [(12 + padding) * Number of array elements]
20209          type   = 5 (NT_GNU_PROPERTY_TYPE_0).  */
20210       assemble_align (POINTER_SIZE);
20211       assemble_integer (GEN_INT (4), 4, 32, 1);
20212       assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
20213       assemble_integer (GEN_INT (5), 4, 32, 1);
20214
20215       /* PT_NOTE name.  */
20216       assemble_string ("GNU", 4);
20217
20218       /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
20219          type   = GNU_PROPERTY_AARCH64_FEATURE_1_AND
20220          datasz = 4
20221          data   = feature_1_and.  */
20222       assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
20223       assemble_integer (GEN_INT (4), 4, 32, 1);
20224       assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
20225
20226       /* Pad the size of the note to the required alignment.  */
20227       assemble_align (POINTER_SIZE);
20228     }
20229 }
20230 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
20231 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
20232 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
20233
20234 /* Target-specific selftests.  */
20235
20236 #if CHECKING_P
20237
20238 namespace selftest {
20239
20240 /* Selftest for the RTL loader.
20241    Verify that the RTL loader copes with a dump from
20242    print_rtx_function.  This is essentially just a test that class
20243    function_reader can handle a real dump, but it also verifies
20244    that lookup_reg_by_dump_name correctly handles hard regs.
20245    The presence of hard reg names in the dump means that the test is
20246    target-specific, hence it is in this file.  */
20247
20248 static void
20249 aarch64_test_loading_full_dump ()
20250 {
20251   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
20252
20253   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
20254
20255   rtx_insn *insn_1 = get_insn_by_uid (1);
20256   ASSERT_EQ (NOTE, GET_CODE (insn_1));
20257
20258   rtx_insn *insn_15 = get_insn_by_uid (15);
20259   ASSERT_EQ (INSN, GET_CODE (insn_15));
20260   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
20261
20262   /* Verify crtl->return_rtx.  */
20263   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
20264   ASSERT_EQ (0, REGNO (crtl->return_rtx));
20265   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
20266 }
20267
20268 /* Run all target-specific selftests.  */
20269
20270 static void
20271 aarch64_run_selftests (void)
20272 {
20273   aarch64_test_loading_full_dump ();
20274 }
20275
20276 } // namespace selftest
20277
20278 #endif /* #if CHECKING_P */
20279
20280 #undef TARGET_STACK_PROTECT_GUARD
20281 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
20282
20283 #undef TARGET_ADDRESS_COST
20284 #define TARGET_ADDRESS_COST aarch64_address_cost
20285
20286 /* This hook will determines whether unnamed bitfields affect the alignment
20287    of the containing structure.  The hook returns true if the structure
20288    should inherit the alignment requirements of an unnamed bitfield's
20289    type.  */
20290 #undef TARGET_ALIGN_ANON_BITFIELD
20291 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
20292
20293 #undef TARGET_ASM_ALIGNED_DI_OP
20294 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
20295
20296 #undef TARGET_ASM_ALIGNED_HI_OP
20297 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
20298
20299 #undef TARGET_ASM_ALIGNED_SI_OP
20300 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
20301
20302 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
20303 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
20304   hook_bool_const_tree_hwi_hwi_const_tree_true
20305
20306 #undef TARGET_ASM_FILE_START
20307 #define TARGET_ASM_FILE_START aarch64_start_file
20308
20309 #undef TARGET_ASM_OUTPUT_MI_THUNK
20310 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
20311
20312 #undef TARGET_ASM_SELECT_RTX_SECTION
20313 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
20314
20315 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
20316 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
20317
20318 #undef TARGET_BUILD_BUILTIN_VA_LIST
20319 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
20320
20321 #undef TARGET_CALLEE_COPIES
20322 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
20323
20324 #undef TARGET_CAN_ELIMINATE
20325 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
20326
20327 #undef TARGET_CAN_INLINE_P
20328 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
20329
20330 #undef TARGET_CANNOT_FORCE_CONST_MEM
20331 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
20332
20333 #undef TARGET_CASE_VALUES_THRESHOLD
20334 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
20335
20336 #undef TARGET_CONDITIONAL_REGISTER_USAGE
20337 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
20338
20339 /* Only the least significant bit is used for initialization guard
20340    variables.  */
20341 #undef TARGET_CXX_GUARD_MASK_BIT
20342 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
20343
20344 #undef TARGET_C_MODE_FOR_SUFFIX
20345 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
20346
20347 #ifdef TARGET_BIG_ENDIAN_DEFAULT
20348 #undef  TARGET_DEFAULT_TARGET_FLAGS
20349 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
20350 #endif
20351
20352 #undef TARGET_CLASS_MAX_NREGS
20353 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
20354
20355 #undef TARGET_BUILTIN_DECL
20356 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
20357
20358 #undef TARGET_BUILTIN_RECIPROCAL
20359 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
20360
20361 #undef TARGET_C_EXCESS_PRECISION
20362 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
20363
20364 #undef  TARGET_EXPAND_BUILTIN
20365 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
20366
20367 #undef TARGET_EXPAND_BUILTIN_VA_START
20368 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
20369
20370 #undef TARGET_FOLD_BUILTIN
20371 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
20372
20373 #undef TARGET_FUNCTION_ARG
20374 #define TARGET_FUNCTION_ARG aarch64_function_arg
20375
20376 #undef TARGET_FUNCTION_ARG_ADVANCE
20377 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
20378
20379 #undef TARGET_FUNCTION_ARG_BOUNDARY
20380 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
20381
20382 #undef TARGET_FUNCTION_ARG_PADDING
20383 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
20384
20385 #undef TARGET_GET_RAW_RESULT_MODE
20386 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
20387 #undef TARGET_GET_RAW_ARG_MODE
20388 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
20389
20390 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
20391 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
20392
20393 #undef TARGET_FUNCTION_VALUE
20394 #define TARGET_FUNCTION_VALUE aarch64_function_value
20395
20396 #undef TARGET_FUNCTION_VALUE_REGNO_P
20397 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
20398
20399 #undef TARGET_GIMPLE_FOLD_BUILTIN
20400 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
20401
20402 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
20403 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
20404
20405 #undef  TARGET_INIT_BUILTINS
20406 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
20407
20408 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
20409 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
20410   aarch64_ira_change_pseudo_allocno_class
20411
20412 #undef TARGET_LEGITIMATE_ADDRESS_P
20413 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
20414
20415 #undef TARGET_LEGITIMATE_CONSTANT_P
20416 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
20417
20418 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
20419 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
20420   aarch64_legitimize_address_displacement
20421
20422 #undef TARGET_LIBGCC_CMP_RETURN_MODE
20423 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
20424
20425 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
20426 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
20427 aarch64_libgcc_floating_mode_supported_p
20428
20429 #undef TARGET_MANGLE_TYPE
20430 #define TARGET_MANGLE_TYPE aarch64_mangle_type
20431
20432 #undef TARGET_MEMORY_MOVE_COST
20433 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
20434
20435 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
20436 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
20437
20438 #undef TARGET_MUST_PASS_IN_STACK
20439 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
20440
20441 /* This target hook should return true if accesses to volatile bitfields
20442    should use the narrowest mode possible.  It should return false if these
20443    accesses should use the bitfield container type.  */
20444 #undef TARGET_NARROW_VOLATILE_BITFIELD
20445 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
20446
20447 #undef  TARGET_OPTION_OVERRIDE
20448 #define TARGET_OPTION_OVERRIDE aarch64_override_options
20449
20450 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
20451 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
20452   aarch64_override_options_after_change
20453
20454 #undef TARGET_OPTION_SAVE
20455 #define TARGET_OPTION_SAVE aarch64_option_save
20456
20457 #undef TARGET_OPTION_RESTORE
20458 #define TARGET_OPTION_RESTORE aarch64_option_restore
20459
20460 #undef TARGET_OPTION_PRINT
20461 #define TARGET_OPTION_PRINT aarch64_option_print
20462
20463 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
20464 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
20465
20466 #undef TARGET_SET_CURRENT_FUNCTION
20467 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
20468
20469 #undef TARGET_PASS_BY_REFERENCE
20470 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
20471
20472 #undef TARGET_PREFERRED_RELOAD_CLASS
20473 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
20474
20475 #undef TARGET_SCHED_REASSOCIATION_WIDTH
20476 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
20477
20478 #undef TARGET_PROMOTED_TYPE
20479 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
20480
20481 #undef TARGET_SECONDARY_RELOAD
20482 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
20483
20484 #undef TARGET_SHIFT_TRUNCATION_MASK
20485 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
20486
20487 #undef TARGET_SETUP_INCOMING_VARARGS
20488 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
20489
20490 #undef TARGET_STRUCT_VALUE_RTX
20491 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
20492
20493 #undef TARGET_REGISTER_MOVE_COST
20494 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
20495
20496 #undef TARGET_RETURN_IN_MEMORY
20497 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
20498
20499 #undef TARGET_RETURN_IN_MSB
20500 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
20501
20502 #undef TARGET_RTX_COSTS
20503 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
20504
20505 #undef TARGET_SCALAR_MODE_SUPPORTED_P
20506 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
20507
20508 #undef TARGET_SCHED_ISSUE_RATE
20509 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
20510
20511 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
20512 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
20513   aarch64_sched_first_cycle_multipass_dfa_lookahead
20514
20515 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
20516 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
20517   aarch64_first_cycle_multipass_dfa_lookahead_guard
20518
20519 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
20520 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
20521   aarch64_get_separate_components
20522
20523 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
20524 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
20525   aarch64_components_for_bb
20526
20527 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
20528 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
20529   aarch64_disqualify_components
20530
20531 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
20532 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
20533   aarch64_emit_prologue_components
20534
20535 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
20536 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
20537   aarch64_emit_epilogue_components
20538
20539 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
20540 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
20541   aarch64_set_handled_components
20542
20543 #undef TARGET_TRAMPOLINE_INIT
20544 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
20545
20546 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
20547 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
20548
20549 #undef TARGET_VECTOR_MODE_SUPPORTED_P
20550 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
20551
20552 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
20553 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
20554   aarch64_builtin_support_vector_misalignment
20555
20556 #undef TARGET_ARRAY_MODE
20557 #define TARGET_ARRAY_MODE aarch64_array_mode
20558
20559 #undef TARGET_ARRAY_MODE_SUPPORTED_P
20560 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
20561
20562 #undef TARGET_VECTORIZE_ADD_STMT_COST
20563 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
20564
20565 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
20566 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
20567   aarch64_builtin_vectorization_cost
20568
20569 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
20570 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
20571
20572 #undef TARGET_VECTORIZE_BUILTINS
20573 #define TARGET_VECTORIZE_BUILTINS
20574
20575 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
20576 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
20577   aarch64_builtin_vectorized_function
20578
20579 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
20580 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
20581   aarch64_autovectorize_vector_sizes
20582
20583 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
20584 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
20585   aarch64_atomic_assign_expand_fenv
20586
20587 /* Section anchor support.  */
20588
20589 #undef TARGET_MIN_ANCHOR_OFFSET
20590 #define TARGET_MIN_ANCHOR_OFFSET -256
20591
20592 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
20593    byte offset; we can do much more for larger data types, but have no way
20594    to determine the size of the access.  We assume accesses are aligned.  */
20595 #undef TARGET_MAX_ANCHOR_OFFSET
20596 #define TARGET_MAX_ANCHOR_OFFSET 4095
20597
20598 #undef TARGET_VECTOR_ALIGNMENT
20599 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
20600
20601 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
20602 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
20603   aarch64_vectorize_preferred_vector_alignment
20604 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
20605 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
20606   aarch64_simd_vector_alignment_reachable
20607
20608 /* vec_perm support.  */
20609
20610 #undef TARGET_VECTORIZE_VEC_PERM_CONST
20611 #define TARGET_VECTORIZE_VEC_PERM_CONST \
20612   aarch64_vectorize_vec_perm_const
20613
20614 #undef TARGET_VECTORIZE_GET_MASK_MODE
20615 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
20616 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
20617 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
20618   aarch64_empty_mask_is_expensive
20619 #undef TARGET_PREFERRED_ELSE_VALUE
20620 #define TARGET_PREFERRED_ELSE_VALUE \
20621   aarch64_preferred_else_value
20622
20623 #undef TARGET_INIT_LIBFUNCS
20624 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
20625
20626 #undef TARGET_FIXED_CONDITION_CODE_REGS
20627 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
20628
20629 #undef TARGET_FLAGS_REGNUM
20630 #define TARGET_FLAGS_REGNUM CC_REGNUM
20631
20632 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
20633 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
20634
20635 #undef TARGET_ASAN_SHADOW_OFFSET
20636 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
20637
20638 #undef TARGET_LEGITIMIZE_ADDRESS
20639 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
20640
20641 #undef TARGET_SCHED_CAN_SPECULATE_INSN
20642 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
20643
20644 #undef TARGET_CAN_USE_DOLOOP_P
20645 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
20646
20647 #undef TARGET_SCHED_ADJUST_PRIORITY
20648 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
20649
20650 #undef TARGET_SCHED_MACRO_FUSION_P
20651 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
20652
20653 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
20654 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
20655
20656 #undef TARGET_SCHED_FUSION_PRIORITY
20657 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
20658
20659 #undef TARGET_UNSPEC_MAY_TRAP_P
20660 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
20661
20662 #undef TARGET_USE_PSEUDO_PIC_REG
20663 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
20664
20665 #undef TARGET_PRINT_OPERAND
20666 #define TARGET_PRINT_OPERAND aarch64_print_operand
20667
20668 #undef TARGET_PRINT_OPERAND_ADDRESS
20669 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
20670
20671 #undef TARGET_OPTAB_SUPPORTED_P
20672 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
20673
20674 #undef TARGET_OMIT_STRUCT_RETURN_REG
20675 #define TARGET_OMIT_STRUCT_RETURN_REG true
20676
20677 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
20678 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
20679   aarch64_dwarf_poly_indeterminate_value
20680
20681 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
20682 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
20683 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
20684
20685 #undef TARGET_HARD_REGNO_NREGS
20686 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
20687 #undef TARGET_HARD_REGNO_MODE_OK
20688 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
20689
20690 #undef TARGET_MODES_TIEABLE_P
20691 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
20692
20693 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
20694 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
20695   aarch64_hard_regno_call_part_clobbered
20696
20697 #undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
20698 #define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \
20699   aarch64_remove_extra_call_preserved_regs
20700
20701 #undef TARGET_RETURN_CALL_WITH_MAX_CLOBBERS
20702 #define TARGET_RETURN_CALL_WITH_MAX_CLOBBERS \
20703   aarch64_return_call_with_max_clobbers
20704
20705 #undef TARGET_CONSTANT_ALIGNMENT
20706 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
20707
20708 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
20709 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
20710   aarch64_stack_clash_protection_alloca_probe_range
20711
20712 #undef TARGET_COMPUTE_PRESSURE_CLASSES
20713 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
20714
20715 #undef TARGET_CAN_CHANGE_MODE_CLASS
20716 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
20717
20718 #undef TARGET_SELECT_EARLY_REMAT_MODES
20719 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
20720
20721 #undef TARGET_SPECULATION_SAFE_VALUE
20722 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
20723
20724 #undef TARGET_ESTIMATED_POLY_VALUE
20725 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
20726
20727 #undef TARGET_ATTRIBUTE_TABLE
20728 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
20729
20730 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
20731 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
20732   aarch64_simd_clone_compute_vecsize_and_simdlen
20733
20734 #undef TARGET_SIMD_CLONE_ADJUST
20735 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
20736
20737 #undef TARGET_SIMD_CLONE_USABLE
20738 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
20739
20740 #undef TARGET_COMP_TYPE_ATTRIBUTES
20741 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
20742
20743 #undef TARGET_GET_MULTILIB_ABI_NAME
20744 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
20745
20746 #if CHECKING_P
20747 #undef TARGET_RUN_TARGET_SELFTESTS
20748 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
20749 #endif /* #if CHECKING_P */
20750
20751 #undef TARGET_ASM_POST_CFI_STARTPROC
20752 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
20753
20754 struct gcc_target targetm = TARGET_INITIALIZER;
20755
20756 #include "gt-aarch64.h"