gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2019 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #include "config.h"
  24 #define INCLUDE_STRING
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "memmodel.h"
  32 #include "gimple.h"
  33 #include "cfghooks.h"
  34 #include "cfgloop.h"
  35 #include "df.h"
  36 #include "tm_p.h"
  37 #include "stringpool.h"
  38 #include "attribs.h"
  39 #include "optabs.h"
  40 #include "regs.h"
  41 #include "emit-rtl.h"
  42 #include "recog.h"
  43 #include "cgraph.h"
  44 #include "diagnostic.h"
  45 #include "insn-attr.h"
  46 #include "alias.h"
  47 #include "fold-const.h"
  48 #include "stor-layout.h"
  49 #include "calls.h"
  50 #include "varasm.h"
  51 #include "output.h"
  52 #include "flags.h"
  53 #include "explow.h"
  54 #include "expr.h"
  55 #include "reload.h"
  56 #include "langhooks.h"
  57 #include "opts.h"
  58 #include "params.h"
  59 #include "gimplify.h"
  60 #include "dwarf2.h"
  61 #include "gimple-iterator.h"
  62 #include "tree-vectorizer.h"
  63 #include "aarch64-cost-tables.h"
  64 #include "dumpfile.h"
  65 #include "builtins.h"
  66 #include "rtl-iter.h"
  67 #include "tm-constrs.h"
  68 #include "sched-int.h"
  69 #include "target-globals.h"
  70 #include "common/common-target.h"
  71 #include "cfgrtl.h"
  72 #include "selftest.h"
  73 #include "selftest-rtl.h"
  74 #include "rtx-vector-builder.h"
  75 #include "intl.h"
  76
  77 /* This file should be included last.  */
  78 #include "target-def.h"
  79
  80 /* Defined for convenience.  */
  81 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  82
  83 /* Information about a legitimate vector immediate operand.  */
  84 struct simd_immediate_info
  85 {
  86   enum insn_type { MOV, MVN, INDEX, PTRUE };
  87   enum modifier_type { LSL, MSL };
  88
  89   simd_immediate_info () {}
  90   simd_immediate_info (scalar_float_mode, rtx);
  91   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
  92                        insn_type = MOV, modifier_type = LSL,
  93                        unsigned int = 0);
  94   simd_immediate_info (scalar_mode, rtx, rtx);
  95   simd_immediate_info (scalar_int_mode, aarch64_svpattern);
  96
  97   /* The mode of the elements.  */
  98   scalar_mode elt_mode;
  99
 100   /* The instruction to use to move the immediate into a vector.  */
 101   insn_type insn;
 102
 103   union
 104   {
 105     /* For MOV and MVN.  */
 106     struct
 107     {
 108       /* The value of each element.  */
 109       rtx value;
 110
 111       /* The kind of shift modifier to use, and the number of bits to shift.
 112          This is (LSL, 0) if no shift is needed.  */
 113       modifier_type modifier;
 114       unsigned int shift;
 115     } mov;
 116
 117     /* For INDEX.  */
 118     struct
 119     {
 120       /* The value of the first element and the step to be added for each
 121          subsequent element.  */
 122       rtx base, step;
 123     } index;
 124
 125     /* For PTRUE.  */
 126     aarch64_svpattern pattern;
 127   } u;
 128 };
 129
 130 /* Construct a floating-point immediate in which each element has mode
 131    ELT_MODE_IN and value VALUE_IN.  */
 132 inline simd_immediate_info
 133 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 134   : elt_mode (elt_mode_in), insn (MOV)
 135 {
 136   u.mov.value = value_in;
 137   u.mov.modifier = LSL;
 138   u.mov.shift = 0;
 139 }
 140
 141 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 142    and value VALUE_IN.  The other parameters are as for the structure
 143    fields.  */
 144 inline simd_immediate_info
 145 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 146                        unsigned HOST_WIDE_INT value_in,
 147                        insn_type insn_in, modifier_type modifier_in,
 148                        unsigned int shift_in)
 149   : elt_mode (elt_mode_in), insn (insn_in)
 150 {
 151   u.mov.value = gen_int_mode (value_in, elt_mode_in);
 152   u.mov.modifier = modifier_in;
 153   u.mov.shift = shift_in;
 154 }
 155
 156 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 157    and where element I is equal to BASE_IN + I * STEP_IN.  */
 158 inline simd_immediate_info
 159 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
 160   : elt_mode (elt_mode_in), insn (INDEX)
 161 {
 162   u.index.base = base_in;
 163   u.index.step = step_in;
 164 }
 165
 166 /* Construct a predicate that controls elements of mode ELT_MODE_IN
 167    and has PTRUE pattern PATTERN_IN.  */
 168 inline simd_immediate_info
 169 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 170                        aarch64_svpattern pattern_in)
 171   : elt_mode (elt_mode_in), insn (PTRUE)
 172 {
 173   u.pattern = pattern_in;
 174 }
 175
 176 /* The current code model.  */
 177 enum aarch64_code_model aarch64_cmodel;
 178
 179 /* The number of 64-bit elements in an SVE vector.  */
 180 poly_uint16 aarch64_sve_vg;
 181
 182 #ifdef HAVE_AS_TLS
 183 #undef TARGET_HAVE_TLS
 184 #define TARGET_HAVE_TLS 1
 185 #endif
 186
 187 static bool aarch64_composite_type_p (const_tree, machine_mode);
 188 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 189                                                      const_tree,
 190                                                      machine_mode *, int *,
 191                                                      bool *);
 192 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 193 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 194 static void aarch64_override_options_after_change (void);
 195 static bool aarch64_vector_mode_supported_p (machine_mode);
 196 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 197 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 198                                                          const_tree type,
 199                                                          int misalignment,
 200                                                          bool is_packed);
 201 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 202 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 203                                             aarch64_addr_query_type);
 204 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
 205
 206 /* Major revision number of the ARM Architecture implemented by the target.  */
 207 unsigned aarch64_architecture_version;
 208
 209 /* The processor for which instructions should be scheduled.  */
 210 enum aarch64_processor aarch64_tune = cortexa53;
 211
 212 /* Mask to specify which instruction scheduling options should be used.  */
 213 uint64_t aarch64_tune_flags = 0;
 214
 215 /* Global flag for PC relative loads.  */
 216 bool aarch64_pcrelative_literal_loads;
 217
 218 /* Global flag for whether frame pointer is enabled.  */
 219 bool aarch64_use_frame_pointer;
 220
 221 #define BRANCH_PROTECT_STR_MAX 255
 222 char *accepted_branch_protection_string = NULL;
 223
 224 static enum aarch64_parse_opt_result
 225 aarch64_parse_branch_protection (const char*, char**);
 226
 227 /* Support for command line parsing of boolean flags in the tuning
 228    structures.  */
 229 struct aarch64_flag_desc
 230 {
 231   const char* name;
 232   unsigned int flag;
 233 };
 234
 235 #define AARCH64_FUSION_PAIR(name, internal_name) \
 236   { name, AARCH64_FUSE_##internal_name },
 237 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 238 {
 239   { "none", AARCH64_FUSE_NOTHING },
 240 #include "aarch64-fusion-pairs.def"
 241   { "all", AARCH64_FUSE_ALL },
 242   { NULL, AARCH64_FUSE_NOTHING }
 243 };
 244
 245 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 246   { name, AARCH64_EXTRA_TUNE_##internal_name },
 247 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 248 {
 249   { "none", AARCH64_EXTRA_TUNE_NONE },
 250 #include "aarch64-tuning-flags.def"
 251   { "all", AARCH64_EXTRA_TUNE_ALL },
 252   { NULL, AARCH64_EXTRA_TUNE_NONE }
 253 };
 254
 255 /* Tuning parameters.  */
 256
 257 static const struct cpu_addrcost_table generic_addrcost_table =
 258 {
 259     {
 260       1, /* hi  */
 261       0, /* si  */
 262       0, /* di  */
 263       1, /* ti  */
 264     },
 265   0, /* pre_modify  */
 266   0, /* post_modify  */
 267   0, /* register_offset  */
 268   0, /* register_sextend  */
 269   0, /* register_zextend  */
 270   0 /* imm_offset  */
 271 };
 272
 273 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 274 {
 275     {
 276       0, /* hi  */
 277       0, /* si  */
 278       0, /* di  */
 279       2, /* ti  */
 280     },
 281   0, /* pre_modify  */
 282   0, /* post_modify  */
 283   1, /* register_offset  */
 284   1, /* register_sextend  */
 285   2, /* register_zextend  */
 286   0, /* imm_offset  */
 287 };
 288
 289 static const struct cpu_addrcost_table xgene1_addrcost_table =
 290 {
 291     {
 292       1, /* hi  */
 293       0, /* si  */
 294       0, /* di  */
 295       1, /* ti  */
 296     },
 297   1, /* pre_modify  */
 298   1, /* post_modify  */
 299   0, /* register_offset  */
 300   1, /* register_sextend  */
 301   1, /* register_zextend  */
 302   0, /* imm_offset  */
 303 };
 304
 305 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 306 {
 307     {
 308       1, /* hi  */
 309       1, /* si  */
 310       1, /* di  */
 311       2, /* ti  */
 312     },
 313   0, /* pre_modify  */
 314   0, /* post_modify  */
 315   2, /* register_offset  */
 316   3, /* register_sextend  */
 317   3, /* register_zextend  */
 318   0, /* imm_offset  */
 319 };
 320
 321 static const struct cpu_addrcost_table tsv110_addrcost_table =
 322 {
 323     {
 324       1, /* hi  */
 325       0, /* si  */
 326       0, /* di  */
 327       1, /* ti  */
 328     },
 329   0, /* pre_modify  */
 330   0, /* post_modify  */
 331   0, /* register_offset  */
 332   1, /* register_sextend  */
 333   1, /* register_zextend  */
 334   0, /* imm_offset  */
 335 };
 336
 337 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 338 {
 339     {
 340       1, /* hi  */
 341       1, /* si  */
 342       1, /* di  */
 343       2, /* ti  */
 344     },
 345   1, /* pre_modify  */
 346   1, /* post_modify  */
 347   3, /* register_offset  */
 348   3, /* register_sextend  */
 349   3, /* register_zextend  */
 350   2, /* imm_offset  */
 351 };
 352
 353 static const struct cpu_regmove_cost generic_regmove_cost =
 354 {
 355   1, /* GP2GP  */
 356   /* Avoid the use of slow int<->fp moves for spilling by setting
 357      their cost higher than memmov_cost.  */
 358   5, /* GP2FP  */
 359   5, /* FP2GP  */
 360   2 /* FP2FP  */
 361 };
 362
 363 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 364 {
 365   1, /* GP2GP  */
 366   /* Avoid the use of slow int<->fp moves for spilling by setting
 367      their cost higher than memmov_cost.  */
 368   5, /* GP2FP  */
 369   5, /* FP2GP  */
 370   2 /* FP2FP  */
 371 };
 372
 373 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 374 {
 375   1, /* GP2GP  */
 376   /* Avoid the use of slow int<->fp moves for spilling by setting
 377      their cost higher than memmov_cost.  */
 378   5, /* GP2FP  */
 379   5, /* FP2GP  */
 380   2 /* FP2FP  */
 381 };
 382
 383 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 384 {
 385   1, /* GP2GP  */
 386   /* Avoid the use of slow int<->fp moves for spilling by setting
 387      their cost higher than memmov_cost (actual, 4 and 9).  */
 388   9, /* GP2FP  */
 389   9, /* FP2GP  */
 390   1 /* FP2FP  */
 391 };
 392
 393 static const struct cpu_regmove_cost thunderx_regmove_cost =
 394 {
 395   2, /* GP2GP  */
 396   2, /* GP2FP  */
 397   6, /* FP2GP  */
 398   4 /* FP2FP  */
 399 };
 400
 401 static const struct cpu_regmove_cost xgene1_regmove_cost =
 402 {
 403   1, /* GP2GP  */
 404   /* Avoid the use of slow int<->fp moves for spilling by setting
 405      their cost higher than memmov_cost.  */
 406   8, /* GP2FP  */
 407   8, /* FP2GP  */
 408   2 /* FP2FP  */
 409 };
 410
 411 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 412 {
 413   2, /* GP2GP  */
 414   /* Avoid the use of int<->fp moves for spilling.  */
 415   6, /* GP2FP  */
 416   6, /* FP2GP  */
 417   4 /* FP2FP  */
 418 };
 419
 420 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 421 {
 422   1, /* GP2GP  */
 423   /* Avoid the use of int<->fp moves for spilling.  */
 424   8, /* GP2FP  */
 425   8, /* FP2GP  */
 426   4  /* FP2FP  */
 427 };
 428
 429 static const struct cpu_regmove_cost tsv110_regmove_cost =
 430 {
 431   1, /* GP2GP  */
 432   /* Avoid the use of slow int<->fp moves for spilling by setting
 433      their cost higher than memmov_cost.  */
 434   2, /* GP2FP  */
 435   3, /* FP2GP  */
 436   2  /* FP2FP  */
 437 };
 438
 439 /* Generic costs for vector insn classes.  */
 440 static const struct cpu_vector_cost generic_vector_cost =
 441 {
 442   1, /* scalar_int_stmt_cost  */
 443   1, /* scalar_fp_stmt_cost  */
 444   1, /* scalar_load_cost  */
 445   1, /* scalar_store_cost  */
 446   1, /* vec_int_stmt_cost  */
 447   1, /* vec_fp_stmt_cost  */
 448   2, /* vec_permute_cost  */
 449   1, /* vec_to_scalar_cost  */
 450   1, /* scalar_to_vec_cost  */
 451   1, /* vec_align_load_cost  */
 452   1, /* vec_unalign_load_cost  */
 453   1, /* vec_unalign_store_cost  */
 454   1, /* vec_store_cost  */
 455   3, /* cond_taken_branch_cost  */
 456   1 /* cond_not_taken_branch_cost  */
 457 };
 458
 459 /* QDF24XX costs for vector insn classes.  */
 460 static const struct cpu_vector_cost qdf24xx_vector_cost =
 461 {
 462   1, /* scalar_int_stmt_cost  */
 463   1, /* scalar_fp_stmt_cost  */
 464   1, /* scalar_load_cost  */
 465   1, /* scalar_store_cost  */
 466   1, /* vec_int_stmt_cost  */
 467   3, /* vec_fp_stmt_cost  */
 468   2, /* vec_permute_cost  */
 469   1, /* vec_to_scalar_cost  */
 470   1, /* scalar_to_vec_cost  */
 471   1, /* vec_align_load_cost  */
 472   1, /* vec_unalign_load_cost  */
 473   1, /* vec_unalign_store_cost  */
 474   1, /* vec_store_cost  */
 475   3, /* cond_taken_branch_cost  */
 476   1 /* cond_not_taken_branch_cost  */
 477 };
 478
 479 /* ThunderX costs for vector insn classes.  */
 480 static const struct cpu_vector_cost thunderx_vector_cost =
 481 {
 482   1, /* scalar_int_stmt_cost  */
 483   1, /* scalar_fp_stmt_cost  */
 484   3, /* scalar_load_cost  */
 485   1, /* scalar_store_cost  */
 486   4, /* vec_int_stmt_cost  */
 487   1, /* vec_fp_stmt_cost  */
 488   4, /* vec_permute_cost  */
 489   2, /* vec_to_scalar_cost  */
 490   2, /* scalar_to_vec_cost  */
 491   3, /* vec_align_load_cost  */
 492   5, /* vec_unalign_load_cost  */
 493   5, /* vec_unalign_store_cost  */
 494   1, /* vec_store_cost  */
 495   3, /* cond_taken_branch_cost  */
 496   3 /* cond_not_taken_branch_cost  */
 497 };
 498
 499 static const struct cpu_vector_cost tsv110_vector_cost =
 500 {
 501   1, /* scalar_int_stmt_cost  */
 502   1, /* scalar_fp_stmt_cost  */
 503   5, /* scalar_load_cost  */
 504   1, /* scalar_store_cost  */
 505   2, /* vec_int_stmt_cost  */
 506   2, /* vec_fp_stmt_cost  */
 507   2, /* vec_permute_cost  */
 508   3, /* vec_to_scalar_cost  */
 509   2, /* scalar_to_vec_cost  */
 510   5, /* vec_align_load_cost  */
 511   5, /* vec_unalign_load_cost  */
 512   1, /* vec_unalign_store_cost  */
 513   1, /* vec_store_cost  */
 514   1, /* cond_taken_branch_cost  */
 515   1 /* cond_not_taken_branch_cost  */
 516 };
 517
 518 /* Generic costs for vector insn classes.  */
 519 static const struct cpu_vector_cost cortexa57_vector_cost =
 520 {
 521   1, /* scalar_int_stmt_cost  */
 522   1, /* scalar_fp_stmt_cost  */
 523   4, /* scalar_load_cost  */
 524   1, /* scalar_store_cost  */
 525   2, /* vec_int_stmt_cost  */
 526   2, /* vec_fp_stmt_cost  */
 527   3, /* vec_permute_cost  */
 528   8, /* vec_to_scalar_cost  */
 529   8, /* scalar_to_vec_cost  */
 530   4, /* vec_align_load_cost  */
 531   4, /* vec_unalign_load_cost  */
 532   1, /* vec_unalign_store_cost  */
 533   1, /* vec_store_cost  */
 534   1, /* cond_taken_branch_cost  */
 535   1 /* cond_not_taken_branch_cost  */
 536 };
 537
 538 static const struct cpu_vector_cost exynosm1_vector_cost =
 539 {
 540   1, /* scalar_int_stmt_cost  */
 541   1, /* scalar_fp_stmt_cost  */
 542   5, /* scalar_load_cost  */
 543   1, /* scalar_store_cost  */
 544   3, /* vec_int_stmt_cost  */
 545   3, /* vec_fp_stmt_cost  */
 546   3, /* vec_permute_cost  */
 547   3, /* vec_to_scalar_cost  */
 548   3, /* scalar_to_vec_cost  */
 549   5, /* vec_align_load_cost  */
 550   5, /* vec_unalign_load_cost  */
 551   1, /* vec_unalign_store_cost  */
 552   1, /* vec_store_cost  */
 553   1, /* cond_taken_branch_cost  */
 554   1 /* cond_not_taken_branch_cost  */
 555 };
 556
 557 /* Generic costs for vector insn classes.  */
 558 static const struct cpu_vector_cost xgene1_vector_cost =
 559 {
 560   1, /* scalar_int_stmt_cost  */
 561   1, /* scalar_fp_stmt_cost  */
 562   5, /* scalar_load_cost  */
 563   1, /* scalar_store_cost  */
 564   2, /* vec_int_stmt_cost  */
 565   2, /* vec_fp_stmt_cost  */
 566   2, /* vec_permute_cost  */
 567   4, /* vec_to_scalar_cost  */
 568   4, /* scalar_to_vec_cost  */
 569   10, /* vec_align_load_cost  */
 570   10, /* vec_unalign_load_cost  */
 571   2, /* vec_unalign_store_cost  */
 572   2, /* vec_store_cost  */
 573   2, /* cond_taken_branch_cost  */
 574   1 /* cond_not_taken_branch_cost  */
 575 };
 576
 577 /* Costs for vector insn classes for Vulcan.  */
 578 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 579 {
 580   1, /* scalar_int_stmt_cost  */
 581   6, /* scalar_fp_stmt_cost  */
 582   4, /* scalar_load_cost  */
 583   1, /* scalar_store_cost  */
 584   5, /* vec_int_stmt_cost  */
 585   6, /* vec_fp_stmt_cost  */
 586   3, /* vec_permute_cost  */
 587   6, /* vec_to_scalar_cost  */
 588   5, /* scalar_to_vec_cost  */
 589   8, /* vec_align_load_cost  */
 590   8, /* vec_unalign_load_cost  */
 591   4, /* vec_unalign_store_cost  */
 592   4, /* vec_store_cost  */
 593   2, /* cond_taken_branch_cost  */
 594   1  /* cond_not_taken_branch_cost  */
 595 };
 596
 597 /* Generic costs for branch instructions.  */
 598 static const struct cpu_branch_cost generic_branch_cost =
 599 {
 600   1,  /* Predictable.  */
 601   3   /* Unpredictable.  */
 602 };
 603
 604 /* Generic approximation modes.  */
 605 static const cpu_approx_modes generic_approx_modes =
 606 {
 607   AARCH64_APPROX_NONE,  /* division  */
 608   AARCH64_APPROX_NONE,  /* sqrt  */
 609   AARCH64_APPROX_NONE   /* recip_sqrt  */
 610 };
 611
 612 /* Approximation modes for Exynos M1.  */
 613 static const cpu_approx_modes exynosm1_approx_modes =
 614 {
 615   AARCH64_APPROX_NONE,  /* division  */
 616   AARCH64_APPROX_ALL,   /* sqrt  */
 617   AARCH64_APPROX_ALL    /* recip_sqrt  */
 618 };
 619
 620 /* Approximation modes for X-Gene 1.  */
 621 static const cpu_approx_modes xgene1_approx_modes =
 622 {
 623   AARCH64_APPROX_NONE,  /* division  */
 624   AARCH64_APPROX_NONE,  /* sqrt  */
 625   AARCH64_APPROX_ALL    /* recip_sqrt  */
 626 };
 627
 628 /* Generic prefetch settings (which disable prefetch).  */
 629 static const cpu_prefetch_tune generic_prefetch_tune =
 630 {
 631   0,                    /* num_slots  */
 632   -1,                   /* l1_cache_size  */
 633   -1,                   /* l1_cache_line_size  */
 634   -1,                   /* l2_cache_size  */
 635   true,                 /* prefetch_dynamic_strides */
 636   -1,                   /* minimum_stride */
 637   -1                    /* default_opt_level  */
 638 };
 639
 640 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 641 {
 642   0,                    /* num_slots  */
 643   -1,                   /* l1_cache_size  */
 644   64,                   /* l1_cache_line_size  */
 645   -1,                   /* l2_cache_size  */
 646   true,                 /* prefetch_dynamic_strides */
 647   -1,                   /* minimum_stride */
 648   -1                    /* default_opt_level  */
 649 };
 650
 651 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 652 {
 653   4,                    /* num_slots  */
 654   32,                   /* l1_cache_size  */
 655   64,                   /* l1_cache_line_size  */
 656   512,                  /* l2_cache_size  */
 657   false,                /* prefetch_dynamic_strides */
 658   2048,                 /* minimum_stride */
 659   3                     /* default_opt_level  */
 660 };
 661
 662 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 663 {
 664   8,                    /* num_slots  */
 665   32,                   /* l1_cache_size  */
 666   128,                  /* l1_cache_line_size  */
 667   16*1024,              /* l2_cache_size  */
 668   true,                 /* prefetch_dynamic_strides */
 669   -1,                   /* minimum_stride */
 670   3                     /* default_opt_level  */
 671 };
 672
 673 static const cpu_prefetch_tune thunderx_prefetch_tune =
 674 {
 675   8,                    /* num_slots  */
 676   32,                   /* l1_cache_size  */
 677   128,                  /* l1_cache_line_size  */
 678   -1,                   /* l2_cache_size  */
 679   true,                 /* prefetch_dynamic_strides */
 680   -1,                   /* minimum_stride */
 681   -1                    /* default_opt_level  */
 682 };
 683
 684 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 685 {
 686   8,                    /* num_slots  */
 687   32,                   /* l1_cache_size  */
 688   64,                   /* l1_cache_line_size  */
 689   256,                  /* l2_cache_size  */
 690   true,                 /* prefetch_dynamic_strides */
 691   -1,                   /* minimum_stride */
 692   -1                    /* default_opt_level  */
 693 };
 694
 695 static const cpu_prefetch_tune tsv110_prefetch_tune =
 696 {
 697   0,                    /* num_slots  */
 698   64,                   /* l1_cache_size  */
 699   64,                   /* l1_cache_line_size  */
 700   512,                  /* l2_cache_size  */
 701   true,                 /* prefetch_dynamic_strides */
 702   -1,                   /* minimum_stride */
 703   -1                    /* default_opt_level  */
 704 };
 705
 706 static const cpu_prefetch_tune xgene1_prefetch_tune =
 707 {
 708   8,                    /* num_slots  */
 709   32,                   /* l1_cache_size  */
 710   64,                   /* l1_cache_line_size  */
 711   256,                  /* l2_cache_size  */
 712   true,                 /* prefetch_dynamic_strides */
 713   -1,                   /* minimum_stride */
 714   -1                    /* default_opt_level  */
 715 };
 716
 717 static const struct tune_params generic_tunings =
 718 {
 719   &cortexa57_extra_costs,
 720   &generic_addrcost_table,
 721   &generic_regmove_cost,
 722   &generic_vector_cost,
 723   &generic_branch_cost,
 724   &generic_approx_modes,
 725   SVE_NOT_IMPLEMENTED, /* sve_width  */
 726   4, /* memmov_cost  */
 727   2, /* issue_rate  */
 728   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 729   "16:12",      /* function_align.  */
 730   "4",  /* jump_align.  */
 731   "8",  /* loop_align.  */
 732   2,    /* int_reassoc_width.  */
 733   4,    /* fp_reassoc_width.  */
 734   1,    /* vec_reassoc_width.  */
 735   2,    /* min_div_recip_mul_sf.  */
 736   2,    /* min_div_recip_mul_df.  */
 737   0,    /* max_case_values.  */
 738   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 739   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 740   &generic_prefetch_tune
 741 };
 742
 743 static const struct tune_params cortexa35_tunings =
 744 {
 745   &cortexa53_extra_costs,
 746   &generic_addrcost_table,
 747   &cortexa53_regmove_cost,
 748   &generic_vector_cost,
 749   &generic_branch_cost,
 750   &generic_approx_modes,
 751   SVE_NOT_IMPLEMENTED, /* sve_width  */
 752   4, /* memmov_cost  */
 753   1, /* issue_rate  */
 754   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 755    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 756   "16", /* function_align.  */
 757   "4",  /* jump_align.  */
 758   "8",  /* loop_align.  */
 759   2,    /* int_reassoc_width.  */
 760   4,    /* fp_reassoc_width.  */
 761   1,    /* vec_reassoc_width.  */
 762   2,    /* min_div_recip_mul_sf.  */
 763   2,    /* min_div_recip_mul_df.  */
 764   0,    /* max_case_values.  */
 765   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 766   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 767   &generic_prefetch_tune
 768 };
 769
 770 static const struct tune_params cortexa53_tunings =
 771 {
 772   &cortexa53_extra_costs,
 773   &generic_addrcost_table,
 774   &cortexa53_regmove_cost,
 775   &generic_vector_cost,
 776   &generic_branch_cost,
 777   &generic_approx_modes,
 778   SVE_NOT_IMPLEMENTED, /* sve_width  */
 779   4, /* memmov_cost  */
 780   2, /* issue_rate  */
 781   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 782    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 783   "16", /* function_align.  */
 784   "4",  /* jump_align.  */
 785   "8",  /* loop_align.  */
 786   2,    /* int_reassoc_width.  */
 787   4,    /* fp_reassoc_width.  */
 788   1,    /* vec_reassoc_width.  */
 789   2,    /* min_div_recip_mul_sf.  */
 790   2,    /* min_div_recip_mul_df.  */
 791   0,    /* max_case_values.  */
 792   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 793   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 794   &generic_prefetch_tune
 795 };
 796
 797 static const struct tune_params cortexa57_tunings =
 798 {
 799   &cortexa57_extra_costs,
 800   &generic_addrcost_table,
 801   &cortexa57_regmove_cost,
 802   &cortexa57_vector_cost,
 803   &generic_branch_cost,
 804   &generic_approx_modes,
 805   SVE_NOT_IMPLEMENTED, /* sve_width  */
 806   4, /* memmov_cost  */
 807   3, /* issue_rate  */
 808   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 809    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 810   "16", /* function_align.  */
 811   "4",  /* jump_align.  */
 812   "8",  /* loop_align.  */
 813   2,    /* int_reassoc_width.  */
 814   4,    /* fp_reassoc_width.  */
 815   1,    /* vec_reassoc_width.  */
 816   2,    /* min_div_recip_mul_sf.  */
 817   2,    /* min_div_recip_mul_df.  */
 818   0,    /* max_case_values.  */
 819   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 820   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 821   &generic_prefetch_tune
 822 };
 823
 824 static const struct tune_params cortexa72_tunings =
 825 {
 826   &cortexa57_extra_costs,
 827   &generic_addrcost_table,
 828   &cortexa57_regmove_cost,
 829   &cortexa57_vector_cost,
 830   &generic_branch_cost,
 831   &generic_approx_modes,
 832   SVE_NOT_IMPLEMENTED, /* sve_width  */
 833   4, /* memmov_cost  */
 834   3, /* issue_rate  */
 835   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 836    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 837   "16", /* function_align.  */
 838   "4",  /* jump_align.  */
 839   "8",  /* loop_align.  */
 840   2,    /* int_reassoc_width.  */
 841   4,    /* fp_reassoc_width.  */
 842   1,    /* vec_reassoc_width.  */
 843   2,    /* min_div_recip_mul_sf.  */
 844   2,    /* min_div_recip_mul_df.  */
 845   0,    /* max_case_values.  */
 846   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 847   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 848   &generic_prefetch_tune
 849 };
 850
 851 static const struct tune_params cortexa73_tunings =
 852 {
 853   &cortexa57_extra_costs,
 854   &generic_addrcost_table,
 855   &cortexa57_regmove_cost,
 856   &cortexa57_vector_cost,
 857   &generic_branch_cost,
 858   &generic_approx_modes,
 859   SVE_NOT_IMPLEMENTED, /* sve_width  */
 860   4, /* memmov_cost.  */
 861   2, /* issue_rate.  */
 862   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 863    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 864   "16", /* function_align.  */
 865   "4",  /* jump_align.  */
 866   "8",  /* loop_align.  */
 867   2,    /* int_reassoc_width.  */
 868   4,    /* fp_reassoc_width.  */
 869   1,    /* vec_reassoc_width.  */
 870   2,    /* min_div_recip_mul_sf.  */
 871   2,    /* min_div_recip_mul_df.  */
 872   0,    /* max_case_values.  */
 873   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 874   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 875   &generic_prefetch_tune
 876 };
 877
 878
 879
 880 static const struct tune_params exynosm1_tunings =
 881 {
 882   &exynosm1_extra_costs,
 883   &exynosm1_addrcost_table,
 884   &exynosm1_regmove_cost,
 885   &exynosm1_vector_cost,
 886   &generic_branch_cost,
 887   &exynosm1_approx_modes,
 888   SVE_NOT_IMPLEMENTED, /* sve_width  */
 889   4,    /* memmov_cost  */
 890   3,    /* issue_rate  */
 891   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 892   "4",  /* function_align.  */
 893   "4",  /* jump_align.  */
 894   "4",  /* loop_align.  */
 895   2,    /* int_reassoc_width.  */
 896   4,    /* fp_reassoc_width.  */
 897   1,    /* vec_reassoc_width.  */
 898   2,    /* min_div_recip_mul_sf.  */
 899   2,    /* min_div_recip_mul_df.  */
 900   48,   /* max_case_values.  */
 901   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 902   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 903   &exynosm1_prefetch_tune
 904 };
 905
 906 static const struct tune_params thunderxt88_tunings =
 907 {
 908   &thunderx_extra_costs,
 909   &generic_addrcost_table,
 910   &thunderx_regmove_cost,
 911   &thunderx_vector_cost,
 912   &generic_branch_cost,
 913   &generic_approx_modes,
 914   SVE_NOT_IMPLEMENTED, /* sve_width  */
 915   6, /* memmov_cost  */
 916   2, /* issue_rate  */
 917   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 918   "8",  /* function_align.  */
 919   "8",  /* jump_align.  */
 920   "8",  /* loop_align.  */
 921   2,    /* int_reassoc_width.  */
 922   4,    /* fp_reassoc_width.  */
 923   1,    /* vec_reassoc_width.  */
 924   2,    /* min_div_recip_mul_sf.  */
 925   2,    /* min_div_recip_mul_df.  */
 926   0,    /* max_case_values.  */
 927   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 928   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 929   &thunderxt88_prefetch_tune
 930 };
 931
 932 static const struct tune_params thunderx_tunings =
 933 {
 934   &thunderx_extra_costs,
 935   &generic_addrcost_table,
 936   &thunderx_regmove_cost,
 937   &thunderx_vector_cost,
 938   &generic_branch_cost,
 939   &generic_approx_modes,
 940   SVE_NOT_IMPLEMENTED, /* sve_width  */
 941   6, /* memmov_cost  */
 942   2, /* issue_rate  */
 943   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 944   "8",  /* function_align.  */
 945   "8",  /* jump_align.  */
 946   "8",  /* loop_align.  */
 947   2,    /* int_reassoc_width.  */
 948   4,    /* fp_reassoc_width.  */
 949   1,    /* vec_reassoc_width.  */
 950   2,    /* min_div_recip_mul_sf.  */
 951   2,    /* min_div_recip_mul_df.  */
 952   0,    /* max_case_values.  */
 953   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 954   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 955    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 956   &thunderx_prefetch_tune
 957 };
 958
 959 static const struct tune_params tsv110_tunings =
 960 {
 961   &tsv110_extra_costs,
 962   &tsv110_addrcost_table,
 963   &tsv110_regmove_cost,
 964   &tsv110_vector_cost,
 965   &generic_branch_cost,
 966   &generic_approx_modes,
 967   SVE_NOT_IMPLEMENTED, /* sve_width  */
 968   4,    /* memmov_cost  */
 969   4,    /* issue_rate  */
 970   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
 971    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 972   "16", /* function_align.  */
 973   "4",  /* jump_align.  */
 974   "8",  /* loop_align.  */
 975   2,    /* int_reassoc_width.  */
 976   4,    /* fp_reassoc_width.  */
 977   1,    /* vec_reassoc_width.  */
 978   2,    /* min_div_recip_mul_sf.  */
 979   2,    /* min_div_recip_mul_df.  */
 980   0,    /* max_case_values.  */
 981   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 982   (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
 983   &tsv110_prefetch_tune
 984 };
 985
 986 static const struct tune_params xgene1_tunings =
 987 {
 988   &xgene1_extra_costs,
 989   &xgene1_addrcost_table,
 990   &xgene1_regmove_cost,
 991   &xgene1_vector_cost,
 992   &generic_branch_cost,
 993   &xgene1_approx_modes,
 994   SVE_NOT_IMPLEMENTED, /* sve_width  */
 995   6, /* memmov_cost  */
 996   4, /* issue_rate  */
 997   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 998   "16", /* function_align.  */
 999   "16", /* jump_align.  */
1000   "16", /* loop_align.  */
1001   2,    /* int_reassoc_width.  */
1002   4,    /* fp_reassoc_width.  */
1003   1,    /* vec_reassoc_width.  */
1004   2,    /* min_div_recip_mul_sf.  */
1005   2,    /* min_div_recip_mul_df.  */
1006   17,   /* max_case_values.  */
1007   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1008   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1009   &xgene1_prefetch_tune
1010 };
1011
1012 static const struct tune_params emag_tunings =
1013 {
1014   &xgene1_extra_costs,
1015   &xgene1_addrcost_table,
1016   &xgene1_regmove_cost,
1017   &xgene1_vector_cost,
1018   &generic_branch_cost,
1019   &xgene1_approx_modes,
1020   SVE_NOT_IMPLEMENTED,
1021   6, /* memmov_cost  */
1022   4, /* issue_rate  */
1023   AARCH64_FUSE_NOTHING, /* fusible_ops  */
1024   "16", /* function_align.  */
1025   "16", /* jump_align.  */
1026   "16", /* loop_align.  */
1027   2,    /* int_reassoc_width.  */
1028   4,    /* fp_reassoc_width.  */
1029   1,    /* vec_reassoc_width.  */
1030   2,    /* min_div_recip_mul_sf.  */
1031   2,    /* min_div_recip_mul_df.  */
1032   17,   /* max_case_values.  */
1033   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1034   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1035   &xgene1_prefetch_tune
1036 };
1037
1038 static const struct tune_params qdf24xx_tunings =
1039 {
1040   &qdf24xx_extra_costs,
1041   &qdf24xx_addrcost_table,
1042   &qdf24xx_regmove_cost,
1043   &qdf24xx_vector_cost,
1044   &generic_branch_cost,
1045   &generic_approx_modes,
1046   SVE_NOT_IMPLEMENTED, /* sve_width  */
1047   4, /* memmov_cost  */
1048   4, /* issue_rate  */
1049   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1050    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1051   "16", /* function_align.  */
1052   "8",  /* jump_align.  */
1053   "16", /* loop_align.  */
1054   2,    /* int_reassoc_width.  */
1055   4,    /* fp_reassoc_width.  */
1056   1,    /* vec_reassoc_width.  */
1057   2,    /* min_div_recip_mul_sf.  */
1058   2,    /* min_div_recip_mul_df.  */
1059   0,    /* max_case_values.  */
1060   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1061   AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
1062   &qdf24xx_prefetch_tune
1063 };
1064
1065 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
1066    for now.  */
1067 static const struct tune_params saphira_tunings =
1068 {
1069   &generic_extra_costs,
1070   &generic_addrcost_table,
1071   &generic_regmove_cost,
1072   &generic_vector_cost,
1073   &generic_branch_cost,
1074   &generic_approx_modes,
1075   SVE_NOT_IMPLEMENTED, /* sve_width  */
1076   4, /* memmov_cost  */
1077   4, /* issue_rate  */
1078   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1079    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1080   "16", /* function_align.  */
1081   "8",  /* jump_align.  */
1082   "16", /* loop_align.  */
1083   2,    /* int_reassoc_width.  */
1084   4,    /* fp_reassoc_width.  */
1085   1,    /* vec_reassoc_width.  */
1086   2,    /* min_div_recip_mul_sf.  */
1087   2,    /* min_div_recip_mul_df.  */
1088   0,    /* max_case_values.  */
1089   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1090   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
1091   &generic_prefetch_tune
1092 };
1093
1094 static const struct tune_params thunderx2t99_tunings =
1095 {
1096   &thunderx2t99_extra_costs,
1097   &thunderx2t99_addrcost_table,
1098   &thunderx2t99_regmove_cost,
1099   &thunderx2t99_vector_cost,
1100   &generic_branch_cost,
1101   &generic_approx_modes,
1102   SVE_NOT_IMPLEMENTED, /* sve_width  */
1103   4, /* memmov_cost.  */
1104   4, /* issue_rate.  */
1105   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
1106    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
1107   "16", /* function_align.  */
1108   "8",  /* jump_align.  */
1109   "16", /* loop_align.  */
1110   3,    /* int_reassoc_width.  */
1111   2,    /* fp_reassoc_width.  */
1112   2,    /* vec_reassoc_width.  */
1113   2,    /* min_div_recip_mul_sf.  */
1114   2,    /* min_div_recip_mul_df.  */
1115   0,    /* max_case_values.  */
1116   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1117   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1118   &thunderx2t99_prefetch_tune
1119 };
1120
1121 static const struct tune_params neoversen1_tunings =
1122 {
1123   &cortexa57_extra_costs,
1124   &generic_addrcost_table,
1125   &generic_regmove_cost,
1126   &cortexa57_vector_cost,
1127   &generic_branch_cost,
1128   &generic_approx_modes,
1129   SVE_NOT_IMPLEMENTED, /* sve_width  */
1130   4, /* memmov_cost  */
1131   3, /* issue_rate  */
1132   AARCH64_FUSE_AES_AESMC, /* fusible_ops  */
1133   "32:16",      /* function_align.  */
1134   "32:16",      /* jump_align.  */
1135   "32:16",      /* loop_align.  */
1136   2,    /* int_reassoc_width.  */
1137   4,    /* fp_reassoc_width.  */
1138   2,    /* vec_reassoc_width.  */
1139   2,    /* min_div_recip_mul_sf.  */
1140   2,    /* min_div_recip_mul_df.  */
1141   0,    /* max_case_values.  */
1142   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1143   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1144   &generic_prefetch_tune
1145 };
1146
1147 /* Support for fine-grained override of the tuning structures.  */
1148 struct aarch64_tuning_override_function
1149 {
1150   const char* name;
1151   void (*parse_override)(const char*, struct tune_params*);
1152 };
1153
1154 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1155 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1156 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1157
1158 static const struct aarch64_tuning_override_function
1159 aarch64_tuning_override_functions[] =
1160 {
1161   { "fuse", aarch64_parse_fuse_string },
1162   { "tune", aarch64_parse_tune_string },
1163   { "sve_width", aarch64_parse_sve_width_string },
1164   { NULL, NULL }
1165 };
1166
1167 /* A processor implementing AArch64.  */
1168 struct processor
1169 {
1170   const char *const name;
1171   enum aarch64_processor ident;
1172   enum aarch64_processor sched_core;
1173   enum aarch64_arch arch;
1174   unsigned architecture_version;
1175   const uint64_t flags;
1176   const struct tune_params *const tune;
1177 };
1178
1179 /* Architectures implementing AArch64.  */
1180 static const struct processor all_architectures[] =
1181 {
1182 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1183   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1184 #include "aarch64-arches.def"
1185   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1186 };
1187
1188 /* Processor cores implementing AArch64.  */
1189 static const struct processor all_cores[] =
1190 {
1191 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1192   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
1193   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
1194   FLAGS, &COSTS##_tunings},
1195 #include "aarch64-cores.def"
1196   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1197     AARCH64_FL_FOR_ARCH8, &generic_tunings},
1198   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1199 };
1200
1201
1202 /* Target specification.  These are populated by the -march, -mtune, -mcpu
1203    handling code or by target attributes.  */
1204 static const struct processor *selected_arch;
1205 static const struct processor *selected_cpu;
1206 static const struct processor *selected_tune;
1207
1208 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1209
1210 /* The current tuning set.  */
1211 struct tune_params aarch64_tune_params = generic_tunings;
1212
1213 /* Table of machine attributes.  */
1214 static const struct attribute_spec aarch64_attribute_table[] =
1215 {
1216   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1217        affects_type_identity, handler, exclude } */
1218   { "aarch64_vector_pcs", 0, 0, false, true,  true,  true,  NULL, NULL },
1219   { NULL,                 0, 0, false, false, false, false, NULL, NULL }
1220 };
1221
1222 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1223
1224 /* An ISA extension in the co-processor and main instruction set space.  */
1225 struct aarch64_option_extension
1226 {
1227   const char *const name;
1228   const unsigned long flags_on;
1229   const unsigned long flags_off;
1230 };
1231
1232 typedef enum aarch64_cond_code
1233 {
1234   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1235   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1236   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1237 }
1238 aarch64_cc;
1239
1240 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1241
1242 struct aarch64_branch_protect_type
1243 {
1244   /* The type's name that the user passes to the branch-protection option
1245     string.  */
1246   const char* name;
1247   /* Function to handle the protection type and set global variables.
1248     First argument is the string token corresponding with this type and the
1249     second argument is the next token in the option string.
1250     Return values:
1251     * AARCH64_PARSE_OK: Handling was sucessful.
1252     * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1253       should print an error.
1254     * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1255       own error.  */
1256   enum aarch64_parse_opt_result (*handler)(char*, char*);
1257   /* A list of types that can follow this type in the option string.  */
1258   const aarch64_branch_protect_type* subtypes;
1259   unsigned int num_subtypes;
1260 };
1261
1262 static enum aarch64_parse_opt_result
1263 aarch64_handle_no_branch_protection (char* str, char* rest)
1264 {
1265   aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1266   aarch64_enable_bti = 0;
1267   if (rest)
1268     {
1269       error ("unexpected %<%s%> after %<%s%>", rest, str);
1270       return AARCH64_PARSE_INVALID_FEATURE;
1271     }
1272   return AARCH64_PARSE_OK;
1273 }
1274
1275 static enum aarch64_parse_opt_result
1276 aarch64_handle_standard_branch_protection (char* str, char* rest)
1277 {
1278   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1279   aarch64_ra_sign_key = AARCH64_KEY_A;
1280   aarch64_enable_bti = 1;
1281   if (rest)
1282     {
1283       error ("unexpected %<%s%> after %<%s%>", rest, str);
1284       return AARCH64_PARSE_INVALID_FEATURE;
1285     }
1286   return AARCH64_PARSE_OK;
1287 }
1288
1289 static enum aarch64_parse_opt_result
1290 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1291                                     char* rest ATTRIBUTE_UNUSED)
1292 {
1293   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1294   aarch64_ra_sign_key = AARCH64_KEY_A;
1295   return AARCH64_PARSE_OK;
1296 }
1297
1298 static enum aarch64_parse_opt_result
1299 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1300                               char* rest ATTRIBUTE_UNUSED)
1301 {
1302   aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1303   return AARCH64_PARSE_OK;
1304 }
1305
1306 static enum aarch64_parse_opt_result
1307 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
1308                               char* rest ATTRIBUTE_UNUSED)
1309 {
1310   aarch64_ra_sign_key = AARCH64_KEY_B;
1311   return AARCH64_PARSE_OK;
1312 }
1313
1314 static enum aarch64_parse_opt_result
1315 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1316                                     char* rest ATTRIBUTE_UNUSED)
1317 {
1318   aarch64_enable_bti = 1;
1319   return AARCH64_PARSE_OK;
1320 }
1321
1322 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1323   { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1324   { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
1325   { NULL, NULL, NULL, 0 }
1326 };
1327
1328 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1329   { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1330   { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1331   { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1332     ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1333   { "bti", aarch64_handle_bti_protection, NULL, 0 },
1334   { NULL, NULL, NULL, 0 }
1335 };
1336
1337 /* The condition codes of the processor, and the inverse function.  */
1338 static const char * const aarch64_condition_codes[] =
1339 {
1340   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1341   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1342 };
1343
1344 /* The preferred condition codes for SVE conditions.  */
1345 static const char *const aarch64_sve_condition_codes[] =
1346 {
1347   "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1348   "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1349 };
1350
1351 /* Return the assembly token for svpattern value VALUE.  */
1352
1353 static const char *
1354 svpattern_token (enum aarch64_svpattern pattern)
1355 {
1356   switch (pattern)
1357     {
1358 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1359     AARCH64_FOR_SVPATTERN (CASE)
1360 #undef CASE
1361     case AARCH64_NUM_SVPATTERNS:
1362       break;
1363     }
1364   gcc_unreachable ();
1365 }
1366
1367 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1368 const char *
1369 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1370                         const char * branch_format)
1371 {
1372     rtx_code_label * tmp_label = gen_label_rtx ();
1373     char label_buf[256];
1374     char buffer[128];
1375     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1376                                  CODE_LABEL_NUMBER (tmp_label));
1377     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1378     rtx dest_label = operands[pos_label];
1379     operands[pos_label] = tmp_label;
1380
1381     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1382     output_asm_insn (buffer, operands);
1383
1384     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1385     operands[pos_label] = dest_label;
1386     output_asm_insn (buffer, operands);
1387     return "";
1388 }
1389
1390 void
1391 aarch64_err_no_fpadvsimd (machine_mode mode)
1392 {
1393   if (TARGET_GENERAL_REGS_ONLY)
1394     if (FLOAT_MODE_P (mode))
1395       error ("%qs is incompatible with the use of floating-point types",
1396              "-mgeneral-regs-only");
1397     else
1398       error ("%qs is incompatible with the use of vector types",
1399              "-mgeneral-regs-only");
1400   else
1401     if (FLOAT_MODE_P (mode))
1402       error ("%qs feature modifier is incompatible with the use of"
1403              " floating-point types", "+nofp");
1404     else
1405       error ("%qs feature modifier is incompatible with the use of"
1406              " vector types", "+nofp");
1407 }
1408
1409 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1410    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1411    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1412    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1413    and GENERAL_REGS is lower than the memory cost (in this case the best class
1414    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
1415    cost results in bad allocations with many redundant int<->FP moves which
1416    are expensive on various cores.
1417    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1418    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
1419    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
1420    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
1421    The result of this is that it is no longer inefficient to have a higher
1422    memory move cost than the register move cost.
1423 */
1424
1425 static reg_class_t
1426 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1427                                          reg_class_t best_class)
1428 {
1429   machine_mode mode;
1430
1431   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1432       || !reg_class_subset_p (FP_REGS, allocno_class))
1433     return allocno_class;
1434
1435   if (!reg_class_subset_p (GENERAL_REGS, best_class)
1436       || !reg_class_subset_p (FP_REGS, best_class))
1437     return best_class;
1438
1439   mode = PSEUDO_REGNO_MODE (regno);
1440   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1441 }
1442
1443 static unsigned int
1444 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1445 {
1446   if (GET_MODE_UNIT_SIZE (mode) == 4)
1447     return aarch64_tune_params.min_div_recip_mul_sf;
1448   return aarch64_tune_params.min_div_recip_mul_df;
1449 }
1450
1451 /* Return the reassociation width of treeop OPC with mode MODE.  */
1452 static int
1453 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1454 {
1455   if (VECTOR_MODE_P (mode))
1456     return aarch64_tune_params.vec_reassoc_width;
1457   if (INTEGRAL_MODE_P (mode))
1458     return aarch64_tune_params.int_reassoc_width;
1459   /* Avoid reassociating floating point addition so we emit more FMAs.  */
1460   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1461     return aarch64_tune_params.fp_reassoc_width;
1462   return 1;
1463 }
1464
1465 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1466 unsigned
1467 aarch64_dbx_register_number (unsigned regno)
1468 {
1469    if (GP_REGNUM_P (regno))
1470      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1471    else if (regno == SP_REGNUM)
1472      return AARCH64_DWARF_SP;
1473    else if (FP_REGNUM_P (regno))
1474      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1475    else if (PR_REGNUM_P (regno))
1476      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1477    else if (regno == VG_REGNUM)
1478      return AARCH64_DWARF_VG;
1479
1480    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1481       equivalent DWARF register.  */
1482    return DWARF_FRAME_REGISTERS;
1483 }
1484
1485 /* If X is a CONST_DOUBLE, return its bit representation as a constant
1486    integer, otherwise return X unmodified.  */
1487 static rtx
1488 aarch64_bit_representation (rtx x)
1489 {
1490   if (CONST_DOUBLE_P (x))
1491     x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
1492   return x;
1493 }
1494
1495 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1496 static bool
1497 aarch64_advsimd_struct_mode_p (machine_mode mode)
1498 {
1499   return (TARGET_SIMD
1500           && (mode == OImode || mode == CImode || mode == XImode));
1501 }
1502
1503 /* Return true if MODE is an SVE predicate mode.  */
1504 static bool
1505 aarch64_sve_pred_mode_p (machine_mode mode)
1506 {
1507   return (TARGET_SVE
1508           && (mode == VNx16BImode
1509               || mode == VNx8BImode
1510               || mode == VNx4BImode
1511               || mode == VNx2BImode));
1512 }
1513
1514 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1515 const unsigned int VEC_ADVSIMD  = 1;
1516 const unsigned int VEC_SVE_DATA = 2;
1517 const unsigned int VEC_SVE_PRED = 4;
1518 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1519    a structure of 2, 3 or 4 vectors.  */
1520 const unsigned int VEC_STRUCT   = 8;
1521 /* Useful combinations of the above.  */
1522 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1523 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1524
1525 /* Return a set of flags describing the vector properties of mode MODE.
1526    Ignore modes that are not supported by the current target.  */
1527 static unsigned int
1528 aarch64_classify_vector_mode (machine_mode mode)
1529 {
1530   if (aarch64_advsimd_struct_mode_p (mode))
1531     return VEC_ADVSIMD | VEC_STRUCT;
1532
1533   if (aarch64_sve_pred_mode_p (mode))
1534     return VEC_SVE_PRED;
1535
1536   /* Make the decision based on the mode's enum value rather than its
1537      properties, so that we keep the correct classification regardless
1538      of -msve-vector-bits.  */
1539   switch (mode)
1540     {
1541     /* Single SVE vectors.  */
1542     case E_VNx16QImode:
1543     case E_VNx8HImode:
1544     case E_VNx4SImode:
1545     case E_VNx2DImode:
1546     case E_VNx8HFmode:
1547     case E_VNx4SFmode:
1548     case E_VNx2DFmode:
1549       return TARGET_SVE ? VEC_SVE_DATA : 0;
1550
1551     /* x2 SVE vectors.  */
1552     case E_VNx32QImode:
1553     case E_VNx16HImode:
1554     case E_VNx8SImode:
1555     case E_VNx4DImode:
1556     case E_VNx16HFmode:
1557     case E_VNx8SFmode:
1558     case E_VNx4DFmode:
1559     /* x3 SVE vectors.  */
1560     case E_VNx48QImode:
1561     case E_VNx24HImode:
1562     case E_VNx12SImode:
1563     case E_VNx6DImode:
1564     case E_VNx24HFmode:
1565     case E_VNx12SFmode:
1566     case E_VNx6DFmode:
1567     /* x4 SVE vectors.  */
1568     case E_VNx64QImode:
1569     case E_VNx32HImode:
1570     case E_VNx16SImode:
1571     case E_VNx8DImode:
1572     case E_VNx32HFmode:
1573     case E_VNx16SFmode:
1574     case E_VNx8DFmode:
1575       return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
1576
1577     /* 64-bit Advanced SIMD vectors.  */
1578     case E_V8QImode:
1579     case E_V4HImode:
1580     case E_V2SImode:
1581     /* ...E_V1DImode doesn't exist.  */
1582     case E_V4HFmode:
1583     case E_V2SFmode:
1584     case E_V1DFmode:
1585     /* 128-bit Advanced SIMD vectors.  */
1586     case E_V16QImode:
1587     case E_V8HImode:
1588     case E_V4SImode:
1589     case E_V2DImode:
1590     case E_V8HFmode:
1591     case E_V4SFmode:
1592     case E_V2DFmode:
1593       return TARGET_SIMD ? VEC_ADVSIMD : 0;
1594
1595     default:
1596       return 0;
1597     }
1598 }
1599
1600 /* Return true if MODE is any of the data vector modes, including
1601    structure modes.  */
1602 static bool
1603 aarch64_vector_data_mode_p (machine_mode mode)
1604 {
1605   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1606 }
1607
1608 /* Return true if MODE is an SVE data vector mode; either a single vector
1609    or a structure of vectors.  */
1610 static bool
1611 aarch64_sve_data_mode_p (machine_mode mode)
1612 {
1613   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1614 }
1615
1616 /* Implement target hook TARGET_ARRAY_MODE.  */
1617 static opt_machine_mode
1618 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1619 {
1620   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1621       && IN_RANGE (nelems, 2, 4))
1622     return mode_for_vector (GET_MODE_INNER (mode),
1623                             GET_MODE_NUNITS (mode) * nelems);
1624
1625   return opt_machine_mode ();
1626 }
1627
1628 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1629 static bool
1630 aarch64_array_mode_supported_p (machine_mode mode,
1631                                 unsigned HOST_WIDE_INT nelems)
1632 {
1633   if (TARGET_SIMD
1634       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1635           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1636       && (nelems >= 2 && nelems <= 4))
1637     return true;
1638
1639   return false;
1640 }
1641
1642 /* Return the SVE predicate mode to use for elements that have
1643    ELEM_NBYTES bytes, if such a mode exists.  */
1644
1645 opt_machine_mode
1646 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1647 {
1648   if (TARGET_SVE)
1649     {
1650       if (elem_nbytes == 1)
1651         return VNx16BImode;
1652       if (elem_nbytes == 2)
1653         return VNx8BImode;
1654       if (elem_nbytes == 4)
1655         return VNx4BImode;
1656       if (elem_nbytes == 8)
1657         return VNx2BImode;
1658     }
1659   return opt_machine_mode ();
1660 }
1661
1662 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1663
1664 static opt_machine_mode
1665 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1666 {
1667   if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1668     {
1669       unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1670       machine_mode pred_mode;
1671       if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1672         return pred_mode;
1673     }
1674
1675   return default_get_mask_mode (nunits, nbytes);
1676 }
1677
1678 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE.  */
1679
1680 static opt_machine_mode
1681 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
1682 {
1683   enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
1684                             ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
1685   machine_mode mode;
1686   FOR_EACH_MODE_IN_CLASS (mode, mclass)
1687     if (inner_mode == GET_MODE_INNER (mode)
1688         && known_eq (nunits, GET_MODE_NUNITS (mode))
1689         && aarch64_sve_data_mode_p (mode))
1690       return mode;
1691   return opt_machine_mode ();
1692 }
1693
1694 /* Return the integer element mode associated with SVE mode MODE.  */
1695
1696 static scalar_int_mode
1697 aarch64_sve_element_int_mode (machine_mode mode)
1698 {
1699   unsigned int elt_bits = vector_element_size (BITS_PER_SVE_VECTOR,
1700                                                GET_MODE_NUNITS (mode));
1701   return int_mode_for_size (elt_bits, 0).require ();
1702 }
1703
1704 /* Return the integer vector mode associated with SVE mode MODE.
1705    Unlike mode_for_int_vector, this can handle the case in which
1706    MODE is a predicate (and thus has a different total size).  */
1707
1708 static machine_mode
1709 aarch64_sve_int_mode (machine_mode mode)
1710 {
1711   scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
1712   return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
1713 }
1714
1715 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
1716    prefer to use the first arithmetic operand as the else value if
1717    the else value doesn't matter, since that exactly matches the SVE
1718    destructive merging form.  For ternary operations we could either
1719    pick the first operand and use FMAD-like instructions or the last
1720    operand and use FMLA-like instructions; the latter seems more
1721    natural.  */
1722
1723 static tree
1724 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1725 {
1726   return nops == 3 ? ops[2] : ops[0];
1727 }
1728
1729 /* Implement TARGET_HARD_REGNO_NREGS.  */
1730
1731 static unsigned int
1732 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1733 {
1734   /* ??? Logically we should only need to provide a value when
1735      HARD_REGNO_MODE_OK says that the combination is valid,
1736      but at the moment we need to handle all modes.  Just ignore
1737      any runtime parts for registers that can't store them.  */
1738   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1739   switch (aarch64_regno_regclass (regno))
1740     {
1741     case FP_REGS:
1742     case FP_LO_REGS:
1743     case FP_LO8_REGS:
1744       if (aarch64_sve_data_mode_p (mode))
1745         return exact_div (GET_MODE_SIZE (mode),
1746                           BYTES_PER_SVE_VECTOR).to_constant ();
1747       return CEIL (lowest_size, UNITS_PER_VREG);
1748     case PR_REGS:
1749     case PR_LO_REGS:
1750     case PR_HI_REGS:
1751       return 1;
1752     default:
1753       return CEIL (lowest_size, UNITS_PER_WORD);
1754     }
1755   gcc_unreachable ();
1756 }
1757
1758 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1759
1760 static bool
1761 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1762 {
1763   if (GET_MODE_CLASS (mode) == MODE_CC)
1764     return regno == CC_REGNUM;
1765
1766   if (regno == VG_REGNUM)
1767     /* This must have the same size as _Unwind_Word.  */
1768     return mode == DImode;
1769
1770   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1771   if (vec_flags & VEC_SVE_PRED)
1772     return PR_REGNUM_P (regno);
1773
1774   if (PR_REGNUM_P (regno))
1775     return 0;
1776
1777   if (regno == SP_REGNUM)
1778     /* The purpose of comparing with ptr_mode is to support the
1779        global register variable associated with the stack pointer
1780        register via the syntax of asm ("wsp") in ILP32.  */
1781     return mode == Pmode || mode == ptr_mode;
1782
1783   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1784     return mode == Pmode;
1785
1786   if (GP_REGNUM_P (regno))
1787     {
1788       if (known_le (GET_MODE_SIZE (mode), 8))
1789         return true;
1790       else if (known_le (GET_MODE_SIZE (mode), 16))
1791         return (regno & 1) == 0;
1792     }
1793   else if (FP_REGNUM_P (regno))
1794     {
1795       if (vec_flags & VEC_STRUCT)
1796         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1797       else
1798         return !VECTOR_MODE_P (mode) || vec_flags != 0;
1799     }
1800
1801   return false;
1802 }
1803
1804 /* Return true if this is a definition of a vectorized simd function.  */
1805
1806 static bool
1807 aarch64_simd_decl_p (tree fndecl)
1808 {
1809   tree fntype;
1810
1811   if (fndecl == NULL)
1812     return false;
1813   fntype = TREE_TYPE (fndecl);
1814   if (fntype == NULL)
1815     return false;
1816
1817   /* Functions with the aarch64_vector_pcs attribute use the simd ABI.  */
1818   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
1819     return true;
1820
1821   return false;
1822 }
1823
1824 /* Return the mode a register save/restore should use.  DImode for integer
1825    registers, DFmode for FP registers in non-SIMD functions (they only save
1826    the bottom half of a 128 bit register), or TFmode for FP registers in
1827    SIMD functions.  */
1828
1829 static machine_mode
1830 aarch64_reg_save_mode (tree fndecl, unsigned regno)
1831 {
1832   return GP_REGNUM_P (regno)
1833            ? E_DImode
1834            : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
1835 }
1836
1837 /* Return true if the instruction is a call to a SIMD function, false
1838    if it is not a SIMD function or if we do not know anything about
1839    the function.  */
1840
1841 static bool
1842 aarch64_simd_call_p (rtx_insn *insn)
1843 {
1844   rtx symbol;
1845   rtx call;
1846   tree fndecl;
1847
1848   gcc_assert (CALL_P (insn));
1849   call = get_call_rtx_from (insn);
1850   symbol = XEXP (XEXP (call, 0), 0);
1851   if (GET_CODE (symbol) != SYMBOL_REF)
1852     return false;
1853   fndecl = SYMBOL_REF_DECL (symbol);
1854   if (!fndecl)
1855     return false;
1856
1857   return aarch64_simd_decl_p (fndecl);
1858 }
1859
1860 /* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS.  If INSN calls
1861    a function that uses the SIMD ABI, take advantage of the extra
1862    call-preserved registers that the ABI provides.  */
1863
1864 void
1865 aarch64_remove_extra_call_preserved_regs (rtx_insn *insn,
1866                                           HARD_REG_SET *return_set)
1867 {
1868   if (aarch64_simd_call_p (insn))
1869     {
1870       for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1871         if (FP_SIMD_SAVED_REGNUM_P (regno))
1872           CLEAR_HARD_REG_BIT (*return_set, regno);
1873     }
1874 }
1875
1876 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1877    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1878    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1879
1880 static bool
1881 aarch64_hard_regno_call_part_clobbered (rtx_insn *insn, unsigned int regno,
1882                                         machine_mode mode)
1883 {
1884   bool simd_p = insn && CALL_P (insn) && aarch64_simd_call_p (insn);
1885   return FP_REGNUM_P (regno)
1886          && maybe_gt (GET_MODE_SIZE (mode), simd_p ? 16 : 8);
1887 }
1888
1889 /* Implement TARGET_RETURN_CALL_WITH_MAX_CLOBBERS.  */
1890
1891 rtx_insn *
1892 aarch64_return_call_with_max_clobbers (rtx_insn *call_1, rtx_insn *call_2)
1893 {
1894   gcc_assert (CALL_P (call_1) && CALL_P (call_2));
1895
1896   if (!aarch64_simd_call_p (call_1) || aarch64_simd_call_p (call_2))
1897     return call_1;
1898   else
1899     return call_2;
1900 }
1901
1902 /* Implement REGMODE_NATURAL_SIZE.  */
1903 poly_uint64
1904 aarch64_regmode_natural_size (machine_mode mode)
1905 {
1906   /* The natural size for SVE data modes is one SVE data vector,
1907      and similarly for predicates.  We can't independently modify
1908      anything smaller than that.  */
1909   /* ??? For now, only do this for variable-width SVE registers.
1910      Doing it for constant-sized registers breaks lower-subreg.c.  */
1911   /* ??? And once that's fixed, we should probably have similar
1912      code for Advanced SIMD.  */
1913   if (!aarch64_sve_vg.is_constant ())
1914     {
1915       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1916       if (vec_flags & VEC_SVE_PRED)
1917         return BYTES_PER_SVE_PRED;
1918       if (vec_flags & VEC_SVE_DATA)
1919         return BYTES_PER_SVE_VECTOR;
1920     }
1921   return UNITS_PER_WORD;
1922 }
1923
1924 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1925 machine_mode
1926 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1927                                      machine_mode mode)
1928 {
1929   /* The predicate mode determines which bits are significant and
1930      which are "don't care".  Decreasing the number of lanes would
1931      lose data while increasing the number of lanes would make bits
1932      unnecessarily significant.  */
1933   if (PR_REGNUM_P (regno))
1934     return mode;
1935   if (known_ge (GET_MODE_SIZE (mode), 4))
1936     return mode;
1937   else
1938     return SImode;
1939 }
1940
1941 /* Return true if I's bits are consecutive ones from the MSB.  */
1942 bool
1943 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
1944 {
1945   return exact_log2 (-i) != HOST_WIDE_INT_M1;
1946 }
1947
1948 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
1949    that strcpy from constants will be faster.  */
1950
1951 static HOST_WIDE_INT
1952 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1953 {
1954   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1955     return MAX (align, BITS_PER_WORD);
1956   return align;
1957 }
1958
1959 /* Return true if calls to DECL should be treated as
1960    long-calls (ie called via a register).  */
1961 static bool
1962 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1963 {
1964   return false;
1965 }
1966
1967 /* Return true if calls to symbol-ref SYM should be treated as
1968    long-calls (ie called via a register).  */
1969 bool
1970 aarch64_is_long_call_p (rtx sym)
1971 {
1972   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1973 }
1974
1975 /* Return true if calls to symbol-ref SYM should not go through
1976    plt stubs.  */
1977
1978 bool
1979 aarch64_is_noplt_call_p (rtx sym)
1980 {
1981   const_tree decl = SYMBOL_REF_DECL (sym);
1982
1983   if (flag_pic
1984       && decl
1985       && (!flag_plt
1986           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1987       && !targetm.binds_local_p (decl))
1988     return true;
1989
1990   return false;
1991 }
1992
1993 /* Return true if the offsets to a zero/sign-extract operation
1994    represent an expression that matches an extend operation.  The
1995    operands represent the paramters from
1996
1997    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1998 bool
1999 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
2000                                 rtx extract_imm)
2001 {
2002   HOST_WIDE_INT mult_val, extract_val;
2003
2004   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
2005     return false;
2006
2007   mult_val = INTVAL (mult_imm);
2008   extract_val = INTVAL (extract_imm);
2009
2010   if (extract_val > 8
2011       && extract_val < GET_MODE_BITSIZE (mode)
2012       && exact_log2 (extract_val & ~7) > 0
2013       && (extract_val & 7) <= 4
2014       && mult_val == (1 << (extract_val & 7)))
2015     return true;
2016
2017   return false;
2018 }
2019
2020 /* Emit an insn that's a simple single-set.  Both the operands must be
2021    known to be valid.  */
2022 inline static rtx_insn *
2023 emit_set_insn (rtx x, rtx y)
2024 {
2025   return emit_insn (gen_rtx_SET (x, y));
2026 }
2027
2028 /* X and Y are two things to compare using CODE.  Emit the compare insn and
2029    return the rtx for register 0 in the proper mode.  */
2030 rtx
2031 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
2032 {
2033   machine_mode mode = SELECT_CC_MODE (code, x, y);
2034   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
2035
2036   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
2037   return cc_reg;
2038 }
2039
2040 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
2041
2042 static rtx
2043 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2044                                   machine_mode y_mode)
2045 {
2046   if (y_mode == E_QImode || y_mode == E_HImode)
2047     {
2048       if (CONST_INT_P (y))
2049         y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2050       else
2051         {
2052           rtx t, cc_reg;
2053           machine_mode cc_mode;
2054
2055           t = gen_rtx_ZERO_EXTEND (SImode, y);
2056           t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2057           cc_mode = CC_SWPmode;
2058           cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2059           emit_set_insn (cc_reg, t);
2060           return cc_reg;
2061         }
2062     }
2063
2064   return aarch64_gen_compare_reg (code, x, y);
2065 }
2066
2067 /* Build the SYMBOL_REF for __tls_get_addr.  */
2068
2069 static GTY(()) rtx tls_get_addr_libfunc;
2070
2071 rtx
2072 aarch64_tls_get_addr (void)
2073 {
2074   if (!tls_get_addr_libfunc)
2075     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
2076   return tls_get_addr_libfunc;
2077 }
2078
2079 /* Return the TLS model to use for ADDR.  */
2080
2081 static enum tls_model
2082 tls_symbolic_operand_type (rtx addr)
2083 {
2084   enum tls_model tls_kind = TLS_MODEL_NONE;
2085   if (GET_CODE (addr) == CONST)
2086     {
2087       poly_int64 addend;
2088       rtx sym = strip_offset (addr, &addend);
2089       if (GET_CODE (sym) == SYMBOL_REF)
2090         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
2091     }
2092   else if (GET_CODE (addr) == SYMBOL_REF)
2093     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2094
2095   return tls_kind;
2096 }
2097
2098 /* We'll allow lo_sum's in addresses in our legitimate addresses
2099    so that combine would take care of combining addresses where
2100    necessary, but for generation purposes, we'll generate the address
2101    as :
2102    RTL                               Absolute
2103    tmp = hi (symbol_ref);            adrp  x1, foo
2104    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
2105                                      nop
2106
2107    PIC                               TLS
2108    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
2109    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
2110                                      bl   __tls_get_addr
2111                                      nop
2112
2113    Load TLS symbol, depending on TLS mechanism and TLS access model.
2114
2115    Global Dynamic - Traditional TLS:
2116    adrp tmp, :tlsgd:imm
2117    add  dest, tmp, #:tlsgd_lo12:imm
2118    bl   __tls_get_addr
2119
2120    Global Dynamic - TLS Descriptors:
2121    adrp dest, :tlsdesc:imm
2122    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
2123    add  dest, dest, #:tlsdesc_lo12:imm
2124    blr  tmp
2125    mrs  tp, tpidr_el0
2126    add  dest, dest, tp
2127
2128    Initial Exec:
2129    mrs  tp, tpidr_el0
2130    adrp tmp, :gottprel:imm
2131    ldr  dest, [tmp, #:gottprel_lo12:imm]
2132    add  dest, dest, tp
2133
2134    Local Exec:
2135    mrs  tp, tpidr_el0
2136    add  t0, tp, #:tprel_hi12:imm, lsl #12
2137    add  t0, t0, #:tprel_lo12_nc:imm
2138 */
2139
2140 static void
2141 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2142                                    enum aarch64_symbol_type type)
2143 {
2144   switch (type)
2145     {
2146     case SYMBOL_SMALL_ABSOLUTE:
2147       {
2148         /* In ILP32, the mode of dest can be either SImode or DImode.  */
2149         rtx tmp_reg = dest;
2150         machine_mode mode = GET_MODE (dest);
2151
2152         gcc_assert (mode == Pmode || mode == ptr_mode);
2153
2154         if (can_create_pseudo_p ())
2155           tmp_reg = gen_reg_rtx (mode);
2156
2157         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2158         emit_insn (gen_add_losym (dest, tmp_reg, imm));
2159         return;
2160       }
2161
2162     case SYMBOL_TINY_ABSOLUTE:
2163       emit_insn (gen_rtx_SET (dest, imm));
2164       return;
2165
2166     case SYMBOL_SMALL_GOT_28K:
2167       {
2168         machine_mode mode = GET_MODE (dest);
2169         rtx gp_rtx = pic_offset_table_rtx;
2170         rtx insn;
2171         rtx mem;
2172
2173         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2174            here before rtl expand.  Tree IVOPT will generate rtl pattern to
2175            decide rtx costs, in which case pic_offset_table_rtx is not
2176            initialized.  For that case no need to generate the first adrp
2177            instruction as the final cost for global variable access is
2178            one instruction.  */
2179         if (gp_rtx != NULL)
2180           {
2181             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2182                using the page base as GOT base, the first page may be wasted,
2183                in the worst scenario, there is only 28K space for GOT).
2184
2185                The generate instruction sequence for accessing global variable
2186                is:
2187
2188                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2189
2190                Only one instruction needed. But we must initialize
2191                pic_offset_table_rtx properly.  We generate initialize insn for
2192                every global access, and allow CSE to remove all redundant.
2193
2194                The final instruction sequences will look like the following
2195                for multiply global variables access.
2196
2197                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2198
2199                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2200                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2201                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2202                  ...  */
2203
2204             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2205             crtl->uses_pic_offset_table = 1;
2206             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2207
2208             if (mode != GET_MODE (gp_rtx))
2209              gp_rtx = gen_lowpart (mode, gp_rtx);
2210
2211           }
2212
2213         if (mode == ptr_mode)
2214           {
2215             if (mode == DImode)
2216               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2217             else
2218               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2219
2220             mem = XVECEXP (SET_SRC (insn), 0, 0);
2221           }
2222         else
2223           {
2224             gcc_assert (mode == Pmode);
2225
2226             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2227             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2228           }
2229
2230         /* The operand is expected to be MEM.  Whenever the related insn
2231            pattern changed, above code which calculate mem should be
2232            updated.  */
2233         gcc_assert (GET_CODE (mem) == MEM);
2234         MEM_READONLY_P (mem) = 1;
2235         MEM_NOTRAP_P (mem) = 1;
2236         emit_insn (insn);
2237         return;
2238       }
2239
2240     case SYMBOL_SMALL_GOT_4G:
2241       {
2242         /* In ILP32, the mode of dest can be either SImode or DImode,
2243            while the got entry is always of SImode size.  The mode of
2244            dest depends on how dest is used: if dest is assigned to a
2245            pointer (e.g. in the memory), it has SImode; it may have
2246            DImode if dest is dereferenced to access the memeory.
2247            This is why we have to handle three different ldr_got_small
2248            patterns here (two patterns for ILP32).  */
2249
2250         rtx insn;
2251         rtx mem;
2252         rtx tmp_reg = dest;
2253         machine_mode mode = GET_MODE (dest);
2254
2255         if (can_create_pseudo_p ())
2256           tmp_reg = gen_reg_rtx (mode);
2257
2258         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2259         if (mode == ptr_mode)
2260           {
2261             if (mode == DImode)
2262               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2263             else
2264               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2265
2266             mem = XVECEXP (SET_SRC (insn), 0, 0);
2267           }
2268         else
2269           {
2270             gcc_assert (mode == Pmode);
2271
2272             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2273             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2274           }
2275
2276         gcc_assert (GET_CODE (mem) == MEM);
2277         MEM_READONLY_P (mem) = 1;
2278         MEM_NOTRAP_P (mem) = 1;
2279         emit_insn (insn);
2280         return;
2281       }
2282
2283     case SYMBOL_SMALL_TLSGD:
2284       {
2285         rtx_insn *insns;
2286         machine_mode mode = GET_MODE (dest);
2287         rtx result = gen_rtx_REG (mode, R0_REGNUM);
2288
2289         start_sequence ();
2290         if (TARGET_ILP32)
2291           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2292         else
2293           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2294         insns = get_insns ();
2295         end_sequence ();
2296
2297         RTL_CONST_CALL_P (insns) = 1;
2298         emit_libcall_block (insns, dest, result, imm);
2299         return;
2300       }
2301
2302     case SYMBOL_SMALL_TLSDESC:
2303       {
2304         machine_mode mode = GET_MODE (dest);
2305         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2306         rtx tp;
2307
2308         gcc_assert (mode == Pmode || mode == ptr_mode);
2309
2310         /* In ILP32, the got entry is always of SImode size.  Unlike
2311            small GOT, the dest is fixed at reg 0.  */
2312         if (TARGET_ILP32)
2313           emit_insn (gen_tlsdesc_small_si (imm));
2314         else
2315           emit_insn (gen_tlsdesc_small_di (imm));
2316         tp = aarch64_load_tp (NULL);
2317
2318         if (mode != Pmode)
2319           tp = gen_lowpart (mode, tp);
2320
2321         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2322         if (REG_P (dest))
2323           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2324         return;
2325       }
2326
2327     case SYMBOL_SMALL_TLSIE:
2328       {
2329         /* In ILP32, the mode of dest can be either SImode or DImode,
2330            while the got entry is always of SImode size.  The mode of
2331            dest depends on how dest is used: if dest is assigned to a
2332            pointer (e.g. in the memory), it has SImode; it may have
2333            DImode if dest is dereferenced to access the memeory.
2334            This is why we have to handle three different tlsie_small
2335            patterns here (two patterns for ILP32).  */
2336         machine_mode mode = GET_MODE (dest);
2337         rtx tmp_reg = gen_reg_rtx (mode);
2338         rtx tp = aarch64_load_tp (NULL);
2339
2340         if (mode == ptr_mode)
2341           {
2342             if (mode == DImode)
2343               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2344             else
2345               {
2346                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2347                 tp = gen_lowpart (mode, tp);
2348               }
2349           }
2350         else
2351           {
2352             gcc_assert (mode == Pmode);
2353             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2354           }
2355
2356         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2357         if (REG_P (dest))
2358           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2359         return;
2360       }
2361
2362     case SYMBOL_TLSLE12:
2363     case SYMBOL_TLSLE24:
2364     case SYMBOL_TLSLE32:
2365     case SYMBOL_TLSLE48:
2366       {
2367         machine_mode mode = GET_MODE (dest);
2368         rtx tp = aarch64_load_tp (NULL);
2369
2370         if (mode != Pmode)
2371           tp = gen_lowpart (mode, tp);
2372
2373         switch (type)
2374           {
2375           case SYMBOL_TLSLE12:
2376             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2377                         (dest, tp, imm));
2378             break;
2379           case SYMBOL_TLSLE24:
2380             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2381                         (dest, tp, imm));
2382           break;
2383           case SYMBOL_TLSLE32:
2384             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2385                         (dest, imm));
2386             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2387                         (dest, dest, tp));
2388           break;
2389           case SYMBOL_TLSLE48:
2390             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2391                         (dest, imm));
2392             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2393                         (dest, dest, tp));
2394             break;
2395           default:
2396             gcc_unreachable ();
2397           }
2398
2399         if (REG_P (dest))
2400           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2401         return;
2402       }
2403
2404     case SYMBOL_TINY_GOT:
2405       emit_insn (gen_ldr_got_tiny (dest, imm));
2406       return;
2407
2408     case SYMBOL_TINY_TLSIE:
2409       {
2410         machine_mode mode = GET_MODE (dest);
2411         rtx tp = aarch64_load_tp (NULL);
2412
2413         if (mode == ptr_mode)
2414           {
2415             if (mode == DImode)
2416               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2417             else
2418               {
2419                 tp = gen_lowpart (mode, tp);
2420                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2421               }
2422           }
2423         else
2424           {
2425             gcc_assert (mode == Pmode);
2426             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2427           }
2428
2429         if (REG_P (dest))
2430           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2431         return;
2432       }
2433
2434     default:
2435       gcc_unreachable ();
2436     }
2437 }
2438
2439 /* Emit a move from SRC to DEST.  Assume that the move expanders can
2440    handle all moves if !can_create_pseudo_p ().  The distinction is
2441    important because, unlike emit_move_insn, the move expanders know
2442    how to force Pmode objects into the constant pool even when the
2443    constant pool address is not itself legitimate.  */
2444 static rtx
2445 aarch64_emit_move (rtx dest, rtx src)
2446 {
2447   return (can_create_pseudo_p ()
2448           ? emit_move_insn (dest, src)
2449           : emit_move_insn_1 (dest, src));
2450 }
2451
2452 /* Apply UNOPTAB to OP and store the result in DEST.  */
2453
2454 static void
2455 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2456 {
2457   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2458   if (dest != tmp)
2459     emit_move_insn (dest, tmp);
2460 }
2461
2462 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
2463
2464 static void
2465 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2466 {
2467   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2468                           OPTAB_DIRECT);
2469   if (dest != tmp)
2470     emit_move_insn (dest, tmp);
2471 }
2472
2473 /* Split a 128-bit move operation into two 64-bit move operations,
2474    taking care to handle partial overlap of register to register
2475    copies.  Special cases are needed when moving between GP regs and
2476    FP regs.  SRC can be a register, constant or memory; DST a register
2477    or memory.  If either operand is memory it must not have any side
2478    effects.  */
2479 void
2480 aarch64_split_128bit_move (rtx dst, rtx src)
2481 {
2482   rtx dst_lo, dst_hi;
2483   rtx src_lo, src_hi;
2484
2485   machine_mode mode = GET_MODE (dst);
2486
2487   gcc_assert (mode == TImode || mode == TFmode);
2488   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2489   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2490
2491   if (REG_P (dst) && REG_P (src))
2492     {
2493       int src_regno = REGNO (src);
2494       int dst_regno = REGNO (dst);
2495
2496       /* Handle FP <-> GP regs.  */
2497       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2498         {
2499           src_lo = gen_lowpart (word_mode, src);
2500           src_hi = gen_highpart (word_mode, src);
2501
2502           emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2503           emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2504           return;
2505         }
2506       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2507         {
2508           dst_lo = gen_lowpart (word_mode, dst);
2509           dst_hi = gen_highpart (word_mode, dst);
2510
2511           emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2512           emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2513           return;
2514         }
2515     }
2516
2517   dst_lo = gen_lowpart (word_mode, dst);
2518   dst_hi = gen_highpart (word_mode, dst);
2519   src_lo = gen_lowpart (word_mode, src);
2520   src_hi = gen_highpart_mode (word_mode, mode, src);
2521
2522   /* At most one pairing may overlap.  */
2523   if (reg_overlap_mentioned_p (dst_lo, src_hi))
2524     {
2525       aarch64_emit_move (dst_hi, src_hi);
2526       aarch64_emit_move (dst_lo, src_lo);
2527     }
2528   else
2529     {
2530       aarch64_emit_move (dst_lo, src_lo);
2531       aarch64_emit_move (dst_hi, src_hi);
2532     }
2533 }
2534
2535 bool
2536 aarch64_split_128bit_move_p (rtx dst, rtx src)
2537 {
2538   return (! REG_P (src)
2539           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2540 }
2541
2542 /* Split a complex SIMD combine.  */
2543
2544 void
2545 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2546 {
2547   machine_mode src_mode = GET_MODE (src1);
2548   machine_mode dst_mode = GET_MODE (dst);
2549
2550   gcc_assert (VECTOR_MODE_P (dst_mode));
2551   gcc_assert (register_operand (dst, dst_mode)
2552               && register_operand (src1, src_mode)
2553               && register_operand (src2, src_mode));
2554
2555   emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2556   return;
2557 }
2558
2559 /* Split a complex SIMD move.  */
2560
2561 void
2562 aarch64_split_simd_move (rtx dst, rtx src)
2563 {
2564   machine_mode src_mode = GET_MODE (src);
2565   machine_mode dst_mode = GET_MODE (dst);
2566
2567   gcc_assert (VECTOR_MODE_P (dst_mode));
2568
2569   if (REG_P (dst) && REG_P (src))
2570     {
2571       gcc_assert (VECTOR_MODE_P (src_mode));
2572       emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2573     }
2574 }
2575
2576 bool
2577 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2578                               machine_mode ymode, rtx y)
2579 {
2580   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2581   gcc_assert (r != NULL);
2582   return rtx_equal_p (x, r);
2583 }
2584
2585
2586 /* Return TARGET if it is nonnull and a register of mode MODE.
2587    Otherwise, return a fresh register of mode MODE if we can,
2588    or TARGET reinterpreted as MODE if we can't.  */
2589
2590 static rtx
2591 aarch64_target_reg (rtx target, machine_mode mode)
2592 {
2593   if (target && REG_P (target) && GET_MODE (target) == mode)
2594     return target;
2595   if (!can_create_pseudo_p ())
2596     {
2597       gcc_assert (target);
2598       return gen_lowpart (mode, target);
2599     }
2600   return gen_reg_rtx (mode);
2601 }
2602
2603 /* Return a register that contains the constant in BUILDER, given that
2604    the constant is a legitimate move operand.  Use TARGET as the register
2605    if it is nonnull and convenient.  */
2606
2607 static rtx
2608 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
2609 {
2610   rtx src = builder.build ();
2611   target = aarch64_target_reg (target, GET_MODE (src));
2612   emit_insn (gen_rtx_SET (target, src));
2613   return target;
2614 }
2615
2616 static rtx
2617 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2618 {
2619   if (can_create_pseudo_p ())
2620     return force_reg (mode, value);
2621   else
2622     {
2623       gcc_assert (x);
2624       aarch64_emit_move (x, value);
2625       return x;
2626     }
2627 }
2628
2629 /* Return true if predicate value X is a constant in which every element
2630    is a CONST_INT.  When returning true, describe X in BUILDER as a VNx16BI
2631    value, i.e. as a predicate in which all bits are significant.  */
2632
2633 static bool
2634 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
2635 {
2636   if (GET_CODE (x) != CONST_VECTOR)
2637     return false;
2638
2639   unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
2640                                              GET_MODE_NUNITS (GET_MODE (x)));
2641   unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
2642   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
2643   builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
2644
2645   unsigned int nelts = const_vector_encoded_nelts (x);
2646   for (unsigned int i = 0; i < nelts; ++i)
2647     {
2648       rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
2649       if (!CONST_INT_P (elt))
2650         return false;
2651
2652       builder.quick_push (elt);
2653       for (unsigned int j = 1; j < factor; ++j)
2654         builder.quick_push (const0_rtx);
2655     }
2656   builder.finalize ();
2657   return true;
2658 }
2659
2660 /* BUILDER contains a predicate constant of mode VNx16BI.  Return the
2661    widest predicate element size it can have (that is, the largest size
2662    for which each element would still be 0 or 1).  */
2663
2664 unsigned int
2665 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
2666 {
2667   /* Start with the most optimistic assumption: that we only need
2668      one bit per pattern.  This is what we will use if only the first
2669      bit in each pattern is ever set.  */
2670   unsigned int mask = GET_MODE_SIZE (DImode);
2671   mask |= builder.npatterns ();
2672
2673   /* Look for set bits.  */
2674   unsigned int nelts = builder.encoded_nelts ();
2675   for (unsigned int i = 1; i < nelts; ++i)
2676     if (INTVAL (builder.elt (i)) != 0)
2677       {
2678         if (i & 1)
2679           return 1;
2680         mask |= i;
2681       }
2682   return mask & -mask;
2683 }
2684
2685 /* BUILDER is a predicate constant of mode VNx16BI.  Consider the value
2686    that the constant would have with predicate element size ELT_SIZE
2687    (ignoring the upper bits in each element) and return:
2688
2689    * -1 if all bits are set
2690    * N if the predicate has N leading set bits followed by all clear bits
2691    * 0 if the predicate does not have any of these forms.  */
2692
2693 int
2694 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
2695                               unsigned int elt_size)
2696 {
2697   /* If nelts_per_pattern is 3, we have set bits followed by clear bits
2698      followed by set bits.  */
2699   if (builder.nelts_per_pattern () == 3)
2700     return 0;
2701
2702   /* Skip over leading set bits.  */
2703   unsigned int nelts = builder.encoded_nelts ();
2704   unsigned int i = 0;
2705   for (; i < nelts; i += elt_size)
2706     if (INTVAL (builder.elt (i)) == 0)
2707       break;
2708   unsigned int vl = i / elt_size;
2709
2710   /* Check for the all-true case.  */
2711   if (i == nelts)
2712     return -1;
2713
2714   /* If nelts_per_pattern is 1, then either VL is zero, or we have a
2715      repeating pattern of set bits followed by clear bits.  */
2716   if (builder.nelts_per_pattern () != 2)
2717     return 0;
2718
2719   /* We have a "foreground" value and a duplicated "background" value.
2720      If the background might repeat and the last set bit belongs to it,
2721      we might have set bits followed by clear bits followed by set bits.  */
2722   if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
2723     return 0;
2724
2725   /* Make sure that the rest are all clear.  */
2726   for (; i < nelts; i += elt_size)
2727     if (INTVAL (builder.elt (i)) != 0)
2728       return 0;
2729
2730   return vl;
2731 }
2732
2733 /* See if there is an svpattern that encodes an SVE predicate of mode
2734    PRED_MODE in which the first VL bits are set and the rest are clear.
2735    Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
2736    A VL of -1 indicates an all-true vector.  */
2737
2738 aarch64_svpattern
2739 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
2740 {
2741   if (vl < 0)
2742     return AARCH64_SV_ALL;
2743
2744   if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
2745     return AARCH64_NUM_SVPATTERNS;
2746
2747   if (vl >= 1 && vl <= 8)
2748     return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
2749
2750   if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
2751     return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
2752
2753   int max_vl;
2754   if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
2755     {
2756       if (vl == (max_vl / 3) * 3)
2757         return AARCH64_SV_MUL3;
2758       /* These would only trigger for non-power-of-2 lengths.  */
2759       if (vl == (max_vl & -4))
2760         return AARCH64_SV_MUL4;
2761       if (vl == (1 << floor_log2 (max_vl)))
2762         return AARCH64_SV_POW2;
2763       if (vl == max_vl)
2764         return AARCH64_SV_ALL;
2765     }
2766   return AARCH64_NUM_SVPATTERNS;
2767 }
2768
2769 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
2770    bits has the lowest bit set and the upper bits clear.  This is the
2771    VNx16BImode equivalent of a PTRUE for controlling elements of
2772    ELT_SIZE bytes.  However, because the constant is VNx16BImode,
2773    all bits are significant, even the upper zeros.  */
2774
2775 rtx
2776 aarch64_ptrue_all (unsigned int elt_size)
2777 {
2778   rtx_vector_builder builder (VNx16BImode, elt_size, 1);
2779   builder.quick_push (const1_rtx);
2780   for (unsigned int i = 1; i < elt_size; ++i)
2781     builder.quick_push (const0_rtx);
2782   return builder.build ();
2783 }
2784
2785 /* Return an all-true predicate register of mode MODE.  */
2786
2787 rtx
2788 aarch64_ptrue_reg (machine_mode mode)
2789 {
2790   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2791   rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
2792   return gen_lowpart (mode, reg);
2793 }
2794
2795 /* Return an all-false predicate register of mode MODE.  */
2796
2797 rtx
2798 aarch64_pfalse_reg (machine_mode mode)
2799 {
2800   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2801   rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
2802   return gen_lowpart (mode, reg);
2803 }
2804
2805 /* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
2806    true, or alternatively if we know that the operation predicated by
2807    PRED1[0] is safe to perform whenever PRED2 is true.  PRED1[1] is a
2808    aarch64_sve_gp_strictness operand that describes the operation
2809    predicated by PRED1[0].  */
2810
2811 bool
2812 aarch64_sve_pred_dominates_p (rtx *pred1, rtx pred2)
2813 {
2814   machine_mode mode = GET_MODE (pred2);
2815   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2816               && mode == GET_MODE (pred1[0])
2817               && aarch64_sve_gp_strictness (pred1[1], SImode));
2818   return (pred1[0] == CONSTM1_RTX (mode)
2819           || INTVAL (pred1[1]) == SVE_RELAXED_GP
2820           || rtx_equal_p (pred1[0], pred2));
2821 }
2822
2823 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
2824    for it.  PRED2[0] is the predicate for the instruction whose result
2825    is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
2826    for it.  Return true if we can prove that the two predicates are
2827    equivalent for PTEST purposes; that is, if we can replace PRED2[0]
2828    with PRED1[0] without changing behavior.  */
2829
2830 bool
2831 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
2832 {
2833   machine_mode mode = GET_MODE (pred1[0]);
2834   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2835               && mode == GET_MODE (pred2[0])
2836               && aarch64_sve_ptrue_flag (pred1[1], SImode)
2837               && aarch64_sve_ptrue_flag (pred2[1], SImode));
2838
2839   bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
2840                    || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
2841   bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
2842                    || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
2843   return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
2844 }
2845
2846 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
2847    DATA_MODE, and return the result in a predicate of mode PRED_MODE.
2848    Use TARGET as the target register if nonnull and convenient.  */
2849
2850 static rtx
2851 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
2852                           machine_mode data_mode, rtx op1, rtx op2)
2853 {
2854   insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
2855   expand_operand ops[5];
2856   create_output_operand (&ops[0], target, pred_mode);
2857   create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
2858   create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
2859   create_input_operand (&ops[3], op1, data_mode);
2860   create_input_operand (&ops[4], op2, data_mode);
2861   expand_insn (icode, 5, ops);
2862   return ops[0].value;
2863 }
2864
2865 /* Use a comparison to convert integer vector SRC into MODE, which is
2866    the corresponding SVE predicate mode.  Use TARGET for the result
2867    if it's nonnull and convenient.  */
2868
2869 static rtx
2870 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
2871 {
2872   machine_mode src_mode = GET_MODE (src);
2873   return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
2874                                    src, CONST0_RTX (src_mode));
2875 }
2876
2877 /* Return true if we can move VALUE into a register using a single
2878    CNT[BHWD] instruction.  */
2879
2880 static bool
2881 aarch64_sve_cnt_immediate_p (poly_int64 value)
2882 {
2883   HOST_WIDE_INT factor = value.coeffs[0];
2884   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
2885   return (value.coeffs[1] == factor
2886           && IN_RANGE (factor, 2, 16 * 16)
2887           && (factor & 1) == 0
2888           && factor <= 16 * (factor & -factor));
2889 }
2890
2891 /* Likewise for rtx X.  */
2892
2893 bool
2894 aarch64_sve_cnt_immediate_p (rtx x)
2895 {
2896   poly_int64 value;
2897   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2898 }
2899
2900 /* Return the asm string for an instruction with a CNT-like vector size
2901    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2902    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2903    first part of the operands template (the part that comes before the
2904    vector size itself).  FACTOR is the number of quadwords.
2905    NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2906    If it is zero, we can use any element size.  */
2907
2908 static char *
2909 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2910                                   unsigned int factor,
2911                                   unsigned int nelts_per_vq)
2912 {
2913   static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2914
2915   if (nelts_per_vq == 0)
2916     /* There is some overlap in the ranges of the four CNT instructions.
2917        Here we always use the smallest possible element size, so that the
2918        multiplier is 1 whereever possible.  */
2919     nelts_per_vq = factor & -factor;
2920   int shift = std::min (exact_log2 (nelts_per_vq), 4);
2921   gcc_assert (IN_RANGE (shift, 1, 4));
2922   char suffix = "dwhb"[shift - 1];
2923
2924   factor >>= shift;
2925   unsigned int written;
2926   if (factor == 1)
2927     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2928                         prefix, suffix, operands);
2929   else
2930     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2931                         prefix, suffix, operands, factor);
2932   gcc_assert (written < sizeof (buffer));
2933   return buffer;
2934 }
2935
2936 /* Return the asm string for an instruction with a CNT-like vector size
2937    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2938    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2939    first part of the operands template (the part that comes before the
2940    vector size itself).  X is the value of the vector size operand,
2941    as a polynomial integer rtx.  */
2942
2943 char *
2944 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2945                                   rtx x)
2946 {
2947   poly_int64 value = rtx_to_poly_int64 (x);
2948   gcc_assert (aarch64_sve_cnt_immediate_p (value));
2949   return aarch64_output_sve_cnt_immediate (prefix, operands,
2950                                            value.coeffs[1], 0);
2951 }
2952
2953 /* Return true if we can add VALUE to a register using a single ADDVL
2954    or ADDPL instruction.  */
2955
2956 static bool
2957 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2958 {
2959   HOST_WIDE_INT factor = value.coeffs[0];
2960   if (factor == 0 || value.coeffs[1] != factor)
2961     return false;
2962   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2963      and a value of 16 is one vector width.  */
2964   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2965           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2966 }
2967
2968 /* Likewise for rtx X.  */
2969
2970 bool
2971 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2972 {
2973   poly_int64 value;
2974   return (poly_int_rtx_p (x, &value)
2975           && aarch64_sve_addvl_addpl_immediate_p (value));
2976 }
2977
2978 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2979    and storing the result in operand 0.  */
2980
2981 char *
2982 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2983 {
2984   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2985   poly_int64 offset_value = rtx_to_poly_int64 (offset);
2986   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2987
2988   /* Use INC or DEC if possible.  */
2989   if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2990     {
2991       if (aarch64_sve_cnt_immediate_p (offset_value))
2992         return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2993                                                  offset_value.coeffs[1], 0);
2994       if (aarch64_sve_cnt_immediate_p (-offset_value))
2995         return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2996                                                  -offset_value.coeffs[1], 0);
2997     }
2998
2999   int factor = offset_value.coeffs[1];
3000   if ((factor & 15) == 0)
3001     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
3002   else
3003     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
3004   return buffer;
3005 }
3006
3007 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3008    instruction.  If it is, store the number of elements in each vector
3009    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
3010    factor in *FACTOR_OUT (if nonnull).  */
3011
3012 bool
3013 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
3014                                  unsigned int *nelts_per_vq_out)
3015 {
3016   rtx elt;
3017   poly_int64 value;
3018
3019   if (!const_vec_duplicate_p (x, &elt)
3020       || !poly_int_rtx_p (elt, &value))
3021     return false;
3022
3023   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
3024   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
3025     /* There's no vector INCB.  */
3026     return false;
3027
3028   HOST_WIDE_INT factor = value.coeffs[0];
3029   if (value.coeffs[1] != factor)
3030     return false;
3031
3032   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
3033   if ((factor % nelts_per_vq) != 0
3034       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
3035     return false;
3036
3037   if (factor_out)
3038     *factor_out = factor;
3039   if (nelts_per_vq_out)
3040     *nelts_per_vq_out = nelts_per_vq;
3041   return true;
3042 }
3043
3044 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3045    instruction.  */
3046
3047 bool
3048 aarch64_sve_inc_dec_immediate_p (rtx x)
3049 {
3050   return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
3051 }
3052
3053 /* Return the asm template for an SVE vector INC or DEC instruction.
3054    OPERANDS gives the operands before the vector count and X is the
3055    value of the vector count operand itself.  */
3056
3057 char *
3058 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
3059 {
3060   int factor;
3061   unsigned int nelts_per_vq;
3062   if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
3063     gcc_unreachable ();
3064   if (factor < 0)
3065     return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
3066                                              nelts_per_vq);
3067   else
3068     return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
3069                                              nelts_per_vq);
3070 }
3071
3072 static int
3073 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
3074                                 scalar_int_mode mode)
3075 {
3076   int i;
3077   unsigned HOST_WIDE_INT val, val2, mask;
3078   int one_match, zero_match;
3079   int num_insns;
3080
3081   val = INTVAL (imm);
3082
3083   if (aarch64_move_imm (val, mode))
3084     {
3085       if (generate)
3086         emit_insn (gen_rtx_SET (dest, imm));
3087       return 1;
3088     }
3089
3090   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
3091      (with XXXX non-zero). In that case check to see if the move can be done in
3092      a smaller mode.  */
3093   val2 = val & 0xffffffff;
3094   if (mode == DImode
3095       && aarch64_move_imm (val2, SImode)
3096       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
3097     {
3098       if (generate)
3099         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3100
3101       /* Check if we have to emit a second instruction by checking to see
3102          if any of the upper 32 bits of the original DI mode value is set.  */
3103       if (val == val2)
3104         return 1;
3105
3106       i = (val >> 48) ? 48 : 32;
3107
3108       if (generate)
3109          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3110                                     GEN_INT ((val >> i) & 0xffff)));
3111
3112       return 2;
3113     }
3114
3115   if ((val >> 32) == 0 || mode == SImode)
3116     {
3117       if (generate)
3118         {
3119           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
3120           if (mode == SImode)
3121             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
3122                                        GEN_INT ((val >> 16) & 0xffff)));
3123           else
3124             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
3125                                        GEN_INT ((val >> 16) & 0xffff)));
3126         }
3127       return 2;
3128     }
3129
3130   /* Remaining cases are all for DImode.  */
3131
3132   mask = 0xffff;
3133   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
3134     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
3135   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
3136     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
3137
3138   if (zero_match != 2 && one_match != 2)
3139     {
3140       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
3141          For a 64-bit bitmask try whether changing 16 bits to all ones or
3142          zeroes creates a valid bitmask.  To check any repeated bitmask,
3143          try using 16 bits from the other 32-bit half of val.  */
3144
3145       for (i = 0; i < 64; i += 16, mask <<= 16)
3146         {
3147           val2 = val & ~mask;
3148           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3149             break;
3150           val2 = val | mask;
3151           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3152             break;
3153           val2 = val2 & ~mask;
3154           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
3155           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3156             break;
3157         }
3158       if (i != 64)
3159         {
3160           if (generate)
3161             {
3162               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3163               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3164                                          GEN_INT ((val >> i) & 0xffff)));
3165             }
3166           return 2;
3167         }
3168     }
3169
3170   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
3171      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
3172      otherwise skip zero bits.  */
3173
3174   num_insns = 1;
3175   mask = 0xffff;
3176   val2 = one_match > zero_match ? ~val : val;
3177   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
3178
3179   if (generate)
3180     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
3181                                            ? (val | ~(mask << i))
3182                                            : (val & (mask << i)))));
3183   for (i += 16; i < 64; i += 16)
3184     {
3185       if ((val2 & (mask << i)) == 0)
3186         continue;
3187       if (generate)
3188         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3189                                    GEN_INT ((val >> i) & 0xffff)));
3190       num_insns ++;
3191     }
3192
3193   return num_insns;
3194 }
3195
3196 /* Return whether imm is a 128-bit immediate which is simple enough to
3197    expand inline.  */
3198 bool
3199 aarch64_mov128_immediate (rtx imm)
3200 {
3201   if (GET_CODE (imm) == CONST_INT)
3202     return true;
3203
3204   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
3205
3206   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
3207   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
3208
3209   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
3210          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
3211 }
3212
3213
3214 /* Return the number of temporary registers that aarch64_add_offset_1
3215    would need to add OFFSET to a register.  */
3216
3217 static unsigned int
3218 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
3219 {
3220   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
3221 }
3222
3223 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
3224    a non-polynomial OFFSET.  MODE is the mode of the addition.
3225    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3226    be set and CFA adjustments added to the generated instructions.
3227
3228    TEMP1, if nonnull, is a register of mode MODE that can be used as a
3229    temporary if register allocation is already complete.  This temporary
3230    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
3231    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
3232    the immediate again.
3233
3234    Since this function may be used to adjust the stack pointer, we must
3235    ensure that it cannot cause transient stack deallocation (for example
3236    by first incrementing SP and then decrementing when adjusting by a
3237    large immediate).  */
3238
3239 static void
3240 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
3241                       rtx src, HOST_WIDE_INT offset, rtx temp1,
3242                       bool frame_related_p, bool emit_move_imm)
3243 {
3244   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3245   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3246
3247   HOST_WIDE_INT moffset = abs_hwi (offset);
3248   rtx_insn *insn;
3249
3250   if (!moffset)
3251     {
3252       if (!rtx_equal_p (dest, src))
3253         {
3254           insn = emit_insn (gen_rtx_SET (dest, src));
3255           RTX_FRAME_RELATED_P (insn) = frame_related_p;
3256         }
3257       return;
3258     }
3259
3260   /* Single instruction adjustment.  */
3261   if (aarch64_uimm12_shift (moffset))
3262     {
3263       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
3264       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3265       return;
3266     }
3267
3268   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
3269      and either:
3270
3271      a) the offset cannot be loaded by a 16-bit move or
3272      b) there is no spare register into which we can move it.  */
3273   if (moffset < 0x1000000
3274       && ((!temp1 && !can_create_pseudo_p ())
3275           || !aarch64_move_imm (moffset, mode)))
3276     {
3277       HOST_WIDE_INT low_off = moffset & 0xfff;
3278
3279       low_off = offset < 0 ? -low_off : low_off;
3280       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
3281       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3282       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
3283       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3284       return;
3285     }
3286
3287   /* Emit a move immediate if required and an addition/subtraction.  */
3288   if (emit_move_imm)
3289     {
3290       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
3291       temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
3292     }
3293   insn = emit_insn (offset < 0
3294                     ? gen_sub3_insn (dest, src, temp1)
3295                     : gen_add3_insn (dest, src, temp1));
3296   if (frame_related_p)
3297     {
3298       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3299       rtx adj = plus_constant (mode, src, offset);
3300       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
3301     }
3302 }
3303
3304 /* Return the number of temporary registers that aarch64_add_offset
3305    would need to move OFFSET into a register or add OFFSET to a register;
3306    ADD_P is true if we want the latter rather than the former.  */
3307
3308 static unsigned int
3309 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
3310 {
3311   /* This follows the same structure as aarch64_add_offset.  */
3312   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
3313     return 0;
3314
3315   unsigned int count = 0;
3316   HOST_WIDE_INT factor = offset.coeffs[1];
3317   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3318   poly_int64 poly_offset (factor, factor);
3319   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3320     /* Need one register for the ADDVL/ADDPL result.  */
3321     count += 1;
3322   else if (factor != 0)
3323     {
3324       factor = abs (factor);
3325       if (factor > 16 * (factor & -factor))
3326         /* Need one register for the CNT result and one for the multiplication
3327            factor.  If necessary, the second temporary can be reused for the
3328            constant part of the offset.  */
3329         return 2;
3330       /* Need one register for the CNT result (which might then
3331          be shifted).  */
3332       count += 1;
3333     }
3334   return count + aarch64_add_offset_1_temporaries (constant);
3335 }
3336
3337 /* If X can be represented as a poly_int64, return the number
3338    of temporaries that are required to add it to a register.
3339    Return -1 otherwise.  */
3340
3341 int
3342 aarch64_add_offset_temporaries (rtx x)
3343 {
3344   poly_int64 offset;
3345   if (!poly_int_rtx_p (x, &offset))
3346     return -1;
3347   return aarch64_offset_temporaries (true, offset);
3348 }
3349
3350 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
3351    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3352    be set and CFA adjustments added to the generated instructions.
3353
3354    TEMP1, if nonnull, is a register of mode MODE that can be used as a
3355    temporary if register allocation is already complete.  This temporary
3356    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
3357    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
3358    false to avoid emitting the immediate again.
3359
3360    TEMP2, if nonnull, is a second temporary register that doesn't
3361    overlap either DEST or REG.
3362
3363    Since this function may be used to adjust the stack pointer, we must
3364    ensure that it cannot cause transient stack deallocation (for example
3365    by first incrementing SP and then decrementing when adjusting by a
3366    large immediate).  */
3367
3368 static void
3369 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3370                     poly_int64 offset, rtx temp1, rtx temp2,
3371                     bool frame_related_p, bool emit_move_imm = true)
3372 {
3373   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3374   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3375   gcc_assert (temp1 == NULL_RTX
3376               || !frame_related_p
3377               || !reg_overlap_mentioned_p (temp1, dest));
3378   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
3379
3380   /* Try using ADDVL or ADDPL to add the whole value.  */
3381   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
3382     {
3383       rtx offset_rtx = gen_int_mode (offset, mode);
3384       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3385       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3386       return;
3387     }
3388
3389   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
3390      SVE vector register, over and above the minimum size of 128 bits.
3391      This is equivalent to half the value returned by CNTD with a
3392      vector shape of ALL.  */
3393   HOST_WIDE_INT factor = offset.coeffs[1];
3394   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3395
3396   /* Try using ADDVL or ADDPL to add the VG-based part.  */
3397   poly_int64 poly_offset (factor, factor);
3398   if (src != const0_rtx
3399       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3400     {
3401       rtx offset_rtx = gen_int_mode (poly_offset, mode);
3402       if (frame_related_p)
3403         {
3404           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3405           RTX_FRAME_RELATED_P (insn) = true;
3406           src = dest;
3407         }
3408       else
3409         {
3410           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
3411           src = aarch64_force_temporary (mode, temp1, addr);
3412           temp1 = temp2;
3413           temp2 = NULL_RTX;
3414         }
3415     }
3416   /* Otherwise use a CNT-based sequence.  */
3417   else if (factor != 0)
3418     {
3419       /* Use a subtraction if we have a negative factor.  */
3420       rtx_code code = PLUS;
3421       if (factor < 0)
3422         {
3423           factor = -factor;
3424           code = MINUS;
3425         }
3426
3427       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
3428          into the multiplication.  */
3429       rtx val;
3430       int shift = 0;
3431       if (factor & 1)
3432         /* Use a right shift by 1.  */
3433         shift = -1;
3434       else
3435         factor /= 2;
3436       HOST_WIDE_INT low_bit = factor & -factor;
3437       if (factor <= 16 * low_bit)
3438         {
3439           if (factor > 16 * 8)
3440             {
3441               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3442                  the value with the minimum multiplier and shift it into
3443                  position.  */
3444               int extra_shift = exact_log2 (low_bit);
3445               shift += extra_shift;
3446               factor >>= extra_shift;
3447             }
3448           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
3449         }
3450       else
3451         {
3452           /* Use CNTD, then multiply it by FACTOR.  */
3453           val = gen_int_mode (poly_int64 (2, 2), mode);
3454           val = aarch64_force_temporary (mode, temp1, val);
3455
3456           /* Go back to using a negative multiplication factor if we have
3457              no register from which to subtract.  */
3458           if (code == MINUS && src == const0_rtx)
3459             {
3460               factor = -factor;
3461               code = PLUS;
3462             }
3463           rtx coeff1 = gen_int_mode (factor, mode);
3464           coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
3465           val = gen_rtx_MULT (mode, val, coeff1);
3466         }
3467
3468       if (shift > 0)
3469         {
3470           /* Multiply by 1 << SHIFT.  */
3471           val = aarch64_force_temporary (mode, temp1, val);
3472           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3473         }
3474       else if (shift == -1)
3475         {
3476           /* Divide by 2.  */
3477           val = aarch64_force_temporary (mode, temp1, val);
3478           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3479         }
3480
3481       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
3482       if (src != const0_rtx)
3483         {
3484           val = aarch64_force_temporary (mode, temp1, val);
3485           val = gen_rtx_fmt_ee (code, mode, src, val);
3486         }
3487       else if (code == MINUS)
3488         {
3489           val = aarch64_force_temporary (mode, temp1, val);
3490           val = gen_rtx_NEG (mode, val);
3491         }
3492
3493       if (constant == 0 || frame_related_p)
3494         {
3495           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3496           if (frame_related_p)
3497             {
3498               RTX_FRAME_RELATED_P (insn) = true;
3499               add_reg_note (insn, REG_CFA_ADJUST_CFA,
3500                             gen_rtx_SET (dest, plus_constant (Pmode, src,
3501                                                               poly_offset)));
3502             }
3503           src = dest;
3504           if (constant == 0)
3505             return;
3506         }
3507       else
3508         {
3509           src = aarch64_force_temporary (mode, temp1, val);
3510           temp1 = temp2;
3511           temp2 = NULL_RTX;
3512         }
3513
3514       emit_move_imm = true;
3515     }
3516
3517   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
3518                         frame_related_p, emit_move_imm);
3519 }
3520
3521 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3522    than a poly_int64.  */
3523
3524 void
3525 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3526                           rtx offset_rtx, rtx temp1, rtx temp2)
3527 {
3528   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
3529                       temp1, temp2, false);
3530 }
3531
3532 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3533    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
3534    if TEMP1 already contains abs (DELTA).  */
3535
3536 static inline void
3537 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
3538 {
3539   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
3540                       temp1, temp2, true, emit_move_imm);
3541 }
3542
3543 /* Subtract DELTA from the stack pointer, marking the instructions
3544    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
3545    if nonnull.  */
3546
3547 static inline void
3548 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
3549                 bool emit_move_imm = true)
3550 {
3551   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
3552                       temp1, temp2, frame_related_p, emit_move_imm);
3553 }
3554
3555 /* Set DEST to (vec_series BASE STEP).  */
3556
3557 static void
3558 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
3559 {
3560   machine_mode mode = GET_MODE (dest);
3561   scalar_mode inner = GET_MODE_INNER (mode);
3562
3563   /* Each operand can be a register or an immediate in the range [-16, 15].  */
3564   if (!aarch64_sve_index_immediate_p (base))
3565     base = force_reg (inner, base);
3566   if (!aarch64_sve_index_immediate_p (step))
3567     step = force_reg (inner, step);
3568
3569   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
3570 }
3571
3572 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
3573    register of mode MODE.  Use TARGET for the result if it's nonnull
3574    and convenient.
3575
3576    The two vector modes must have the same element mode.  The behavior
3577    is to duplicate architectural lane N of SRC into architectural lanes
3578    N + I * STEP of the result.  On big-endian targets, architectural
3579    lane 0 of an Advanced SIMD vector is the last element of the vector
3580    in memory layout, so for big-endian targets this operation has the
3581    effect of reversing SRC before duplicating it.  Callers need to
3582    account for this.  */
3583
3584 rtx
3585 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
3586 {
3587   machine_mode src_mode = GET_MODE (src);
3588   gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
3589   insn_code icode = (BYTES_BIG_ENDIAN
3590                      ? code_for_aarch64_vec_duplicate_vq_be (mode)
3591                      : code_for_aarch64_vec_duplicate_vq_le (mode));
3592
3593   unsigned int i = 0;
3594   expand_operand ops[3];
3595   create_output_operand (&ops[i++], target, mode);
3596   create_output_operand (&ops[i++], src, src_mode);
3597   if (BYTES_BIG_ENDIAN)
3598     {
3599       /* Create a PARALLEL describing the reversal of SRC.  */
3600       unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
3601       rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
3602                                                   nelts_per_vq - 1, -1);
3603       create_fixed_operand (&ops[i++], sel);
3604     }
3605   expand_insn (icode, i, ops);
3606   return ops[0].value;
3607 }
3608
3609 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
3610    the memory image into DEST.  Return true on success.  */
3611
3612 static bool
3613 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
3614 {
3615   src = force_const_mem (GET_MODE (src), src);
3616   if (!src)
3617     return false;
3618
3619   /* Make sure that the address is legitimate.  */
3620   if (!aarch64_sve_ld1rq_operand_p (src))
3621     {
3622       rtx addr = force_reg (Pmode, XEXP (src, 0));
3623       src = replace_equiv_address (src, addr);
3624     }
3625
3626   machine_mode mode = GET_MODE (dest);
3627   unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
3628   machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
3629   rtx ptrue = aarch64_ptrue_reg (pred_mode);
3630   emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
3631   return true;
3632 }
3633
3634 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
3635    SVE data mode and isn't a legitimate constant.  Use TARGET for the
3636    result if convenient.
3637
3638    The returned register can have whatever mode seems most natural
3639    given the contents of SRC.  */
3640
3641 static rtx
3642 aarch64_expand_sve_const_vector (rtx target, rtx src)
3643 {
3644   machine_mode mode = GET_MODE (src);
3645   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
3646   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
3647   scalar_mode elt_mode = GET_MODE_INNER (mode);
3648   unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
3649   unsigned int encoded_bits = npatterns * nelts_per_pattern * elt_bits;
3650
3651   if (nelts_per_pattern == 1 && encoded_bits == 128)
3652     {
3653       /* The constant is a duplicated quadword but can't be narrowed
3654          beyond a quadword.  Get the memory image of the first quadword
3655          as a 128-bit vector and try using LD1RQ to load it from memory.
3656
3657          The effect for both endiannesses is to load memory lane N into
3658          architectural lanes N + I * STEP of the result.  On big-endian
3659          targets, the layout of the 128-bit vector in an Advanced SIMD
3660          register would be different from its layout in an SVE register,
3661          but this 128-bit vector is a memory value only.  */
3662       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
3663       rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
3664       if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
3665         return target;
3666     }
3667
3668   if (nelts_per_pattern == 1 && encoded_bits < 128)
3669     {
3670       /* The vector is a repeating sequence of 64 bits or fewer.
3671          See if we can load them using an Advanced SIMD move and then
3672          duplicate it to fill a vector.  This is better than using a GPR
3673          move because it keeps everything in the same register file.  */
3674       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
3675       rtx_vector_builder builder (vq_mode, npatterns, 1);
3676       for (unsigned int i = 0; i < npatterns; ++i)
3677         {
3678           /* We want memory lane N to go into architectural lane N,
3679              so reverse for big-endian targets.  The DUP .Q pattern
3680              has a compensating reverse built-in.  */
3681           unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
3682           builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
3683         }
3684       rtx vq_src = builder.build ();
3685       if (aarch64_simd_valid_immediate (vq_src, NULL))
3686         {
3687           vq_src = force_reg (vq_mode, vq_src);
3688           return aarch64_expand_sve_dupq (target, mode, vq_src);
3689         }
3690
3691       /* Get an integer representation of the repeating part of Advanced
3692          SIMD vector VQ_SRC.  This preserves the endianness of VQ_SRC,
3693          which for big-endian targets is lane-swapped wrt a normal
3694          Advanced SIMD vector.  This means that for both endiannesses,
3695          memory lane N of SVE vector SRC corresponds to architectural
3696          lane N of a register holding VQ_SRC.  This in turn means that
3697          memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
3698          as a single 128-bit value) and thus that memory lane 0 of SRC is
3699          in the lsb of the integer.  Duplicating the integer therefore
3700          ensures that memory lane N of SRC goes into architectural lane
3701          N + I * INDEX of the SVE register.  */
3702       scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
3703       rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
3704       if (elt_value)
3705         {
3706           /* Pretend that we had a vector of INT_MODE to start with.  */
3707           elt_mode = int_mode;
3708           mode = aarch64_full_sve_mode (int_mode).require ();
3709
3710           /* If the integer can be moved into a general register by a
3711              single instruction, do that and duplicate the result.  */
3712           if (CONST_INT_P (elt_value)
3713               && aarch64_move_imm (INTVAL (elt_value), elt_mode))
3714             {
3715               elt_value = force_reg (elt_mode, elt_value);
3716               return expand_vector_broadcast (mode, elt_value);
3717             }
3718         }
3719       else if (npatterns == 1)
3720         /* We're duplicating a single value, but can't do better than
3721            force it to memory and load from there.  This handles things
3722            like symbolic constants.  */
3723         elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
3724
3725       if (elt_value)
3726         {
3727           /* Load the element from memory if we can, otherwise move it into
3728              a register and use a DUP.  */
3729           rtx op = force_const_mem (elt_mode, elt_value);
3730           if (!op)
3731             op = force_reg (elt_mode, elt_value);
3732           return expand_vector_broadcast (mode, op);
3733         }
3734     }
3735
3736   /* Try using INDEX.  */
3737   rtx base, step;
3738   if (const_vec_series_p (src, &base, &step))
3739     {
3740       aarch64_expand_vec_series (target, base, step);
3741       return target;
3742     }
3743
3744   /* From here on, it's better to force the whole constant to memory
3745      if we can.  */
3746   if (GET_MODE_NUNITS (mode).is_constant ())
3747     return NULL_RTX;
3748
3749   /* Expand each pattern individually.  */
3750   gcc_assert (npatterns > 1);
3751   rtx_vector_builder builder;
3752   auto_vec<rtx, 16> vectors (npatterns);
3753   for (unsigned int i = 0; i < npatterns; ++i)
3754     {
3755       builder.new_vector (mode, 1, nelts_per_pattern);
3756       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
3757         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
3758       vectors.quick_push (force_reg (mode, builder.build ()));
3759     }
3760
3761   /* Use permutes to interleave the separate vectors.  */
3762   while (npatterns > 1)
3763     {
3764       npatterns /= 2;
3765       for (unsigned int i = 0; i < npatterns; ++i)
3766         {
3767           rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
3768           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
3769           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
3770           vectors[i] = tmp;
3771         }
3772     }
3773   gcc_assert (vectors[0] == target);
3774   return target;
3775 }
3776
3777 /* Use WHILE to set a predicate register of mode MODE in which the first
3778    VL bits are set and the rest are clear.  Use TARGET for the register
3779    if it's nonnull and convenient.  */
3780
3781 static rtx
3782 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
3783                                  unsigned int vl)
3784 {
3785   rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
3786   target = aarch64_target_reg (target, mode);
3787   emit_insn (gen_while_ult (DImode, mode, target, const0_rtx, limit));
3788   return target;
3789 }
3790
3791 static rtx
3792 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
3793
3794 /* BUILDER is a constant predicate in which the index of every set bit
3795    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
3796    by inverting every element at a multiple of ELT_SIZE and EORing the
3797    result with an ELT_SIZE PTRUE.
3798
3799    Return a register that contains the constant on success, otherwise
3800    return null.  Use TARGET as the register if it is nonnull and
3801    convenient.  */
3802
3803 static rtx
3804 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
3805                                    unsigned int elt_size)
3806 {
3807   /* Invert every element at a multiple of ELT_SIZE, keeping the
3808      other bits zero.  */
3809   rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
3810                                   builder.nelts_per_pattern ());
3811   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
3812     if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
3813       inv_builder.quick_push (const1_rtx);
3814     else
3815       inv_builder.quick_push (const0_rtx);
3816   inv_builder.finalize ();
3817
3818   /* See if we can load the constant cheaply.  */
3819   rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
3820   if (!inv)
3821     return NULL_RTX;
3822
3823   /* EOR the result with an ELT_SIZE PTRUE.  */
3824   rtx mask = aarch64_ptrue_all (elt_size);
3825   mask = force_reg (VNx16BImode, mask);
3826   target = aarch64_target_reg (target, VNx16BImode);
3827   emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
3828   return target;
3829 }
3830
3831 /* BUILDER is a constant predicate in which the index of every set bit
3832    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
3833    using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE.  Return the
3834    register on success, otherwise return null.  Use TARGET as the register
3835    if nonnull and convenient.  */
3836
3837 static rtx
3838 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
3839                                    unsigned int elt_size,
3840                                    unsigned int permute_size)
3841 {
3842   /* We're going to split the constant into two new constants A and B,
3843      with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
3844      and into B otherwise.  E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
3845
3846      A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
3847      B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
3848
3849      where _ indicates elements that will be discarded by the permute.
3850
3851      First calculate the ELT_SIZEs for A and B.  */
3852   unsigned int a_elt_size = GET_MODE_SIZE (DImode);
3853   unsigned int b_elt_size = GET_MODE_SIZE (DImode);
3854   for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
3855     if (INTVAL (builder.elt (i)) != 0)
3856       {
3857         if (i & permute_size)
3858           b_elt_size |= i - permute_size;
3859         else
3860           a_elt_size |= i;
3861       }
3862   a_elt_size &= -a_elt_size;
3863   b_elt_size &= -b_elt_size;
3864
3865   /* Now construct the vectors themselves.  */
3866   rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
3867                                 builder.nelts_per_pattern ());
3868   rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
3869                                 builder.nelts_per_pattern ());
3870   unsigned int nelts = builder.encoded_nelts ();
3871   for (unsigned int i = 0; i < nelts; ++i)
3872     if (i & (elt_size - 1))
3873       {
3874         a_builder.quick_push (const0_rtx);
3875         b_builder.quick_push (const0_rtx);
3876       }
3877     else if ((i & permute_size) == 0)
3878       {
3879         /* The A and B elements are significant.  */
3880         a_builder.quick_push (builder.elt (i));
3881         b_builder.quick_push (builder.elt (i + permute_size));
3882       }
3883     else
3884       {
3885         /* The A and B elements are going to be discarded, so pick whatever
3886            is likely to give a nice constant.  We are targeting element
3887            sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
3888            with the aim of each being a sequence of ones followed by
3889            a sequence of zeros.  So:
3890
3891            * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
3892              duplicate the last X_ELT_SIZE element, to extend the
3893              current sequence of ones or zeros.
3894
3895            * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
3896              zero, so that the constant really does have X_ELT_SIZE and
3897              not a smaller size.  */
3898         if (a_elt_size > permute_size)
3899           a_builder.quick_push (const0_rtx);
3900         else
3901           a_builder.quick_push (a_builder.elt (i - a_elt_size));
3902         if (b_elt_size > permute_size)
3903           b_builder.quick_push (const0_rtx);
3904         else
3905           b_builder.quick_push (b_builder.elt (i - b_elt_size));
3906       }
3907   a_builder.finalize ();
3908   b_builder.finalize ();
3909
3910   /* Try loading A into a register.  */
3911   rtx_insn *last = get_last_insn ();
3912   rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
3913   if (!a)
3914     return NULL_RTX;
3915
3916   /* Try loading B into a register.  */
3917   rtx b = a;
3918   if (a_builder != b_builder)
3919     {
3920       b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
3921       if (!b)
3922         {
3923           delete_insns_since (last);
3924           return NULL_RTX;
3925         }
3926     }
3927
3928   /* Emit the TRN1 itself.  */
3929   machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
3930   target = aarch64_target_reg (target, mode);
3931   emit_insn (gen_aarch64_sve (UNSPEC_TRN1, mode, target,
3932                               gen_lowpart (mode, a),
3933                               gen_lowpart (mode, b)));
3934   return target;
3935 }
3936
3937 /* Subroutine of aarch64_expand_sve_const_pred.  Try to load the VNx16BI
3938    constant in BUILDER into an SVE predicate register.  Return the register
3939    on success, otherwise return null.  Use TARGET for the register if
3940    nonnull and convenient.
3941
3942    ALLOW_RECURSE_P is true if we can use methods that would call this
3943    function recursively.  */
3944
3945 static rtx
3946 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
3947                                  bool allow_recurse_p)
3948 {
3949   if (builder.encoded_nelts () == 1)
3950     /* A PFALSE or a PTRUE .B ALL.  */
3951     return aarch64_emit_set_immediate (target, builder);
3952
3953   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
3954   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
3955     {
3956       /* If we can load the constant using PTRUE, use it as-is.  */
3957       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
3958       if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
3959         return aarch64_emit_set_immediate (target, builder);
3960
3961       /* Otherwise use WHILE to set the first VL bits.  */
3962       return aarch64_sve_move_pred_via_while (target, mode, vl);
3963     }
3964
3965   if (!allow_recurse_p)
3966     return NULL_RTX;
3967
3968   /* Try inverting the vector in element size ELT_SIZE and then EORing
3969      the result with an ELT_SIZE PTRUE.  */
3970   if (INTVAL (builder.elt (0)) == 0)
3971     if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
3972                                                      elt_size))
3973       return res;
3974
3975   /* Try using TRN1 to permute two simpler constants.  */
3976   for (unsigned int i = elt_size; i <= 8; i *= 2)
3977     if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
3978                                                      elt_size, i))
3979       return res;
3980
3981   return NULL_RTX;
3982 }
3983
3984 /* Return an SVE predicate register that contains the VNx16BImode
3985    constant in BUILDER, without going through the move expanders.
3986
3987    The returned register can have whatever mode seems most natural
3988    given the contents of BUILDER.  Use TARGET for the result if
3989    convenient.  */
3990
3991 static rtx
3992 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
3993 {
3994   /* Try loading the constant using pure predicate operations.  */
3995   if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
3996     return res;
3997
3998   /* Try forcing the constant to memory.  */
3999   if (builder.full_nelts ().is_constant ())
4000     if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
4001       {
4002         target = aarch64_target_reg (target, VNx16BImode);
4003         emit_move_insn (target, mem);
4004         return target;
4005       }
4006
4007   /* The last resort is to load the constant as an integer and then
4008      compare it against zero.  Use -1 for set bits in order to increase
4009      the changes of using SVE DUPM or an Advanced SIMD byte mask.  */
4010   rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
4011                                   builder.nelts_per_pattern ());
4012   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
4013     int_builder.quick_push (INTVAL (builder.elt (i))
4014                             ? constm1_rtx : const0_rtx);
4015   return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
4016                                            int_builder.build ());
4017 }
4018
4019 /* Set DEST to immediate IMM.  */
4020
4021 void
4022 aarch64_expand_mov_immediate (rtx dest, rtx imm)
4023 {
4024   machine_mode mode = GET_MODE (dest);
4025
4026   /* Check on what type of symbol it is.  */
4027   scalar_int_mode int_mode;
4028   if ((GET_CODE (imm) == SYMBOL_REF
4029        || GET_CODE (imm) == LABEL_REF
4030        || GET_CODE (imm) == CONST
4031        || GET_CODE (imm) == CONST_POLY_INT)
4032       && is_a <scalar_int_mode> (mode, &int_mode))
4033     {
4034       rtx mem;
4035       poly_int64 offset;
4036       HOST_WIDE_INT const_offset;
4037       enum aarch64_symbol_type sty;
4038
4039       /* If we have (const (plus symbol offset)), separate out the offset
4040          before we start classifying the symbol.  */
4041       rtx base = strip_offset (imm, &offset);
4042
4043       /* We must always add an offset involving VL separately, rather than
4044          folding it into the relocation.  */
4045       if (!offset.is_constant (&const_offset))
4046         {
4047           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
4048             emit_insn (gen_rtx_SET (dest, imm));
4049           else
4050             {
4051               /* Do arithmetic on 32-bit values if the result is smaller
4052                  than that.  */
4053               if (partial_subreg_p (int_mode, SImode))
4054                 {
4055                   /* It is invalid to do symbol calculations in modes
4056                      narrower than SImode.  */
4057                   gcc_assert (base == const0_rtx);
4058                   dest = gen_lowpart (SImode, dest);
4059                   int_mode = SImode;
4060                 }
4061               if (base != const0_rtx)
4062                 {
4063                   base = aarch64_force_temporary (int_mode, dest, base);
4064                   aarch64_add_offset (int_mode, dest, base, offset,
4065                                       NULL_RTX, NULL_RTX, false);
4066                 }
4067               else
4068                 aarch64_add_offset (int_mode, dest, base, offset,
4069                                     dest, NULL_RTX, false);
4070             }
4071           return;
4072         }
4073
4074       sty = aarch64_classify_symbol (base, const_offset);
4075       switch (sty)
4076         {
4077         case SYMBOL_FORCE_TO_MEM:
4078           if (const_offset != 0
4079               && targetm.cannot_force_const_mem (int_mode, imm))
4080             {
4081               gcc_assert (can_create_pseudo_p ());
4082               base = aarch64_force_temporary (int_mode, dest, base);
4083               aarch64_add_offset (int_mode, dest, base, const_offset,
4084                                   NULL_RTX, NULL_RTX, false);
4085               return;
4086             }
4087
4088           mem = force_const_mem (ptr_mode, imm);
4089           gcc_assert (mem);
4090
4091           /* If we aren't generating PC relative literals, then
4092              we need to expand the literal pool access carefully.
4093              This is something that needs to be done in a number
4094              of places, so could well live as a separate function.  */
4095           if (!aarch64_pcrelative_literal_loads)
4096             {
4097               gcc_assert (can_create_pseudo_p ());
4098               base = gen_reg_rtx (ptr_mode);
4099               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
4100               if (ptr_mode != Pmode)
4101                 base = convert_memory_address (Pmode, base);
4102               mem = gen_rtx_MEM (ptr_mode, base);
4103             }
4104
4105           if (int_mode != ptr_mode)
4106             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
4107
4108           emit_insn (gen_rtx_SET (dest, mem));
4109
4110           return;
4111
4112         case SYMBOL_SMALL_TLSGD:
4113         case SYMBOL_SMALL_TLSDESC:
4114         case SYMBOL_SMALL_TLSIE:
4115         case SYMBOL_SMALL_GOT_28K:
4116         case SYMBOL_SMALL_GOT_4G:
4117         case SYMBOL_TINY_GOT:
4118         case SYMBOL_TINY_TLSIE:
4119           if (const_offset != 0)
4120             {
4121               gcc_assert(can_create_pseudo_p ());
4122               base = aarch64_force_temporary (int_mode, dest, base);
4123               aarch64_add_offset (int_mode, dest, base, const_offset,
4124                                   NULL_RTX, NULL_RTX, false);
4125               return;
4126             }
4127           /* FALLTHRU */
4128
4129         case SYMBOL_SMALL_ABSOLUTE:
4130         case SYMBOL_TINY_ABSOLUTE:
4131         case SYMBOL_TLSLE12:
4132         case SYMBOL_TLSLE24:
4133         case SYMBOL_TLSLE32:
4134         case SYMBOL_TLSLE48:
4135           aarch64_load_symref_appropriately (dest, imm, sty);
4136           return;
4137
4138         default:
4139           gcc_unreachable ();
4140         }
4141     }
4142
4143   if (!CONST_INT_P (imm))
4144     {
4145       if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
4146         {
4147           /* Only the low bit of each .H, .S and .D element is defined,
4148              so we can set the upper bits to whatever we like.  If the
4149              predicate is all-true in MODE, prefer to set all the undefined
4150              bits as well, so that we can share a single .B predicate for
4151              all modes.  */
4152           if (imm == CONSTM1_RTX (mode))
4153             imm = CONSTM1_RTX (VNx16BImode);
4154
4155           /* All methods for constructing predicate modes wider than VNx16BI
4156              will set the upper bits of each element to zero.  Expose this
4157              by moving such constants as a VNx16BI, so that all bits are
4158              significant and so that constants for different modes can be
4159              shared.  The wider constant will still be available as a
4160              REG_EQUAL note.  */
4161           rtx_vector_builder builder;
4162           if (aarch64_get_sve_pred_bits (builder, imm))
4163             {
4164               rtx res = aarch64_expand_sve_const_pred (dest, builder);
4165               if (dest != res)
4166                 emit_move_insn (dest, gen_lowpart (mode, res));
4167               return;
4168             }
4169         }
4170
4171       if (GET_CODE (imm) == HIGH
4172           || aarch64_simd_valid_immediate (imm, NULL))
4173         {
4174           emit_insn (gen_rtx_SET (dest, imm));
4175           return;
4176         }
4177
4178       if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
4179         if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
4180           {
4181             if (dest != res)
4182               emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
4183             return;
4184           }
4185
4186       rtx mem = force_const_mem (mode, imm);
4187       gcc_assert (mem);
4188       emit_move_insn (dest, mem);
4189       return;
4190     }
4191
4192   aarch64_internal_mov_immediate (dest, imm, true,
4193                                   as_a <scalar_int_mode> (mode));
4194 }
4195
4196 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
4197    that is known to contain PTRUE.  */
4198
4199 void
4200 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
4201 {
4202   expand_operand ops[3];
4203   machine_mode mode = GET_MODE (dest);
4204   create_output_operand (&ops[0], dest, mode);
4205   create_input_operand (&ops[1], pred, GET_MODE(pred));
4206   create_input_operand (&ops[2], src, mode);
4207   temporary_volatile_ok v (true);
4208   expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
4209 }
4210
4211 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
4212    operand is in memory.  In this case we need to use the predicated LD1
4213    and ST1 instead of LDR and STR, both for correctness on big-endian
4214    targets and because LD1 and ST1 support a wider range of addressing modes.
4215    PRED_MODE is the mode of the predicate.
4216
4217    See the comment at the head of aarch64-sve.md for details about the
4218    big-endian handling.  */
4219
4220 void
4221 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
4222 {
4223   machine_mode mode = GET_MODE (dest);
4224   rtx ptrue = aarch64_ptrue_reg (pred_mode);
4225   if (!register_operand (src, mode)
4226       && !register_operand (dest, mode))
4227     {
4228       rtx tmp = gen_reg_rtx (mode);
4229       if (MEM_P (src))
4230         aarch64_emit_sve_pred_move (tmp, ptrue, src);
4231       else
4232         emit_move_insn (tmp, src);
4233       src = tmp;
4234     }
4235   aarch64_emit_sve_pred_move (dest, ptrue, src);
4236 }
4237
4238 /* Called only on big-endian targets.  See whether an SVE vector move
4239    from SRC to DEST is effectively a REV[BHW] instruction, because at
4240    least one operand is a subreg of an SVE vector that has wider or
4241    narrower elements.  Return true and emit the instruction if so.
4242
4243    For example:
4244
4245      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
4246
4247    represents a VIEW_CONVERT between the following vectors, viewed
4248    in memory order:
4249
4250      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
4251      R1: { [0],      [1],      [2],      [3],     ... }
4252
4253    The high part of lane X in R2 should therefore correspond to lane X*2
4254    of R1, but the register representations are:
4255
4256          msb                                      lsb
4257      R2: ...... [1].high  [1].low   [0].high  [0].low
4258      R1: ...... [3]       [2]       [1]       [0]
4259
4260    where the low part of lane X in R2 corresponds to lane X*2 in R1.
4261    We therefore need a reverse operation to swap the high and low values
4262    around.
4263
4264    This is purely an optimization.  Without it we would spill the
4265    subreg operand to the stack in one mode and reload it in the
4266    other mode, which has the same effect as the REV.  */
4267
4268 bool
4269 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
4270 {
4271   gcc_assert (BYTES_BIG_ENDIAN);
4272   if (GET_CODE (dest) == SUBREG)
4273     dest = SUBREG_REG (dest);
4274   if (GET_CODE (src) == SUBREG)
4275     src = SUBREG_REG (src);
4276
4277   /* The optimization handles two single SVE REGs with different element
4278      sizes.  */
4279   if (!REG_P (dest)
4280       || !REG_P (src)
4281       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
4282       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
4283       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
4284           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
4285     return false;
4286
4287   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
4288   rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
4289   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
4290                                UNSPEC_REV_SUBREG);
4291   emit_insn (gen_rtx_SET (dest, unspec));
4292   return true;
4293 }
4294
4295 /* Return a copy of X with mode MODE, without changing its other
4296    attributes.  Unlike gen_lowpart, this doesn't care whether the
4297    mode change is valid.  */
4298
4299 static rtx
4300 aarch64_replace_reg_mode (rtx x, machine_mode mode)
4301 {
4302   if (GET_MODE (x) == mode)
4303     return x;
4304
4305   x = shallow_copy_rtx (x);
4306   set_mode_and_regno (x, mode, REGNO (x));
4307   return x;
4308 }
4309
4310 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
4311    stored in wider integer containers.  */
4312
4313 static unsigned int
4314 aarch64_sve_rev_unspec (machine_mode mode)
4315 {
4316   switch (GET_MODE_UNIT_SIZE (mode))
4317     {
4318     case 1: return UNSPEC_REVB;
4319     case 2: return UNSPEC_REVH;
4320     case 4: return UNSPEC_REVW;
4321     }
4322   gcc_unreachable ();
4323 }
4324
4325 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
4326    operands.  */
4327
4328 void
4329 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
4330 {
4331   /* Decide which REV operation we need.  The mode with wider elements
4332      determines the mode of the operands and the mode with the narrower
4333      elements determines the reverse width.  */
4334   machine_mode mode_with_wider_elts = GET_MODE (dest);
4335   machine_mode mode_with_narrower_elts = GET_MODE (src);
4336   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
4337       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
4338     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
4339
4340   unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
4341   unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
4342   machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
4343
4344   /* Get the operands in the appropriate modes and emit the instruction.  */
4345   ptrue = gen_lowpart (pred_mode, ptrue);
4346   dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
4347   src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
4348   emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
4349                                dest, ptrue, src));
4350 }
4351
4352 static bool
4353 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
4354                                  tree exp ATTRIBUTE_UNUSED)
4355 {
4356   if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
4357     return false;
4358
4359   return true;
4360 }
4361
4362 /* Implement TARGET_PASS_BY_REFERENCE.  */
4363
4364 static bool
4365 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
4366                            machine_mode mode,
4367                            const_tree type,
4368                            bool named ATTRIBUTE_UNUSED)
4369 {
4370   HOST_WIDE_INT size;
4371   machine_mode dummymode;
4372   int nregs;
4373
4374   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
4375   if (mode == BLKmode && type)
4376     size = int_size_in_bytes (type);
4377   else
4378     /* No frontends can create types with variable-sized modes, so we
4379        shouldn't be asked to pass or return them.  */
4380     size = GET_MODE_SIZE (mode).to_constant ();
4381
4382   /* Aggregates are passed by reference based on their size.  */
4383   if (type && AGGREGATE_TYPE_P (type))
4384     {
4385       size = int_size_in_bytes (type);
4386     }
4387
4388   /* Variable sized arguments are always returned by reference.  */
4389   if (size < 0)
4390     return true;
4391
4392   /* Can this be a candidate to be passed in fp/simd register(s)?  */
4393   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
4394                                                &dummymode, &nregs,
4395                                                NULL))
4396     return false;
4397
4398   /* Arguments which are variable sized or larger than 2 registers are
4399      passed by reference unless they are a homogenous floating point
4400      aggregate.  */
4401   return size > 2 * UNITS_PER_WORD;
4402 }
4403
4404 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
4405 static bool
4406 aarch64_return_in_msb (const_tree valtype)
4407 {
4408   machine_mode dummy_mode;
4409   int dummy_int;
4410
4411   /* Never happens in little-endian mode.  */
4412   if (!BYTES_BIG_ENDIAN)
4413     return false;
4414
4415   /* Only composite types smaller than or equal to 16 bytes can
4416      be potentially returned in registers.  */
4417   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
4418       || int_size_in_bytes (valtype) <= 0
4419       || int_size_in_bytes (valtype) > 16)
4420     return false;
4421
4422   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
4423      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
4424      is always passed/returned in the least significant bits of fp/simd
4425      register(s).  */
4426   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
4427                                                &dummy_mode, &dummy_int, NULL))
4428     return false;
4429
4430   return true;
4431 }
4432
4433 /* Implement TARGET_FUNCTION_VALUE.
4434    Define how to find the value returned by a function.  */
4435
4436 static rtx
4437 aarch64_function_value (const_tree type, const_tree func,
4438                         bool outgoing ATTRIBUTE_UNUSED)
4439 {
4440   machine_mode mode;
4441   int unsignedp;
4442   int count;
4443   machine_mode ag_mode;
4444
4445   mode = TYPE_MODE (type);
4446   if (INTEGRAL_TYPE_P (type))
4447     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
4448
4449   if (aarch64_return_in_msb (type))
4450     {
4451       HOST_WIDE_INT size = int_size_in_bytes (type);
4452
4453       if (size % UNITS_PER_WORD != 0)
4454         {
4455           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
4456           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
4457         }
4458     }
4459
4460   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
4461                                                &ag_mode, &count, NULL))
4462     {
4463       if (!aarch64_composite_type_p (type, mode))
4464         {
4465           gcc_assert (count == 1 && mode == ag_mode);
4466           return gen_rtx_REG (mode, V0_REGNUM);
4467         }
4468       else
4469         {
4470           int i;
4471           rtx par;
4472
4473           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
4474           for (i = 0; i < count; i++)
4475             {
4476               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
4477               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
4478               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4479               XVECEXP (par, 0, i) = tmp;
4480             }
4481           return par;
4482         }
4483     }
4484   else
4485     return gen_rtx_REG (mode, R0_REGNUM);
4486 }
4487
4488 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
4489    Return true if REGNO is the number of a hard register in which the values
4490    of called function may come back.  */
4491
4492 static bool
4493 aarch64_function_value_regno_p (const unsigned int regno)
4494 {
4495   /* Maximum of 16 bytes can be returned in the general registers.  Examples
4496      of 16-byte return values are: 128-bit integers and 16-byte small
4497      structures (excluding homogeneous floating-point aggregates).  */
4498   if (regno == R0_REGNUM || regno == R1_REGNUM)
4499     return true;
4500
4501   /* Up to four fp/simd registers can return a function value, e.g. a
4502      homogeneous floating-point aggregate having four members.  */
4503   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
4504     return TARGET_FLOAT;
4505
4506   return false;
4507 }
4508
4509 /* Implement TARGET_RETURN_IN_MEMORY.
4510
4511    If the type T of the result of a function is such that
4512      void func (T arg)
4513    would require that arg be passed as a value in a register (or set of
4514    registers) according to the parameter passing rules, then the result
4515    is returned in the same registers as would be used for such an
4516    argument.  */
4517
4518 static bool
4519 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
4520 {
4521   HOST_WIDE_INT size;
4522   machine_mode ag_mode;
4523   int count;
4524
4525   if (!AGGREGATE_TYPE_P (type)
4526       && TREE_CODE (type) != COMPLEX_TYPE
4527       && TREE_CODE (type) != VECTOR_TYPE)
4528     /* Simple scalar types always returned in registers.  */
4529     return false;
4530
4531   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
4532                                                type,
4533                                                &ag_mode,
4534                                                &count,
4535                                                NULL))
4536     return false;
4537
4538   /* Types larger than 2 registers returned in memory.  */
4539   size = int_size_in_bytes (type);
4540   return (size < 0 || size > 2 * UNITS_PER_WORD);
4541 }
4542
4543 static bool
4544 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
4545                                const_tree type, int *nregs)
4546 {
4547   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4548   return aarch64_vfp_is_call_or_return_candidate (mode,
4549                                                   type,
4550                                                   &pcum->aapcs_vfp_rmode,
4551                                                   nregs,
4552                                                   NULL);
4553 }
4554
4555 /* Given MODE and TYPE of a function argument, return the alignment in
4556    bits.  The idea is to suppress any stronger alignment requested by
4557    the user and opt for the natural alignment (specified in AAPCS64 \S
4558    4.1).  ABI_BREAK is set to true if the alignment was incorrectly
4559    calculated in versions of GCC prior to GCC-9.  This is a helper
4560    function for local use only.  */
4561
4562 static unsigned int
4563 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
4564                                 bool *abi_break)
4565 {
4566   *abi_break = false;
4567   if (!type)
4568     return GET_MODE_ALIGNMENT (mode);
4569
4570   if (integer_zerop (TYPE_SIZE (type)))
4571     return 0;
4572
4573   gcc_assert (TYPE_MODE (type) == mode);
4574
4575   if (!AGGREGATE_TYPE_P (type))
4576     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
4577
4578   if (TREE_CODE (type) == ARRAY_TYPE)
4579     return TYPE_ALIGN (TREE_TYPE (type));
4580
4581   unsigned int alignment = 0;
4582   unsigned int bitfield_alignment = 0;
4583   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
4584     if (TREE_CODE (field) == FIELD_DECL)
4585       {
4586         alignment = std::max (alignment, DECL_ALIGN (field));
4587         if (DECL_BIT_FIELD_TYPE (field))
4588           bitfield_alignment
4589             = std::max (bitfield_alignment,
4590                         TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
4591       }
4592
4593   if (bitfield_alignment > alignment)
4594     {
4595       *abi_break = true;
4596       return bitfield_alignment;
4597     }
4598
4599   return alignment;
4600 }
4601
4602 /* Layout a function argument according to the AAPCS64 rules.  The rule
4603    numbers refer to the rule numbers in the AAPCS64.  */
4604
4605 static void
4606 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
4607                     const_tree type,
4608                     bool named ATTRIBUTE_UNUSED)
4609 {
4610   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4611   int ncrn, nvrn, nregs;
4612   bool allocate_ncrn, allocate_nvrn;
4613   HOST_WIDE_INT size;
4614   bool abi_break;
4615
4616   /* We need to do this once per argument.  */
4617   if (pcum->aapcs_arg_processed)
4618     return;
4619
4620   pcum->aapcs_arg_processed = true;
4621
4622   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
4623   if (type)
4624     size = int_size_in_bytes (type);
4625   else
4626     /* No frontends can create types with variable-sized modes, so we
4627        shouldn't be asked to pass or return them.  */
4628     size = GET_MODE_SIZE (mode).to_constant ();
4629   size = ROUND_UP (size, UNITS_PER_WORD);
4630
4631   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
4632   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
4633                                                  mode,
4634                                                  type,
4635                                                  &nregs);
4636
4637   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
4638      The following code thus handles passing by SIMD/FP registers first.  */
4639
4640   nvrn = pcum->aapcs_nvrn;
4641
4642   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
4643      and homogenous short-vector aggregates (HVA).  */
4644   if (allocate_nvrn)
4645     {
4646       if (!TARGET_FLOAT)
4647         aarch64_err_no_fpadvsimd (mode);
4648
4649       if (nvrn + nregs <= NUM_FP_ARG_REGS)
4650         {
4651           pcum->aapcs_nextnvrn = nvrn + nregs;
4652           if (!aarch64_composite_type_p (type, mode))
4653             {
4654               gcc_assert (nregs == 1);
4655               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
4656             }
4657           else
4658             {
4659               rtx par;
4660               int i;
4661               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4662               for (i = 0; i < nregs; i++)
4663                 {
4664                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
4665                                          V0_REGNUM + nvrn + i);
4666                   rtx offset = gen_int_mode
4667                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
4668                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4669                   XVECEXP (par, 0, i) = tmp;
4670                 }
4671               pcum->aapcs_reg = par;
4672             }
4673           return;
4674         }
4675       else
4676         {
4677           /* C.3 NSRN is set to 8.  */
4678           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
4679           goto on_stack;
4680         }
4681     }
4682
4683   ncrn = pcum->aapcs_ncrn;
4684   nregs = size / UNITS_PER_WORD;
4685
4686   /* C6 - C9.  though the sign and zero extension semantics are
4687      handled elsewhere.  This is the case where the argument fits
4688      entirely general registers.  */
4689   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
4690     {
4691       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
4692
4693       /* C.8 if the argument has an alignment of 16 then the NGRN is
4694          rounded up to the next even number.  */
4695       if (nregs == 2
4696           && ncrn % 2
4697           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
4698              comparison is there because for > 16 * BITS_PER_UNIT
4699              alignment nregs should be > 2 and therefore it should be
4700              passed by reference rather than value.  */
4701           && (aarch64_function_arg_alignment (mode, type, &abi_break)
4702               == 16 * BITS_PER_UNIT))
4703         {
4704           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4705             inform (input_location, "parameter passing for argument of type "
4706                     "%qT changed in GCC 9.1", type);
4707           ++ncrn;
4708           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
4709         }
4710
4711       /* NREGS can be 0 when e.g. an empty structure is to be passed.
4712          A reg is still generated for it, but the caller should be smart
4713          enough not to use it.  */
4714       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
4715         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
4716       else
4717         {
4718           rtx par;
4719           int i;
4720
4721           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4722           for (i = 0; i < nregs; i++)
4723             {
4724               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
4725               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
4726                                        GEN_INT (i * UNITS_PER_WORD));
4727               XVECEXP (par, 0, i) = tmp;
4728             }
4729           pcum->aapcs_reg = par;
4730         }
4731
4732       pcum->aapcs_nextncrn = ncrn + nregs;
4733       return;
4734     }
4735
4736   /* C.11  */
4737   pcum->aapcs_nextncrn = NUM_ARG_REGS;
4738
4739   /* The argument is passed on stack; record the needed number of words for
4740      this argument and align the total size if necessary.  */
4741 on_stack:
4742   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
4743
4744   if (aarch64_function_arg_alignment (mode, type, &abi_break)
4745       == 16 * BITS_PER_UNIT)
4746     {
4747       int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
4748       if (pcum->aapcs_stack_size != new_size)
4749         {
4750           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4751             inform (input_location, "parameter passing for argument of type "
4752                     "%qT changed in GCC 9.1", type);
4753           pcum->aapcs_stack_size = new_size;
4754         }
4755     }
4756   return;
4757 }
4758
4759 /* Implement TARGET_FUNCTION_ARG.  */
4760
4761 static rtx
4762 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
4763                       const_tree type, bool named)
4764 {
4765   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4766   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
4767
4768   if (mode == VOIDmode)
4769     return NULL_RTX;
4770
4771   aarch64_layout_arg (pcum_v, mode, type, named);
4772   return pcum->aapcs_reg;
4773 }
4774
4775 void
4776 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
4777                            const_tree fntype ATTRIBUTE_UNUSED,
4778                            rtx libname ATTRIBUTE_UNUSED,
4779                            const_tree fndecl ATTRIBUTE_UNUSED,
4780                            unsigned n_named ATTRIBUTE_UNUSED)
4781 {
4782   pcum->aapcs_ncrn = 0;
4783   pcum->aapcs_nvrn = 0;
4784   pcum->aapcs_nextncrn = 0;
4785   pcum->aapcs_nextnvrn = 0;
4786   pcum->pcs_variant = ARM_PCS_AAPCS64;
4787   pcum->aapcs_reg = NULL_RTX;
4788   pcum->aapcs_arg_processed = false;
4789   pcum->aapcs_stack_words = 0;
4790   pcum->aapcs_stack_size = 0;
4791
4792   if (!TARGET_FLOAT
4793       && fndecl && TREE_PUBLIC (fndecl)
4794       && fntype && fntype != error_mark_node)
4795     {
4796       const_tree type = TREE_TYPE (fntype);
4797       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
4798       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
4799       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
4800                                                    &mode, &nregs, NULL))
4801         aarch64_err_no_fpadvsimd (TYPE_MODE (type));
4802     }
4803   return;
4804 }
4805
4806 static void
4807 aarch64_function_arg_advance (cumulative_args_t pcum_v,
4808                               machine_mode mode,
4809                               const_tree type,
4810                               bool named)
4811 {
4812   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4813   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
4814     {
4815       aarch64_layout_arg (pcum_v, mode, type, named);
4816       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
4817                   != (pcum->aapcs_stack_words != 0));
4818       pcum->aapcs_arg_processed = false;
4819       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
4820       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
4821       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
4822       pcum->aapcs_stack_words = 0;
4823       pcum->aapcs_reg = NULL_RTX;
4824     }
4825 }
4826
4827 bool
4828 aarch64_function_arg_regno_p (unsigned regno)
4829 {
4830   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
4831           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
4832 }
4833
4834 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
4835    PARM_BOUNDARY bits of alignment, but will be given anything up
4836    to STACK_BOUNDARY bits if the type requires it.  This makes sure
4837    that both before and after the layout of each argument, the Next
4838    Stacked Argument Address (NSAA) will have a minimum alignment of
4839    8 bytes.  */
4840
4841 static unsigned int
4842 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
4843 {
4844   bool abi_break;
4845   unsigned int alignment = aarch64_function_arg_alignment (mode, type,
4846                                                            &abi_break);
4847   if (abi_break & warn_psabi)
4848     inform (input_location, "parameter passing for argument of type "
4849             "%qT changed in GCC 9.1", type);
4850
4851   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
4852 }
4853
4854 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
4855
4856 static fixed_size_mode
4857 aarch64_get_reg_raw_mode (int regno)
4858 {
4859   if (TARGET_SVE && FP_REGNUM_P (regno))
4860     /* Don't use the SVE part of the register for __builtin_apply and
4861        __builtin_return.  The SVE registers aren't used by the normal PCS,
4862        so using them there would be a waste of time.  The PCS extensions
4863        for SVE types are fundamentally incompatible with the
4864        __builtin_return/__builtin_apply interface.  */
4865     return as_a <fixed_size_mode> (V16QImode);
4866   return default_get_reg_raw_mode (regno);
4867 }
4868
4869 /* Implement TARGET_FUNCTION_ARG_PADDING.
4870
4871    Small aggregate types are placed in the lowest memory address.
4872
4873    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
4874
4875 static pad_direction
4876 aarch64_function_arg_padding (machine_mode mode, const_tree type)
4877 {
4878   /* On little-endian targets, the least significant byte of every stack
4879      argument is passed at the lowest byte address of the stack slot.  */
4880   if (!BYTES_BIG_ENDIAN)
4881     return PAD_UPWARD;
4882
4883   /* Otherwise, integral, floating-point and pointer types are padded downward:
4884      the least significant byte of a stack argument is passed at the highest
4885      byte address of the stack slot.  */
4886   if (type
4887       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
4888          || POINTER_TYPE_P (type))
4889       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
4890     return PAD_DOWNWARD;
4891
4892   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
4893   return PAD_UPWARD;
4894 }
4895
4896 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4897
4898    It specifies padding for the last (may also be the only)
4899    element of a block move between registers and memory.  If
4900    assuming the block is in the memory, padding upward means that
4901    the last element is padded after its highest significant byte,
4902    while in downward padding, the last element is padded at the
4903    its least significant byte side.
4904
4905    Small aggregates and small complex types are always padded
4906    upwards.
4907
4908    We don't need to worry about homogeneous floating-point or
4909    short-vector aggregates; their move is not affected by the
4910    padding direction determined here.  Regardless of endianness,
4911    each element of such an aggregate is put in the least
4912    significant bits of a fp/simd register.
4913
4914    Return !BYTES_BIG_ENDIAN if the least significant byte of the
4915    register has useful data, and return the opposite if the most
4916    significant byte does.  */
4917
4918 bool
4919 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
4920                      bool first ATTRIBUTE_UNUSED)
4921 {
4922
4923   /* Small composite types are always padded upward.  */
4924   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
4925     {
4926       HOST_WIDE_INT size;
4927       if (type)
4928         size = int_size_in_bytes (type);
4929       else
4930         /* No frontends can create types with variable-sized modes, so we
4931            shouldn't be asked to pass or return them.  */
4932         size = GET_MODE_SIZE (mode).to_constant ();
4933       if (size < 2 * UNITS_PER_WORD)
4934         return true;
4935     }
4936
4937   /* Otherwise, use the default padding.  */
4938   return !BYTES_BIG_ENDIAN;
4939 }
4940
4941 static scalar_int_mode
4942 aarch64_libgcc_cmp_return_mode (void)
4943 {
4944   return SImode;
4945 }
4946
4947 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
4948
4949 /* We use the 12-bit shifted immediate arithmetic instructions so values
4950    must be multiple of (1 << 12), i.e. 4096.  */
4951 #define ARITH_FACTOR 4096
4952
4953 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
4954 #error Cannot use simple address calculation for stack probing
4955 #endif
4956
4957 /* The pair of scratch registers used for stack probing.  */
4958 #define PROBE_STACK_FIRST_REG  R9_REGNUM
4959 #define PROBE_STACK_SECOND_REG R10_REGNUM
4960
4961 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
4962    inclusive.  These are offsets from the current stack pointer.  */
4963
4964 static void
4965 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
4966 {
4967   HOST_WIDE_INT size;
4968   if (!poly_size.is_constant (&size))
4969     {
4970       sorry ("stack probes for SVE frames");
4971       return;
4972     }
4973
4974   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
4975
4976   /* See the same assertion on PROBE_INTERVAL above.  */
4977   gcc_assert ((first % ARITH_FACTOR) == 0);
4978
4979   /* See if we have a constant small number of probes to generate.  If so,
4980      that's the easy case.  */
4981   if (size <= PROBE_INTERVAL)
4982     {
4983       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
4984
4985       emit_set_insn (reg1,
4986                      plus_constant (Pmode,
4987                                     stack_pointer_rtx, -(first + base)));
4988       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
4989     }
4990
4991   /* The run-time loop is made up of 8 insns in the generic case while the
4992      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
4993   else if (size <= 4 * PROBE_INTERVAL)
4994     {
4995       HOST_WIDE_INT i, rem;
4996
4997       emit_set_insn (reg1,
4998                      plus_constant (Pmode,
4999                                     stack_pointer_rtx,
5000                                     -(first + PROBE_INTERVAL)));
5001       emit_stack_probe (reg1);
5002
5003       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
5004          it exceeds SIZE.  If only two probes are needed, this will not
5005          generate any code.  Then probe at FIRST + SIZE.  */
5006       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
5007         {
5008           emit_set_insn (reg1,
5009                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
5010           emit_stack_probe (reg1);
5011         }
5012
5013       rem = size - (i - PROBE_INTERVAL);
5014       if (rem > 256)
5015         {
5016           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5017
5018           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
5019           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
5020         }
5021       else
5022         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
5023     }
5024
5025   /* Otherwise, do the same as above, but in a loop.  Note that we must be
5026      extra careful with variables wrapping around because we might be at
5027      the very top (or the very bottom) of the address space and we have
5028      to be able to handle this case properly; in particular, we use an
5029      equality test for the loop condition.  */
5030   else
5031     {
5032       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
5033
5034       /* Step 1: round SIZE to the previous multiple of the interval.  */
5035
5036       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
5037
5038
5039       /* Step 2: compute initial and final value of the loop counter.  */
5040
5041       /* TEST_ADDR = SP + FIRST.  */
5042       emit_set_insn (reg1,
5043                      plus_constant (Pmode, stack_pointer_rtx, -first));
5044
5045       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
5046       HOST_WIDE_INT adjustment = - (first + rounded_size);
5047       if (! aarch64_uimm12_shift (adjustment))
5048         {
5049           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
5050                                           true, Pmode);
5051           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
5052         }
5053       else
5054         emit_set_insn (reg2,
5055                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
5056
5057       /* Step 3: the loop
5058
5059          do
5060            {
5061              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
5062              probe at TEST_ADDR
5063            }
5064          while (TEST_ADDR != LAST_ADDR)
5065
5066          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
5067          until it is equal to ROUNDED_SIZE.  */
5068
5069       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
5070
5071
5072       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
5073          that SIZE is equal to ROUNDED_SIZE.  */
5074
5075       if (size != rounded_size)
5076         {
5077           HOST_WIDE_INT rem = size - rounded_size;
5078
5079           if (rem > 256)
5080             {
5081               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5082
5083               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
5084               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
5085             }
5086           else
5087             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
5088         }
5089     }
5090
5091   /* Make sure nothing is scheduled before we are done.  */
5092   emit_insn (gen_blockage ());
5093 }
5094
5095 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
5096    absolute addresses.  */
5097
5098 const char *
5099 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
5100 {
5101   static int labelno = 0;
5102   char loop_lab[32];
5103   rtx xops[2];
5104
5105   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
5106
5107   /* Loop.  */
5108   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
5109
5110   HOST_WIDE_INT stack_clash_probe_interval
5111     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5112
5113   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
5114   xops[0] = reg1;
5115   HOST_WIDE_INT interval;
5116   if (flag_stack_clash_protection)
5117     interval = stack_clash_probe_interval;
5118   else
5119     interval = PROBE_INTERVAL;
5120
5121   gcc_assert (aarch64_uimm12_shift (interval));
5122   xops[1] = GEN_INT (interval);
5123
5124   output_asm_insn ("sub\t%0, %0, %1", xops);
5125
5126   /* If doing stack clash protection then we probe up by the ABI specified
5127      amount.  We do this because we're dropping full pages at a time in the
5128      loop.  But if we're doing non-stack clash probing, probe at SP 0.  */
5129   if (flag_stack_clash_protection)
5130     xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
5131   else
5132     xops[1] = CONST0_RTX (GET_MODE (xops[1]));
5133
5134   /* Probe at TEST_ADDR.  If we're inside the loop it is always safe to probe
5135      by this amount for each iteration.  */
5136   output_asm_insn ("str\txzr, [%0, %1]", xops);
5137
5138   /* Test if TEST_ADDR == LAST_ADDR.  */
5139   xops[1] = reg2;
5140   output_asm_insn ("cmp\t%0, %1", xops);
5141
5142   /* Branch.  */
5143   fputs ("\tb.ne\t", asm_out_file);
5144   assemble_name_raw (asm_out_file, loop_lab);
5145   fputc ('\n', asm_out_file);
5146
5147   return "";
5148 }
5149
5150 /* Emit the probe loop for doing stack clash probes and stack adjustments for
5151    SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
5152    of GUARD_SIZE.  When a probe is emitted it is done at most
5153    MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
5154    at most MIN_PROBE_THRESHOLD.  By the end of this function
5155    BASE = BASE - ADJUSTMENT.  */
5156
5157 const char *
5158 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
5159                                       rtx min_probe_threshold, rtx guard_size)
5160 {
5161   /* This function is not allowed to use any instruction generation function
5162      like gen_ and friends.  If you do you'll likely ICE during CFG validation,
5163      so instead emit the code you want using output_asm_insn.  */
5164   gcc_assert (flag_stack_clash_protection);
5165   gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
5166   gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
5167
5168   /* The minimum required allocation before the residual requires probing.  */
5169   HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
5170
5171   /* Clamp the value down to the nearest value that can be used with a cmp.  */
5172   residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
5173   rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
5174
5175   gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
5176   gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
5177
5178   static int labelno = 0;
5179   char loop_start_lab[32];
5180   char loop_end_lab[32];
5181   rtx xops[2];
5182
5183   ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
5184   ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
5185
5186   /* Emit loop start label.  */
5187   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
5188
5189   /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
5190   xops[0] = adjustment;
5191   xops[1] = probe_offset_value_rtx;
5192   output_asm_insn ("cmp\t%0, %1", xops);
5193
5194   /* Branch to end if not enough adjustment to probe.  */
5195   fputs ("\tb.lt\t", asm_out_file);
5196   assemble_name_raw (asm_out_file, loop_end_lab);
5197   fputc ('\n', asm_out_file);
5198
5199   /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
5200   xops[0] = base;
5201   xops[1] = probe_offset_value_rtx;
5202   output_asm_insn ("sub\t%0, %0, %1", xops);
5203
5204   /* Probe at BASE.  */
5205   xops[1] = const0_rtx;
5206   output_asm_insn ("str\txzr, [%0, %1]", xops);
5207
5208   /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
5209   xops[0] = adjustment;
5210   xops[1] = probe_offset_value_rtx;
5211   output_asm_insn ("sub\t%0, %0, %1", xops);
5212
5213   /* Branch to start if still more bytes to allocate.  */
5214   fputs ("\tb\t", asm_out_file);
5215   assemble_name_raw (asm_out_file, loop_start_lab);
5216   fputc ('\n', asm_out_file);
5217
5218   /* No probe leave.  */
5219   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
5220
5221   /* BASE = BASE - ADJUSTMENT.  */
5222   xops[0] = base;
5223   xops[1] = adjustment;
5224   output_asm_insn ("sub\t%0, %0, %1", xops);
5225   return "";
5226 }
5227
5228 /* Determine whether a frame chain needs to be generated.  */
5229 static bool
5230 aarch64_needs_frame_chain (void)
5231 {
5232   /* Force a frame chain for EH returns so the return address is at FP+8.  */
5233   if (frame_pointer_needed || crtl->calls_eh_return)
5234     return true;
5235
5236   /* A leaf function cannot have calls or write LR.  */
5237   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
5238
5239   /* Don't use a frame chain in leaf functions if leaf frame pointers
5240      are disabled.  */
5241   if (flag_omit_leaf_frame_pointer && is_leaf)
5242     return false;
5243
5244   return aarch64_use_frame_pointer;
5245 }
5246
5247 /* Mark the registers that need to be saved by the callee and calculate
5248    the size of the callee-saved registers area and frame record (both FP
5249    and LR may be omitted).  */
5250 static void
5251 aarch64_layout_frame (void)
5252 {
5253   HOST_WIDE_INT offset = 0;
5254   int regno, last_fp_reg = INVALID_REGNUM;
5255   bool simd_function = aarch64_simd_decl_p (cfun->decl);
5256
5257   cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
5258
5259   /* Adjust the outgoing arguments size if required.  Keep it in sync with what
5260      the mid-end is doing.  */
5261   crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
5262
5263 #define SLOT_NOT_REQUIRED (-2)
5264 #define SLOT_REQUIRED     (-1)
5265
5266   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
5267   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
5268
5269   /* If this is a non-leaf simd function with calls we assume that
5270      at least one of those calls is to a non-simd function and thus
5271      we must save V8 to V23 in the prologue.  */
5272
5273   if (simd_function && !crtl->is_leaf)
5274     {
5275       for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5276         if (FP_SIMD_SAVED_REGNUM_P (regno))
5277           df_set_regs_ever_live (regno, true);
5278     }
5279
5280   /* First mark all the registers that really need to be saved...  */
5281   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5282     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5283
5284   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5285     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5286
5287   /* ... that includes the eh data registers (if needed)...  */
5288   if (crtl->calls_eh_return)
5289     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
5290       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
5291         = SLOT_REQUIRED;
5292
5293   /* ... and any callee saved register that dataflow says is live.  */
5294   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5295     if (df_regs_ever_live_p (regno)
5296         && (regno == R30_REGNUM
5297             || !call_used_regs[regno]))
5298       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
5299
5300   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5301     if (df_regs_ever_live_p (regno)
5302         && (!call_used_regs[regno]
5303             || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
5304       {
5305         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
5306         last_fp_reg = regno;
5307       }
5308
5309   if (cfun->machine->frame.emit_frame_chain)
5310     {
5311       /* FP and LR are placed in the linkage record.  */
5312       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
5313       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
5314       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
5315       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
5316       offset = 2 * UNITS_PER_WORD;
5317     }
5318
5319   /* With stack-clash, LR must be saved in non-leaf functions.  */
5320   gcc_assert (crtl->is_leaf
5321               || (cfun->machine->frame.reg_offset[R30_REGNUM]
5322                   != SLOT_NOT_REQUIRED));
5323
5324   /* Now assign stack slots for them.  */
5325   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5326     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
5327       {
5328         cfun->machine->frame.reg_offset[regno] = offset;
5329         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
5330           cfun->machine->frame.wb_candidate1 = regno;
5331         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
5332           cfun->machine->frame.wb_candidate2 = regno;
5333         offset += UNITS_PER_WORD;
5334       }
5335
5336   HOST_WIDE_INT max_int_offset = offset;
5337   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
5338   bool has_align_gap = offset != max_int_offset;
5339
5340   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5341     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
5342       {
5343         /* If there is an alignment gap between integer and fp callee-saves,
5344            allocate the last fp register to it if possible.  */
5345         if (regno == last_fp_reg
5346             && has_align_gap
5347             && !simd_function
5348             && (offset & 8) == 0)
5349           {
5350             cfun->machine->frame.reg_offset[regno] = max_int_offset;
5351             break;
5352           }
5353
5354         cfun->machine->frame.reg_offset[regno] = offset;
5355         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
5356           cfun->machine->frame.wb_candidate1 = regno;
5357         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
5358                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
5359           cfun->machine->frame.wb_candidate2 = regno;
5360         offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
5361       }
5362
5363   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
5364
5365   cfun->machine->frame.saved_regs_size = offset;
5366
5367   HOST_WIDE_INT varargs_and_saved_regs_size
5368     = offset + cfun->machine->frame.saved_varargs_size;
5369
5370   cfun->machine->frame.hard_fp_offset
5371     = aligned_upper_bound (varargs_and_saved_regs_size
5372                            + get_frame_size (),
5373                            STACK_BOUNDARY / BITS_PER_UNIT);
5374
5375   /* Both these values are already aligned.  */
5376   gcc_assert (multiple_p (crtl->outgoing_args_size,
5377                           STACK_BOUNDARY / BITS_PER_UNIT));
5378   cfun->machine->frame.frame_size
5379     = (cfun->machine->frame.hard_fp_offset
5380        + crtl->outgoing_args_size);
5381
5382   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
5383
5384   cfun->machine->frame.initial_adjust = 0;
5385   cfun->machine->frame.final_adjust = 0;
5386   cfun->machine->frame.callee_adjust = 0;
5387   cfun->machine->frame.callee_offset = 0;
5388
5389   HOST_WIDE_INT max_push_offset = 0;
5390   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
5391     max_push_offset = 512;
5392   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
5393     max_push_offset = 256;
5394
5395   HOST_WIDE_INT const_size, const_fp_offset;
5396   if (cfun->machine->frame.frame_size.is_constant (&const_size)
5397       && const_size < max_push_offset
5398       && known_eq (crtl->outgoing_args_size, 0))
5399     {
5400       /* Simple, small frame with no outgoing arguments:
5401          stp reg1, reg2, [sp, -frame_size]!
5402          stp reg3, reg4, [sp, 16]  */
5403       cfun->machine->frame.callee_adjust = const_size;
5404     }
5405   else if (known_lt (crtl->outgoing_args_size
5406                      + cfun->machine->frame.saved_regs_size, 512)
5407            && !(cfun->calls_alloca
5408                 && known_lt (cfun->machine->frame.hard_fp_offset,
5409                              max_push_offset)))
5410     {
5411       /* Frame with small outgoing arguments:
5412          sub sp, sp, frame_size
5413          stp reg1, reg2, [sp, outgoing_args_size]
5414          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
5415       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
5416       cfun->machine->frame.callee_offset
5417         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
5418     }
5419   else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
5420            && const_fp_offset < max_push_offset)
5421     {
5422       /* Frame with large outgoing arguments but a small local area:
5423          stp reg1, reg2, [sp, -hard_fp_offset]!
5424          stp reg3, reg4, [sp, 16]
5425          sub sp, sp, outgoing_args_size  */
5426       cfun->machine->frame.callee_adjust = const_fp_offset;
5427       cfun->machine->frame.final_adjust
5428         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
5429     }
5430   else
5431     {
5432       /* Frame with large local area and outgoing arguments using frame pointer:
5433          sub sp, sp, hard_fp_offset
5434          stp x29, x30, [sp, 0]
5435          add x29, sp, 0
5436          stp reg3, reg4, [sp, 16]
5437          sub sp, sp, outgoing_args_size  */
5438       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
5439       cfun->machine->frame.final_adjust
5440         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
5441     }
5442
5443   cfun->machine->frame.laid_out = true;
5444 }
5445
5446 /* Return true if the register REGNO is saved on entry to
5447    the current function.  */
5448
5449 static bool
5450 aarch64_register_saved_on_entry (int regno)
5451 {
5452   return cfun->machine->frame.reg_offset[regno] >= 0;
5453 }
5454
5455 /* Return the next register up from REGNO up to LIMIT for the callee
5456    to save.  */
5457
5458 static unsigned
5459 aarch64_next_callee_save (unsigned regno, unsigned limit)
5460 {
5461   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
5462     regno ++;
5463   return regno;
5464 }
5465
5466 /* Push the register number REGNO of mode MODE to the stack with write-back
5467    adjusting the stack by ADJUSTMENT.  */
5468
5469 static void
5470 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
5471                            HOST_WIDE_INT adjustment)
5472  {
5473   rtx base_rtx = stack_pointer_rtx;
5474   rtx insn, reg, mem;
5475
5476   reg = gen_rtx_REG (mode, regno);
5477   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
5478                             plus_constant (Pmode, base_rtx, -adjustment));
5479   mem = gen_frame_mem (mode, mem);
5480
5481   insn = emit_move_insn (mem, reg);
5482   RTX_FRAME_RELATED_P (insn) = 1;
5483 }
5484
5485 /* Generate and return an instruction to store the pair of registers
5486    REG and REG2 of mode MODE to location BASE with write-back adjusting
5487    the stack location BASE by ADJUSTMENT.  */
5488
5489 static rtx
5490 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
5491                           HOST_WIDE_INT adjustment)
5492 {
5493   switch (mode)
5494     {
5495     case E_DImode:
5496       return gen_storewb_pairdi_di (base, base, reg, reg2,
5497                                     GEN_INT (-adjustment),
5498                                     GEN_INT (UNITS_PER_WORD - adjustment));
5499     case E_DFmode:
5500       return gen_storewb_pairdf_di (base, base, reg, reg2,
5501                                     GEN_INT (-adjustment),
5502                                     GEN_INT (UNITS_PER_WORD - adjustment));
5503     case E_TFmode:
5504       return gen_storewb_pairtf_di (base, base, reg, reg2,
5505                                     GEN_INT (-adjustment),
5506                                     GEN_INT (UNITS_PER_VREG - adjustment));
5507     default:
5508       gcc_unreachable ();
5509     }
5510 }
5511
5512 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
5513    stack pointer by ADJUSTMENT.  */
5514
5515 static void
5516 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
5517 {
5518   rtx_insn *insn;
5519   machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
5520
5521   if (regno2 == INVALID_REGNUM)
5522     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
5523
5524   rtx reg1 = gen_rtx_REG (mode, regno1);
5525   rtx reg2 = gen_rtx_REG (mode, regno2);
5526
5527   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
5528                                               reg2, adjustment));
5529   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
5530   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5531   RTX_FRAME_RELATED_P (insn) = 1;
5532 }
5533
5534 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
5535    adjusting it by ADJUSTMENT afterwards.  */
5536
5537 static rtx
5538 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
5539                          HOST_WIDE_INT adjustment)
5540 {
5541   switch (mode)
5542     {
5543     case E_DImode:
5544       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
5545                                    GEN_INT (UNITS_PER_WORD));
5546     case E_DFmode:
5547       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
5548                                    GEN_INT (UNITS_PER_WORD));
5549     case E_TFmode:
5550       return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
5551                                    GEN_INT (UNITS_PER_VREG));
5552     default:
5553       gcc_unreachable ();
5554     }
5555 }
5556
5557 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
5558    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
5559    into CFI_OPS.  */
5560
5561 static void
5562 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
5563                   rtx *cfi_ops)
5564 {
5565   machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
5566   rtx reg1 = gen_rtx_REG (mode, regno1);
5567
5568   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
5569
5570   if (regno2 == INVALID_REGNUM)
5571     {
5572       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
5573       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
5574       emit_move_insn (reg1, gen_frame_mem (mode, mem));
5575     }
5576   else
5577     {
5578       rtx reg2 = gen_rtx_REG (mode, regno2);
5579       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5580       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
5581                                           reg2, adjustment));
5582     }
5583 }
5584
5585 /* Generate and return a store pair instruction of mode MODE to store
5586    register REG1 to MEM1 and register REG2 to MEM2.  */
5587
5588 static rtx
5589 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
5590                         rtx reg2)
5591 {
5592   switch (mode)
5593     {
5594     case E_DImode:
5595       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
5596
5597     case E_DFmode:
5598       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
5599
5600     case E_TFmode:
5601       return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
5602
5603     default:
5604       gcc_unreachable ();
5605     }
5606 }
5607
5608 /* Generate and regurn a load pair isntruction of mode MODE to load register
5609    REG1 from MEM1 and register REG2 from MEM2.  */
5610
5611 static rtx
5612 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
5613                        rtx mem2)
5614 {
5615   switch (mode)
5616     {
5617     case E_DImode:
5618       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
5619
5620     case E_DFmode:
5621       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
5622
5623     case E_TFmode:
5624       return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
5625
5626     default:
5627       gcc_unreachable ();
5628     }
5629 }
5630
5631 /* Return TRUE if return address signing should be enabled for the current
5632    function, otherwise return FALSE.  */
5633
5634 bool
5635 aarch64_return_address_signing_enabled (void)
5636 {
5637   /* This function should only be called after frame laid out.   */
5638   gcc_assert (cfun->machine->frame.laid_out);
5639
5640   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
5641      if its LR is pushed onto stack.  */
5642   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
5643           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
5644               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
5645 }
5646
5647 /* Return TRUE if Branch Target Identification Mechanism is enabled.  */
5648 bool
5649 aarch64_bti_enabled (void)
5650 {
5651   return (aarch64_enable_bti == 1);
5652 }
5653
5654 /* Emit code to save the callee-saved registers from register number START
5655    to LIMIT to the stack at the location starting at offset START_OFFSET,
5656    skipping any write-back candidates if SKIP_WB is true.  */
5657
5658 static void
5659 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
5660                            unsigned start, unsigned limit, bool skip_wb)
5661 {
5662   rtx_insn *insn;
5663   unsigned regno;
5664   unsigned regno2;
5665
5666   for (regno = aarch64_next_callee_save (start, limit);
5667        regno <= limit;
5668        regno = aarch64_next_callee_save (regno + 1, limit))
5669     {
5670       rtx reg, mem;
5671       poly_int64 offset;
5672       int offset_diff;
5673
5674       if (skip_wb
5675           && (regno == cfun->machine->frame.wb_candidate1
5676               || regno == cfun->machine->frame.wb_candidate2))
5677         continue;
5678
5679       if (cfun->machine->reg_is_wrapped_separately[regno])
5680        continue;
5681
5682       reg = gen_rtx_REG (mode, regno);
5683       offset = start_offset + cfun->machine->frame.reg_offset[regno];
5684       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5685                                                 offset));
5686
5687       regno2 = aarch64_next_callee_save (regno + 1, limit);
5688       offset_diff = cfun->machine->frame.reg_offset[regno2]
5689                     - cfun->machine->frame.reg_offset[regno];
5690
5691       if (regno2 <= limit
5692           && !cfun->machine->reg_is_wrapped_separately[regno2]
5693           && known_eq (GET_MODE_SIZE (mode), offset_diff))
5694         {
5695           rtx reg2 = gen_rtx_REG (mode, regno2);
5696           rtx mem2;
5697
5698           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5699           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5700                                                      offset));
5701           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
5702                                                     reg2));
5703
5704           /* The first part of a frame-related parallel insn is
5705              always assumed to be relevant to the frame
5706              calculations; subsequent parts, are only
5707              frame-related if explicitly marked.  */
5708           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5709           regno = regno2;
5710         }
5711       else
5712         insn = emit_move_insn (mem, reg);
5713
5714       RTX_FRAME_RELATED_P (insn) = 1;
5715     }
5716 }
5717
5718 /* Emit code to restore the callee registers of mode MODE from register
5719    number START up to and including LIMIT.  Restore from the stack offset
5720    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
5721    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
5722
5723 static void
5724 aarch64_restore_callee_saves (machine_mode mode,
5725                               poly_int64 start_offset, unsigned start,
5726                               unsigned limit, bool skip_wb, rtx *cfi_ops)
5727 {
5728   rtx base_rtx = stack_pointer_rtx;
5729   unsigned regno;
5730   unsigned regno2;
5731   poly_int64 offset;
5732
5733   for (regno = aarch64_next_callee_save (start, limit);
5734        regno <= limit;
5735        regno = aarch64_next_callee_save (regno + 1, limit))
5736     {
5737       if (cfun->machine->reg_is_wrapped_separately[regno])
5738        continue;
5739
5740       rtx reg, mem;
5741       int offset_diff;
5742
5743       if (skip_wb
5744           && (regno == cfun->machine->frame.wb_candidate1
5745               || regno == cfun->machine->frame.wb_candidate2))
5746         continue;
5747
5748       reg = gen_rtx_REG (mode, regno);
5749       offset = start_offset + cfun->machine->frame.reg_offset[regno];
5750       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5751
5752       regno2 = aarch64_next_callee_save (regno + 1, limit);
5753       offset_diff = cfun->machine->frame.reg_offset[regno2]
5754                     - cfun->machine->frame.reg_offset[regno];
5755
5756       if (regno2 <= limit
5757           && !cfun->machine->reg_is_wrapped_separately[regno2]
5758           && known_eq (GET_MODE_SIZE (mode), offset_diff))
5759         {
5760           rtx reg2 = gen_rtx_REG (mode, regno2);
5761           rtx mem2;
5762
5763           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5764           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5765           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5766
5767           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5768           regno = regno2;
5769         }
5770       else
5771         emit_move_insn (reg, mem);
5772       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
5773     }
5774 }
5775
5776 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
5777    of MODE.  */
5778
5779 static inline bool
5780 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5781 {
5782   HOST_WIDE_INT multiple;
5783   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5784           && IN_RANGE (multiple, -8, 7));
5785 }
5786
5787 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
5788    of MODE.  */
5789
5790 static inline bool
5791 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5792 {
5793   HOST_WIDE_INT multiple;
5794   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5795           && IN_RANGE (multiple, 0, 63));
5796 }
5797
5798 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
5799    of MODE.  */
5800
5801 bool
5802 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5803 {
5804   HOST_WIDE_INT multiple;
5805   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5806           && IN_RANGE (multiple, -64, 63));
5807 }
5808
5809 /* Return true if OFFSET is a signed 9-bit value.  */
5810
5811 bool
5812 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
5813                                        poly_int64 offset)
5814 {
5815   HOST_WIDE_INT const_offset;
5816   return (offset.is_constant (&const_offset)
5817           && IN_RANGE (const_offset, -256, 255));
5818 }
5819
5820 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
5821    of MODE.  */
5822
5823 static inline bool
5824 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5825 {
5826   HOST_WIDE_INT multiple;
5827   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5828           && IN_RANGE (multiple, -256, 255));
5829 }
5830
5831 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
5832    of MODE.  */
5833
5834 static inline bool
5835 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5836 {
5837   HOST_WIDE_INT multiple;
5838   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5839           && IN_RANGE (multiple, 0, 4095));
5840 }
5841
5842 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
5843
5844 static sbitmap
5845 aarch64_get_separate_components (void)
5846 {
5847   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5848   bitmap_clear (components);
5849
5850   /* The registers we need saved to the frame.  */
5851   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5852     if (aarch64_register_saved_on_entry (regno))
5853       {
5854         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5855         if (!frame_pointer_needed)
5856           offset += cfun->machine->frame.frame_size
5857                     - cfun->machine->frame.hard_fp_offset;
5858         /* Check that we can access the stack slot of the register with one
5859            direct load with no adjustments needed.  */
5860         if (offset_12bit_unsigned_scaled_p (DImode, offset))
5861           bitmap_set_bit (components, regno);
5862       }
5863
5864   /* Don't mess with the hard frame pointer.  */
5865   if (frame_pointer_needed)
5866     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
5867
5868   unsigned reg1 = cfun->machine->frame.wb_candidate1;
5869   unsigned reg2 = cfun->machine->frame.wb_candidate2;
5870   /* If registers have been chosen to be stored/restored with
5871      writeback don't interfere with them to avoid having to output explicit
5872      stack adjustment instructions.  */
5873   if (reg2 != INVALID_REGNUM)
5874     bitmap_clear_bit (components, reg2);
5875   if (reg1 != INVALID_REGNUM)
5876     bitmap_clear_bit (components, reg1);
5877
5878   bitmap_clear_bit (components, LR_REGNUM);
5879   bitmap_clear_bit (components, SP_REGNUM);
5880
5881   return components;
5882 }
5883
5884 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
5885
5886 static sbitmap
5887 aarch64_components_for_bb (basic_block bb)
5888 {
5889   bitmap in = DF_LIVE_IN (bb);
5890   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
5891   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
5892   bool simd_function = aarch64_simd_decl_p (cfun->decl);
5893
5894   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5895   bitmap_clear (components);
5896
5897   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
5898   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5899     if ((!call_used_regs[regno]
5900         || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
5901        && (bitmap_bit_p (in, regno)
5902            || bitmap_bit_p (gen, regno)
5903            || bitmap_bit_p (kill, regno)))
5904       {
5905         unsigned regno2, offset, offset2;
5906         bitmap_set_bit (components, regno);
5907
5908         /* If there is a callee-save at an adjacent offset, add it too
5909            to increase the use of LDP/STP.  */
5910         offset = cfun->machine->frame.reg_offset[regno];
5911         regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
5912
5913         if (regno2 <= LAST_SAVED_REGNUM)
5914           {
5915             offset2 = cfun->machine->frame.reg_offset[regno2];
5916             if ((offset & ~8) == (offset2 & ~8))
5917               bitmap_set_bit (components, regno2);
5918           }
5919       }
5920
5921   return components;
5922 }
5923
5924 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5925    Nothing to do for aarch64.  */
5926
5927 static void
5928 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
5929 {
5930 }
5931
5932 /* Return the next set bit in BMP from START onwards.  Return the total number
5933    of bits in BMP if no set bit is found at or after START.  */
5934
5935 static unsigned int
5936 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
5937 {
5938   unsigned int nbits = SBITMAP_SIZE (bmp);
5939   if (start == nbits)
5940     return start;
5941
5942   gcc_assert (start < nbits);
5943   for (unsigned int i = start; i < nbits; i++)
5944     if (bitmap_bit_p (bmp, i))
5945       return i;
5946
5947   return nbits;
5948 }
5949
5950 /* Do the work for aarch64_emit_prologue_components and
5951    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
5952    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
5953    for these components or the epilogue sequence.  That is, it determines
5954    whether we should emit stores or loads and what kind of CFA notes to attach
5955    to the insns.  Otherwise the logic for the two sequences is very
5956    similar.  */
5957
5958 static void
5959 aarch64_process_components (sbitmap components, bool prologue_p)
5960 {
5961   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
5962                              ? HARD_FRAME_POINTER_REGNUM
5963                              : STACK_POINTER_REGNUM);
5964
5965   unsigned last_regno = SBITMAP_SIZE (components);
5966   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
5967   rtx_insn *insn = NULL;
5968
5969   while (regno != last_regno)
5970     {
5971       /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
5972          so DFmode for the vector registers is enough.  For simd functions
5973          we want to save the low 128 bits.  */
5974       machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
5975
5976       rtx reg = gen_rtx_REG (mode, regno);
5977       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5978       if (!frame_pointer_needed)
5979         offset += cfun->machine->frame.frame_size
5980                   - cfun->machine->frame.hard_fp_offset;
5981       rtx addr = plus_constant (Pmode, ptr_reg, offset);
5982       rtx mem = gen_frame_mem (mode, addr);
5983
5984       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
5985       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
5986       /* No more registers to handle after REGNO.
5987          Emit a single save/restore and exit.  */
5988       if (regno2 == last_regno)
5989         {
5990           insn = emit_insn (set);
5991           RTX_FRAME_RELATED_P (insn) = 1;
5992           if (prologue_p)
5993             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5994           else
5995             add_reg_note (insn, REG_CFA_RESTORE, reg);
5996           break;
5997         }
5998
5999       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
6000       /* The next register is not of the same class or its offset is not
6001          mergeable with the current one into a pair.  */
6002       if (!satisfies_constraint_Ump (mem)
6003           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
6004           || (aarch64_simd_decl_p (cfun->decl) && FP_REGNUM_P (regno))
6005           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
6006                        GET_MODE_SIZE (mode)))
6007         {
6008           insn = emit_insn (set);
6009           RTX_FRAME_RELATED_P (insn) = 1;
6010           if (prologue_p)
6011             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
6012           else
6013             add_reg_note (insn, REG_CFA_RESTORE, reg);
6014
6015           regno = regno2;
6016           continue;
6017         }
6018
6019       /* REGNO2 can be saved/restored in a pair with REGNO.  */
6020       rtx reg2 = gen_rtx_REG (mode, regno2);
6021       if (!frame_pointer_needed)
6022         offset2 += cfun->machine->frame.frame_size
6023                   - cfun->machine->frame.hard_fp_offset;
6024       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
6025       rtx mem2 = gen_frame_mem (mode, addr2);
6026       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
6027                              : gen_rtx_SET (reg2, mem2);
6028
6029       if (prologue_p)
6030         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
6031       else
6032         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
6033
6034       RTX_FRAME_RELATED_P (insn) = 1;
6035       if (prologue_p)
6036         {
6037           add_reg_note (insn, REG_CFA_OFFSET, set);
6038           add_reg_note (insn, REG_CFA_OFFSET, set2);
6039         }
6040       else
6041         {
6042           add_reg_note (insn, REG_CFA_RESTORE, reg);
6043           add_reg_note (insn, REG_CFA_RESTORE, reg2);
6044         }
6045
6046       regno = aarch64_get_next_set_bit (components, regno2 + 1);
6047     }
6048 }
6049
6050 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
6051
6052 static void
6053 aarch64_emit_prologue_components (sbitmap components)
6054 {
6055   aarch64_process_components (components, true);
6056 }
6057
6058 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
6059
6060 static void
6061 aarch64_emit_epilogue_components (sbitmap components)
6062 {
6063   aarch64_process_components (components, false);
6064 }
6065
6066 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
6067
6068 static void
6069 aarch64_set_handled_components (sbitmap components)
6070 {
6071   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6072     if (bitmap_bit_p (components, regno))
6073       cfun->machine->reg_is_wrapped_separately[regno] = true;
6074 }
6075
6076 /* On AArch64 we have an ABI defined safe buffer.  This constant is used to
6077    determining the probe offset for alloca.  */
6078
6079 static HOST_WIDE_INT
6080 aarch64_stack_clash_protection_alloca_probe_range (void)
6081 {
6082   return STACK_CLASH_CALLER_GUARD;
6083 }
6084
6085
6086 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
6087    registers.  If POLY_SIZE is not large enough to require a probe this function
6088    will only adjust the stack.  When allocating the stack space
6089    FRAME_RELATED_P is then used to indicate if the allocation is frame related.
6090    FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
6091    arguments.  If we are then we ensure that any allocation larger than the ABI
6092    defined buffer needs a probe so that the invariant of having a 1KB buffer is
6093    maintained.
6094
6095    We emit barriers after each stack adjustment to prevent optimizations from
6096    breaking the invariant that we never drop the stack more than a page.  This
6097    invariant is needed to make it easier to correctly handle asynchronous
6098    events, e.g. if we were to allow the stack to be dropped by more than a page
6099    and then have multiple probes up and we take a signal somewhere in between
6100    then the signal handler doesn't know the state of the stack and can make no
6101    assumptions about which pages have been probed.  */
6102
6103 static void
6104 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
6105                                         poly_int64 poly_size,
6106                                         bool frame_related_p,
6107                                         bool final_adjustment_p)
6108 {
6109   HOST_WIDE_INT guard_size
6110     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
6111   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
6112   /* When doing the final adjustment for the outgoing argument size we can't
6113      assume that LR was saved at position 0.  So subtract it's offset from the
6114      ABI safe buffer so that we don't accidentally allow an adjustment that
6115      would result in an allocation larger than the ABI buffer without
6116      probing.  */
6117   HOST_WIDE_INT min_probe_threshold
6118     = final_adjustment_p
6119       ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
6120       : guard_size - guard_used_by_caller;
6121
6122   poly_int64 frame_size = cfun->machine->frame.frame_size;
6123
6124   /* We should always have a positive probe threshold.  */
6125   gcc_assert (min_probe_threshold > 0);
6126
6127   if (flag_stack_clash_protection && !final_adjustment_p)
6128     {
6129       poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6130       poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6131
6132       if (known_eq (frame_size, 0))
6133         {
6134           dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
6135         }
6136       else if (known_lt (initial_adjust, guard_size - guard_used_by_caller)
6137                && known_lt (final_adjust, guard_used_by_caller))
6138         {
6139           dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
6140         }
6141     }
6142
6143   /* If SIZE is not large enough to require probing, just adjust the stack and
6144      exit.  */
6145   if (known_lt (poly_size, min_probe_threshold)
6146       || !flag_stack_clash_protection)
6147     {
6148       aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
6149       return;
6150     }
6151
6152   HOST_WIDE_INT size;
6153   /* Handle the SVE non-constant case first.  */
6154   if (!poly_size.is_constant (&size))
6155     {
6156      if (dump_file)
6157       {
6158         fprintf (dump_file, "Stack clash SVE prologue: ");
6159         print_dec (poly_size, dump_file);
6160         fprintf (dump_file, " bytes, dynamic probing will be required.\n");
6161       }
6162
6163       /* First calculate the amount of bytes we're actually spilling.  */
6164       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
6165                           poly_size, temp1, temp2, false, true);
6166
6167       rtx_insn *insn = get_last_insn ();
6168
6169       if (frame_related_p)
6170         {
6171           /* This is done to provide unwinding information for the stack
6172              adjustments we're about to do, however to prevent the optimizers
6173              from removing the R11 move and leaving the CFA note (which would be
6174              very wrong) we tie the old and new stack pointer together.
6175              The tie will expand to nothing but the optimizers will not touch
6176              the instruction.  */
6177           rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
6178           emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
6179           emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
6180
6181           /* We want the CFA independent of the stack pointer for the
6182              duration of the loop.  */
6183           add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
6184           RTX_FRAME_RELATED_P (insn) = 1;
6185         }
6186
6187       rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
6188       rtx guard_const = gen_int_mode (guard_size, Pmode);
6189
6190       insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
6191                                                    stack_pointer_rtx, temp1,
6192                                                    probe_const, guard_const));
6193
6194       /* Now reset the CFA register if needed.  */
6195       if (frame_related_p)
6196         {
6197           add_reg_note (insn, REG_CFA_DEF_CFA,
6198                         gen_rtx_PLUS (Pmode, stack_pointer_rtx,
6199                                       gen_int_mode (poly_size, Pmode)));
6200           RTX_FRAME_RELATED_P (insn) = 1;
6201         }
6202
6203       return;
6204     }
6205
6206   if (dump_file)
6207     fprintf (dump_file,
6208              "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
6209              " bytes, probing will be required.\n", size);
6210
6211   /* Round size to the nearest multiple of guard_size, and calculate the
6212      residual as the difference between the original size and the rounded
6213      size.  */
6214   HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
6215   HOST_WIDE_INT residual = size - rounded_size;
6216
6217   /* We can handle a small number of allocations/probes inline.  Otherwise
6218      punt to a loop.  */
6219   if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
6220     {
6221       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
6222         {
6223           aarch64_sub_sp (NULL, temp2, guard_size, true);
6224           emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
6225                                            guard_used_by_caller));
6226           emit_insn (gen_blockage ());
6227         }
6228       dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
6229     }
6230   else
6231     {
6232       /* Compute the ending address.  */
6233       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
6234                           temp1, NULL, false, true);
6235       rtx_insn *insn = get_last_insn ();
6236
6237       /* For the initial allocation, we don't have a frame pointer
6238          set up, so we always need CFI notes.  If we're doing the
6239          final allocation, then we may have a frame pointer, in which
6240          case it is the CFA, otherwise we need CFI notes.
6241
6242          We can determine which allocation we are doing by looking at
6243          the value of FRAME_RELATED_P since the final allocations are not
6244          frame related.  */
6245       if (frame_related_p)
6246         {
6247           /* We want the CFA independent of the stack pointer for the
6248              duration of the loop.  */
6249           add_reg_note (insn, REG_CFA_DEF_CFA,
6250                         plus_constant (Pmode, temp1, rounded_size));
6251           RTX_FRAME_RELATED_P (insn) = 1;
6252         }
6253
6254       /* This allocates and probes the stack.  Note that this re-uses some of
6255          the existing Ada stack protection code.  However we are guaranteed not
6256          to enter the non loop or residual branches of that code.
6257
6258          The non-loop part won't be entered because if our allocation amount
6259          doesn't require a loop, the case above would handle it.
6260
6261          The residual amount won't be entered because TEMP1 is a mutliple of
6262          the allocation size.  The residual will always be 0.  As such, the only
6263          part we are actually using from that code is the loop setup.  The
6264          actual probing is done in aarch64_output_probe_stack_range.  */
6265       insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
6266                                                stack_pointer_rtx, temp1));
6267
6268       /* Now reset the CFA register if needed.  */
6269       if (frame_related_p)
6270         {
6271           add_reg_note (insn, REG_CFA_DEF_CFA,
6272                         plus_constant (Pmode, stack_pointer_rtx, rounded_size));
6273           RTX_FRAME_RELATED_P (insn) = 1;
6274         }
6275
6276       emit_insn (gen_blockage ());
6277       dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
6278     }
6279
6280   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
6281      be probed.  This maintains the requirement that each page is probed at
6282      least once.  For initial probing we probe only if the allocation is
6283      more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
6284      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
6285      GUARD_SIZE.  This works that for any allocation that is large enough to
6286      trigger a probe here, we'll have at least one, and if they're not large
6287      enough for this code to emit anything for them, The page would have been
6288      probed by the saving of FP/LR either by this function or any callees.  If
6289      we don't have any callees then we won't have more stack adjustments and so
6290      are still safe.  */
6291   if (residual)
6292     {
6293       HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
6294       /* If we're doing final adjustments, and we've done any full page
6295          allocations then any residual needs to be probed.  */
6296       if (final_adjustment_p && rounded_size != 0)
6297         min_probe_threshold = 0;
6298       /* If doing a small final adjustment, we always probe at offset 0.
6299          This is done to avoid issues when LR is not at position 0 or when
6300          the final adjustment is smaller than the probing offset.  */
6301       else if (final_adjustment_p && rounded_size == 0)
6302         residual_probe_offset = 0;
6303
6304       aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
6305       if (residual >= min_probe_threshold)
6306         {
6307           if (dump_file)
6308             fprintf (dump_file,
6309                      "Stack clash AArch64 prologue residuals: "
6310                      HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
6311                      "\n", residual);
6312
6313             emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
6314                                              residual_probe_offset));
6315           emit_insn (gen_blockage ());
6316         }
6317     }
6318 }
6319
6320 /* Return 1 if the register is used by the epilogue.  We need to say the
6321    return register is used, but only after epilogue generation is complete.
6322    Note that in the case of sibcalls, the values "used by the epilogue" are
6323    considered live at the start of the called function.
6324
6325    For SIMD functions we need to return 1 for FP registers that are saved and
6326    restored by a function but are not zero in call_used_regs.  If we do not do
6327    this optimizations may remove the restore of the register.  */
6328
6329 int
6330 aarch64_epilogue_uses (int regno)
6331 {
6332   if (epilogue_completed)
6333     {
6334       if (regno == LR_REGNUM)
6335         return 1;
6336       if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno))
6337         return 1;
6338     }
6339   return 0;
6340 }
6341
6342 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
6343    is saved at BASE + OFFSET.  */
6344
6345 static void
6346 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
6347                             rtx base, poly_int64 offset)
6348 {
6349   rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
6350   add_reg_note (insn, REG_CFA_EXPRESSION,
6351                 gen_rtx_SET (mem, regno_reg_rtx[reg]));
6352 }
6353
6354 /* AArch64 stack frames generated by this compiler look like:
6355
6356         +-------------------------------+
6357         |                               |
6358         |  incoming stack arguments     |
6359         |                               |
6360         +-------------------------------+
6361         |                               | <-- incoming stack pointer (aligned)
6362         |  callee-allocated save area   |
6363         |  for register varargs         |
6364         |                               |
6365         +-------------------------------+
6366         |  local variables              | <-- frame_pointer_rtx
6367         |                               |
6368         +-------------------------------+
6369         |  padding                      | \
6370         +-------------------------------+  |
6371         |  callee-saved registers       |  | frame.saved_regs_size
6372         +-------------------------------+  |
6373         |  LR'                          |  |
6374         +-------------------------------+  |
6375         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
6376         +-------------------------------+
6377         |  dynamic allocation           |
6378         +-------------------------------+
6379         |  padding                      |
6380         +-------------------------------+
6381         |  outgoing stack arguments     | <-- arg_pointer
6382         |                               |
6383         +-------------------------------+
6384         |                               | <-- stack_pointer_rtx (aligned)
6385
6386    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
6387    but leave frame_pointer_rtx and hard_frame_pointer_rtx
6388    unchanged.
6389
6390    By default for stack-clash we assume the guard is at least 64KB, but this
6391    value is configurable to either 4KB or 64KB.  We also force the guard size to
6392    be the same as the probing interval and both values are kept in sync.
6393
6394    With those assumptions the callee can allocate up to 63KB (or 3KB depending
6395    on the guard size) of stack space without probing.
6396
6397    When probing is needed, we emit a probe at the start of the prologue
6398    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
6399
6400    We have to track how much space has been allocated and the only stores
6401    to the stack we track as implicit probes are the FP/LR stores.
6402
6403    For outgoing arguments we probe if the size is larger than 1KB, such that
6404    the ABI specified buffer is maintained for the next callee.
6405
6406    The following registers are reserved during frame layout and should not be
6407    used for any other purpose:
6408
6409    - r11: Used by stack clash protection when SVE is enabled.
6410    - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
6411    - r14 and r15: Used for speculation tracking.
6412    - r16(IP0), r17(IP1): Used by indirect tailcalls.
6413    - r30(LR), r29(FP): Used by standard frame layout.
6414
6415    These registers must be avoided in frame layout related code unless the
6416    explicit intention is to interact with one of the features listed above.  */
6417
6418 /* Generate the prologue instructions for entry into a function.
6419    Establish the stack frame by decreasing the stack pointer with a
6420    properly calculated size and, if necessary, create a frame record
6421    filled with the values of LR and previous frame pointer.  The
6422    current FP is also set up if it is in use.  */
6423
6424 void
6425 aarch64_expand_prologue (void)
6426 {
6427   poly_int64 frame_size = cfun->machine->frame.frame_size;
6428   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6429   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6430   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6431   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
6432   unsigned reg1 = cfun->machine->frame.wb_candidate1;
6433   unsigned reg2 = cfun->machine->frame.wb_candidate2;
6434   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
6435   rtx_insn *insn;
6436
6437   /* Sign return address for functions.  */
6438   if (aarch64_return_address_signing_enabled ())
6439     {
6440       switch (aarch64_ra_sign_key)
6441         {
6442           case AARCH64_KEY_A:
6443             insn = emit_insn (gen_paciasp ());
6444             break;
6445           case AARCH64_KEY_B:
6446             insn = emit_insn (gen_pacibsp ());
6447             break;
6448           default:
6449             gcc_unreachable ();
6450         }
6451       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6452       RTX_FRAME_RELATED_P (insn) = 1;
6453     }
6454
6455   if (flag_stack_usage_info)
6456     current_function_static_stack_size = constant_lower_bound (frame_size);
6457
6458   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
6459     {
6460       if (crtl->is_leaf && !cfun->calls_alloca)
6461         {
6462           if (maybe_gt (frame_size, PROBE_INTERVAL)
6463               && maybe_gt (frame_size, get_stack_check_protect ()))
6464             aarch64_emit_probe_stack_range (get_stack_check_protect (),
6465                                             (frame_size
6466                                              - get_stack_check_protect ()));
6467         }
6468       else if (maybe_gt (frame_size, 0))
6469         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
6470     }
6471
6472   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
6473   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6474
6475   /* In theory we should never have both an initial adjustment
6476      and a callee save adjustment.  Verify that is the case since the
6477      code below does not handle it for -fstack-clash-protection.  */
6478   gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
6479
6480   /* Will only probe if the initial adjustment is larger than the guard
6481      less the amount of the guard reserved for use by the caller's
6482      outgoing args.  */
6483   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
6484                                           true, false);
6485
6486   if (callee_adjust != 0)
6487     aarch64_push_regs (reg1, reg2, callee_adjust);
6488
6489   if (emit_frame_chain)
6490     {
6491       poly_int64 reg_offset = callee_adjust;
6492       if (callee_adjust == 0)
6493         {
6494           reg1 = R29_REGNUM;
6495           reg2 = R30_REGNUM;
6496           reg_offset = callee_offset;
6497           aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
6498         }
6499       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
6500                           stack_pointer_rtx, callee_offset,
6501                           tmp1_rtx, tmp0_rtx, frame_pointer_needed);
6502       if (frame_pointer_needed && !frame_size.is_constant ())
6503         {
6504           /* Variable-sized frames need to describe the save slot
6505              address using DW_CFA_expression rather than DW_CFA_offset.
6506              This means that, without taking further action, the
6507              locations of the registers that we've already saved would
6508              remain based on the stack pointer even after we redefine
6509              the CFA based on the frame pointer.  We therefore need new
6510              DW_CFA_expressions to re-express the save slots with addresses
6511              based on the frame pointer.  */
6512           rtx_insn *insn = get_last_insn ();
6513           gcc_assert (RTX_FRAME_RELATED_P (insn));
6514
6515           /* Add an explicit CFA definition if this was previously
6516              implicit.  */
6517           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
6518             {
6519               rtx src = plus_constant (Pmode, stack_pointer_rtx,
6520                                        callee_offset);
6521               add_reg_note (insn, REG_CFA_ADJUST_CFA,
6522                             gen_rtx_SET (hard_frame_pointer_rtx, src));
6523             }
6524
6525           /* Change the save slot expressions for the registers that
6526              we've already saved.  */
6527           reg_offset -= callee_offset;
6528           aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
6529                                       reg_offset + UNITS_PER_WORD);
6530           aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
6531                                       reg_offset);
6532         }
6533       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
6534     }
6535
6536   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
6537                              callee_adjust != 0 || emit_frame_chain);
6538   if (aarch64_simd_decl_p (cfun->decl))
6539     aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6540                                callee_adjust != 0 || emit_frame_chain);
6541   else
6542     aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6543                                callee_adjust != 0 || emit_frame_chain);
6544
6545   /* We may need to probe the final adjustment if it is larger than the guard
6546      that is assumed by the called.  */
6547   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
6548                                           !frame_pointer_needed, true);
6549 }
6550
6551 /* Return TRUE if we can use a simple_return insn.
6552
6553    This function checks whether the callee saved stack is empty, which
6554    means no restore actions are need. The pro_and_epilogue will use
6555    this to check whether shrink-wrapping opt is feasible.  */
6556
6557 bool
6558 aarch64_use_return_insn_p (void)
6559 {
6560   if (!reload_completed)
6561     return false;
6562
6563   if (crtl->profile)
6564     return false;
6565
6566   return known_eq (cfun->machine->frame.frame_size, 0);
6567 }
6568
6569 /* Return false for non-leaf SIMD functions in order to avoid
6570    shrink-wrapping them.  Doing this will lose the necessary
6571    save/restore of FP registers.  */
6572
6573 bool
6574 aarch64_use_simple_return_insn_p (void)
6575 {
6576   if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf)
6577     return false;
6578
6579   return true;
6580 }
6581
6582 /* Generate the epilogue instructions for returning from a function.
6583    This is almost exactly the reverse of the prolog sequence, except
6584    that we need to insert barriers to avoid scheduling loads that read
6585    from a deallocated stack, and we optimize the unwind records by
6586    emitting them all together if possible.  */
6587 void
6588 aarch64_expand_epilogue (bool for_sibcall)
6589 {
6590   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6591   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6592   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6593   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
6594   unsigned reg1 = cfun->machine->frame.wb_candidate1;
6595   unsigned reg2 = cfun->machine->frame.wb_candidate2;
6596   rtx cfi_ops = NULL;
6597   rtx_insn *insn;
6598   /* A stack clash protection prologue may not have left EP0_REGNUM or
6599      EP1_REGNUM in a usable state.  The same is true for allocations
6600      with an SVE component, since we then need both temporary registers
6601      for each allocation.  For stack clash we are in a usable state if
6602      the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
6603   HOST_WIDE_INT guard_size
6604     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
6605   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
6606
6607   /* We can re-use the registers when the allocation amount is smaller than
6608      guard_size - guard_used_by_caller because we won't be doing any probes
6609      then.  In such situations the register should remain live with the correct
6610      value.  */
6611   bool can_inherit_p = (initial_adjust.is_constant ()
6612                         && final_adjust.is_constant ())
6613                         && (!flag_stack_clash_protection
6614                             || known_lt (initial_adjust,
6615                                          guard_size - guard_used_by_caller));
6616
6617   /* We need to add memory barrier to prevent read from deallocated stack.  */
6618   bool need_barrier_p
6619     = maybe_ne (get_frame_size ()
6620                 + cfun->machine->frame.saved_varargs_size, 0);
6621
6622   /* Emit a barrier to prevent loads from a deallocated stack.  */
6623   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
6624       || cfun->calls_alloca
6625       || crtl->calls_eh_return)
6626     {
6627       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6628       need_barrier_p = false;
6629     }
6630
6631   /* Restore the stack pointer from the frame pointer if it may not
6632      be the same as the stack pointer.  */
6633   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
6634   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6635   if (frame_pointer_needed
6636       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
6637     /* If writeback is used when restoring callee-saves, the CFA
6638        is restored on the instruction doing the writeback.  */
6639     aarch64_add_offset (Pmode, stack_pointer_rtx,
6640                         hard_frame_pointer_rtx, -callee_offset,
6641                         tmp1_rtx, tmp0_rtx, callee_adjust == 0);
6642   else
6643      /* The case where we need to re-use the register here is very rare, so
6644         avoid the complicated condition and just always emit a move if the
6645         immediate doesn't fit.  */
6646      aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
6647
6648   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
6649                                 callee_adjust != 0, &cfi_ops);
6650   if (aarch64_simd_decl_p (cfun->decl))
6651     aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6652                                   callee_adjust != 0, &cfi_ops);
6653   else
6654     aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6655                                   callee_adjust != 0, &cfi_ops);
6656
6657   if (need_barrier_p)
6658     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6659
6660   if (callee_adjust != 0)
6661     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
6662
6663   if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
6664     {
6665       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
6666       insn = get_last_insn ();
6667       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
6668       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
6669       RTX_FRAME_RELATED_P (insn) = 1;
6670       cfi_ops = NULL;
6671     }
6672
6673   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
6674      add restriction on emit_move optimization to leaf functions.  */
6675   aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
6676                   (!can_inherit_p || !crtl->is_leaf
6677                    || df_regs_ever_live_p (EP0_REGNUM)));
6678
6679   if (cfi_ops)
6680     {
6681       /* Emit delayed restores and reset the CFA to be SP.  */
6682       insn = get_last_insn ();
6683       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
6684       REG_NOTES (insn) = cfi_ops;
6685       RTX_FRAME_RELATED_P (insn) = 1;
6686     }
6687
6688   /* We prefer to emit the combined return/authenticate instruction RETAA,
6689      however there are three cases in which we must instead emit an explicit
6690      authentication instruction.
6691
6692         1) Sibcalls don't return in a normal way, so if we're about to call one
6693            we must authenticate.
6694
6695         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
6696            generating code for !TARGET_ARMV8_3 we can't use it and must
6697            explicitly authenticate.
6698
6699         3) On an eh_return path we make extra stack adjustments to update the
6700            canonical frame address to be the exception handler's CFA.  We want
6701            to authenticate using the CFA of the function which calls eh_return.
6702     */
6703   if (aarch64_return_address_signing_enabled ()
6704       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
6705     {
6706       switch (aarch64_ra_sign_key)
6707         {
6708           case AARCH64_KEY_A:
6709             insn = emit_insn (gen_autiasp ());
6710             break;
6711           case AARCH64_KEY_B:
6712             insn = emit_insn (gen_autibsp ());
6713             break;
6714           default:
6715             gcc_unreachable ();
6716         }
6717       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6718       RTX_FRAME_RELATED_P (insn) = 1;
6719     }
6720
6721   /* Stack adjustment for exception handler.  */
6722   if (crtl->calls_eh_return && !for_sibcall)
6723     {
6724       /* We need to unwind the stack by the offset computed by
6725          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
6726          to be SP; letting the CFA move during this adjustment
6727          is just as correct as retaining the CFA from the body
6728          of the function.  Therefore, do nothing special.  */
6729       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
6730     }
6731
6732   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
6733   if (!for_sibcall)
6734     emit_jump_insn (ret_rtx);
6735 }
6736
6737 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
6738    normally or return to a previous frame after unwinding.
6739
6740    An EH return uses a single shared return sequence.  The epilogue is
6741    exactly like a normal epilogue except that it has an extra input
6742    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
6743    that must be applied after the frame has been destroyed.  An extra label
6744    is inserted before the epilogue which initializes this register to zero,
6745    and this is the entry point for a normal return.
6746
6747    An actual EH return updates the return address, initializes the stack
6748    adjustment and jumps directly into the epilogue (bypassing the zeroing
6749    of the adjustment).  Since the return address is typically saved on the
6750    stack when a function makes a call, the saved LR must be updated outside
6751    the epilogue.
6752
6753    This poses problems as the store is generated well before the epilogue,
6754    so the offset of LR is not known yet.  Also optimizations will remove the
6755    store as it appears dead, even after the epilogue is generated (as the
6756    base or offset for loading LR is different in many cases).
6757
6758    To avoid these problems this implementation forces the frame pointer
6759    in eh_return functions so that the location of LR is fixed and known early.
6760    It also marks the store volatile, so no optimization is permitted to
6761    remove the store.  */
6762 rtx
6763 aarch64_eh_return_handler_rtx (void)
6764 {
6765   rtx tmp = gen_frame_mem (Pmode,
6766     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
6767
6768   /* Mark the store volatile, so no optimization is permitted to remove it.  */
6769   MEM_VOLATILE_P (tmp) = true;
6770   return tmp;
6771 }
6772
6773 /* Output code to add DELTA to the first argument, and then jump
6774    to FUNCTION.  Used for C++ multiple inheritance.  */
6775 static void
6776 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
6777                          HOST_WIDE_INT delta,
6778                          HOST_WIDE_INT vcall_offset,
6779                          tree function)
6780 {
6781   /* The this pointer is always in x0.  Note that this differs from
6782      Arm where the this pointer maybe bumped to r1 if r0 is required
6783      to return a pointer to an aggregate.  On AArch64 a result value
6784      pointer will be in x8.  */
6785   int this_regno = R0_REGNUM;
6786   rtx this_rtx, temp0, temp1, addr, funexp;
6787   rtx_insn *insn;
6788   const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
6789
6790   if (aarch64_bti_enabled ())
6791     emit_insn (gen_bti_c());
6792
6793   reload_completed = 1;
6794   emit_note (NOTE_INSN_PROLOGUE_END);
6795
6796   this_rtx = gen_rtx_REG (Pmode, this_regno);
6797   temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
6798   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
6799
6800   if (vcall_offset == 0)
6801     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
6802   else
6803     {
6804       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
6805
6806       addr = this_rtx;
6807       if (delta != 0)
6808         {
6809           if (delta >= -256 && delta < 256)
6810             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
6811                                        plus_constant (Pmode, this_rtx, delta));
6812           else
6813             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
6814                                 temp1, temp0, false);
6815         }
6816
6817       if (Pmode == ptr_mode)
6818         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
6819       else
6820         aarch64_emit_move (temp0,
6821                            gen_rtx_ZERO_EXTEND (Pmode,
6822                                                 gen_rtx_MEM (ptr_mode, addr)));
6823
6824       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
6825           addr = plus_constant (Pmode, temp0, vcall_offset);
6826       else
6827         {
6828           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
6829                                           Pmode);
6830           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
6831         }
6832
6833       if (Pmode == ptr_mode)
6834         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
6835       else
6836         aarch64_emit_move (temp1,
6837                            gen_rtx_SIGN_EXTEND (Pmode,
6838                                                 gen_rtx_MEM (ptr_mode, addr)));
6839
6840       emit_insn (gen_add2_insn (this_rtx, temp1));
6841     }
6842
6843   /* Generate a tail call to the target function.  */
6844   if (!TREE_USED (function))
6845     {
6846       assemble_external (function);
6847       TREE_USED (function) = 1;
6848     }
6849   funexp = XEXP (DECL_RTL (function), 0);
6850   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
6851   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
6852   SIBLING_CALL_P (insn) = 1;
6853
6854   insn = get_insns ();
6855   shorten_branches (insn);
6856
6857   assemble_start_function (thunk, fnname);
6858   final_start_function (insn, file, 1);
6859   final (insn, file, 1);
6860   final_end_function ();
6861   assemble_end_function (thunk, fnname);
6862
6863   /* Stop pretending to be a post-reload pass.  */
6864   reload_completed = 0;
6865 }
6866
6867 static bool
6868 aarch64_tls_referenced_p (rtx x)
6869 {
6870   if (!TARGET_HAVE_TLS)
6871     return false;
6872   subrtx_iterator::array_type array;
6873   FOR_EACH_SUBRTX (iter, array, x, ALL)
6874     {
6875       const_rtx x = *iter;
6876       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
6877         return true;
6878       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
6879          TLS offsets, not real symbol references.  */
6880       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
6881         iter.skip_subrtxes ();
6882     }
6883   return false;
6884 }
6885
6886
6887 /* Return true if val can be encoded as a 12-bit unsigned immediate with
6888    a left shift of 0 or 12 bits.  */
6889 bool
6890 aarch64_uimm12_shift (HOST_WIDE_INT val)
6891 {
6892   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
6893           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
6894           );
6895 }
6896
6897 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
6898    that can be created with a left shift of 0 or 12.  */
6899 static HOST_WIDE_INT
6900 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
6901 {
6902   /* Check to see if the value fits in 24 bits, as that is the maximum we can
6903      handle correctly.  */
6904   gcc_assert ((val & 0xffffff) == val);
6905
6906   if (((val & 0xfff) << 0) == val)
6907     return val;
6908
6909   return val & (0xfff << 12);
6910 }
6911
6912 /* Return true if val is an immediate that can be loaded into a
6913    register by a MOVZ instruction.  */
6914 static bool
6915 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
6916 {
6917   if (GET_MODE_SIZE (mode) > 4)
6918     {
6919       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
6920           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
6921         return 1;
6922     }
6923   else
6924     {
6925       /* Ignore sign extension.  */
6926       val &= (HOST_WIDE_INT) 0xffffffff;
6927     }
6928   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
6929           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
6930 }
6931
6932 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
6933    64-bit (DImode) integer.  */
6934
6935 static unsigned HOST_WIDE_INT
6936 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
6937 {
6938   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
6939   while (size < 64)
6940     {
6941       val &= (HOST_WIDE_INT_1U << size) - 1;
6942       val |= val << size;
6943       size *= 2;
6944     }
6945   return val;
6946 }
6947
6948 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
6949
6950 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
6951   {
6952     0x0000000100000001ull,
6953     0x0001000100010001ull,
6954     0x0101010101010101ull,
6955     0x1111111111111111ull,
6956     0x5555555555555555ull,
6957   };
6958
6959
6960 /* Return true if val is a valid bitmask immediate.  */
6961
6962 bool
6963 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
6964 {
6965   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
6966   int bits;
6967
6968   /* Check for a single sequence of one bits and return quickly if so.
6969      The special cases of all ones and all zeroes returns false.  */
6970   val = aarch64_replicate_bitmask_imm (val_in, mode);
6971   tmp = val + (val & -val);
6972
6973   if (tmp == (tmp & -tmp))
6974     return (val + 1) > 1;
6975
6976   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
6977   if (mode == SImode)
6978     val = (val << 32) | (val & 0xffffffff);
6979
6980   /* Invert if the immediate doesn't start with a zero bit - this means we
6981      only need to search for sequences of one bits.  */
6982   if (val & 1)
6983     val = ~val;
6984
6985   /* Find the first set bit and set tmp to val with the first sequence of one
6986      bits removed.  Return success if there is a single sequence of ones.  */
6987   first_one = val & -val;
6988   tmp = val & (val + first_one);
6989
6990   if (tmp == 0)
6991     return true;
6992
6993   /* Find the next set bit and compute the difference in bit position.  */
6994   next_one = tmp & -tmp;
6995   bits = clz_hwi (first_one) - clz_hwi (next_one);
6996   mask = val ^ tmp;
6997
6998   /* Check the bit position difference is a power of 2, and that the first
6999      sequence of one bits fits within 'bits' bits.  */
7000   if ((mask >> bits) != 0 || bits != (bits & -bits))
7001     return false;
7002
7003   /* Check the sequence of one bits is repeated 64/bits times.  */
7004   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
7005 }
7006
7007 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
7008    Assumed precondition: VAL_IN Is not zero.  */
7009
7010 unsigned HOST_WIDE_INT
7011 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
7012 {
7013   int lowest_bit_set = ctz_hwi (val_in);
7014   int highest_bit_set = floor_log2 (val_in);
7015   gcc_assert (val_in != 0);
7016
7017   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
7018           (HOST_WIDE_INT_1U << lowest_bit_set));
7019 }
7020
7021 /* Create constant where bits outside of lowest bit set to highest bit set
7022    are set to 1.  */
7023
7024 unsigned HOST_WIDE_INT
7025 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
7026 {
7027   return val_in | ~aarch64_and_split_imm1 (val_in);
7028 }
7029
7030 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
7031
7032 bool
7033 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
7034 {
7035   scalar_int_mode int_mode;
7036   if (!is_a <scalar_int_mode> (mode, &int_mode))
7037     return false;
7038
7039   if (aarch64_bitmask_imm (val_in, int_mode))
7040     return false;
7041
7042   if (aarch64_move_imm (val_in, int_mode))
7043     return false;
7044
7045   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
7046
7047   return aarch64_bitmask_imm (imm2, int_mode);
7048 }
7049
7050 /* Return true if val is an immediate that can be loaded into a
7051    register in a single instruction.  */
7052 bool
7053 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
7054 {
7055   scalar_int_mode int_mode;
7056   if (!is_a <scalar_int_mode> (mode, &int_mode))
7057     return false;
7058
7059   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
7060     return 1;
7061   return aarch64_bitmask_imm (val, int_mode);
7062 }
7063
7064 static bool
7065 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
7066 {
7067   rtx base, offset;
7068
7069   if (GET_CODE (x) == HIGH)
7070     return true;
7071
7072   /* There's no way to calculate VL-based values using relocations.  */
7073   subrtx_iterator::array_type array;
7074   FOR_EACH_SUBRTX (iter, array, x, ALL)
7075     if (GET_CODE (*iter) == CONST_POLY_INT)
7076       return true;
7077
7078   split_const (x, &base, &offset);
7079   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
7080     {
7081       if (aarch64_classify_symbol (base, INTVAL (offset))
7082           != SYMBOL_FORCE_TO_MEM)
7083         return true;
7084       else
7085         /* Avoid generating a 64-bit relocation in ILP32; leave
7086            to aarch64_expand_mov_immediate to handle it properly.  */
7087         return mode != ptr_mode;
7088     }
7089
7090   return aarch64_tls_referenced_p (x);
7091 }
7092
7093 /* Implement TARGET_CASE_VALUES_THRESHOLD.
7094    The expansion for a table switch is quite expensive due to the number
7095    of instructions, the table lookup and hard to predict indirect jump.
7096    When optimizing for speed, and -O3 enabled, use the per-core tuning if
7097    set, otherwise use tables for > 16 cases as a tradeoff between size and
7098    performance.  When optimizing for size, use the default setting.  */
7099
7100 static unsigned int
7101 aarch64_case_values_threshold (void)
7102 {
7103   /* Use the specified limit for the number of cases before using jump
7104      tables at higher optimization levels.  */
7105   if (optimize > 2
7106       && selected_cpu->tune->max_case_values != 0)
7107     return selected_cpu->tune->max_case_values;
7108   else
7109     return optimize_size ? default_case_values_threshold () : 17;
7110 }
7111
7112 /* Return true if register REGNO is a valid index register.
7113    STRICT_P is true if REG_OK_STRICT is in effect.  */
7114
7115 bool
7116 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
7117 {
7118   if (!HARD_REGISTER_NUM_P (regno))
7119     {
7120       if (!strict_p)
7121         return true;
7122
7123       if (!reg_renumber)
7124         return false;
7125
7126       regno = reg_renumber[regno];
7127     }
7128   return GP_REGNUM_P (regno);
7129 }
7130
7131 /* Return true if register REGNO is a valid base register for mode MODE.
7132    STRICT_P is true if REG_OK_STRICT is in effect.  */
7133
7134 bool
7135 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
7136 {
7137   if (!HARD_REGISTER_NUM_P (regno))
7138     {
7139       if (!strict_p)
7140         return true;
7141
7142       if (!reg_renumber)
7143         return false;
7144
7145       regno = reg_renumber[regno];
7146     }
7147
7148   /* The fake registers will be eliminated to either the stack or
7149      hard frame pointer, both of which are usually valid base registers.
7150      Reload deals with the cases where the eliminated form isn't valid.  */
7151   return (GP_REGNUM_P (regno)
7152           || regno == SP_REGNUM
7153           || regno == FRAME_POINTER_REGNUM
7154           || regno == ARG_POINTER_REGNUM);
7155 }
7156
7157 /* Return true if X is a valid base register for mode MODE.
7158    STRICT_P is true if REG_OK_STRICT is in effect.  */
7159
7160 static bool
7161 aarch64_base_register_rtx_p (rtx x, bool strict_p)
7162 {
7163   if (!strict_p
7164       && GET_CODE (x) == SUBREG
7165       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
7166     x = SUBREG_REG (x);
7167
7168   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
7169 }
7170
7171 /* Return true if address offset is a valid index.  If it is, fill in INFO
7172    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
7173
7174 static bool
7175 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
7176                         machine_mode mode, bool strict_p)
7177 {
7178   enum aarch64_address_type type;
7179   rtx index;
7180   int shift;
7181
7182   /* (reg:P) */
7183   if ((REG_P (x) || GET_CODE (x) == SUBREG)
7184       && GET_MODE (x) == Pmode)
7185     {
7186       type = ADDRESS_REG_REG;
7187       index = x;
7188       shift = 0;
7189     }
7190   /* (sign_extend:DI (reg:SI)) */
7191   else if ((GET_CODE (x) == SIGN_EXTEND
7192             || GET_CODE (x) == ZERO_EXTEND)
7193            && GET_MODE (x) == DImode
7194            && GET_MODE (XEXP (x, 0)) == SImode)
7195     {
7196       type = (GET_CODE (x) == SIGN_EXTEND)
7197         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7198       index = XEXP (x, 0);
7199       shift = 0;
7200     }
7201   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
7202   else if (GET_CODE (x) == MULT
7203            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
7204                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
7205            && GET_MODE (XEXP (x, 0)) == DImode
7206            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
7207            && CONST_INT_P (XEXP (x, 1)))
7208     {
7209       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
7210         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7211       index = XEXP (XEXP (x, 0), 0);
7212       shift = exact_log2 (INTVAL (XEXP (x, 1)));
7213     }
7214   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
7215   else if (GET_CODE (x) == ASHIFT
7216            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
7217                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
7218            && GET_MODE (XEXP (x, 0)) == DImode
7219            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
7220            && CONST_INT_P (XEXP (x, 1)))
7221     {
7222       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
7223         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7224       index = XEXP (XEXP (x, 0), 0);
7225       shift = INTVAL (XEXP (x, 1));
7226     }
7227   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
7228   else if ((GET_CODE (x) == SIGN_EXTRACT
7229             || GET_CODE (x) == ZERO_EXTRACT)
7230            && GET_MODE (x) == DImode
7231            && GET_CODE (XEXP (x, 0)) == MULT
7232            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7233            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
7234     {
7235       type = (GET_CODE (x) == SIGN_EXTRACT)
7236         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7237       index = XEXP (XEXP (x, 0), 0);
7238       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
7239       if (INTVAL (XEXP (x, 1)) != 32 + shift
7240           || INTVAL (XEXP (x, 2)) != 0)
7241         shift = -1;
7242     }
7243   /* (and:DI (mult:DI (reg:DI) (const_int scale))
7244      (const_int 0xffffffff<<shift)) */
7245   else if (GET_CODE (x) == AND
7246            && GET_MODE (x) == DImode
7247            && GET_CODE (XEXP (x, 0)) == MULT
7248            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7249            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7250            && CONST_INT_P (XEXP (x, 1)))
7251     {
7252       type = ADDRESS_REG_UXTW;
7253       index = XEXP (XEXP (x, 0), 0);
7254       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
7255       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
7256         shift = -1;
7257     }
7258   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
7259   else if ((GET_CODE (x) == SIGN_EXTRACT
7260             || GET_CODE (x) == ZERO_EXTRACT)
7261            && GET_MODE (x) == DImode
7262            && GET_CODE (XEXP (x, 0)) == ASHIFT
7263            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7264            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
7265     {
7266       type = (GET_CODE (x) == SIGN_EXTRACT)
7267         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7268       index = XEXP (XEXP (x, 0), 0);
7269       shift = INTVAL (XEXP (XEXP (x, 0), 1));
7270       if (INTVAL (XEXP (x, 1)) != 32 + shift
7271           || INTVAL (XEXP (x, 2)) != 0)
7272         shift = -1;
7273     }
7274   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
7275      (const_int 0xffffffff<<shift)) */
7276   else if (GET_CODE (x) == AND
7277            && GET_MODE (x) == DImode
7278            && GET_CODE (XEXP (x, 0)) == ASHIFT
7279            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7280            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7281            && CONST_INT_P (XEXP (x, 1)))
7282     {
7283       type = ADDRESS_REG_UXTW;
7284       index = XEXP (XEXP (x, 0), 0);
7285       shift = INTVAL (XEXP (XEXP (x, 0), 1));
7286       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
7287         shift = -1;
7288     }
7289   /* (mult:P (reg:P) (const_int scale)) */
7290   else if (GET_CODE (x) == MULT
7291            && GET_MODE (x) == Pmode
7292            && GET_MODE (XEXP (x, 0)) == Pmode
7293            && CONST_INT_P (XEXP (x, 1)))
7294     {
7295       type = ADDRESS_REG_REG;
7296       index = XEXP (x, 0);
7297       shift = exact_log2 (INTVAL (XEXP (x, 1)));
7298     }
7299   /* (ashift:P (reg:P) (const_int shift)) */
7300   else if (GET_CODE (x) == ASHIFT
7301            && GET_MODE (x) == Pmode
7302            && GET_MODE (XEXP (x, 0)) == Pmode
7303            && CONST_INT_P (XEXP (x, 1)))
7304     {
7305       type = ADDRESS_REG_REG;
7306       index = XEXP (x, 0);
7307       shift = INTVAL (XEXP (x, 1));
7308     }
7309   else
7310     return false;
7311
7312   if (!strict_p
7313       && GET_CODE (index) == SUBREG
7314       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
7315     index = SUBREG_REG (index);
7316
7317   if (aarch64_sve_data_mode_p (mode))
7318     {
7319       if (type != ADDRESS_REG_REG
7320           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
7321         return false;
7322     }
7323   else
7324     {
7325       if (shift != 0
7326           && !(IN_RANGE (shift, 1, 3)
7327                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
7328         return false;
7329     }
7330
7331   if (REG_P (index)
7332       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
7333     {
7334       info->type = type;
7335       info->offset = index;
7336       info->shift = shift;
7337       return true;
7338     }
7339
7340   return false;
7341 }
7342
7343 /* Return true if MODE is one of the modes for which we
7344    support LDP/STP operations.  */
7345
7346 static bool
7347 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
7348 {
7349   return mode == SImode || mode == DImode
7350          || mode == SFmode || mode == DFmode
7351          || (aarch64_vector_mode_supported_p (mode)
7352              && (known_eq (GET_MODE_SIZE (mode), 8)
7353                  || (known_eq (GET_MODE_SIZE (mode), 16)
7354                     && (aarch64_tune_params.extra_tuning_flags
7355                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
7356 }
7357
7358 /* Return true if REGNO is a virtual pointer register, or an eliminable
7359    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
7360    include stack_pointer or hard_frame_pointer.  */
7361 static bool
7362 virt_or_elim_regno_p (unsigned regno)
7363 {
7364   return ((regno >= FIRST_VIRTUAL_REGISTER
7365            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
7366           || regno == FRAME_POINTER_REGNUM
7367           || regno == ARG_POINTER_REGNUM);
7368 }
7369
7370 /* Return true if X is a valid address of type TYPE for machine mode MODE.
7371    If it is, fill in INFO appropriately.  STRICT_P is true if
7372    REG_OK_STRICT is in effect.  */
7373
7374 bool
7375 aarch64_classify_address (struct aarch64_address_info *info,
7376                           rtx x, machine_mode mode, bool strict_p,
7377                           aarch64_addr_query_type type)
7378 {
7379   enum rtx_code code = GET_CODE (x);
7380   rtx op0, op1;
7381   poly_int64 offset;
7382
7383   HOST_WIDE_INT const_size;
7384
7385   /* On BE, we use load/store pair for all large int mode load/stores.
7386      TI/TFmode may also use a load/store pair.  */
7387   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7388   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
7389   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
7390                             || type == ADDR_QUERY_LDP_STP_N
7391                             || mode == TImode
7392                             || mode == TFmode
7393                             || (BYTES_BIG_ENDIAN && advsimd_struct_p));
7394
7395   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
7396      corresponds to the actual size of the memory being loaded/stored and the
7397      mode of the corresponding addressing mode is half of that.  */
7398   if (type == ADDR_QUERY_LDP_STP_N
7399       && known_eq (GET_MODE_SIZE (mode), 16))
7400     mode = DFmode;
7401
7402   bool allow_reg_index_p = (!load_store_pair_p
7403                             && (known_lt (GET_MODE_SIZE (mode), 16)
7404                                 || vec_flags == VEC_ADVSIMD
7405                                 || vec_flags & VEC_SVE_DATA));
7406
7407   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
7408      [Rn, #offset, MUL VL].  */
7409   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
7410       && (code != REG && code != PLUS))
7411     return false;
7412
7413   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
7414      REG addressing.  */
7415   if (advsimd_struct_p
7416       && !BYTES_BIG_ENDIAN
7417       && (code != POST_INC && code != REG))
7418     return false;
7419
7420   gcc_checking_assert (GET_MODE (x) == VOIDmode
7421                        || SCALAR_INT_MODE_P (GET_MODE (x)));
7422
7423   switch (code)
7424     {
7425     case REG:
7426     case SUBREG:
7427       info->type = ADDRESS_REG_IMM;
7428       info->base = x;
7429       info->offset = const0_rtx;
7430       info->const_offset = 0;
7431       return aarch64_base_register_rtx_p (x, strict_p);
7432
7433     case PLUS:
7434       op0 = XEXP (x, 0);
7435       op1 = XEXP (x, 1);
7436
7437       if (! strict_p
7438           && REG_P (op0)
7439           && virt_or_elim_regno_p (REGNO (op0))
7440           && poly_int_rtx_p (op1, &offset))
7441         {
7442           info->type = ADDRESS_REG_IMM;
7443           info->base = op0;
7444           info->offset = op1;
7445           info->const_offset = offset;
7446
7447           return true;
7448         }
7449
7450       if (maybe_ne (GET_MODE_SIZE (mode), 0)
7451           && aarch64_base_register_rtx_p (op0, strict_p)
7452           && poly_int_rtx_p (op1, &offset))
7453         {
7454           info->type = ADDRESS_REG_IMM;
7455           info->base = op0;
7456           info->offset = op1;
7457           info->const_offset = offset;
7458
7459           /* TImode and TFmode values are allowed in both pairs of X
7460              registers and individual Q registers.  The available
7461              address modes are:
7462              X,X: 7-bit signed scaled offset
7463              Q:   9-bit signed offset
7464              We conservatively require an offset representable in either mode.
7465              When performing the check for pairs of X registers i.e.  LDP/STP
7466              pass down DImode since that is the natural size of the LDP/STP
7467              instruction memory accesses.  */
7468           if (mode == TImode || mode == TFmode)
7469             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
7470                     && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
7471                         || offset_12bit_unsigned_scaled_p (mode, offset)));
7472
7473           /* A 7bit offset check because OImode will emit a ldp/stp
7474              instruction (only big endian will get here).
7475              For ldp/stp instructions, the offset is scaled for the size of a
7476              single element of the pair.  */
7477           if (mode == OImode)
7478             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
7479
7480           /* Three 9/12 bit offsets checks because CImode will emit three
7481              ldr/str instructions (only big endian will get here).  */
7482           if (mode == CImode)
7483             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
7484                     && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
7485                                                                offset + 32)
7486                         || offset_12bit_unsigned_scaled_p (V16QImode,
7487                                                            offset + 32)));
7488
7489           /* Two 7bit offsets checks because XImode will emit two ldp/stp
7490              instructions (only big endian will get here).  */
7491           if (mode == XImode)
7492             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
7493                     && aarch64_offset_7bit_signed_scaled_p (TImode,
7494                                                             offset + 32));
7495
7496           /* Make "m" use the LD1 offset range for SVE data modes, so
7497              that pre-RTL optimizers like ivopts will work to that
7498              instead of the wider LDR/STR range.  */
7499           if (vec_flags == VEC_SVE_DATA)
7500             return (type == ADDR_QUERY_M
7501                     ? offset_4bit_signed_scaled_p (mode, offset)
7502                     : offset_9bit_signed_scaled_p (mode, offset));
7503
7504           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
7505             {
7506               poly_int64 end_offset = (offset
7507                                        + GET_MODE_SIZE (mode)
7508                                        - BYTES_PER_SVE_VECTOR);
7509               return (type == ADDR_QUERY_M
7510                       ? offset_4bit_signed_scaled_p (mode, offset)
7511                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
7512                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
7513                                                          end_offset)));
7514             }
7515
7516           if (vec_flags == VEC_SVE_PRED)
7517             return offset_9bit_signed_scaled_p (mode, offset);
7518
7519           if (load_store_pair_p)
7520             return ((known_eq (GET_MODE_SIZE (mode), 4)
7521                      || known_eq (GET_MODE_SIZE (mode), 8)
7522                      || known_eq (GET_MODE_SIZE (mode), 16))
7523                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
7524           else
7525             return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
7526                     || offset_12bit_unsigned_scaled_p (mode, offset));
7527         }
7528
7529       if (allow_reg_index_p)
7530         {
7531           /* Look for base + (scaled/extended) index register.  */
7532           if (aarch64_base_register_rtx_p (op0, strict_p)
7533               && aarch64_classify_index (info, op1, mode, strict_p))
7534             {
7535               info->base = op0;
7536               return true;
7537             }
7538           if (aarch64_base_register_rtx_p (op1, strict_p)
7539               && aarch64_classify_index (info, op0, mode, strict_p))
7540             {
7541               info->base = op1;
7542               return true;
7543             }
7544         }
7545
7546       return false;
7547
7548     case POST_INC:
7549     case POST_DEC:
7550     case PRE_INC:
7551     case PRE_DEC:
7552       info->type = ADDRESS_REG_WB;
7553       info->base = XEXP (x, 0);
7554       info->offset = NULL_RTX;
7555       return aarch64_base_register_rtx_p (info->base, strict_p);
7556
7557     case POST_MODIFY:
7558     case PRE_MODIFY:
7559       info->type = ADDRESS_REG_WB;
7560       info->base = XEXP (x, 0);
7561       if (GET_CODE (XEXP (x, 1)) == PLUS
7562           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
7563           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
7564           && aarch64_base_register_rtx_p (info->base, strict_p))
7565         {
7566           info->offset = XEXP (XEXP (x, 1), 1);
7567           info->const_offset = offset;
7568
7569           /* TImode and TFmode values are allowed in both pairs of X
7570              registers and individual Q registers.  The available
7571              address modes are:
7572              X,X: 7-bit signed scaled offset
7573              Q:   9-bit signed offset
7574              We conservatively require an offset representable in either mode.
7575            */
7576           if (mode == TImode || mode == TFmode)
7577             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
7578                     && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
7579
7580           if (load_store_pair_p)
7581             return ((known_eq (GET_MODE_SIZE (mode), 4)
7582                      || known_eq (GET_MODE_SIZE (mode), 8)
7583                      || known_eq (GET_MODE_SIZE (mode), 16))
7584                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
7585           else
7586             return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
7587         }
7588       return false;
7589
7590     case CONST:
7591     case SYMBOL_REF:
7592     case LABEL_REF:
7593       /* load literal: pc-relative constant pool entry.  Only supported
7594          for SI mode or larger.  */
7595       info->type = ADDRESS_SYMBOLIC;
7596
7597       if (!load_store_pair_p
7598           && GET_MODE_SIZE (mode).is_constant (&const_size)
7599           && const_size >= 4)
7600         {
7601           rtx sym, addend;
7602
7603           split_const (x, &sym, &addend);
7604           return ((GET_CODE (sym) == LABEL_REF
7605                    || (GET_CODE (sym) == SYMBOL_REF
7606                        && CONSTANT_POOL_ADDRESS_P (sym)
7607                        && aarch64_pcrelative_literal_loads)));
7608         }
7609       return false;
7610
7611     case LO_SUM:
7612       info->type = ADDRESS_LO_SUM;
7613       info->base = XEXP (x, 0);
7614       info->offset = XEXP (x, 1);
7615       if (allow_reg_index_p
7616           && aarch64_base_register_rtx_p (info->base, strict_p))
7617         {
7618           rtx sym, offs;
7619           split_const (info->offset, &sym, &offs);
7620           if (GET_CODE (sym) == SYMBOL_REF
7621               && (aarch64_classify_symbol (sym, INTVAL (offs))
7622                   == SYMBOL_SMALL_ABSOLUTE))
7623             {
7624               /* The symbol and offset must be aligned to the access size.  */
7625               unsigned int align;
7626
7627               if (CONSTANT_POOL_ADDRESS_P (sym))
7628                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
7629               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
7630                 {
7631                   tree exp = SYMBOL_REF_DECL (sym);
7632                   align = TYPE_ALIGN (TREE_TYPE (exp));
7633                   align = aarch64_constant_alignment (exp, align);
7634                 }
7635               else if (SYMBOL_REF_DECL (sym))
7636                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
7637               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
7638                        && SYMBOL_REF_BLOCK (sym) != NULL)
7639                 align = SYMBOL_REF_BLOCK (sym)->alignment;
7640               else
7641                 align = BITS_PER_UNIT;
7642
7643               poly_int64 ref_size = GET_MODE_SIZE (mode);
7644               if (known_eq (ref_size, 0))
7645                 ref_size = GET_MODE_SIZE (DImode);
7646
7647               return (multiple_p (INTVAL (offs), ref_size)
7648                       && multiple_p (align / BITS_PER_UNIT, ref_size));
7649             }
7650         }
7651       return false;
7652
7653     default:
7654       return false;
7655     }
7656 }
7657
7658 /* Return true if the address X is valid for a PRFM instruction.
7659    STRICT_P is true if we should do strict checking with
7660    aarch64_classify_address.  */
7661
7662 bool
7663 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
7664 {
7665   struct aarch64_address_info addr;
7666
7667   /* PRFM accepts the same addresses as DImode...  */
7668   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
7669   if (!res)
7670     return false;
7671
7672   /* ... except writeback forms.  */
7673   return addr.type != ADDRESS_REG_WB;
7674 }
7675
7676 bool
7677 aarch64_symbolic_address_p (rtx x)
7678 {
7679   rtx offset;
7680
7681   split_const (x, &x, &offset);
7682   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
7683 }
7684
7685 /* Classify the base of symbolic expression X.  */
7686
7687 enum aarch64_symbol_type
7688 aarch64_classify_symbolic_expression (rtx x)
7689 {
7690   rtx offset;
7691
7692   split_const (x, &x, &offset);
7693   return aarch64_classify_symbol (x, INTVAL (offset));
7694 }
7695
7696
7697 /* Return TRUE if X is a legitimate address for accessing memory in
7698    mode MODE.  */
7699 static bool
7700 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
7701 {
7702   struct aarch64_address_info addr;
7703
7704   return aarch64_classify_address (&addr, x, mode, strict_p);
7705 }
7706
7707 /* Return TRUE if X is a legitimate address of type TYPE for accessing
7708    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
7709 bool
7710 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
7711                               aarch64_addr_query_type type)
7712 {
7713   struct aarch64_address_info addr;
7714
7715   return aarch64_classify_address (&addr, x, mode, strict_p, type);
7716 }
7717
7718 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
7719
7720 static bool
7721 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
7722                                          poly_int64 orig_offset,
7723                                          machine_mode mode)
7724 {
7725   HOST_WIDE_INT size;
7726   if (GET_MODE_SIZE (mode).is_constant (&size))
7727     {
7728       HOST_WIDE_INT const_offset, second_offset;
7729
7730       /* A general SVE offset is A * VQ + B.  Remove the A component from
7731          coefficient 0 in order to get the constant B.  */
7732       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
7733
7734       /* Split an out-of-range address displacement into a base and
7735          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
7736          range otherwise to increase opportunities for sharing the base
7737          address of different sizes.  Unaligned accesses use the signed
7738          9-bit range, TImode/TFmode use the intersection of signed
7739          scaled 7-bit and signed 9-bit offset.  */
7740       if (mode == TImode || mode == TFmode)
7741         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
7742       else if ((const_offset & (size - 1)) != 0)
7743         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
7744       else
7745         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
7746
7747       if (second_offset == 0 || known_eq (orig_offset, second_offset))
7748         return false;
7749
7750       /* Split the offset into second_offset and the rest.  */
7751       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7752       *offset2 = gen_int_mode (second_offset, Pmode);
7753       return true;
7754     }
7755   else
7756     {
7757       /* Get the mode we should use as the basis of the range.  For structure
7758          modes this is the mode of one vector.  */
7759       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7760       machine_mode step_mode
7761         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
7762
7763       /* Get the "mul vl" multiplier we'd like to use.  */
7764       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
7765       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
7766       if (vec_flags & VEC_SVE_DATA)
7767         /* LDR supports a 9-bit range, but the move patterns for
7768            structure modes require all vectors to be in range of the
7769            same base.  The simplest way of accomodating that while still
7770            promoting reuse of anchor points between different modes is
7771            to use an 8-bit range unconditionally.  */
7772         vnum = ((vnum + 128) & 255) - 128;
7773       else
7774         /* Predicates are only handled singly, so we might as well use
7775            the full range.  */
7776         vnum = ((vnum + 256) & 511) - 256;
7777       if (vnum == 0)
7778         return false;
7779
7780       /* Convert the "mul vl" multiplier into a byte offset.  */
7781       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
7782       if (known_eq (second_offset, orig_offset))
7783         return false;
7784
7785       /* Split the offset into second_offset and the rest.  */
7786       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7787       *offset2 = gen_int_mode (second_offset, Pmode);
7788       return true;
7789     }
7790 }
7791
7792 /* Return the binary representation of floating point constant VALUE in INTVAL.
7793    If the value cannot be converted, return false without setting INTVAL.
7794    The conversion is done in the given MODE.  */
7795 bool
7796 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
7797 {
7798
7799   /* We make a general exception for 0.  */
7800   if (aarch64_float_const_zero_rtx_p (value))
7801     {
7802       *intval = 0;
7803       return true;
7804     }
7805
7806   scalar_float_mode mode;
7807   if (GET_CODE (value) != CONST_DOUBLE
7808       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
7809       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
7810       /* Only support up to DF mode.  */
7811       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
7812     return false;
7813
7814   unsigned HOST_WIDE_INT ival = 0;
7815
7816   long res[2];
7817   real_to_target (res,
7818                   CONST_DOUBLE_REAL_VALUE (value),
7819                   REAL_MODE_FORMAT (mode));
7820
7821   if (mode == DFmode)
7822     {
7823       int order = BYTES_BIG_ENDIAN ? 1 : 0;
7824       ival = zext_hwi (res[order], 32);
7825       ival |= (zext_hwi (res[1 - order], 32) << 32);
7826     }
7827   else
7828       ival = zext_hwi (res[0], 32);
7829
7830   *intval = ival;
7831   return true;
7832 }
7833
7834 /* Return TRUE if rtx X is an immediate constant that can be moved using a
7835    single MOV(+MOVK) followed by an FMOV.  */
7836 bool
7837 aarch64_float_const_rtx_p (rtx x)
7838 {
7839   machine_mode mode = GET_MODE (x);
7840   if (mode == VOIDmode)
7841     return false;
7842
7843   /* Determine whether it's cheaper to write float constants as
7844      mov/movk pairs over ldr/adrp pairs.  */
7845   unsigned HOST_WIDE_INT ival;
7846
7847   if (GET_CODE (x) == CONST_DOUBLE
7848       && SCALAR_FLOAT_MODE_P (mode)
7849       && aarch64_reinterpret_float_as_int (x, &ival))
7850     {
7851       scalar_int_mode imode = (mode == HFmode
7852                                ? SImode
7853                                : int_mode_for_mode (mode).require ());
7854       int num_instr = aarch64_internal_mov_immediate
7855                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7856       return num_instr < 3;
7857     }
7858
7859   return false;
7860 }
7861
7862 /* Return TRUE if rtx X is immediate constant 0.0 */
7863 bool
7864 aarch64_float_const_zero_rtx_p (rtx x)
7865 {
7866   if (GET_MODE (x) == VOIDmode)
7867     return false;
7868
7869   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
7870     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
7871   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
7872 }
7873
7874 /* Return TRUE if rtx X is immediate constant that fits in a single
7875    MOVI immediate operation.  */
7876 bool
7877 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
7878 {
7879   if (!TARGET_SIMD)
7880      return false;
7881
7882   machine_mode vmode;
7883   scalar_int_mode imode;
7884   unsigned HOST_WIDE_INT ival;
7885
7886   if (GET_CODE (x) == CONST_DOUBLE
7887       && SCALAR_FLOAT_MODE_P (mode))
7888     {
7889       if (!aarch64_reinterpret_float_as_int (x, &ival))
7890         return false;
7891
7892       /* We make a general exception for 0.  */
7893       if (aarch64_float_const_zero_rtx_p (x))
7894         return true;
7895
7896       imode = int_mode_for_mode (mode).require ();
7897     }
7898   else if (GET_CODE (x) == CONST_INT
7899            && is_a <scalar_int_mode> (mode, &imode))
7900     ival = INTVAL (x);
7901   else
7902     return false;
7903
7904    /* use a 64 bit mode for everything except for DI/DF mode, where we use
7905      a 128 bit vector mode.  */
7906   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
7907
7908   vmode = aarch64_simd_container_mode (imode, width);
7909   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
7910
7911   return aarch64_simd_valid_immediate (v_op, NULL);
7912 }
7913
7914
7915 /* Return the fixed registers used for condition codes.  */
7916
7917 static bool
7918 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
7919 {
7920   *p1 = CC_REGNUM;
7921   *p2 = INVALID_REGNUM;
7922   return true;
7923 }
7924
7925 /* This function is used by the call expanders of the machine description.
7926    RESULT is the register in which the result is returned.  It's NULL for
7927    "call" and "sibcall".
7928    MEM is the location of the function call.
7929    SIBCALL indicates whether this function call is normal call or sibling call.
7930    It will generate different pattern accordingly.  */
7931
7932 void
7933 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
7934 {
7935   rtx call, callee, tmp;
7936   rtvec vec;
7937   machine_mode mode;
7938
7939   gcc_assert (MEM_P (mem));
7940   callee = XEXP (mem, 0);
7941   mode = GET_MODE (callee);
7942   gcc_assert (mode == Pmode);
7943
7944   /* Decide if we should generate indirect calls by loading the
7945      address of the callee into a register before performing
7946      the branch-and-link.  */
7947   if (SYMBOL_REF_P (callee)
7948       ? (aarch64_is_long_call_p (callee)
7949          || aarch64_is_noplt_call_p (callee))
7950       : !REG_P (callee))
7951     XEXP (mem, 0) = force_reg (mode, callee);
7952
7953   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
7954
7955   if (result != NULL_RTX)
7956     call = gen_rtx_SET (result, call);
7957
7958   if (sibcall)
7959     tmp = ret_rtx;
7960   else
7961     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
7962
7963   vec = gen_rtvec (2, call, tmp);
7964   call = gen_rtx_PARALLEL (VOIDmode, vec);
7965
7966   aarch64_emit_call_insn (call);
7967 }
7968
7969 /* Emit call insn with PAT and do aarch64-specific handling.  */
7970
7971 void
7972 aarch64_emit_call_insn (rtx pat)
7973 {
7974   rtx insn = emit_call_insn (pat);
7975
7976   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
7977   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
7978   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
7979 }
7980
7981 machine_mode
7982 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
7983 {
7984   machine_mode mode_x = GET_MODE (x);
7985   rtx_code code_x = GET_CODE (x);
7986
7987   /* All floating point compares return CCFP if it is an equality
7988      comparison, and CCFPE otherwise.  */
7989   if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
7990     {
7991       switch (code)
7992         {
7993         case EQ:
7994         case NE:
7995         case UNORDERED:
7996         case ORDERED:
7997         case UNLT:
7998         case UNLE:
7999         case UNGT:
8000         case UNGE:
8001         case UNEQ:
8002           return CCFPmode;
8003
8004         case LT:
8005         case LE:
8006         case GT:
8007         case GE:
8008         case LTGT:
8009           return CCFPEmode;
8010
8011         default:
8012           gcc_unreachable ();
8013         }
8014     }
8015
8016   /* Equality comparisons of short modes against zero can be performed
8017      using the TST instruction with the appropriate bitmask.  */
8018   if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
8019       && (code == EQ || code == NE)
8020       && (mode_x == HImode || mode_x == QImode))
8021     return CC_NZmode;
8022
8023   /* Similarly, comparisons of zero_extends from shorter modes can
8024      be performed using an ANDS with an immediate mask.  */
8025   if (y == const0_rtx && code_x == ZERO_EXTEND
8026       && (mode_x == SImode || mode_x == DImode)
8027       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
8028       && (code == EQ || code == NE))
8029     return CC_NZmode;
8030
8031   if ((mode_x == SImode || mode_x == DImode)
8032       && y == const0_rtx
8033       && (code == EQ || code == NE || code == LT || code == GE)
8034       && (code_x == PLUS || code_x == MINUS || code_x == AND
8035           || code_x == NEG
8036           || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
8037               && CONST_INT_P (XEXP (x, 2)))))
8038     return CC_NZmode;
8039
8040   /* A compare with a shifted operand.  Because of canonicalization,
8041      the comparison will have to be swapped when we emit the assembly
8042      code.  */
8043   if ((mode_x == SImode || mode_x == DImode)
8044       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
8045       && (code_x == ASHIFT || code_x == ASHIFTRT
8046           || code_x == LSHIFTRT
8047           || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
8048     return CC_SWPmode;
8049
8050   /* Similarly for a negated operand, but we can only do this for
8051      equalities.  */
8052   if ((mode_x == SImode || mode_x == DImode)
8053       && (REG_P (y) || GET_CODE (y) == SUBREG)
8054       && (code == EQ || code == NE)
8055       && code_x == NEG)
8056     return CC_Zmode;
8057
8058   /* A test for unsigned overflow from an addition.  */
8059   if ((mode_x == DImode || mode_x == TImode)
8060       && (code == LTU || code == GEU)
8061       && code_x == PLUS
8062       && rtx_equal_p (XEXP (x, 0), y))
8063     return CC_Cmode;
8064
8065   /* A test for unsigned overflow from an add with carry.  */
8066   if ((mode_x == DImode || mode_x == TImode)
8067       && (code == LTU || code == GEU)
8068       && code_x == PLUS
8069       && CONST_SCALAR_INT_P (y)
8070       && (rtx_mode_t (y, mode_x)
8071           == (wi::shwi (1, mode_x)
8072               << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
8073     return CC_ADCmode;
8074
8075   /* A test for signed overflow.  */
8076   if ((mode_x == DImode || mode_x == TImode)
8077       && code == NE
8078       && code_x == PLUS
8079       && GET_CODE (y) == SIGN_EXTEND)
8080     return CC_Vmode;
8081
8082   /* For everything else, return CCmode.  */
8083   return CCmode;
8084 }
8085
8086 static int
8087 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
8088
8089 int
8090 aarch64_get_condition_code (rtx x)
8091 {
8092   machine_mode mode = GET_MODE (XEXP (x, 0));
8093   enum rtx_code comp_code = GET_CODE (x);
8094
8095   if (GET_MODE_CLASS (mode) != MODE_CC)
8096     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
8097   return aarch64_get_condition_code_1 (mode, comp_code);
8098 }
8099
8100 static int
8101 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
8102 {
8103   switch (mode)
8104     {
8105     case E_CCFPmode:
8106     case E_CCFPEmode:
8107       switch (comp_code)
8108         {
8109         case GE: return AARCH64_GE;
8110         case GT: return AARCH64_GT;
8111         case LE: return AARCH64_LS;
8112         case LT: return AARCH64_MI;
8113         case NE: return AARCH64_NE;
8114         case EQ: return AARCH64_EQ;
8115         case ORDERED: return AARCH64_VC;
8116         case UNORDERED: return AARCH64_VS;
8117         case UNLT: return AARCH64_LT;
8118         case UNLE: return AARCH64_LE;
8119         case UNGT: return AARCH64_HI;
8120         case UNGE: return AARCH64_PL;
8121         default: return -1;
8122         }
8123       break;
8124
8125     case E_CCmode:
8126       switch (comp_code)
8127         {
8128         case NE: return AARCH64_NE;
8129         case EQ: return AARCH64_EQ;
8130         case GE: return AARCH64_GE;
8131         case GT: return AARCH64_GT;
8132         case LE: return AARCH64_LE;
8133         case LT: return AARCH64_LT;
8134         case GEU: return AARCH64_CS;
8135         case GTU: return AARCH64_HI;
8136         case LEU: return AARCH64_LS;
8137         case LTU: return AARCH64_CC;
8138         default: return -1;
8139         }
8140       break;
8141
8142     case E_CC_SWPmode:
8143       switch (comp_code)
8144         {
8145         case NE: return AARCH64_NE;
8146         case EQ: return AARCH64_EQ;
8147         case GE: return AARCH64_LE;
8148         case GT: return AARCH64_LT;
8149         case LE: return AARCH64_GE;
8150         case LT: return AARCH64_GT;
8151         case GEU: return AARCH64_LS;
8152         case GTU: return AARCH64_CC;
8153         case LEU: return AARCH64_CS;
8154         case LTU: return AARCH64_HI;
8155         default: return -1;
8156         }
8157       break;
8158
8159     case E_CC_NZCmode:
8160       switch (comp_code)
8161         {
8162         case NE: return AARCH64_NE; /* = any */
8163         case EQ: return AARCH64_EQ; /* = none */
8164         case GE: return AARCH64_PL; /* = nfrst */
8165         case LT: return AARCH64_MI; /* = first */
8166         case GEU: return AARCH64_CS; /* = nlast */
8167         case GTU: return AARCH64_HI; /* = pmore */
8168         case LEU: return AARCH64_LS; /* = plast */
8169         case LTU: return AARCH64_CC; /* = last */
8170         default: return -1;
8171         }
8172       break;
8173
8174     case E_CC_NZmode:
8175       switch (comp_code)
8176         {
8177         case NE: return AARCH64_NE;
8178         case EQ: return AARCH64_EQ;
8179         case GE: return AARCH64_PL;
8180         case LT: return AARCH64_MI;
8181         default: return -1;
8182         }
8183       break;
8184
8185     case E_CC_Zmode:
8186       switch (comp_code)
8187         {
8188         case NE: return AARCH64_NE;
8189         case EQ: return AARCH64_EQ;
8190         default: return -1;
8191         }
8192       break;
8193
8194     case E_CC_Cmode:
8195       switch (comp_code)
8196         {
8197         case LTU: return AARCH64_CS;
8198         case GEU: return AARCH64_CC;
8199         default: return -1;
8200         }
8201       break;
8202
8203     case E_CC_ADCmode:
8204       switch (comp_code)
8205         {
8206         case GEU: return AARCH64_CS;
8207         case LTU: return AARCH64_CC;
8208         default: return -1;
8209         }
8210       break;
8211
8212     case E_CC_Vmode:
8213       switch (comp_code)
8214         {
8215         case NE: return AARCH64_VS;
8216         case EQ: return AARCH64_VC;
8217         default: return -1;
8218         }
8219       break;
8220
8221     default:
8222       return -1;
8223     }
8224
8225   return -1;
8226 }
8227
8228 bool
8229 aarch64_const_vec_all_same_in_range_p (rtx x,
8230                                        HOST_WIDE_INT minval,
8231                                        HOST_WIDE_INT maxval)
8232 {
8233   rtx elt;
8234   return (const_vec_duplicate_p (x, &elt)
8235           && CONST_INT_P (elt)
8236           && IN_RANGE (INTVAL (elt), minval, maxval));
8237 }
8238
8239 bool
8240 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
8241 {
8242   return aarch64_const_vec_all_same_in_range_p (x, val, val);
8243 }
8244
8245 /* Return true if VEC is a constant in which every element is in the range
8246    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
8247
8248 static bool
8249 aarch64_const_vec_all_in_range_p (rtx vec,
8250                                   HOST_WIDE_INT minval,
8251                                   HOST_WIDE_INT maxval)
8252 {
8253   if (GET_CODE (vec) != CONST_VECTOR
8254       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
8255     return false;
8256
8257   int nunits;
8258   if (!CONST_VECTOR_STEPPED_P (vec))
8259     nunits = const_vector_encoded_nelts (vec);
8260   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
8261     return false;
8262
8263   for (int i = 0; i < nunits; i++)
8264     {
8265       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
8266       if (!CONST_INT_P (vec_elem)
8267           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
8268         return false;
8269     }
8270   return true;
8271 }
8272
8273 /* N Z C V.  */
8274 #define AARCH64_CC_V 1
8275 #define AARCH64_CC_C (1 << 1)
8276 #define AARCH64_CC_Z (1 << 2)
8277 #define AARCH64_CC_N (1 << 3)
8278
8279 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
8280 static const int aarch64_nzcv_codes[] =
8281 {
8282   0,            /* EQ, Z == 1.  */
8283   AARCH64_CC_Z, /* NE, Z == 0.  */
8284   0,            /* CS, C == 1.  */
8285   AARCH64_CC_C, /* CC, C == 0.  */
8286   0,            /* MI, N == 1.  */
8287   AARCH64_CC_N, /* PL, N == 0.  */
8288   0,            /* VS, V == 1.  */
8289   AARCH64_CC_V, /* VC, V == 0.  */
8290   0,            /* HI, C ==1 && Z == 0.  */
8291   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
8292   AARCH64_CC_V, /* GE, N == V.  */
8293   0,            /* LT, N != V.  */
8294   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
8295   0,            /* LE, !(Z == 0 && N == V).  */
8296   0,            /* AL, Any.  */
8297   0             /* NV, Any.  */
8298 };
8299
8300 /* Print floating-point vector immediate operand X to F, negating it
8301    first if NEGATE is true.  Return true on success, false if it isn't
8302    a constant we can handle.  */
8303
8304 static bool
8305 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
8306 {
8307   rtx elt;
8308
8309   if (!const_vec_duplicate_p (x, &elt))
8310     return false;
8311
8312   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
8313   if (negate)
8314     r = real_value_negate (&r);
8315
8316   /* Handle the SVE single-bit immediates specially, since they have a
8317      fixed form in the assembly syntax.  */
8318   if (real_equal (&r, &dconst0))
8319     asm_fprintf (f, "0.0");
8320   else if (real_equal (&r, &dconst2))
8321     asm_fprintf (f, "2.0");
8322   else if (real_equal (&r, &dconst1))
8323     asm_fprintf (f, "1.0");
8324   else if (real_equal (&r, &dconsthalf))
8325     asm_fprintf (f, "0.5");
8326   else
8327     {
8328       const int buf_size = 20;
8329       char float_buf[buf_size] = {'\0'};
8330       real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
8331                                 1, GET_MODE (elt));
8332       asm_fprintf (f, "%s", float_buf);
8333     }
8334
8335   return true;
8336 }
8337
8338 /* Return the equivalent letter for size.  */
8339 static char
8340 sizetochar (int size)
8341 {
8342   switch (size)
8343     {
8344     case 64: return 'd';
8345     case 32: return 's';
8346     case 16: return 'h';
8347     case 8 : return 'b';
8348     default: gcc_unreachable ();
8349     }
8350 }
8351
8352 /* Print operand X to file F in a target specific manner according to CODE.
8353    The acceptable formatting commands given by CODE are:
8354      'c':               An integer or symbol address without a preceding #
8355                         sign.
8356      'C':               Take the duplicated element in a vector constant
8357                         and print it in hex.
8358      'D':               Take the duplicated element in a vector constant
8359                         and print it as an unsigned integer, in decimal.
8360      'e':               Print the sign/zero-extend size as a character 8->b,
8361                         16->h, 32->w.  Can also be used for masks:
8362                         0xff->b, 0xffff->h, 0xffffffff->w.
8363      'I':               If the operand is a duplicated vector constant,
8364                         replace it with the duplicated scalar.  If the
8365                         operand is then a floating-point constant, replace
8366                         it with the integer bit representation.  Print the
8367                         transformed constant as a signed decimal number.
8368      'p':               Prints N such that 2^N == X (X must be power of 2 and
8369                         const int).
8370      'P':               Print the number of non-zero bits in X (a const_int).
8371      'H':               Print the higher numbered register of a pair (TImode)
8372                         of regs.
8373      'm':               Print a condition (eq, ne, etc).
8374      'M':               Same as 'm', but invert condition.
8375      'N':               Take the duplicated element in a vector constant
8376                         and print the negative of it in decimal.
8377      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
8378      'S/T/U/V':         Print a FP/SIMD register name for a register list.
8379                         The register printed is the FP/SIMD register name
8380                         of X + 0/1/2/3 for S/T/U/V.
8381      'R':               Print a scalar FP/SIMD register name + 1.
8382      'X':               Print bottom 16 bits of integer constant in hex.
8383      'w/x':             Print a general register name or the zero register
8384                         (32-bit or 64-bit).
8385      '0':               Print a normal operand, if it's a general register,
8386                         then we assume DImode.
8387      'k':               Print NZCV for conditional compare instructions.
8388      'A':               Output address constant representing the first
8389                         argument of X, specifying a relocation offset
8390                         if appropriate.
8391      'L':               Output constant address specified by X
8392                         with a relocation offset if appropriate.
8393      'G':               Prints address of X, specifying a PC relative
8394                         relocation mode if appropriate.
8395      'y':               Output address of LDP or STP - this is used for
8396                         some LDP/STPs which don't use a PARALLEL in their
8397                         pattern (so the mode needs to be adjusted).
8398      'z':               Output address of a typical LDP or STP.  */
8399
8400 static void
8401 aarch64_print_operand (FILE *f, rtx x, int code)
8402 {
8403   rtx elt;
8404   switch (code)
8405     {
8406     case 'c':
8407       switch (GET_CODE (x))
8408         {
8409         case CONST_INT:
8410           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
8411           break;
8412
8413         case SYMBOL_REF:
8414           output_addr_const (f, x);
8415           break;
8416
8417         case CONST:
8418           if (GET_CODE (XEXP (x, 0)) == PLUS
8419               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
8420             {
8421               output_addr_const (f, x);
8422               break;
8423             }
8424           /* Fall through.  */
8425
8426         default:
8427           output_operand_lossage ("unsupported operand for code '%c'", code);
8428         }
8429       break;
8430
8431     case 'e':
8432       {
8433         x = unwrap_const_vec_duplicate (x);
8434         if (!CONST_INT_P (x))
8435           {
8436             output_operand_lossage ("invalid operand for '%%%c'", code);
8437             return;
8438           }
8439
8440         HOST_WIDE_INT val = INTVAL (x);
8441         if ((val & ~7) == 8 || val == 0xff)
8442           fputc ('b', f);
8443         else if ((val & ~7) == 16 || val == 0xffff)
8444           fputc ('h', f);
8445         else if ((val & ~7) == 32 || val == 0xffffffff)
8446           fputc ('w', f);
8447         else
8448           {
8449             output_operand_lossage ("invalid operand for '%%%c'", code);
8450             return;
8451           }
8452       }
8453       break;
8454
8455     case 'p':
8456       {
8457         int n;
8458
8459         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
8460           {
8461             output_operand_lossage ("invalid operand for '%%%c'", code);
8462             return;
8463           }
8464
8465         asm_fprintf (f, "%d", n);
8466       }
8467       break;
8468
8469     case 'P':
8470       if (!CONST_INT_P (x))
8471         {
8472           output_operand_lossage ("invalid operand for '%%%c'", code);
8473           return;
8474         }
8475
8476       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
8477       break;
8478
8479     case 'H':
8480       if (x == const0_rtx)
8481         {
8482           asm_fprintf (f, "xzr");
8483           break;
8484         }
8485
8486       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
8487         {
8488           output_operand_lossage ("invalid operand for '%%%c'", code);
8489           return;
8490         }
8491
8492       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
8493       break;
8494
8495     case 'I':
8496       {
8497         x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
8498         if (CONST_INT_P (x))
8499           asm_fprintf (f, "%wd", INTVAL (x));
8500         else
8501           {
8502             output_operand_lossage ("invalid operand for '%%%c'", code);
8503             return;
8504           }
8505         break;
8506       }
8507
8508     case 'M':
8509     case 'm':
8510       {
8511         int cond_code;
8512         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
8513         if (x == const_true_rtx)
8514           {
8515             if (code == 'M')
8516               fputs ("nv", f);
8517             return;
8518           }
8519
8520         if (!COMPARISON_P (x))
8521           {
8522             output_operand_lossage ("invalid operand for '%%%c'", code);
8523             return;
8524           }
8525
8526         cond_code = aarch64_get_condition_code (x);
8527         gcc_assert (cond_code >= 0);
8528         if (code == 'M')
8529           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
8530         if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
8531           fputs (aarch64_sve_condition_codes[cond_code], f);
8532         else
8533           fputs (aarch64_condition_codes[cond_code], f);
8534       }
8535       break;
8536
8537     case 'N':
8538       if (!const_vec_duplicate_p (x, &elt))
8539         {
8540           output_operand_lossage ("invalid vector constant");
8541           return;
8542         }
8543
8544       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
8545         asm_fprintf (f, "%wd", -INTVAL (elt));
8546       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
8547                && aarch64_print_vector_float_operand (f, x, true))
8548         ;
8549       else
8550         {
8551           output_operand_lossage ("invalid vector constant");
8552           return;
8553         }
8554       break;
8555
8556     case 'b':
8557     case 'h':
8558     case 's':
8559     case 'd':
8560     case 'q':
8561       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8562         {
8563           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8564           return;
8565         }
8566       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
8567       break;
8568
8569     case 'S':
8570     case 'T':
8571     case 'U':
8572     case 'V':
8573       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8574         {
8575           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8576           return;
8577         }
8578       asm_fprintf (f, "%c%d",
8579                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
8580                    REGNO (x) - V0_REGNUM + (code - 'S'));
8581       break;
8582
8583     case 'R':
8584       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8585         {
8586           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8587           return;
8588         }
8589       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
8590       break;
8591
8592     case 'X':
8593       if (!CONST_INT_P (x))
8594         {
8595           output_operand_lossage ("invalid operand for '%%%c'", code);
8596           return;
8597         }
8598       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
8599       break;
8600
8601     case 'C':
8602       {
8603         /* Print a replicated constant in hex.  */
8604         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
8605           {
8606             output_operand_lossage ("invalid operand for '%%%c'", code);
8607             return;
8608           }
8609         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
8610         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
8611       }
8612       break;
8613
8614     case 'D':
8615       {
8616         /* Print a replicated constant in decimal, treating it as
8617            unsigned.  */
8618         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
8619           {
8620             output_operand_lossage ("invalid operand for '%%%c'", code);
8621             return;
8622           }
8623         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
8624         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
8625       }
8626       break;
8627
8628     case 'w':
8629     case 'x':
8630       if (x == const0_rtx
8631           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
8632         {
8633           asm_fprintf (f, "%czr", code);
8634           break;
8635         }
8636
8637       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
8638         {
8639           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
8640           break;
8641         }
8642
8643       if (REG_P (x) && REGNO (x) == SP_REGNUM)
8644         {
8645           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
8646           break;
8647         }
8648
8649       /* Fall through */
8650
8651     case 0:
8652       if (x == NULL)
8653         {
8654           output_operand_lossage ("missing operand");
8655           return;
8656         }
8657
8658       switch (GET_CODE (x))
8659         {
8660         case REG:
8661           if (aarch64_sve_data_mode_p (GET_MODE (x)))
8662             {
8663               if (REG_NREGS (x) == 1)
8664                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
8665               else
8666                 {
8667                   char suffix
8668                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
8669                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
8670                                REGNO (x) - V0_REGNUM, suffix,
8671                                END_REGNO (x) - V0_REGNUM - 1, suffix);
8672                 }
8673             }
8674           else
8675             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
8676           break;
8677
8678         case MEM:
8679           output_address (GET_MODE (x), XEXP (x, 0));
8680           break;
8681
8682         case LABEL_REF:
8683         case SYMBOL_REF:
8684           output_addr_const (asm_out_file, x);
8685           break;
8686
8687         case CONST_INT:
8688           asm_fprintf (f, "%wd", INTVAL (x));
8689           break;
8690
8691         case CONST:
8692           if (!VECTOR_MODE_P (GET_MODE (x)))
8693             {
8694               output_addr_const (asm_out_file, x);
8695               break;
8696             }
8697           /* fall through */
8698
8699         case CONST_VECTOR:
8700           if (!const_vec_duplicate_p (x, &elt))
8701             {
8702               output_operand_lossage ("invalid vector constant");
8703               return;
8704             }
8705
8706           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
8707             asm_fprintf (f, "%wd", INTVAL (elt));
8708           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
8709                    && aarch64_print_vector_float_operand (f, x, false))
8710             ;
8711           else
8712             {
8713               output_operand_lossage ("invalid vector constant");
8714               return;
8715             }
8716           break;
8717
8718         case CONST_DOUBLE:
8719           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
8720              be getting CONST_DOUBLEs holding integers.  */
8721           gcc_assert (GET_MODE (x) != VOIDmode);
8722           if (aarch64_float_const_zero_rtx_p (x))
8723             {
8724               fputc ('0', f);
8725               break;
8726             }
8727           else if (aarch64_float_const_representable_p (x))
8728             {
8729 #define buf_size 20
8730               char float_buf[buf_size] = {'\0'};
8731               real_to_decimal_for_mode (float_buf,
8732                                         CONST_DOUBLE_REAL_VALUE (x),
8733                                         buf_size, buf_size,
8734                                         1, GET_MODE (x));
8735               asm_fprintf (asm_out_file, "%s", float_buf);
8736               break;
8737 #undef buf_size
8738             }
8739           output_operand_lossage ("invalid constant");
8740           return;
8741         default:
8742           output_operand_lossage ("invalid operand");
8743           return;
8744         }
8745       break;
8746
8747     case 'A':
8748       if (GET_CODE (x) == HIGH)
8749         x = XEXP (x, 0);
8750
8751       switch (aarch64_classify_symbolic_expression (x))
8752         {
8753         case SYMBOL_SMALL_GOT_4G:
8754           asm_fprintf (asm_out_file, ":got:");
8755           break;
8756
8757         case SYMBOL_SMALL_TLSGD:
8758           asm_fprintf (asm_out_file, ":tlsgd:");
8759           break;
8760
8761         case SYMBOL_SMALL_TLSDESC:
8762           asm_fprintf (asm_out_file, ":tlsdesc:");
8763           break;
8764
8765         case SYMBOL_SMALL_TLSIE:
8766           asm_fprintf (asm_out_file, ":gottprel:");
8767           break;
8768
8769         case SYMBOL_TLSLE24:
8770           asm_fprintf (asm_out_file, ":tprel:");
8771           break;
8772
8773         case SYMBOL_TINY_GOT:
8774           gcc_unreachable ();
8775           break;
8776
8777         default:
8778           break;
8779         }
8780       output_addr_const (asm_out_file, x);
8781       break;
8782
8783     case 'L':
8784       switch (aarch64_classify_symbolic_expression (x))
8785         {
8786         case SYMBOL_SMALL_GOT_4G:
8787           asm_fprintf (asm_out_file, ":lo12:");
8788           break;
8789
8790         case SYMBOL_SMALL_TLSGD:
8791           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
8792           break;
8793
8794         case SYMBOL_SMALL_TLSDESC:
8795           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
8796           break;
8797
8798         case SYMBOL_SMALL_TLSIE:
8799           asm_fprintf (asm_out_file, ":gottprel_lo12:");
8800           break;
8801
8802         case SYMBOL_TLSLE12:
8803           asm_fprintf (asm_out_file, ":tprel_lo12:");
8804           break;
8805
8806         case SYMBOL_TLSLE24:
8807           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
8808           break;
8809
8810         case SYMBOL_TINY_GOT:
8811           asm_fprintf (asm_out_file, ":got:");
8812           break;
8813
8814         case SYMBOL_TINY_TLSIE:
8815           asm_fprintf (asm_out_file, ":gottprel:");
8816           break;
8817
8818         default:
8819           break;
8820         }
8821       output_addr_const (asm_out_file, x);
8822       break;
8823
8824     case 'G':
8825       switch (aarch64_classify_symbolic_expression (x))
8826         {
8827         case SYMBOL_TLSLE24:
8828           asm_fprintf (asm_out_file, ":tprel_hi12:");
8829           break;
8830         default:
8831           break;
8832         }
8833       output_addr_const (asm_out_file, x);
8834       break;
8835
8836     case 'k':
8837       {
8838         HOST_WIDE_INT cond_code;
8839
8840         if (!CONST_INT_P (x))
8841           {
8842             output_operand_lossage ("invalid operand for '%%%c'", code);
8843             return;
8844           }
8845
8846         cond_code = INTVAL (x);
8847         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
8848         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
8849       }
8850       break;
8851
8852     case 'y':
8853     case 'z':
8854       {
8855         machine_mode mode = GET_MODE (x);
8856
8857         if (GET_CODE (x) != MEM
8858             || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
8859           {
8860             output_operand_lossage ("invalid operand for '%%%c'", code);
8861             return;
8862           }
8863
8864         if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
8865                                             code == 'y'
8866                                             ? ADDR_QUERY_LDP_STP_N
8867                                             : ADDR_QUERY_LDP_STP))
8868           output_operand_lossage ("invalid operand prefix '%%%c'", code);
8869       }
8870       break;
8871
8872     default:
8873       output_operand_lossage ("invalid operand prefix '%%%c'", code);
8874       return;
8875     }
8876 }
8877
8878 /* Print address 'x' of a memory access with mode 'mode'.
8879    'op' is the context required by aarch64_classify_address.  It can either be
8880    MEM for a normal memory access or PARALLEL for LDP/STP.  */
8881 static bool
8882 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
8883                                 aarch64_addr_query_type type)
8884 {
8885   struct aarch64_address_info addr;
8886   unsigned int size;
8887
8888   /* Check all addresses are Pmode - including ILP32.  */
8889   if (GET_MODE (x) != Pmode
8890       && (!CONST_INT_P (x)
8891           || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
8892     {
8893       output_operand_lossage ("invalid address mode");
8894       return false;
8895     }
8896
8897   if (aarch64_classify_address (&addr, x, mode, true, type))
8898     switch (addr.type)
8899       {
8900       case ADDRESS_REG_IMM:
8901         if (known_eq (addr.const_offset, 0))
8902           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
8903         else if (aarch64_sve_data_mode_p (mode))
8904           {
8905             HOST_WIDE_INT vnum
8906               = exact_div (addr.const_offset,
8907                            BYTES_PER_SVE_VECTOR).to_constant ();
8908             asm_fprintf (f, "[%s, #%wd, mul vl]",
8909                          reg_names[REGNO (addr.base)], vnum);
8910           }
8911         else if (aarch64_sve_pred_mode_p (mode))
8912           {
8913             HOST_WIDE_INT vnum
8914               = exact_div (addr.const_offset,
8915                            BYTES_PER_SVE_PRED).to_constant ();
8916             asm_fprintf (f, "[%s, #%wd, mul vl]",
8917                          reg_names[REGNO (addr.base)], vnum);
8918           }
8919         else
8920           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
8921                        INTVAL (addr.offset));
8922         return true;
8923
8924       case ADDRESS_REG_REG:
8925         if (addr.shift == 0)
8926           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
8927                        reg_names [REGNO (addr.offset)]);
8928         else
8929           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
8930                        reg_names [REGNO (addr.offset)], addr.shift);
8931         return true;
8932
8933       case ADDRESS_REG_UXTW:
8934         if (addr.shift == 0)
8935           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
8936                        REGNO (addr.offset) - R0_REGNUM);
8937         else
8938           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
8939                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
8940         return true;
8941
8942       case ADDRESS_REG_SXTW:
8943         if (addr.shift == 0)
8944           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
8945                        REGNO (addr.offset) - R0_REGNUM);
8946         else
8947           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
8948                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
8949         return true;
8950
8951       case ADDRESS_REG_WB:
8952         /* Writeback is only supported for fixed-width modes.  */
8953         size = GET_MODE_SIZE (mode).to_constant ();
8954         switch (GET_CODE (x))
8955           {
8956           case PRE_INC:
8957             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
8958             return true;
8959           case POST_INC:
8960             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
8961             return true;
8962           case PRE_DEC:
8963             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
8964             return true;
8965           case POST_DEC:
8966             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
8967             return true;
8968           case PRE_MODIFY:
8969             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
8970                          INTVAL (addr.offset));
8971             return true;
8972           case POST_MODIFY:
8973             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
8974                          INTVAL (addr.offset));
8975             return true;
8976           default:
8977             break;
8978           }
8979         break;
8980
8981       case ADDRESS_LO_SUM:
8982         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
8983         output_addr_const (f, addr.offset);
8984         asm_fprintf (f, "]");
8985         return true;
8986
8987       case ADDRESS_SYMBOLIC:
8988         output_addr_const (f, x);
8989         return true;
8990       }
8991
8992   return false;
8993 }
8994
8995 /* Print address 'x' of a memory access with mode 'mode'.  */
8996 static void
8997 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
8998 {
8999   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
9000     output_addr_const (f, x);
9001 }
9002
9003 bool
9004 aarch64_label_mentioned_p (rtx x)
9005 {
9006   const char *fmt;
9007   int i;
9008
9009   if (GET_CODE (x) == LABEL_REF)
9010     return true;
9011
9012   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
9013      referencing instruction, but they are constant offsets, not
9014      symbols.  */
9015   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
9016     return false;
9017
9018   fmt = GET_RTX_FORMAT (GET_CODE (x));
9019   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
9020     {
9021       if (fmt[i] == 'E')
9022         {
9023           int j;
9024
9025           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
9026             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
9027               return 1;
9028         }
9029       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
9030         return 1;
9031     }
9032
9033   return 0;
9034 }
9035
9036 /* Implement REGNO_REG_CLASS.  */
9037
9038 enum reg_class
9039 aarch64_regno_regclass (unsigned regno)
9040 {
9041   if (GP_REGNUM_P (regno))
9042     return GENERAL_REGS;
9043
9044   if (regno == SP_REGNUM)
9045     return STACK_REG;
9046
9047   if (regno == FRAME_POINTER_REGNUM
9048       || regno == ARG_POINTER_REGNUM)
9049     return POINTER_REGS;
9050
9051   if (FP_REGNUM_P (regno))
9052     return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
9053             : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
9054
9055   if (PR_REGNUM_P (regno))
9056     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
9057
9058   return NO_REGS;
9059 }
9060
9061 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
9062    If OFFSET is out of range, return an offset of an anchor point
9063    that is in range.  Return 0 otherwise.  */
9064
9065 static HOST_WIDE_INT
9066 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
9067                        machine_mode mode)
9068 {
9069   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
9070   if (size > 16)
9071     return (offset + 0x400) & ~0x7f0;
9072
9073   /* For offsets that aren't a multiple of the access size, the limit is
9074      -256...255.  */
9075   if (offset & (size - 1))
9076     {
9077       /* BLKmode typically uses LDP of X-registers.  */
9078       if (mode == BLKmode)
9079         return (offset + 512) & ~0x3ff;
9080       return (offset + 0x100) & ~0x1ff;
9081     }
9082
9083   /* Small negative offsets are supported.  */
9084   if (IN_RANGE (offset, -256, 0))
9085     return 0;
9086
9087   if (mode == TImode || mode == TFmode)
9088     return (offset + 0x100) & ~0x1ff;
9089
9090   /* Use 12-bit offset by access size.  */
9091   return offset & (~0xfff * size);
9092 }
9093
9094 static rtx
9095 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
9096 {
9097   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
9098      where mask is selected by alignment and size of the offset.
9099      We try to pick as large a range for the offset as possible to
9100      maximize the chance of a CSE.  However, for aligned addresses
9101      we limit the range to 4k so that structures with different sized
9102      elements are likely to use the same base.  We need to be careful
9103      not to split a CONST for some forms of address expression, otherwise
9104      it will generate sub-optimal code.  */
9105
9106   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
9107     {
9108       rtx base = XEXP (x, 0);
9109       rtx offset_rtx = XEXP (x, 1);
9110       HOST_WIDE_INT offset = INTVAL (offset_rtx);
9111
9112       if (GET_CODE (base) == PLUS)
9113         {
9114           rtx op0 = XEXP (base, 0);
9115           rtx op1 = XEXP (base, 1);
9116
9117           /* Force any scaling into a temp for CSE.  */
9118           op0 = force_reg (Pmode, op0);
9119           op1 = force_reg (Pmode, op1);
9120
9121           /* Let the pointer register be in op0.  */
9122           if (REG_POINTER (op1))
9123             std::swap (op0, op1);
9124
9125           /* If the pointer is virtual or frame related, then we know that
9126              virtual register instantiation or register elimination is going
9127              to apply a second constant.  We want the two constants folded
9128              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
9129           if (virt_or_elim_regno_p (REGNO (op0)))
9130             {
9131               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
9132                                    NULL_RTX, true, OPTAB_DIRECT);
9133               return gen_rtx_PLUS (Pmode, base, op1);
9134             }
9135
9136           /* Otherwise, in order to encourage CSE (and thence loop strength
9137              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
9138           base = expand_binop (Pmode, add_optab, op0, op1,
9139                                NULL_RTX, true, OPTAB_DIRECT);
9140           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
9141         }
9142
9143       HOST_WIDE_INT size;
9144       if (GET_MODE_SIZE (mode).is_constant (&size))
9145         {
9146           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
9147                                                              mode);
9148           if (base_offset != 0)
9149             {
9150               base = plus_constant (Pmode, base, base_offset);
9151               base = force_operand (base, NULL_RTX);
9152               return plus_constant (Pmode, base, offset - base_offset);
9153             }
9154         }
9155     }
9156
9157   return x;
9158 }
9159
9160 static reg_class_t
9161 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
9162                           reg_class_t rclass,
9163                           machine_mode mode,
9164                           secondary_reload_info *sri)
9165 {
9166   /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
9167      directly by the *aarch64_sve_mov<mode>_be move pattern.  See the
9168      comment at the head of aarch64-sve.md for more details about the
9169      big-endian handling.  */
9170   if (BYTES_BIG_ENDIAN
9171       && reg_class_subset_p (rclass, FP_REGS)
9172       && !((REG_P (x) && HARD_REGISTER_P (x))
9173            || aarch64_simd_valid_immediate (x, NULL))
9174       && aarch64_sve_data_mode_p (mode))
9175     {
9176       sri->icode = CODE_FOR_aarch64_sve_reload_be;
9177       return NO_REGS;
9178     }
9179
9180   /* If we have to disable direct literal pool loads and stores because the
9181      function is too big, then we need a scratch register.  */
9182   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
9183       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
9184           || targetm.vector_mode_supported_p (GET_MODE (x)))
9185       && !aarch64_pcrelative_literal_loads)
9186     {
9187       sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
9188       return NO_REGS;
9189     }
9190
9191   /* Without the TARGET_SIMD instructions we cannot move a Q register
9192      to a Q register directly.  We need a scratch.  */
9193   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
9194       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
9195       && reg_class_subset_p (rclass, FP_REGS))
9196     {
9197       sri->icode = code_for_aarch64_reload_mov (mode);
9198       return NO_REGS;
9199     }
9200
9201   /* A TFmode or TImode memory access should be handled via an FP_REGS
9202      because AArch64 has richer addressing modes for LDR/STR instructions
9203      than LDP/STP instructions.  */
9204   if (TARGET_FLOAT && rclass == GENERAL_REGS
9205       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
9206     return FP_REGS;
9207
9208   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
9209       return GENERAL_REGS;
9210
9211   return NO_REGS;
9212 }
9213
9214 static bool
9215 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
9216 {
9217   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
9218
9219   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
9220      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
9221   if (frame_pointer_needed)
9222     return to == HARD_FRAME_POINTER_REGNUM;
9223   return true;
9224 }
9225
9226 poly_int64
9227 aarch64_initial_elimination_offset (unsigned from, unsigned to)
9228 {
9229   if (to == HARD_FRAME_POINTER_REGNUM)
9230     {
9231       if (from == ARG_POINTER_REGNUM)
9232         return cfun->machine->frame.hard_fp_offset;
9233
9234       if (from == FRAME_POINTER_REGNUM)
9235         return cfun->machine->frame.hard_fp_offset
9236                - cfun->machine->frame.locals_offset;
9237     }
9238
9239   if (to == STACK_POINTER_REGNUM)
9240     {
9241       if (from == FRAME_POINTER_REGNUM)
9242           return cfun->machine->frame.frame_size
9243                  - cfun->machine->frame.locals_offset;
9244     }
9245
9246   return cfun->machine->frame.frame_size;
9247 }
9248
9249 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
9250    previous frame.  */
9251
9252 rtx
9253 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
9254 {
9255   if (count != 0)
9256     return const0_rtx;
9257   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
9258 }
9259
9260
9261 static void
9262 aarch64_asm_trampoline_template (FILE *f)
9263 {
9264   int offset1 = 16;
9265   int offset2 = 20;
9266
9267   if (aarch64_bti_enabled ())
9268     {
9269       asm_fprintf (f, "\thint\t34 // bti c\n");
9270       offset1 -= 4;
9271       offset2 -= 4;
9272     }
9273
9274   if (TARGET_ILP32)
9275     {
9276       asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
9277       asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
9278                    offset1);
9279     }
9280   else
9281     {
9282       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
9283       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
9284                    offset2);
9285     }
9286   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
9287
9288   /* The trampoline needs an extra padding instruction.  In case if BTI is
9289      enabled the padding instruction is replaced by the BTI instruction at
9290      the beginning.  */
9291   if (!aarch64_bti_enabled ())
9292     assemble_aligned_integer (4, const0_rtx);
9293
9294   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
9295   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
9296 }
9297
9298 static void
9299 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
9300 {
9301   rtx fnaddr, mem, a_tramp;
9302   const int tramp_code_sz = 16;
9303
9304   /* Don't need to copy the trailing D-words, we fill those in below.  */
9305   emit_block_move (m_tramp, assemble_trampoline_template (),
9306                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
9307   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
9308   fnaddr = XEXP (DECL_RTL (fndecl), 0);
9309   if (GET_MODE (fnaddr) != ptr_mode)
9310     fnaddr = convert_memory_address (ptr_mode, fnaddr);
9311   emit_move_insn (mem, fnaddr);
9312
9313   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
9314   emit_move_insn (mem, chain_value);
9315
9316   /* XXX We should really define a "clear_cache" pattern and use
9317      gen_clear_cache().  */
9318   a_tramp = XEXP (m_tramp, 0);
9319   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
9320                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
9321                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
9322                      ptr_mode);
9323 }
9324
9325 static unsigned char
9326 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
9327 {
9328   /* ??? Logically we should only need to provide a value when
9329      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
9330      can hold MODE, but at the moment we need to handle all modes.
9331      Just ignore any runtime parts for registers that can't store them.  */
9332   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
9333   unsigned int nregs;
9334   switch (regclass)
9335     {
9336     case TAILCALL_ADDR_REGS:
9337     case POINTER_REGS:
9338     case GENERAL_REGS:
9339     case ALL_REGS:
9340     case POINTER_AND_FP_REGS:
9341     case FP_REGS:
9342     case FP_LO_REGS:
9343     case FP_LO8_REGS:
9344       if (aarch64_sve_data_mode_p (mode)
9345           && constant_multiple_p (GET_MODE_SIZE (mode),
9346                                   BYTES_PER_SVE_VECTOR, &nregs))
9347         return nregs;
9348       return (aarch64_vector_data_mode_p (mode)
9349               ? CEIL (lowest_size, UNITS_PER_VREG)
9350               : CEIL (lowest_size, UNITS_PER_WORD));
9351     case STACK_REG:
9352     case PR_REGS:
9353     case PR_LO_REGS:
9354     case PR_HI_REGS:
9355       return 1;
9356
9357     case NO_REGS:
9358       return 0;
9359
9360     default:
9361       break;
9362     }
9363   gcc_unreachable ();
9364 }
9365
9366 static reg_class_t
9367 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
9368 {
9369   if (regclass == POINTER_REGS)
9370     return GENERAL_REGS;
9371
9372   if (regclass == STACK_REG)
9373     {
9374       if (REG_P(x)
9375           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
9376           return regclass;
9377
9378       return NO_REGS;
9379     }
9380
9381   /* Register eliminiation can result in a request for
9382      SP+constant->FP_REGS.  We cannot support such operations which
9383      use SP as source and an FP_REG as destination, so reject out
9384      right now.  */
9385   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
9386     {
9387       rtx lhs = XEXP (x, 0);
9388
9389       /* Look through a possible SUBREG introduced by ILP32.  */
9390       if (GET_CODE (lhs) == SUBREG)
9391         lhs = SUBREG_REG (lhs);
9392
9393       gcc_assert (REG_P (lhs));
9394       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
9395                                       POINTER_REGS));
9396       return NO_REGS;
9397     }
9398
9399   return regclass;
9400 }
9401
9402 void
9403 aarch64_asm_output_labelref (FILE* f, const char *name)
9404 {
9405   asm_fprintf (f, "%U%s", name);
9406 }
9407
9408 static void
9409 aarch64_elf_asm_constructor (rtx symbol, int priority)
9410 {
9411   if (priority == DEFAULT_INIT_PRIORITY)
9412     default_ctor_section_asm_out_constructor (symbol, priority);
9413   else
9414     {
9415       section *s;
9416       /* While priority is known to be in range [0, 65535], so 18 bytes
9417          would be enough, the compiler might not know that.  To avoid
9418          -Wformat-truncation false positive, use a larger size.  */
9419       char buf[23];
9420       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
9421       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
9422       switch_to_section (s);
9423       assemble_align (POINTER_SIZE);
9424       assemble_aligned_integer (POINTER_BYTES, symbol);
9425     }
9426 }
9427
9428 static void
9429 aarch64_elf_asm_destructor (rtx symbol, int priority)
9430 {
9431   if (priority == DEFAULT_INIT_PRIORITY)
9432     default_dtor_section_asm_out_destructor (symbol, priority);
9433   else
9434     {
9435       section *s;
9436       /* While priority is known to be in range [0, 65535], so 18 bytes
9437          would be enough, the compiler might not know that.  To avoid
9438          -Wformat-truncation false positive, use a larger size.  */
9439       char buf[23];
9440       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
9441       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
9442       switch_to_section (s);
9443       assemble_align (POINTER_SIZE);
9444       assemble_aligned_integer (POINTER_BYTES, symbol);
9445     }
9446 }
9447
9448 const char*
9449 aarch64_output_casesi (rtx *operands)
9450 {
9451   char buf[100];
9452   char label[100];
9453   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
9454   int index;
9455   static const char *const patterns[4][2] =
9456   {
9457     {
9458       "ldrb\t%w3, [%0,%w1,uxtw]",
9459       "add\t%3, %4, %w3, sxtb #2"
9460     },
9461     {
9462       "ldrh\t%w3, [%0,%w1,uxtw #1]",
9463       "add\t%3, %4, %w3, sxth #2"
9464     },
9465     {
9466       "ldr\t%w3, [%0,%w1,uxtw #2]",
9467       "add\t%3, %4, %w3, sxtw #2"
9468     },
9469     /* We assume that DImode is only generated when not optimizing and
9470        that we don't really need 64-bit address offsets.  That would
9471        imply an object file with 8GB of code in a single function!  */
9472     {
9473       "ldr\t%w3, [%0,%w1,uxtw #2]",
9474       "add\t%3, %4, %w3, sxtw #2"
9475     }
9476   };
9477
9478   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
9479
9480   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
9481   index = exact_log2 (GET_MODE_SIZE (mode));
9482
9483   gcc_assert (index >= 0 && index <= 3);
9484
9485   /* Need to implement table size reduction, by chaning the code below.  */
9486   output_asm_insn (patterns[index][0], operands);
9487   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
9488   snprintf (buf, sizeof (buf),
9489             "adr\t%%4, %s", targetm.strip_name_encoding (label));
9490   output_asm_insn (buf, operands);
9491   output_asm_insn (patterns[index][1], operands);
9492   output_asm_insn ("br\t%3", operands);
9493   assemble_label (asm_out_file, label);
9494   return "";
9495 }
9496
9497
9498 /* Return size in bits of an arithmetic operand which is shifted/scaled and
9499    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
9500    operator.  */
9501
9502 int
9503 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
9504 {
9505   if (shift >= 0 && shift <= 3)
9506     {
9507       int size;
9508       for (size = 8; size <= 32; size *= 2)
9509         {
9510           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
9511           if (mask == bits << shift)
9512             return size;
9513         }
9514     }
9515   return 0;
9516 }
9517
9518 /* Constant pools are per function only when PC relative
9519    literal loads are true or we are in the large memory
9520    model.  */
9521
9522 static inline bool
9523 aarch64_can_use_per_function_literal_pools_p (void)
9524 {
9525   return (aarch64_pcrelative_literal_loads
9526           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
9527 }
9528
9529 static bool
9530 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
9531 {
9532   /* We can't use blocks for constants when we're using a per-function
9533      constant pool.  */
9534   return !aarch64_can_use_per_function_literal_pools_p ();
9535 }
9536
9537 /* Select appropriate section for constants depending
9538    on where we place literal pools.  */
9539
9540 static section *
9541 aarch64_select_rtx_section (machine_mode mode,
9542                             rtx x,
9543                             unsigned HOST_WIDE_INT align)
9544 {
9545   if (aarch64_can_use_per_function_literal_pools_p ())
9546     return function_section (current_function_decl);
9547
9548   return default_elf_select_rtx_section (mode, x, align);
9549 }
9550
9551 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
9552 void
9553 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
9554                                   HOST_WIDE_INT offset)
9555 {
9556   /* When using per-function literal pools, we must ensure that any code
9557      section is aligned to the minimal instruction length, lest we get
9558      errors from the assembler re "unaligned instructions".  */
9559   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
9560     ASM_OUTPUT_ALIGN (f, 2);
9561 }
9562
9563 /* Costs.  */
9564
9565 /* Helper function for rtx cost calculation.  Strip a shift expression
9566    from X.  Returns the inner operand if successful, or the original
9567    expression on failure.  */
9568 static rtx
9569 aarch64_strip_shift (rtx x)
9570 {
9571   rtx op = x;
9572
9573   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
9574      we can convert both to ROR during final output.  */
9575   if ((GET_CODE (op) == ASHIFT
9576        || GET_CODE (op) == ASHIFTRT
9577        || GET_CODE (op) == LSHIFTRT
9578        || GET_CODE (op) == ROTATERT
9579        || GET_CODE (op) == ROTATE)
9580       && CONST_INT_P (XEXP (op, 1)))
9581     return XEXP (op, 0);
9582
9583   if (GET_CODE (op) == MULT
9584       && CONST_INT_P (XEXP (op, 1))
9585       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
9586     return XEXP (op, 0);
9587
9588   return x;
9589 }
9590
9591 /* Helper function for rtx cost calculation.  Strip an extend
9592    expression from X.  Returns the inner operand if successful, or the
9593    original expression on failure.  We deal with a number of possible
9594    canonicalization variations here. If STRIP_SHIFT is true, then
9595    we can strip off a shift also.  */
9596 static rtx
9597 aarch64_strip_extend (rtx x, bool strip_shift)
9598 {
9599   scalar_int_mode mode;
9600   rtx op = x;
9601
9602   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
9603     return op;
9604
9605   /* Zero and sign extraction of a widened value.  */
9606   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
9607       && XEXP (op, 2) == const0_rtx
9608       && GET_CODE (XEXP (op, 0)) == MULT
9609       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
9610                                          XEXP (op, 1)))
9611     return XEXP (XEXP (op, 0), 0);
9612
9613   /* It can also be represented (for zero-extend) as an AND with an
9614      immediate.  */
9615   if (GET_CODE (op) == AND
9616       && GET_CODE (XEXP (op, 0)) == MULT
9617       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
9618       && CONST_INT_P (XEXP (op, 1))
9619       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
9620                            INTVAL (XEXP (op, 1))) != 0)
9621     return XEXP (XEXP (op, 0), 0);
9622
9623   /* Now handle extended register, as this may also have an optional
9624      left shift by 1..4.  */
9625   if (strip_shift
9626       && GET_CODE (op) == ASHIFT
9627       && CONST_INT_P (XEXP (op, 1))
9628       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
9629     op = XEXP (op, 0);
9630
9631   if (GET_CODE (op) == ZERO_EXTEND
9632       || GET_CODE (op) == SIGN_EXTEND)
9633     op = XEXP (op, 0);
9634
9635   if (op != x)
9636     return op;
9637
9638   return x;
9639 }
9640
9641 /* Return true iff CODE is a shift supported in combination
9642    with arithmetic instructions.  */
9643
9644 static bool
9645 aarch64_shift_p (enum rtx_code code)
9646 {
9647   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
9648 }
9649
9650
9651 /* Return true iff X is a cheap shift without a sign extend. */
9652
9653 static bool
9654 aarch64_cheap_mult_shift_p (rtx x)
9655 {
9656   rtx op0, op1;
9657
9658   op0 = XEXP (x, 0);
9659   op1 = XEXP (x, 1);
9660
9661   if (!(aarch64_tune_params.extra_tuning_flags
9662                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
9663     return false;
9664
9665   if (GET_CODE (op0) == SIGN_EXTEND)
9666     return false;
9667
9668   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
9669       && UINTVAL (op1) <= 4)
9670     return true;
9671
9672   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
9673     return false;
9674
9675   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
9676
9677   if (l2 > 0 && l2 <= 4)
9678     return true;
9679
9680   return false;
9681 }
9682
9683 /* Helper function for rtx cost calculation.  Calculate the cost of
9684    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
9685    Return the calculated cost of the expression, recursing manually in to
9686    operands where needed.  */
9687
9688 static int
9689 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
9690 {
9691   rtx op0, op1;
9692   const struct cpu_cost_table *extra_cost
9693     = aarch64_tune_params.insn_extra_cost;
9694   int cost = 0;
9695   bool compound_p = (outer == PLUS || outer == MINUS);
9696   machine_mode mode = GET_MODE (x);
9697
9698   gcc_checking_assert (code == MULT);
9699
9700   op0 = XEXP (x, 0);
9701   op1 = XEXP (x, 1);
9702
9703   if (VECTOR_MODE_P (mode))
9704     mode = GET_MODE_INNER (mode);
9705
9706   /* Integer multiply/fma.  */
9707   if (GET_MODE_CLASS (mode) == MODE_INT)
9708     {
9709       /* The multiply will be canonicalized as a shift, cost it as such.  */
9710       if (aarch64_shift_p (GET_CODE (x))
9711           || (CONST_INT_P (op1)
9712               && exact_log2 (INTVAL (op1)) > 0))
9713         {
9714           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
9715                            || GET_CODE (op0) == SIGN_EXTEND;
9716           if (speed)
9717             {
9718               if (compound_p)
9719                 {
9720                   /* If the shift is considered cheap,
9721                      then don't add any cost. */
9722                   if (aarch64_cheap_mult_shift_p (x))
9723                     ;
9724                   else if (REG_P (op1))
9725                     /* ARITH + shift-by-register.  */
9726                     cost += extra_cost->alu.arith_shift_reg;
9727                   else if (is_extend)
9728                     /* ARITH + extended register.  We don't have a cost field
9729                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
9730                     cost += extra_cost->alu.extend_arith;
9731                   else
9732                     /* ARITH + shift-by-immediate.  */
9733                     cost += extra_cost->alu.arith_shift;
9734                 }
9735               else
9736                 /* LSL (immediate).  */
9737                 cost += extra_cost->alu.shift;
9738
9739             }
9740           /* Strip extends as we will have costed them in the case above.  */
9741           if (is_extend)
9742             op0 = aarch64_strip_extend (op0, true);
9743
9744           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
9745
9746           return cost;
9747         }
9748
9749       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
9750          compound and let the below cases handle it.  After all, MNEG is a
9751          special-case alias of MSUB.  */
9752       if (GET_CODE (op0) == NEG)
9753         {
9754           op0 = XEXP (op0, 0);
9755           compound_p = true;
9756         }
9757
9758       /* Integer multiplies or FMAs have zero/sign extending variants.  */
9759       if ((GET_CODE (op0) == ZERO_EXTEND
9760            && GET_CODE (op1) == ZERO_EXTEND)
9761           || (GET_CODE (op0) == SIGN_EXTEND
9762               && GET_CODE (op1) == SIGN_EXTEND))
9763         {
9764           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
9765           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
9766
9767           if (speed)
9768             {
9769               if (compound_p)
9770                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
9771                 cost += extra_cost->mult[0].extend_add;
9772               else
9773                 /* MUL/SMULL/UMULL.  */
9774                 cost += extra_cost->mult[0].extend;
9775             }
9776
9777           return cost;
9778         }
9779
9780       /* This is either an integer multiply or a MADD.  In both cases
9781          we want to recurse and cost the operands.  */
9782       cost += rtx_cost (op0, mode, MULT, 0, speed);
9783       cost += rtx_cost (op1, mode, MULT, 1, speed);
9784
9785       if (speed)
9786         {
9787           if (compound_p)
9788             /* MADD/MSUB.  */
9789             cost += extra_cost->mult[mode == DImode].add;
9790           else
9791             /* MUL.  */
9792             cost += extra_cost->mult[mode == DImode].simple;
9793         }
9794
9795       return cost;
9796     }
9797   else
9798     {
9799       if (speed)
9800         {
9801           /* Floating-point FMA/FMUL can also support negations of the
9802              operands, unless the rounding mode is upward or downward in
9803              which case FNMUL is different than FMUL with operand negation.  */
9804           bool neg0 = GET_CODE (op0) == NEG;
9805           bool neg1 = GET_CODE (op1) == NEG;
9806           if (compound_p || !flag_rounding_math || (neg0 && neg1))
9807             {
9808               if (neg0)
9809                 op0 = XEXP (op0, 0);
9810               if (neg1)
9811                 op1 = XEXP (op1, 0);
9812             }
9813
9814           if (compound_p)
9815             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
9816             cost += extra_cost->fp[mode == DFmode].fma;
9817           else
9818             /* FMUL/FNMUL.  */
9819             cost += extra_cost->fp[mode == DFmode].mult;
9820         }
9821
9822       cost += rtx_cost (op0, mode, MULT, 0, speed);
9823       cost += rtx_cost (op1, mode, MULT, 1, speed);
9824       return cost;
9825     }
9826 }
9827
9828 static int
9829 aarch64_address_cost (rtx x,
9830                       machine_mode mode,
9831                       addr_space_t as ATTRIBUTE_UNUSED,
9832                       bool speed)
9833 {
9834   enum rtx_code c = GET_CODE (x);
9835   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
9836   struct aarch64_address_info info;
9837   int cost = 0;
9838   info.shift = 0;
9839
9840   if (!aarch64_classify_address (&info, x, mode, false))
9841     {
9842       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
9843         {
9844           /* This is a CONST or SYMBOL ref which will be split
9845              in a different way depending on the code model in use.
9846              Cost it through the generic infrastructure.  */
9847           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
9848           /* Divide through by the cost of one instruction to
9849              bring it to the same units as the address costs.  */
9850           cost_symbol_ref /= COSTS_N_INSNS (1);
9851           /* The cost is then the cost of preparing the address,
9852              followed by an immediate (possibly 0) offset.  */
9853           return cost_symbol_ref + addr_cost->imm_offset;
9854         }
9855       else
9856         {
9857           /* This is most likely a jump table from a case
9858              statement.  */
9859           return addr_cost->register_offset;
9860         }
9861     }
9862
9863   switch (info.type)
9864     {
9865       case ADDRESS_LO_SUM:
9866       case ADDRESS_SYMBOLIC:
9867       case ADDRESS_REG_IMM:
9868         cost += addr_cost->imm_offset;
9869         break;
9870
9871       case ADDRESS_REG_WB:
9872         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
9873           cost += addr_cost->pre_modify;
9874         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
9875           cost += addr_cost->post_modify;
9876         else
9877           gcc_unreachable ();
9878
9879         break;
9880
9881       case ADDRESS_REG_REG:
9882         cost += addr_cost->register_offset;
9883         break;
9884
9885       case ADDRESS_REG_SXTW:
9886         cost += addr_cost->register_sextend;
9887         break;
9888
9889       case ADDRESS_REG_UXTW:
9890         cost += addr_cost->register_zextend;
9891         break;
9892
9893       default:
9894         gcc_unreachable ();
9895     }
9896
9897
9898   if (info.shift > 0)
9899     {
9900       /* For the sake of calculating the cost of the shifted register
9901          component, we can treat same sized modes in the same way.  */
9902       if (known_eq (GET_MODE_BITSIZE (mode), 16))
9903         cost += addr_cost->addr_scale_costs.hi;
9904       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
9905         cost += addr_cost->addr_scale_costs.si;
9906       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
9907         cost += addr_cost->addr_scale_costs.di;
9908       else
9909         /* We can't tell, or this is a 128-bit vector.  */
9910         cost += addr_cost->addr_scale_costs.ti;
9911     }
9912
9913   return cost;
9914 }
9915
9916 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
9917    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
9918    to be taken.  */
9919
9920 int
9921 aarch64_branch_cost (bool speed_p, bool predictable_p)
9922 {
9923   /* When optimizing for speed, use the cost of unpredictable branches.  */
9924   const struct cpu_branch_cost *branch_costs =
9925     aarch64_tune_params.branch_costs;
9926
9927   if (!speed_p || predictable_p)
9928     return branch_costs->predictable;
9929   else
9930     return branch_costs->unpredictable;
9931 }
9932
9933 /* Return true if the RTX X in mode MODE is a zero or sign extract
9934    usable in an ADD or SUB (extended register) instruction.  */
9935 static bool
9936 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
9937 {
9938   /* Catch add with a sign extract.
9939      This is add_<optab><mode>_multp2.  */
9940   if (GET_CODE (x) == SIGN_EXTRACT
9941       || GET_CODE (x) == ZERO_EXTRACT)
9942     {
9943       rtx op0 = XEXP (x, 0);
9944       rtx op1 = XEXP (x, 1);
9945       rtx op2 = XEXP (x, 2);
9946
9947       if (GET_CODE (op0) == MULT
9948           && CONST_INT_P (op1)
9949           && op2 == const0_rtx
9950           && CONST_INT_P (XEXP (op0, 1))
9951           && aarch64_is_extend_from_extract (mode,
9952                                              XEXP (op0, 1),
9953                                              op1))
9954         {
9955           return true;
9956         }
9957     }
9958   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
9959      No shift.  */
9960   else if (GET_CODE (x) == SIGN_EXTEND
9961            || GET_CODE (x) == ZERO_EXTEND)
9962     return REG_P (XEXP (x, 0));
9963
9964   return false;
9965 }
9966
9967 static bool
9968 aarch64_frint_unspec_p (unsigned int u)
9969 {
9970   switch (u)
9971     {
9972       case UNSPEC_FRINTZ:
9973       case UNSPEC_FRINTP:
9974       case UNSPEC_FRINTM:
9975       case UNSPEC_FRINTA:
9976       case UNSPEC_FRINTN:
9977       case UNSPEC_FRINTX:
9978       case UNSPEC_FRINTI:
9979         return true;
9980
9981       default:
9982         return false;
9983     }
9984 }
9985
9986 /* Return true iff X is an rtx that will match an extr instruction
9987    i.e. as described in the *extr<mode>5_insn family of patterns.
9988    OP0 and OP1 will be set to the operands of the shifts involved
9989    on success and will be NULL_RTX otherwise.  */
9990
9991 static bool
9992 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
9993 {
9994   rtx op0, op1;
9995   scalar_int_mode mode;
9996   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
9997     return false;
9998
9999   *res_op0 = NULL_RTX;
10000   *res_op1 = NULL_RTX;
10001
10002   if (GET_CODE (x) != IOR)
10003     return false;
10004
10005   op0 = XEXP (x, 0);
10006   op1 = XEXP (x, 1);
10007
10008   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
10009       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
10010     {
10011      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
10012       if (GET_CODE (op1) == ASHIFT)
10013         std::swap (op0, op1);
10014
10015       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
10016         return false;
10017
10018       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
10019       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
10020
10021       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
10022           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
10023         {
10024           *res_op0 = XEXP (op0, 0);
10025           *res_op1 = XEXP (op1, 0);
10026           return true;
10027         }
10028     }
10029
10030   return false;
10031 }
10032
10033 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
10034    storing it in *COST.  Result is true if the total cost of the operation
10035    has now been calculated.  */
10036 static bool
10037 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
10038 {
10039   rtx inner;
10040   rtx comparator;
10041   enum rtx_code cmpcode;
10042
10043   if (COMPARISON_P (op0))
10044     {
10045       inner = XEXP (op0, 0);
10046       comparator = XEXP (op0, 1);
10047       cmpcode = GET_CODE (op0);
10048     }
10049   else
10050     {
10051       inner = op0;
10052       comparator = const0_rtx;
10053       cmpcode = NE;
10054     }
10055
10056   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
10057     {
10058       /* Conditional branch.  */
10059       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
10060         return true;
10061       else
10062         {
10063           if (cmpcode == NE || cmpcode == EQ)
10064             {
10065               if (comparator == const0_rtx)
10066                 {
10067                   /* TBZ/TBNZ/CBZ/CBNZ.  */
10068                   if (GET_CODE (inner) == ZERO_EXTRACT)
10069                     /* TBZ/TBNZ.  */
10070                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
10071                                        ZERO_EXTRACT, 0, speed);
10072                   else
10073                     /* CBZ/CBNZ.  */
10074                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
10075
10076                 return true;
10077               }
10078             }
10079           else if (cmpcode == LT || cmpcode == GE)
10080             {
10081               /* TBZ/TBNZ.  */
10082               if (comparator == const0_rtx)
10083                 return true;
10084             }
10085         }
10086     }
10087   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
10088     {
10089       /* CCMP.  */
10090       if (GET_CODE (op1) == COMPARE)
10091         {
10092           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
10093           if (XEXP (op1, 1) == const0_rtx)
10094             *cost += 1;
10095           if (speed)
10096             {
10097               machine_mode mode = GET_MODE (XEXP (op1, 0));
10098               const struct cpu_cost_table *extra_cost
10099                 = aarch64_tune_params.insn_extra_cost;
10100
10101               if (GET_MODE_CLASS (mode) == MODE_INT)
10102                 *cost += extra_cost->alu.arith;
10103               else
10104                 *cost += extra_cost->fp[mode == DFmode].compare;
10105             }
10106           return true;
10107         }
10108
10109       /* It's a conditional operation based on the status flags,
10110          so it must be some flavor of CSEL.  */
10111
10112       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
10113       if (GET_CODE (op1) == NEG
10114           || GET_CODE (op1) == NOT
10115           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
10116         op1 = XEXP (op1, 0);
10117       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
10118         {
10119           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
10120           op1 = XEXP (op1, 0);
10121           op2 = XEXP (op2, 0);
10122         }
10123
10124       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
10125       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
10126       return true;
10127     }
10128
10129   /* We don't know what this is, cost all operands.  */
10130   return false;
10131 }
10132
10133 /* Check whether X is a bitfield operation of the form shift + extend that
10134    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
10135    operand to which the bitfield operation is applied.  Otherwise return
10136    NULL_RTX.  */
10137
10138 static rtx
10139 aarch64_extend_bitfield_pattern_p (rtx x)
10140 {
10141   rtx_code outer_code = GET_CODE (x);
10142   machine_mode outer_mode = GET_MODE (x);
10143
10144   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
10145       && outer_mode != SImode && outer_mode != DImode)
10146     return NULL_RTX;
10147
10148   rtx inner = XEXP (x, 0);
10149   rtx_code inner_code = GET_CODE (inner);
10150   machine_mode inner_mode = GET_MODE (inner);
10151   rtx op = NULL_RTX;
10152
10153   switch (inner_code)
10154     {
10155       case ASHIFT:
10156         if (CONST_INT_P (XEXP (inner, 1))
10157             && (inner_mode == QImode || inner_mode == HImode))
10158           op = XEXP (inner, 0);
10159         break;
10160       case LSHIFTRT:
10161         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
10162             && (inner_mode == QImode || inner_mode == HImode))
10163           op = XEXP (inner, 0);
10164         break;
10165       case ASHIFTRT:
10166         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
10167             && (inner_mode == QImode || inner_mode == HImode))
10168           op = XEXP (inner, 0);
10169         break;
10170       default:
10171         break;
10172     }
10173
10174   return op;
10175 }
10176
10177 /* Return true if the mask and a shift amount from an RTX of the form
10178    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
10179    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
10180
10181 bool
10182 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
10183                                     rtx shft_amnt)
10184 {
10185   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
10186          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
10187          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
10188          && (INTVAL (mask)
10189              & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
10190 }
10191
10192 /* Return true if the masks and a shift amount from an RTX of the form
10193    ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
10194    a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
10195
10196 bool
10197 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
10198                                    unsigned HOST_WIDE_INT mask1,
10199                                    unsigned HOST_WIDE_INT shft_amnt,
10200                                    unsigned HOST_WIDE_INT mask2)
10201 {
10202   unsigned HOST_WIDE_INT t;
10203
10204   /* Verify that there is no overlap in what bits are set in the two masks.  */
10205   if (mask1 != ~mask2)
10206     return false;
10207
10208   /* Verify that mask2 is not all zeros or ones.  */
10209   if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
10210     return false;
10211
10212   /* The shift amount should always be less than the mode size.  */
10213   gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
10214
10215   /* Verify that the mask being shifted is contiguous and would be in the
10216      least significant bits after shifting by shft_amnt.  */
10217   t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
10218   return (t == (t & -t));
10219 }
10220
10221 /* Calculate the cost of calculating X, storing it in *COST.  Result
10222    is true if the total cost of the operation has now been calculated.  */
10223 static bool
10224 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
10225                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
10226 {
10227   rtx op0, op1, op2;
10228   const struct cpu_cost_table *extra_cost
10229     = aarch64_tune_params.insn_extra_cost;
10230   int code = GET_CODE (x);
10231   scalar_int_mode int_mode;
10232
10233   /* By default, assume that everything has equivalent cost to the
10234      cheapest instruction.  Any additional costs are applied as a delta
10235      above this default.  */
10236   *cost = COSTS_N_INSNS (1);
10237
10238   switch (code)
10239     {
10240     case SET:
10241       /* The cost depends entirely on the operands to SET.  */
10242       *cost = 0;
10243       op0 = SET_DEST (x);
10244       op1 = SET_SRC (x);
10245
10246       switch (GET_CODE (op0))
10247         {
10248         case MEM:
10249           if (speed)
10250             {
10251               rtx address = XEXP (op0, 0);
10252               if (VECTOR_MODE_P (mode))
10253                 *cost += extra_cost->ldst.storev;
10254               else if (GET_MODE_CLASS (mode) == MODE_INT)
10255                 *cost += extra_cost->ldst.store;
10256               else if (mode == SFmode)
10257                 *cost += extra_cost->ldst.storef;
10258               else if (mode == DFmode)
10259                 *cost += extra_cost->ldst.stored;
10260
10261               *cost +=
10262                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10263                                                      0, speed));
10264             }
10265
10266           *cost += rtx_cost (op1, mode, SET, 1, speed);
10267           return true;
10268
10269         case SUBREG:
10270           if (! REG_P (SUBREG_REG (op0)))
10271             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
10272
10273           /* Fall through.  */
10274         case REG:
10275           /* The cost is one per vector-register copied.  */
10276           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
10277             {
10278               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
10279               *cost = COSTS_N_INSNS (nregs);
10280             }
10281           /* const0_rtx is in general free, but we will use an
10282              instruction to set a register to 0.  */
10283           else if (REG_P (op1) || op1 == const0_rtx)
10284             {
10285               /* The cost is 1 per register copied.  */
10286               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
10287               *cost = COSTS_N_INSNS (nregs);
10288             }
10289           else
10290             /* Cost is just the cost of the RHS of the set.  */
10291             *cost += rtx_cost (op1, mode, SET, 1, speed);
10292           return true;
10293
10294         case ZERO_EXTRACT:
10295         case SIGN_EXTRACT:
10296           /* Bit-field insertion.  Strip any redundant widening of
10297              the RHS to meet the width of the target.  */
10298           if (GET_CODE (op1) == SUBREG)
10299             op1 = SUBREG_REG (op1);
10300           if ((GET_CODE (op1) == ZERO_EXTEND
10301                || GET_CODE (op1) == SIGN_EXTEND)
10302               && CONST_INT_P (XEXP (op0, 1))
10303               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
10304               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
10305             op1 = XEXP (op1, 0);
10306
10307           if (CONST_INT_P (op1))
10308             {
10309               /* MOV immediate is assumed to always be cheap.  */
10310               *cost = COSTS_N_INSNS (1);
10311             }
10312           else
10313             {
10314               /* BFM.  */
10315               if (speed)
10316                 *cost += extra_cost->alu.bfi;
10317               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
10318             }
10319
10320           return true;
10321
10322         default:
10323           /* We can't make sense of this, assume default cost.  */
10324           *cost = COSTS_N_INSNS (1);
10325           return false;
10326         }
10327       return false;
10328
10329     case CONST_INT:
10330       /* If an instruction can incorporate a constant within the
10331          instruction, the instruction's expression avoids calling
10332          rtx_cost() on the constant.  If rtx_cost() is called on a
10333          constant, then it is usually because the constant must be
10334          moved into a register by one or more instructions.
10335
10336          The exception is constant 0, which can be expressed
10337          as XZR/WZR and is therefore free.  The exception to this is
10338          if we have (set (reg) (const0_rtx)) in which case we must cost
10339          the move.  However, we can catch that when we cost the SET, so
10340          we don't need to consider that here.  */
10341       if (x == const0_rtx)
10342         *cost = 0;
10343       else
10344         {
10345           /* To an approximation, building any other constant is
10346              proportionally expensive to the number of instructions
10347              required to build that constant.  This is true whether we
10348              are compiling for SPEED or otherwise.  */
10349           if (!is_a <scalar_int_mode> (mode, &int_mode))
10350             int_mode = word_mode;
10351           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
10352                                  (NULL_RTX, x, false, int_mode));
10353         }
10354       return true;
10355
10356     case CONST_DOUBLE:
10357
10358       /* First determine number of instructions to do the move
10359           as an integer constant.  */
10360       if (!aarch64_float_const_representable_p (x)
10361            && !aarch64_can_const_movi_rtx_p (x, mode)
10362            && aarch64_float_const_rtx_p (x))
10363         {
10364           unsigned HOST_WIDE_INT ival;
10365           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
10366           gcc_assert (succeed);
10367
10368           scalar_int_mode imode = (mode == HFmode
10369                                    ? SImode
10370                                    : int_mode_for_mode (mode).require ());
10371           int ncost = aarch64_internal_mov_immediate
10372                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
10373           *cost += COSTS_N_INSNS (ncost);
10374           return true;
10375         }
10376
10377       if (speed)
10378         {
10379           /* mov[df,sf]_aarch64.  */
10380           if (aarch64_float_const_representable_p (x))
10381             /* FMOV (scalar immediate).  */
10382             *cost += extra_cost->fp[mode == DFmode].fpconst;
10383           else if (!aarch64_float_const_zero_rtx_p (x))
10384             {
10385               /* This will be a load from memory.  */
10386               if (mode == DFmode)
10387                 *cost += extra_cost->ldst.loadd;
10388               else
10389                 *cost += extra_cost->ldst.loadf;
10390             }
10391           else
10392             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
10393                or MOV v0.s[0], wzr - neither of which are modeled by the
10394                cost tables.  Just use the default cost.  */
10395             {
10396             }
10397         }
10398
10399       return true;
10400
10401     case MEM:
10402       if (speed)
10403         {
10404           /* For loads we want the base cost of a load, plus an
10405              approximation for the additional cost of the addressing
10406              mode.  */
10407           rtx address = XEXP (x, 0);
10408           if (VECTOR_MODE_P (mode))
10409             *cost += extra_cost->ldst.loadv;
10410           else if (GET_MODE_CLASS (mode) == MODE_INT)
10411             *cost += extra_cost->ldst.load;
10412           else if (mode == SFmode)
10413             *cost += extra_cost->ldst.loadf;
10414           else if (mode == DFmode)
10415             *cost += extra_cost->ldst.loadd;
10416
10417           *cost +=
10418                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10419                                                      0, speed));
10420         }
10421
10422       return true;
10423
10424     case NEG:
10425       op0 = XEXP (x, 0);
10426
10427       if (VECTOR_MODE_P (mode))
10428         {
10429           if (speed)
10430             {
10431               /* FNEG.  */
10432               *cost += extra_cost->vect.alu;
10433             }
10434           return false;
10435         }
10436
10437       if (GET_MODE_CLASS (mode) == MODE_INT)
10438         {
10439           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
10440               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
10441             {
10442               /* CSETM.  */
10443               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
10444               return true;
10445             }
10446
10447           /* Cost this as SUB wzr, X.  */
10448           op0 = CONST0_RTX (mode);
10449           op1 = XEXP (x, 0);
10450           goto cost_minus;
10451         }
10452
10453       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10454         {
10455           /* Support (neg(fma...)) as a single instruction only if
10456              sign of zeros is unimportant.  This matches the decision
10457              making in aarch64.md.  */
10458           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
10459             {
10460               /* FNMADD.  */
10461               *cost = rtx_cost (op0, mode, NEG, 0, speed);
10462               return true;
10463             }
10464           if (GET_CODE (op0) == MULT)
10465             {
10466               /* FNMUL.  */
10467               *cost = rtx_cost (op0, mode, NEG, 0, speed);
10468               return true;
10469             }
10470           if (speed)
10471             /* FNEG.  */
10472             *cost += extra_cost->fp[mode == DFmode].neg;
10473           return false;
10474         }
10475
10476       return false;
10477
10478     case CLRSB:
10479     case CLZ:
10480       if (speed)
10481         {
10482           if (VECTOR_MODE_P (mode))
10483             *cost += extra_cost->vect.alu;
10484           else
10485             *cost += extra_cost->alu.clz;
10486         }
10487
10488       return false;
10489
10490     case COMPARE:
10491       op0 = XEXP (x, 0);
10492       op1 = XEXP (x, 1);
10493
10494       if (op1 == const0_rtx
10495           && GET_CODE (op0) == AND)
10496         {
10497           x = op0;
10498           mode = GET_MODE (op0);
10499           goto cost_logic;
10500         }
10501
10502       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
10503         {
10504           /* TODO: A write to the CC flags possibly costs extra, this
10505              needs encoding in the cost tables.  */
10506
10507           mode = GET_MODE (op0);
10508           /* ANDS.  */
10509           if (GET_CODE (op0) == AND)
10510             {
10511               x = op0;
10512               goto cost_logic;
10513             }
10514
10515           if (GET_CODE (op0) == PLUS)
10516             {
10517               /* ADDS (and CMN alias).  */
10518               x = op0;
10519               goto cost_plus;
10520             }
10521
10522           if (GET_CODE (op0) == MINUS)
10523             {
10524               /* SUBS.  */
10525               x = op0;
10526               goto cost_minus;
10527             }
10528
10529           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
10530               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
10531               && CONST_INT_P (XEXP (op0, 2)))
10532             {
10533               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
10534                  Handle it here directly rather than going to cost_logic
10535                  since we know the immediate generated for the TST is valid
10536                  so we can avoid creating an intermediate rtx for it only
10537                  for costing purposes.  */
10538               if (speed)
10539                 *cost += extra_cost->alu.logical;
10540
10541               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
10542                                  ZERO_EXTRACT, 0, speed);
10543               return true;
10544             }
10545
10546           if (GET_CODE (op1) == NEG)
10547             {
10548               /* CMN.  */
10549               if (speed)
10550                 *cost += extra_cost->alu.arith;
10551
10552               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
10553               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
10554               return true;
10555             }
10556
10557           /* CMP.
10558
10559              Compare can freely swap the order of operands, and
10560              canonicalization puts the more complex operation first.
10561              But the integer MINUS logic expects the shift/extend
10562              operation in op1.  */
10563           if (! (REG_P (op0)
10564                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
10565           {
10566             op0 = XEXP (x, 1);
10567             op1 = XEXP (x, 0);
10568           }
10569           goto cost_minus;
10570         }
10571
10572       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
10573         {
10574           /* FCMP.  */
10575           if (speed)
10576             *cost += extra_cost->fp[mode == DFmode].compare;
10577
10578           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
10579             {
10580               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
10581               /* FCMP supports constant 0.0 for no extra cost. */
10582               return true;
10583             }
10584           return false;
10585         }
10586
10587       if (VECTOR_MODE_P (mode))
10588         {
10589           /* Vector compare.  */
10590           if (speed)
10591             *cost += extra_cost->vect.alu;
10592
10593           if (aarch64_float_const_zero_rtx_p (op1))
10594             {
10595               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
10596                  cost.  */
10597               return true;
10598             }
10599           return false;
10600         }
10601       return false;
10602
10603     case MINUS:
10604       {
10605         op0 = XEXP (x, 0);
10606         op1 = XEXP (x, 1);
10607
10608 cost_minus:
10609         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
10610
10611         /* Detect valid immediates.  */
10612         if ((GET_MODE_CLASS (mode) == MODE_INT
10613              || (GET_MODE_CLASS (mode) == MODE_CC
10614                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
10615             && CONST_INT_P (op1)
10616             && aarch64_uimm12_shift (INTVAL (op1)))
10617           {
10618             if (speed)
10619               /* SUB(S) (immediate).  */
10620               *cost += extra_cost->alu.arith;
10621             return true;
10622           }
10623
10624         /* Look for SUB (extended register).  */
10625         if (is_a <scalar_int_mode> (mode, &int_mode)
10626             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
10627           {
10628             if (speed)
10629               *cost += extra_cost->alu.extend_arith;
10630
10631             op1 = aarch64_strip_extend (op1, true);
10632             *cost += rtx_cost (op1, VOIDmode,
10633                                (enum rtx_code) GET_CODE (op1), 0, speed);
10634             return true;
10635           }
10636
10637         rtx new_op1 = aarch64_strip_extend (op1, false);
10638
10639         /* Cost this as an FMA-alike operation.  */
10640         if ((GET_CODE (new_op1) == MULT
10641              || aarch64_shift_p (GET_CODE (new_op1)))
10642             && code != COMPARE)
10643           {
10644             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
10645                                             (enum rtx_code) code,
10646                                             speed);
10647             return true;
10648           }
10649
10650         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
10651
10652         if (speed)
10653           {
10654             if (VECTOR_MODE_P (mode))
10655               {
10656                 /* Vector SUB.  */
10657                 *cost += extra_cost->vect.alu;
10658               }
10659             else if (GET_MODE_CLASS (mode) == MODE_INT)
10660               {
10661                 /* SUB(S).  */
10662                 *cost += extra_cost->alu.arith;
10663               }
10664             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10665               {
10666                 /* FSUB.  */
10667                 *cost += extra_cost->fp[mode == DFmode].addsub;
10668               }
10669           }
10670         return true;
10671       }
10672
10673     case PLUS:
10674       {
10675         rtx new_op0;
10676
10677         op0 = XEXP (x, 0);
10678         op1 = XEXP (x, 1);
10679
10680 cost_plus:
10681         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
10682             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
10683           {
10684             /* CSINC.  */
10685             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
10686             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10687             return true;
10688           }
10689
10690         if (GET_MODE_CLASS (mode) == MODE_INT
10691             && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
10692                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
10693           {
10694             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
10695
10696             if (speed)
10697               /* ADD (immediate).  */
10698               *cost += extra_cost->alu.arith;
10699             return true;
10700           }
10701
10702         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10703
10704         /* Look for ADD (extended register).  */
10705         if (is_a <scalar_int_mode> (mode, &int_mode)
10706             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
10707           {
10708             if (speed)
10709               *cost += extra_cost->alu.extend_arith;
10710
10711             op0 = aarch64_strip_extend (op0, true);
10712             *cost += rtx_cost (op0, VOIDmode,
10713                                (enum rtx_code) GET_CODE (op0), 0, speed);
10714             return true;
10715           }
10716
10717         /* Strip any extend, leave shifts behind as we will
10718            cost them through mult_cost.  */
10719         new_op0 = aarch64_strip_extend (op0, false);
10720
10721         if (GET_CODE (new_op0) == MULT
10722             || aarch64_shift_p (GET_CODE (new_op0)))
10723           {
10724             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
10725                                             speed);
10726             return true;
10727           }
10728
10729         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
10730
10731         if (speed)
10732           {
10733             if (VECTOR_MODE_P (mode))
10734               {
10735                 /* Vector ADD.  */
10736                 *cost += extra_cost->vect.alu;
10737               }
10738             else if (GET_MODE_CLASS (mode) == MODE_INT)
10739               {
10740                 /* ADD.  */
10741                 *cost += extra_cost->alu.arith;
10742               }
10743             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10744               {
10745                 /* FADD.  */
10746                 *cost += extra_cost->fp[mode == DFmode].addsub;
10747               }
10748           }
10749         return true;
10750       }
10751
10752     case BSWAP:
10753       *cost = COSTS_N_INSNS (1);
10754
10755       if (speed)
10756         {
10757           if (VECTOR_MODE_P (mode))
10758             *cost += extra_cost->vect.alu;
10759           else
10760             *cost += extra_cost->alu.rev;
10761         }
10762       return false;
10763
10764     case IOR:
10765       if (aarch_rev16_p (x))
10766         {
10767           *cost = COSTS_N_INSNS (1);
10768
10769           if (speed)
10770             {
10771               if (VECTOR_MODE_P (mode))
10772                 *cost += extra_cost->vect.alu;
10773               else
10774                 *cost += extra_cost->alu.rev;
10775             }
10776           return true;
10777         }
10778
10779       if (aarch64_extr_rtx_p (x, &op0, &op1))
10780         {
10781           *cost += rtx_cost (op0, mode, IOR, 0, speed);
10782           *cost += rtx_cost (op1, mode, IOR, 1, speed);
10783           if (speed)
10784             *cost += extra_cost->alu.shift;
10785
10786           return true;
10787         }
10788     /* Fall through.  */
10789     case XOR:
10790     case AND:
10791     cost_logic:
10792       op0 = XEXP (x, 0);
10793       op1 = XEXP (x, 1);
10794
10795       if (VECTOR_MODE_P (mode))
10796         {
10797           if (speed)
10798             *cost += extra_cost->vect.alu;
10799           return true;
10800         }
10801
10802       if (code == AND
10803           && GET_CODE (op0) == MULT
10804           && CONST_INT_P (XEXP (op0, 1))
10805           && CONST_INT_P (op1)
10806           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
10807                                INTVAL (op1)) != 0)
10808         {
10809           /* This is a UBFM/SBFM.  */
10810           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
10811           if (speed)
10812             *cost += extra_cost->alu.bfx;
10813           return true;
10814         }
10815
10816       if (is_int_mode (mode, &int_mode))
10817         {
10818           if (CONST_INT_P (op1))
10819             {
10820               /* We have a mask + shift version of a UBFIZ
10821                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
10822               if (GET_CODE (op0) == ASHIFT
10823                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
10824                                                          XEXP (op0, 1)))
10825                 {
10826                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
10827                                      (enum rtx_code) code, 0, speed);
10828                   if (speed)
10829                     *cost += extra_cost->alu.bfx;
10830
10831                   return true;
10832                 }
10833               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
10834                 {
10835                 /* We possibly get the immediate for free, this is not
10836                    modelled.  */
10837                   *cost += rtx_cost (op0, int_mode,
10838                                      (enum rtx_code) code, 0, speed);
10839                   if (speed)
10840                     *cost += extra_cost->alu.logical;
10841
10842                   return true;
10843                 }
10844             }
10845           else
10846             {
10847               rtx new_op0 = op0;
10848
10849               /* Handle ORN, EON, or BIC.  */
10850               if (GET_CODE (op0) == NOT)
10851                 op0 = XEXP (op0, 0);
10852
10853               new_op0 = aarch64_strip_shift (op0);
10854
10855               /* If we had a shift on op0 then this is a logical-shift-
10856                  by-register/immediate operation.  Otherwise, this is just
10857                  a logical operation.  */
10858               if (speed)
10859                 {
10860                   if (new_op0 != op0)
10861                     {
10862                       /* Shift by immediate.  */
10863                       if (CONST_INT_P (XEXP (op0, 1)))
10864                         *cost += extra_cost->alu.log_shift;
10865                       else
10866                         *cost += extra_cost->alu.log_shift_reg;
10867                     }
10868                   else
10869                     *cost += extra_cost->alu.logical;
10870                 }
10871
10872               /* In both cases we want to cost both operands.  */
10873               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
10874                                  0, speed);
10875               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
10876                                  1, speed);
10877
10878               return true;
10879             }
10880         }
10881       return false;
10882
10883     case NOT:
10884       x = XEXP (x, 0);
10885       op0 = aarch64_strip_shift (x);
10886
10887       if (VECTOR_MODE_P (mode))
10888         {
10889           /* Vector NOT.  */
10890           *cost += extra_cost->vect.alu;
10891           return false;
10892         }
10893
10894       /* MVN-shifted-reg.  */
10895       if (op0 != x)
10896         {
10897           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10898
10899           if (speed)
10900             *cost += extra_cost->alu.log_shift;
10901
10902           return true;
10903         }
10904       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
10905          Handle the second form here taking care that 'a' in the above can
10906          be a shift.  */
10907       else if (GET_CODE (op0) == XOR)
10908         {
10909           rtx newop0 = XEXP (op0, 0);
10910           rtx newop1 = XEXP (op0, 1);
10911           rtx op0_stripped = aarch64_strip_shift (newop0);
10912
10913           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
10914           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
10915
10916           if (speed)
10917             {
10918               if (op0_stripped != newop0)
10919                 *cost += extra_cost->alu.log_shift;
10920               else
10921                 *cost += extra_cost->alu.logical;
10922             }
10923
10924           return true;
10925         }
10926       /* MVN.  */
10927       if (speed)
10928         *cost += extra_cost->alu.logical;
10929
10930       return false;
10931
10932     case ZERO_EXTEND:
10933
10934       op0 = XEXP (x, 0);
10935       /* If a value is written in SI mode, then zero extended to DI
10936          mode, the operation will in general be free as a write to
10937          a 'w' register implicitly zeroes the upper bits of an 'x'
10938          register.  However, if this is
10939
10940            (set (reg) (zero_extend (reg)))
10941
10942          we must cost the explicit register move.  */
10943       if (mode == DImode
10944           && GET_MODE (op0) == SImode
10945           && outer == SET)
10946         {
10947           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
10948
10949         /* If OP_COST is non-zero, then the cost of the zero extend
10950            is effectively the cost of the inner operation.  Otherwise
10951            we have a MOV instruction and we take the cost from the MOV
10952            itself.  This is true independently of whether we are
10953            optimizing for space or time.  */
10954           if (op_cost)
10955             *cost = op_cost;
10956
10957           return true;
10958         }
10959       else if (MEM_P (op0))
10960         {
10961           /* All loads can zero extend to any size for free.  */
10962           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
10963           return true;
10964         }
10965
10966       op0 = aarch64_extend_bitfield_pattern_p (x);
10967       if (op0)
10968         {
10969           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
10970           if (speed)
10971             *cost += extra_cost->alu.bfx;
10972           return true;
10973         }
10974
10975       if (speed)
10976         {
10977           if (VECTOR_MODE_P (mode))
10978             {
10979               /* UMOV.  */
10980               *cost += extra_cost->vect.alu;
10981             }
10982           else
10983             {
10984               /* We generate an AND instead of UXTB/UXTH.  */
10985               *cost += extra_cost->alu.logical;
10986             }
10987         }
10988       return false;
10989
10990     case SIGN_EXTEND:
10991       if (MEM_P (XEXP (x, 0)))
10992         {
10993           /* LDRSH.  */
10994           if (speed)
10995             {
10996               rtx address = XEXP (XEXP (x, 0), 0);
10997               *cost += extra_cost->ldst.load_sign_extend;
10998
10999               *cost +=
11000                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
11001                                                      0, speed));
11002             }
11003           return true;
11004         }
11005
11006       op0 = aarch64_extend_bitfield_pattern_p (x);
11007       if (op0)
11008         {
11009           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
11010           if (speed)
11011             *cost += extra_cost->alu.bfx;
11012           return true;
11013         }
11014
11015       if (speed)
11016         {
11017           if (VECTOR_MODE_P (mode))
11018             *cost += extra_cost->vect.alu;
11019           else
11020             *cost += extra_cost->alu.extend;
11021         }
11022       return false;
11023
11024     case ASHIFT:
11025       op0 = XEXP (x, 0);
11026       op1 = XEXP (x, 1);
11027
11028       if (CONST_INT_P (op1))
11029         {
11030           if (speed)
11031             {
11032               if (VECTOR_MODE_P (mode))
11033                 {
11034                   /* Vector shift (immediate).  */
11035                   *cost += extra_cost->vect.alu;
11036                 }
11037               else
11038                 {
11039                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
11040                      aliases.  */
11041                   *cost += extra_cost->alu.shift;
11042                 }
11043             }
11044
11045           /* We can incorporate zero/sign extend for free.  */
11046           if (GET_CODE (op0) == ZERO_EXTEND
11047               || GET_CODE (op0) == SIGN_EXTEND)
11048             op0 = XEXP (op0, 0);
11049
11050           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
11051           return true;
11052         }
11053       else
11054         {
11055           if (VECTOR_MODE_P (mode))
11056             {
11057               if (speed)
11058                 /* Vector shift (register).  */
11059                 *cost += extra_cost->vect.alu;
11060             }
11061           else
11062             {
11063               if (speed)
11064                 /* LSLV.  */
11065                 *cost += extra_cost->alu.shift_reg;
11066
11067               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
11068                   && CONST_INT_P (XEXP (op1, 1))
11069                   && known_eq (INTVAL (XEXP (op1, 1)),
11070                                GET_MODE_BITSIZE (mode) - 1))
11071                 {
11072                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
11073                   /* We already demanded XEXP (op1, 0) to be REG_P, so
11074                      don't recurse into it.  */
11075                   return true;
11076                 }
11077             }
11078           return false;  /* All arguments need to be in registers.  */
11079         }
11080
11081     case ROTATE:
11082     case ROTATERT:
11083     case LSHIFTRT:
11084     case ASHIFTRT:
11085       op0 = XEXP (x, 0);
11086       op1 = XEXP (x, 1);
11087
11088       if (CONST_INT_P (op1))
11089         {
11090           /* ASR (immediate) and friends.  */
11091           if (speed)
11092             {
11093               if (VECTOR_MODE_P (mode))
11094                 *cost += extra_cost->vect.alu;
11095               else
11096                 *cost += extra_cost->alu.shift;
11097             }
11098
11099           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
11100           return true;
11101         }
11102       else
11103         {
11104           if (VECTOR_MODE_P (mode))
11105             {
11106               if (speed)
11107                 /* Vector shift (register).  */
11108                 *cost += extra_cost->vect.alu;
11109             }
11110           else
11111             {
11112               if (speed)
11113                 /* ASR (register) and friends.  */
11114                 *cost += extra_cost->alu.shift_reg;
11115
11116               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
11117                   && CONST_INT_P (XEXP (op1, 1))
11118                   && known_eq (INTVAL (XEXP (op1, 1)),
11119                                GET_MODE_BITSIZE (mode) - 1))
11120                 {
11121                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
11122                   /* We already demanded XEXP (op1, 0) to be REG_P, so
11123                      don't recurse into it.  */
11124                   return true;
11125                 }
11126             }
11127           return false;  /* All arguments need to be in registers.  */
11128         }
11129
11130     case SYMBOL_REF:
11131
11132       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
11133           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
11134         {
11135           /* LDR.  */
11136           if (speed)
11137             *cost += extra_cost->ldst.load;
11138         }
11139       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
11140                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
11141         {
11142           /* ADRP, followed by ADD.  */
11143           *cost += COSTS_N_INSNS (1);
11144           if (speed)
11145             *cost += 2 * extra_cost->alu.arith;
11146         }
11147       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
11148                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
11149         {
11150           /* ADR.  */
11151           if (speed)
11152             *cost += extra_cost->alu.arith;
11153         }
11154
11155       if (flag_pic)
11156         {
11157           /* One extra load instruction, after accessing the GOT.  */
11158           *cost += COSTS_N_INSNS (1);
11159           if (speed)
11160             *cost += extra_cost->ldst.load;
11161         }
11162       return true;
11163
11164     case HIGH:
11165     case LO_SUM:
11166       /* ADRP/ADD (immediate).  */
11167       if (speed)
11168         *cost += extra_cost->alu.arith;
11169       return true;
11170
11171     case ZERO_EXTRACT:
11172     case SIGN_EXTRACT:
11173       /* UBFX/SBFX.  */
11174       if (speed)
11175         {
11176           if (VECTOR_MODE_P (mode))
11177             *cost += extra_cost->vect.alu;
11178           else
11179             *cost += extra_cost->alu.bfx;
11180         }
11181
11182       /* We can trust that the immediates used will be correct (there
11183          are no by-register forms), so we need only cost op0.  */
11184       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
11185       return true;
11186
11187     case MULT:
11188       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
11189       /* aarch64_rtx_mult_cost always handles recursion to its
11190          operands.  */
11191       return true;
11192
11193     case MOD:
11194     /* We can expand signed mod by power of 2 using a NEGS, two parallel
11195        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
11196        an unconditional negate.  This case should only ever be reached through
11197        the set_smod_pow2_cheap check in expmed.c.  */
11198       if (CONST_INT_P (XEXP (x, 1))
11199           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
11200           && (mode == SImode || mode == DImode))
11201         {
11202           /* We expand to 4 instructions.  Reset the baseline.  */
11203           *cost = COSTS_N_INSNS (4);
11204
11205           if (speed)
11206             *cost += 2 * extra_cost->alu.logical
11207                      + 2 * extra_cost->alu.arith;
11208
11209           return true;
11210         }
11211
11212     /* Fall-through.  */
11213     case UMOD:
11214       if (speed)
11215         {
11216           /* Slighly prefer UMOD over SMOD.  */
11217           if (VECTOR_MODE_P (mode))
11218             *cost += extra_cost->vect.alu;
11219           else if (GET_MODE_CLASS (mode) == MODE_INT)
11220             *cost += (extra_cost->mult[mode == DImode].add
11221                       + extra_cost->mult[mode == DImode].idiv
11222                       + (code == MOD ? 1 : 0));
11223         }
11224       return false;  /* All arguments need to be in registers.  */
11225
11226     case DIV:
11227     case UDIV:
11228     case SQRT:
11229       if (speed)
11230         {
11231           if (VECTOR_MODE_P (mode))
11232             *cost += extra_cost->vect.alu;
11233           else if (GET_MODE_CLASS (mode) == MODE_INT)
11234             /* There is no integer SQRT, so only DIV and UDIV can get
11235                here.  */
11236             *cost += (extra_cost->mult[mode == DImode].idiv
11237                      /* Slighly prefer UDIV over SDIV.  */
11238                      + (code == DIV ? 1 : 0));
11239           else
11240             *cost += extra_cost->fp[mode == DFmode].div;
11241         }
11242       return false;  /* All arguments need to be in registers.  */
11243
11244     case IF_THEN_ELSE:
11245       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
11246                                          XEXP (x, 2), cost, speed);
11247
11248     case EQ:
11249     case NE:
11250     case GT:
11251     case GTU:
11252     case LT:
11253     case LTU:
11254     case GE:
11255     case GEU:
11256     case LE:
11257     case LEU:
11258
11259       return false; /* All arguments must be in registers.  */
11260
11261     case FMA:
11262       op0 = XEXP (x, 0);
11263       op1 = XEXP (x, 1);
11264       op2 = XEXP (x, 2);
11265
11266       if (speed)
11267         {
11268           if (VECTOR_MODE_P (mode))
11269             *cost += extra_cost->vect.alu;
11270           else
11271             *cost += extra_cost->fp[mode == DFmode].fma;
11272         }
11273
11274       /* FMSUB, FNMADD, and FNMSUB are free.  */
11275       if (GET_CODE (op0) == NEG)
11276         op0 = XEXP (op0, 0);
11277
11278       if (GET_CODE (op2) == NEG)
11279         op2 = XEXP (op2, 0);
11280
11281       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
11282          and the by-element operand as operand 0.  */
11283       if (GET_CODE (op1) == NEG)
11284         op1 = XEXP (op1, 0);
11285
11286       /* Catch vector-by-element operations.  The by-element operand can
11287          either be (vec_duplicate (vec_select (x))) or just
11288          (vec_select (x)), depending on whether we are multiplying by
11289          a vector or a scalar.
11290
11291          Canonicalization is not very good in these cases, FMA4 will put the
11292          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
11293       if (GET_CODE (op0) == VEC_DUPLICATE)
11294         op0 = XEXP (op0, 0);
11295       else if (GET_CODE (op1) == VEC_DUPLICATE)
11296         op1 = XEXP (op1, 0);
11297
11298       if (GET_CODE (op0) == VEC_SELECT)
11299         op0 = XEXP (op0, 0);
11300       else if (GET_CODE (op1) == VEC_SELECT)
11301         op1 = XEXP (op1, 0);
11302
11303       /* If the remaining parameters are not registers,
11304          get the cost to put them into registers.  */
11305       *cost += rtx_cost (op0, mode, FMA, 0, speed);
11306       *cost += rtx_cost (op1, mode, FMA, 1, speed);
11307       *cost += rtx_cost (op2, mode, FMA, 2, speed);
11308       return true;
11309
11310     case FLOAT:
11311     case UNSIGNED_FLOAT:
11312       if (speed)
11313         *cost += extra_cost->fp[mode == DFmode].fromint;
11314       return false;
11315
11316     case FLOAT_EXTEND:
11317       if (speed)
11318         {
11319           if (VECTOR_MODE_P (mode))
11320             {
11321               /*Vector truncate.  */
11322               *cost += extra_cost->vect.alu;
11323             }
11324           else
11325             *cost += extra_cost->fp[mode == DFmode].widen;
11326         }
11327       return false;
11328
11329     case FLOAT_TRUNCATE:
11330       if (speed)
11331         {
11332           if (VECTOR_MODE_P (mode))
11333             {
11334               /*Vector conversion.  */
11335               *cost += extra_cost->vect.alu;
11336             }
11337           else
11338             *cost += extra_cost->fp[mode == DFmode].narrow;
11339         }
11340       return false;
11341
11342     case FIX:
11343     case UNSIGNED_FIX:
11344       x = XEXP (x, 0);
11345       /* Strip the rounding part.  They will all be implemented
11346          by the fcvt* family of instructions anyway.  */
11347       if (GET_CODE (x) == UNSPEC)
11348         {
11349           unsigned int uns_code = XINT (x, 1);
11350
11351           if (uns_code == UNSPEC_FRINTA
11352               || uns_code == UNSPEC_FRINTM
11353               || uns_code == UNSPEC_FRINTN
11354               || uns_code == UNSPEC_FRINTP
11355               || uns_code == UNSPEC_FRINTZ)
11356             x = XVECEXP (x, 0, 0);
11357         }
11358
11359       if (speed)
11360         {
11361           if (VECTOR_MODE_P (mode))
11362             *cost += extra_cost->vect.alu;
11363           else
11364             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
11365         }
11366
11367       /* We can combine fmul by a power of 2 followed by a fcvt into a single
11368          fixed-point fcvt.  */
11369       if (GET_CODE (x) == MULT
11370           && ((VECTOR_MODE_P (mode)
11371                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
11372               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
11373         {
11374           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
11375                              0, speed);
11376           return true;
11377         }
11378
11379       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
11380       return true;
11381
11382     case ABS:
11383       if (VECTOR_MODE_P (mode))
11384         {
11385           /* ABS (vector).  */
11386           if (speed)
11387             *cost += extra_cost->vect.alu;
11388         }
11389       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11390         {
11391           op0 = XEXP (x, 0);
11392
11393           /* FABD, which is analogous to FADD.  */
11394           if (GET_CODE (op0) == MINUS)
11395             {
11396               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
11397               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
11398               if (speed)
11399                 *cost += extra_cost->fp[mode == DFmode].addsub;
11400
11401               return true;
11402             }
11403           /* Simple FABS is analogous to FNEG.  */
11404           if (speed)
11405             *cost += extra_cost->fp[mode == DFmode].neg;
11406         }
11407       else
11408         {
11409           /* Integer ABS will either be split to
11410              two arithmetic instructions, or will be an ABS
11411              (scalar), which we don't model.  */
11412           *cost = COSTS_N_INSNS (2);
11413           if (speed)
11414             *cost += 2 * extra_cost->alu.arith;
11415         }
11416       return false;
11417
11418     case SMAX:
11419     case SMIN:
11420       if (speed)
11421         {
11422           if (VECTOR_MODE_P (mode))
11423             *cost += extra_cost->vect.alu;
11424           else
11425             {
11426               /* FMAXNM/FMINNM/FMAX/FMIN.
11427                  TODO: This may not be accurate for all implementations, but
11428                  we do not model this in the cost tables.  */
11429               *cost += extra_cost->fp[mode == DFmode].addsub;
11430             }
11431         }
11432       return false;
11433
11434     case UNSPEC:
11435       /* The floating point round to integer frint* instructions.  */
11436       if (aarch64_frint_unspec_p (XINT (x, 1)))
11437         {
11438           if (speed)
11439             *cost += extra_cost->fp[mode == DFmode].roundint;
11440
11441           return false;
11442         }
11443
11444       if (XINT (x, 1) == UNSPEC_RBIT)
11445         {
11446           if (speed)
11447             *cost += extra_cost->alu.rev;
11448
11449           return false;
11450         }
11451       break;
11452
11453     case TRUNCATE:
11454
11455       /* Decompose <su>muldi3_highpart.  */
11456       if (/* (truncate:DI  */
11457           mode == DImode
11458           /*   (lshiftrt:TI  */
11459           && GET_MODE (XEXP (x, 0)) == TImode
11460           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
11461           /*      (mult:TI  */
11462           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
11463           /*        (ANY_EXTEND:TI (reg:DI))
11464                     (ANY_EXTEND:TI (reg:DI)))  */
11465           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
11466                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
11467               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
11468                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
11469           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
11470           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
11471           /*     (const_int 64)  */
11472           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
11473           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
11474         {
11475           /* UMULH/SMULH.  */
11476           if (speed)
11477             *cost += extra_cost->mult[mode == DImode].extend;
11478           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
11479                              mode, MULT, 0, speed);
11480           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
11481                              mode, MULT, 1, speed);
11482           return true;
11483         }
11484
11485       /* Fall through.  */
11486     default:
11487       break;
11488     }
11489
11490   if (dump_file
11491       && flag_aarch64_verbose_cost)
11492     fprintf (dump_file,
11493       "\nFailed to cost RTX.  Assuming default cost.\n");
11494
11495   return true;
11496 }
11497
11498 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
11499    calculated for X.  This cost is stored in *COST.  Returns true
11500    if the total cost of X was calculated.  */
11501 static bool
11502 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
11503                    int param, int *cost, bool speed)
11504 {
11505   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
11506
11507   if (dump_file
11508       && flag_aarch64_verbose_cost)
11509     {
11510       print_rtl_single (dump_file, x);
11511       fprintf (dump_file, "\n%s cost: %d (%s)\n",
11512                speed ? "Hot" : "Cold",
11513                *cost, result ? "final" : "partial");
11514     }
11515
11516   return result;
11517 }
11518
11519 static int
11520 aarch64_register_move_cost (machine_mode mode,
11521                             reg_class_t from_i, reg_class_t to_i)
11522 {
11523   enum reg_class from = (enum reg_class) from_i;
11524   enum reg_class to = (enum reg_class) to_i;
11525   const struct cpu_regmove_cost *regmove_cost
11526     = aarch64_tune_params.regmove_cost;
11527
11528   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
11529   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
11530     to = GENERAL_REGS;
11531
11532   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
11533     from = GENERAL_REGS;
11534
11535   /* Moving between GPR and stack cost is the same as GP2GP.  */
11536   if ((from == GENERAL_REGS && to == STACK_REG)
11537       || (to == GENERAL_REGS && from == STACK_REG))
11538     return regmove_cost->GP2GP;
11539
11540   /* To/From the stack register, we move via the gprs.  */
11541   if (to == STACK_REG || from == STACK_REG)
11542     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
11543             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
11544
11545   if (known_eq (GET_MODE_SIZE (mode), 16))
11546     {
11547       /* 128-bit operations on general registers require 2 instructions.  */
11548       if (from == GENERAL_REGS && to == GENERAL_REGS)
11549         return regmove_cost->GP2GP * 2;
11550       else if (from == GENERAL_REGS)
11551         return regmove_cost->GP2FP * 2;
11552       else if (to == GENERAL_REGS)
11553         return regmove_cost->FP2GP * 2;
11554
11555       /* When AdvSIMD instructions are disabled it is not possible to move
11556          a 128-bit value directly between Q registers.  This is handled in
11557          secondary reload.  A general register is used as a scratch to move
11558          the upper DI value and the lower DI value is moved directly,
11559          hence the cost is the sum of three moves. */
11560       if (! TARGET_SIMD)
11561         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
11562
11563       return regmove_cost->FP2FP;
11564     }
11565
11566   if (from == GENERAL_REGS && to == GENERAL_REGS)
11567     return regmove_cost->GP2GP;
11568   else if (from == GENERAL_REGS)
11569     return regmove_cost->GP2FP;
11570   else if (to == GENERAL_REGS)
11571     return regmove_cost->FP2GP;
11572
11573   return regmove_cost->FP2FP;
11574 }
11575
11576 static int
11577 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
11578                           reg_class_t rclass ATTRIBUTE_UNUSED,
11579                           bool in ATTRIBUTE_UNUSED)
11580 {
11581   return aarch64_tune_params.memmov_cost;
11582 }
11583
11584 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
11585    to optimize 1.0/sqrt.  */
11586
11587 static bool
11588 use_rsqrt_p (machine_mode mode)
11589 {
11590   return (!flag_trapping_math
11591           && flag_unsafe_math_optimizations
11592           && ((aarch64_tune_params.approx_modes->recip_sqrt
11593                & AARCH64_APPROX_MODE (mode))
11594               || flag_mrecip_low_precision_sqrt));
11595 }
11596
11597 /* Function to decide when to use the approximate reciprocal square root
11598    builtin.  */
11599
11600 static tree
11601 aarch64_builtin_reciprocal (tree fndecl)
11602 {
11603   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
11604
11605   if (!use_rsqrt_p (mode))
11606     return NULL_TREE;
11607   return aarch64_builtin_rsqrt (DECL_MD_FUNCTION_CODE (fndecl));
11608 }
11609
11610 /* Emit instruction sequence to compute either the approximate square root
11611    or its approximate reciprocal, depending on the flag RECP, and return
11612    whether the sequence was emitted or not.  */
11613
11614 bool
11615 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
11616 {
11617   machine_mode mode = GET_MODE (dst);
11618
11619   if (GET_MODE_INNER (mode) == HFmode)
11620     {
11621       gcc_assert (!recp);
11622       return false;
11623     }
11624
11625   if (!recp)
11626     {
11627       if (!(flag_mlow_precision_sqrt
11628             || (aarch64_tune_params.approx_modes->sqrt
11629                 & AARCH64_APPROX_MODE (mode))))
11630         return false;
11631
11632       if (flag_finite_math_only
11633           || flag_trapping_math
11634           || !flag_unsafe_math_optimizations
11635           || optimize_function_for_size_p (cfun))
11636         return false;
11637     }
11638   else
11639     /* Caller assumes we cannot fail.  */
11640     gcc_assert (use_rsqrt_p (mode));
11641
11642   machine_mode mmsk = mode_for_int_vector (mode).require ();
11643   rtx xmsk = gen_reg_rtx (mmsk);
11644   if (!recp)
11645     /* When calculating the approximate square root, compare the
11646        argument with 0.0 and create a mask.  */
11647     emit_insn (gen_rtx_SET (xmsk,
11648                             gen_rtx_NEG (mmsk,
11649                                          gen_rtx_EQ (mmsk, src,
11650                                                      CONST0_RTX (mode)))));
11651
11652   /* Estimate the approximate reciprocal square root.  */
11653   rtx xdst = gen_reg_rtx (mode);
11654   emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
11655
11656   /* Iterate over the series twice for SF and thrice for DF.  */
11657   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11658
11659   /* Optionally iterate over the series once less for faster performance
11660      while sacrificing the accuracy.  */
11661   if ((recp && flag_mrecip_low_precision_sqrt)
11662       || (!recp && flag_mlow_precision_sqrt))
11663     iterations--;
11664
11665   /* Iterate over the series to calculate the approximate reciprocal square
11666      root.  */
11667   rtx x1 = gen_reg_rtx (mode);
11668   while (iterations--)
11669     {
11670       rtx x2 = gen_reg_rtx (mode);
11671       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
11672
11673       emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
11674
11675       if (iterations > 0)
11676         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
11677     }
11678
11679   if (!recp)
11680     {
11681       /* Qualify the approximate reciprocal square root when the argument is
11682          0.0 by squashing the intermediary result to 0.0.  */
11683       rtx xtmp = gen_reg_rtx (mmsk);
11684       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
11685                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
11686       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
11687
11688       /* Calculate the approximate square root.  */
11689       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
11690     }
11691
11692   /* Finalize the approximation.  */
11693   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
11694
11695   return true;
11696 }
11697
11698 /* Emit the instruction sequence to compute the approximation for the division
11699    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
11700
11701 bool
11702 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
11703 {
11704   machine_mode mode = GET_MODE (quo);
11705
11706   if (GET_MODE_INNER (mode) == HFmode)
11707     return false;
11708
11709   bool use_approx_division_p = (flag_mlow_precision_div
11710                                 || (aarch64_tune_params.approx_modes->division
11711                                     & AARCH64_APPROX_MODE (mode)));
11712
11713   if (!flag_finite_math_only
11714       || flag_trapping_math
11715       || !flag_unsafe_math_optimizations
11716       || optimize_function_for_size_p (cfun)
11717       || !use_approx_division_p)
11718     return false;
11719
11720   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
11721     return false;
11722
11723   /* Estimate the approximate reciprocal.  */
11724   rtx xrcp = gen_reg_rtx (mode);
11725   emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
11726
11727   /* Iterate over the series twice for SF and thrice for DF.  */
11728   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11729
11730   /* Optionally iterate over the series once less for faster performance,
11731      while sacrificing the accuracy.  */
11732   if (flag_mlow_precision_div)
11733     iterations--;
11734
11735   /* Iterate over the series to calculate the approximate reciprocal.  */
11736   rtx xtmp = gen_reg_rtx (mode);
11737   while (iterations--)
11738     {
11739       emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
11740
11741       if (iterations > 0)
11742         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
11743     }
11744
11745   if (num != CONST1_RTX (mode))
11746     {
11747       /* As the approximate reciprocal of DEN is already calculated, only
11748          calculate the approximate division when NUM is not 1.0.  */
11749       rtx xnum = force_reg (mode, num);
11750       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
11751     }
11752
11753   /* Finalize the approximation.  */
11754   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
11755   return true;
11756 }
11757
11758 /* Return the number of instructions that can be issued per cycle.  */
11759 static int
11760 aarch64_sched_issue_rate (void)
11761 {
11762   return aarch64_tune_params.issue_rate;
11763 }
11764
11765 static int
11766 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
11767 {
11768   int issue_rate = aarch64_sched_issue_rate ();
11769
11770   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
11771 }
11772
11773
11774 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
11775    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
11776    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
11777
11778 static int
11779 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
11780                                                     int ready_index)
11781 {
11782   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
11783 }
11784
11785
11786 /* Vectorizer cost model target hooks.  */
11787
11788 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
11789 static int
11790 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
11791                                     tree vectype,
11792                                     int misalign ATTRIBUTE_UNUSED)
11793 {
11794   unsigned elements;
11795   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
11796   bool fp = false;
11797
11798   if (vectype != NULL)
11799     fp = FLOAT_TYPE_P (vectype);
11800
11801   switch (type_of_cost)
11802     {
11803       case scalar_stmt:
11804         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
11805
11806       case scalar_load:
11807         return costs->scalar_load_cost;
11808
11809       case scalar_store:
11810         return costs->scalar_store_cost;
11811
11812       case vector_stmt:
11813         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11814
11815       case vector_load:
11816         return costs->vec_align_load_cost;
11817
11818       case vector_store:
11819         return costs->vec_store_cost;
11820
11821       case vec_to_scalar:
11822         return costs->vec_to_scalar_cost;
11823
11824       case scalar_to_vec:
11825         return costs->scalar_to_vec_cost;
11826
11827       case unaligned_load:
11828       case vector_gather_load:
11829         return costs->vec_unalign_load_cost;
11830
11831       case unaligned_store:
11832       case vector_scatter_store:
11833         return costs->vec_unalign_store_cost;
11834
11835       case cond_branch_taken:
11836         return costs->cond_taken_branch_cost;
11837
11838       case cond_branch_not_taken:
11839         return costs->cond_not_taken_branch_cost;
11840
11841       case vec_perm:
11842         return costs->vec_permute_cost;
11843
11844       case vec_promote_demote:
11845         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11846
11847       case vec_construct:
11848         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
11849         return elements / 2 + 1;
11850
11851       default:
11852         gcc_unreachable ();
11853     }
11854 }
11855
11856 /* Implement targetm.vectorize.add_stmt_cost.  */
11857 static unsigned
11858 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
11859                        struct _stmt_vec_info *stmt_info, int misalign,
11860                        enum vect_cost_model_location where)
11861 {
11862   unsigned *cost = (unsigned *) data;
11863   unsigned retval = 0;
11864
11865   if (flag_vect_cost_model)
11866     {
11867       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
11868       int stmt_cost =
11869             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
11870
11871       /* Statements in an inner loop relative to the loop being
11872          vectorized are weighted more heavily.  The value here is
11873          arbitrary and could potentially be improved with analysis.  */
11874       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
11875         count *= 50; /*  FIXME  */
11876
11877       retval = (unsigned) (count * stmt_cost);
11878       cost[where] += retval;
11879     }
11880
11881   return retval;
11882 }
11883
11884 static void initialize_aarch64_code_model (struct gcc_options *);
11885
11886 /* Parse the TO_PARSE string and put the architecture struct that it
11887    selects into RES and the architectural features into ISA_FLAGS.
11888    Return an aarch64_parse_opt_result describing the parse result.
11889    If there is an error parsing, RES and ISA_FLAGS are left unchanged.
11890    When the TO_PARSE string contains an invalid extension,
11891    a copy of the string is created and stored to INVALID_EXTENSION.  */
11892
11893 static enum aarch64_parse_opt_result
11894 aarch64_parse_arch (const char *to_parse, const struct processor **res,
11895                     uint64_t *isa_flags, std::string *invalid_extension)
11896 {
11897   const char *ext;
11898   const struct processor *arch;
11899   size_t len;
11900
11901   ext = strchr (to_parse, '+');
11902
11903   if (ext != NULL)
11904     len = ext - to_parse;
11905   else
11906     len = strlen (to_parse);
11907
11908   if (len == 0)
11909     return AARCH64_PARSE_MISSING_ARG;
11910
11911
11912   /* Loop through the list of supported ARCHes to find a match.  */
11913   for (arch = all_architectures; arch->name != NULL; arch++)
11914     {
11915       if (strlen (arch->name) == len
11916           && strncmp (arch->name, to_parse, len) == 0)
11917         {
11918           uint64_t isa_temp = arch->flags;
11919
11920           if (ext != NULL)
11921             {
11922               /* TO_PARSE string contains at least one extension.  */
11923               enum aarch64_parse_opt_result ext_res
11924                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11925
11926               if (ext_res != AARCH64_PARSE_OK)
11927                 return ext_res;
11928             }
11929           /* Extension parsing was successful.  Confirm the result
11930              arch and ISA flags.  */
11931           *res = arch;
11932           *isa_flags = isa_temp;
11933           return AARCH64_PARSE_OK;
11934         }
11935     }
11936
11937   /* ARCH name not found in list.  */
11938   return AARCH64_PARSE_INVALID_ARG;
11939 }
11940
11941 /* Parse the TO_PARSE string and put the result tuning in RES and the
11942    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
11943    describing the parse result.  If there is an error parsing, RES and
11944    ISA_FLAGS are left unchanged.
11945    When the TO_PARSE string contains an invalid extension,
11946    a copy of the string is created and stored to INVALID_EXTENSION.  */
11947
11948 static enum aarch64_parse_opt_result
11949 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
11950                    uint64_t *isa_flags, std::string *invalid_extension)
11951 {
11952   const char *ext;
11953   const struct processor *cpu;
11954   size_t len;
11955
11956   ext = strchr (to_parse, '+');
11957
11958   if (ext != NULL)
11959     len = ext - to_parse;
11960   else
11961     len = strlen (to_parse);
11962
11963   if (len == 0)
11964     return AARCH64_PARSE_MISSING_ARG;
11965
11966
11967   /* Loop through the list of supported CPUs to find a match.  */
11968   for (cpu = all_cores; cpu->name != NULL; cpu++)
11969     {
11970       if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
11971         {
11972           uint64_t isa_temp = cpu->flags;
11973
11974
11975           if (ext != NULL)
11976             {
11977               /* TO_PARSE string contains at least one extension.  */
11978               enum aarch64_parse_opt_result ext_res
11979                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11980
11981               if (ext_res != AARCH64_PARSE_OK)
11982                 return ext_res;
11983             }
11984           /* Extension parsing was successfull.  Confirm the result
11985              cpu and ISA flags.  */
11986           *res = cpu;
11987           *isa_flags = isa_temp;
11988           return AARCH64_PARSE_OK;
11989         }
11990     }
11991
11992   /* CPU name not found in list.  */
11993   return AARCH64_PARSE_INVALID_ARG;
11994 }
11995
11996 /* Parse the TO_PARSE string and put the cpu it selects into RES.
11997    Return an aarch64_parse_opt_result describing the parse result.
11998    If the parsing fails the RES does not change.  */
11999
12000 static enum aarch64_parse_opt_result
12001 aarch64_parse_tune (const char *to_parse, const struct processor **res)
12002 {
12003   const struct processor *cpu;
12004
12005   /* Loop through the list of supported CPUs to find a match.  */
12006   for (cpu = all_cores; cpu->name != NULL; cpu++)
12007     {
12008       if (strcmp (cpu->name, to_parse) == 0)
12009         {
12010           *res = cpu;
12011           return AARCH64_PARSE_OK;
12012         }
12013     }
12014
12015   /* CPU name not found in list.  */
12016   return AARCH64_PARSE_INVALID_ARG;
12017 }
12018
12019 /* Parse TOKEN, which has length LENGTH to see if it is an option
12020    described in FLAG.  If it is, return the index bit for that fusion type.
12021    If not, error (printing OPTION_NAME) and return zero.  */
12022
12023 static unsigned int
12024 aarch64_parse_one_option_token (const char *token,
12025                                 size_t length,
12026                                 const struct aarch64_flag_desc *flag,
12027                                 const char *option_name)
12028 {
12029   for (; flag->name != NULL; flag++)
12030     {
12031       if (length == strlen (flag->name)
12032           && !strncmp (flag->name, token, length))
12033         return flag->flag;
12034     }
12035
12036   error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
12037   return 0;
12038 }
12039
12040 /* Parse OPTION which is a comma-separated list of flags to enable.
12041    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
12042    default state we inherit from the CPU tuning structures.  OPTION_NAME
12043    gives the top-level option we are parsing in the -moverride string,
12044    for use in error messages.  */
12045
12046 static unsigned int
12047 aarch64_parse_boolean_options (const char *option,
12048                                const struct aarch64_flag_desc *flags,
12049                                unsigned int initial_state,
12050                                const char *option_name)
12051 {
12052   const char separator = '.';
12053   const char* specs = option;
12054   const char* ntoken = option;
12055   unsigned int found_flags = initial_state;
12056
12057   while ((ntoken = strchr (specs, separator)))
12058     {
12059       size_t token_length = ntoken - specs;
12060       unsigned token_ops = aarch64_parse_one_option_token (specs,
12061                                                            token_length,
12062                                                            flags,
12063                                                            option_name);
12064       /* If we find "none" (or, for simplicity's sake, an error) anywhere
12065          in the token stream, reset the supported operations.  So:
12066
12067            adrp+add.cmp+branch.none.adrp+add
12068
12069            would have the result of turning on only adrp+add fusion.  */
12070       if (!token_ops)
12071         found_flags = 0;
12072
12073       found_flags |= token_ops;
12074       specs = ++ntoken;
12075     }
12076
12077   /* We ended with a comma, print something.  */
12078   if (!(*specs))
12079     {
12080       error ("%s string ill-formed\n", option_name);
12081       return 0;
12082     }
12083
12084   /* We still have one more token to parse.  */
12085   size_t token_length = strlen (specs);
12086   unsigned token_ops = aarch64_parse_one_option_token (specs,
12087                                                        token_length,
12088                                                        flags,
12089                                                        option_name);
12090    if (!token_ops)
12091      found_flags = 0;
12092
12093   found_flags |= token_ops;
12094   return found_flags;
12095 }
12096
12097 /* Support for overriding instruction fusion.  */
12098
12099 static void
12100 aarch64_parse_fuse_string (const char *fuse_string,
12101                             struct tune_params *tune)
12102 {
12103   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
12104                                                      aarch64_fusible_pairs,
12105                                                      tune->fusible_ops,
12106                                                      "fuse=");
12107 }
12108
12109 /* Support for overriding other tuning flags.  */
12110
12111 static void
12112 aarch64_parse_tune_string (const char *tune_string,
12113                             struct tune_params *tune)
12114 {
12115   tune->extra_tuning_flags
12116     = aarch64_parse_boolean_options (tune_string,
12117                                      aarch64_tuning_flags,
12118                                      tune->extra_tuning_flags,
12119                                      "tune=");
12120 }
12121
12122 /* Parse the sve_width tuning moverride string in TUNE_STRING.
12123    Accept the valid SVE vector widths allowed by
12124    aarch64_sve_vector_bits_enum and use it to override sve_width
12125    in TUNE.  */
12126
12127 static void
12128 aarch64_parse_sve_width_string (const char *tune_string,
12129                                 struct tune_params *tune)
12130 {
12131   int width = -1;
12132
12133   int n = sscanf (tune_string, "%d", &width);
12134   if (n == EOF)
12135     {
12136       error ("invalid format for sve_width");
12137       return;
12138     }
12139   switch (width)
12140     {
12141     case SVE_128:
12142     case SVE_256:
12143     case SVE_512:
12144     case SVE_1024:
12145     case SVE_2048:
12146       break;
12147     default:
12148       error ("invalid sve_width value: %d", width);
12149     }
12150   tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
12151 }
12152
12153 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
12154    we understand.  If it is, extract the option string and handoff to
12155    the appropriate function.  */
12156
12157 void
12158 aarch64_parse_one_override_token (const char* token,
12159                                   size_t length,
12160                                   struct tune_params *tune)
12161 {
12162   const struct aarch64_tuning_override_function *fn
12163     = aarch64_tuning_override_functions;
12164
12165   const char *option_part = strchr (token, '=');
12166   if (!option_part)
12167     {
12168       error ("tuning string missing in option (%s)", token);
12169       return;
12170     }
12171
12172   /* Get the length of the option name.  */
12173   length = option_part - token;
12174   /* Skip the '=' to get to the option string.  */
12175   option_part++;
12176
12177   for (; fn->name != NULL; fn++)
12178     {
12179       if (!strncmp (fn->name, token, length))
12180         {
12181           fn->parse_override (option_part, tune);
12182           return;
12183         }
12184     }
12185
12186   error ("unknown tuning option (%s)",token);
12187   return;
12188 }
12189
12190 /* A checking mechanism for the implementation of the tls size.  */
12191
12192 static void
12193 initialize_aarch64_tls_size (struct gcc_options *opts)
12194 {
12195   if (aarch64_tls_size == 0)
12196     aarch64_tls_size = 24;
12197
12198   switch (opts->x_aarch64_cmodel_var)
12199     {
12200     case AARCH64_CMODEL_TINY:
12201       /* Both the default and maximum TLS size allowed under tiny is 1M which
12202          needs two instructions to address, so we clamp the size to 24.  */
12203       if (aarch64_tls_size > 24)
12204         aarch64_tls_size = 24;
12205       break;
12206     case AARCH64_CMODEL_SMALL:
12207       /* The maximum TLS size allowed under small is 4G.  */
12208       if (aarch64_tls_size > 32)
12209         aarch64_tls_size = 32;
12210       break;
12211     case AARCH64_CMODEL_LARGE:
12212       /* The maximum TLS size allowed under large is 16E.
12213          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
12214       if (aarch64_tls_size > 48)
12215         aarch64_tls_size = 48;
12216       break;
12217     default:
12218       gcc_unreachable ();
12219     }
12220
12221   return;
12222 }
12223
12224 /* Parse STRING looking for options in the format:
12225      string     :: option:string
12226      option     :: name=substring
12227      name       :: {a-z}
12228      substring  :: defined by option.  */
12229
12230 static void
12231 aarch64_parse_override_string (const char* input_string,
12232                                struct tune_params* tune)
12233 {
12234   const char separator = ':';
12235   size_t string_length = strlen (input_string) + 1;
12236   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
12237   char *string = string_root;
12238   strncpy (string, input_string, string_length);
12239   string[string_length - 1] = '\0';
12240
12241   char* ntoken = string;
12242
12243   while ((ntoken = strchr (string, separator)))
12244     {
12245       size_t token_length = ntoken - string;
12246       /* Make this substring look like a string.  */
12247       *ntoken = '\0';
12248       aarch64_parse_one_override_token (string, token_length, tune);
12249       string = ++ntoken;
12250     }
12251
12252   /* One last option to parse.  */
12253   aarch64_parse_one_override_token (string, strlen (string), tune);
12254   free (string_root);
12255 }
12256
12257
12258 static void
12259 aarch64_override_options_after_change_1 (struct gcc_options *opts)
12260 {
12261   if (accepted_branch_protection_string)
12262     {
12263       opts->x_aarch64_branch_protection_string
12264         = xstrdup (accepted_branch_protection_string);
12265     }
12266
12267   /* PR 70044: We have to be careful about being called multiple times for the
12268      same function.  This means all changes should be repeatable.  */
12269
12270   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
12271      Disable the frame pointer flag so the mid-end will not use a frame
12272      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
12273      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
12274      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
12275   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
12276   if (opts->x_flag_omit_frame_pointer == 0)
12277     opts->x_flag_omit_frame_pointer = 2;
12278
12279   /* If not optimizing for size, set the default
12280      alignment to what the target wants.  */
12281   if (!opts->x_optimize_size)
12282     {
12283       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
12284         opts->x_str_align_loops = aarch64_tune_params.loop_align;
12285       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
12286         opts->x_str_align_jumps = aarch64_tune_params.jump_align;
12287       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
12288         opts->x_str_align_functions = aarch64_tune_params.function_align;
12289     }
12290
12291   /* We default to no pc-relative literal loads.  */
12292
12293   aarch64_pcrelative_literal_loads = false;
12294
12295   /* If -mpc-relative-literal-loads is set on the command line, this
12296      implies that the user asked for PC relative literal loads.  */
12297   if (opts->x_pcrelative_literal_loads == 1)
12298     aarch64_pcrelative_literal_loads = true;
12299
12300   /* In the tiny memory model it makes no sense to disallow PC relative
12301      literal pool loads.  */
12302   if (aarch64_cmodel == AARCH64_CMODEL_TINY
12303       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
12304     aarch64_pcrelative_literal_loads = true;
12305
12306   /* When enabling the lower precision Newton series for the square root, also
12307      enable it for the reciprocal square root, since the latter is an
12308      intermediary step for the former.  */
12309   if (flag_mlow_precision_sqrt)
12310     flag_mrecip_low_precision_sqrt = true;
12311 }
12312
12313 /* 'Unpack' up the internal tuning structs and update the options
12314     in OPTS.  The caller must have set up selected_tune and selected_arch
12315     as all the other target-specific codegen decisions are
12316     derived from them.  */
12317
12318 void
12319 aarch64_override_options_internal (struct gcc_options *opts)
12320 {
12321   aarch64_tune_flags = selected_tune->flags;
12322   aarch64_tune = selected_tune->sched_core;
12323   /* Make a copy of the tuning parameters attached to the core, which
12324      we may later overwrite.  */
12325   aarch64_tune_params = *(selected_tune->tune);
12326   aarch64_architecture_version = selected_arch->architecture_version;
12327
12328   if (opts->x_aarch64_override_tune_string)
12329     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
12330                                   &aarch64_tune_params);
12331
12332   /* This target defaults to strict volatile bitfields.  */
12333   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
12334     opts->x_flag_strict_volatile_bitfields = 1;
12335
12336   if (aarch64_stack_protector_guard == SSP_GLOBAL
12337       && opts->x_aarch64_stack_protector_guard_offset_str)
12338     {
12339       error ("incompatible options %<-mstack-protector-guard=global%> and "
12340              "%<-mstack-protector-guard-offset=%s%>",
12341              aarch64_stack_protector_guard_offset_str);
12342     }
12343
12344   if (aarch64_stack_protector_guard == SSP_SYSREG
12345       && !(opts->x_aarch64_stack_protector_guard_offset_str
12346            && opts->x_aarch64_stack_protector_guard_reg_str))
12347     {
12348       error ("both %<-mstack-protector-guard-offset%> and "
12349              "%<-mstack-protector-guard-reg%> must be used "
12350              "with %<-mstack-protector-guard=sysreg%>");
12351     }
12352
12353   if (opts->x_aarch64_stack_protector_guard_reg_str)
12354     {
12355       if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
12356           error ("specify a system register with a small string length.");
12357     }
12358
12359   if (opts->x_aarch64_stack_protector_guard_offset_str)
12360     {
12361       char *end;
12362       const char *str = aarch64_stack_protector_guard_offset_str;
12363       errno = 0;
12364       long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
12365       if (!*str || *end || errno)
12366         error ("%qs is not a valid offset in %qs", str,
12367                "-mstack-protector-guard-offset=");
12368       aarch64_stack_protector_guard_offset = offs;
12369     }
12370
12371   initialize_aarch64_code_model (opts);
12372   initialize_aarch64_tls_size (opts);
12373
12374   int queue_depth = 0;
12375   switch (aarch64_tune_params.autoprefetcher_model)
12376     {
12377       case tune_params::AUTOPREFETCHER_OFF:
12378         queue_depth = -1;
12379         break;
12380       case tune_params::AUTOPREFETCHER_WEAK:
12381         queue_depth = 0;
12382         break;
12383       case tune_params::AUTOPREFETCHER_STRONG:
12384         queue_depth = max_insn_queue_index + 1;
12385         break;
12386       default:
12387         gcc_unreachable ();
12388     }
12389
12390   /* We don't mind passing in global_options_set here as we don't use
12391      the *options_set structs anyway.  */
12392   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
12393                          queue_depth,
12394                          opts->x_param_values,
12395                          global_options_set.x_param_values);
12396
12397   /* Set up parameters to be used in prefetching algorithm.  Do not
12398      override the defaults unless we are tuning for a core we have
12399      researched values for.  */
12400   if (aarch64_tune_params.prefetch->num_slots > 0)
12401     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
12402                            aarch64_tune_params.prefetch->num_slots,
12403                            opts->x_param_values,
12404                            global_options_set.x_param_values);
12405   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
12406     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
12407                            aarch64_tune_params.prefetch->l1_cache_size,
12408                            opts->x_param_values,
12409                            global_options_set.x_param_values);
12410   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
12411     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
12412                            aarch64_tune_params.prefetch->l1_cache_line_size,
12413                            opts->x_param_values,
12414                            global_options_set.x_param_values);
12415   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
12416     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
12417                            aarch64_tune_params.prefetch->l2_cache_size,
12418                            opts->x_param_values,
12419                            global_options_set.x_param_values);
12420   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
12421     maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
12422                            0,
12423                            opts->x_param_values,
12424                            global_options_set.x_param_values);
12425   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
12426     maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
12427                            aarch64_tune_params.prefetch->minimum_stride,
12428                            opts->x_param_values,
12429                            global_options_set.x_param_values);
12430
12431   /* Use the alternative scheduling-pressure algorithm by default.  */
12432   maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
12433                          opts->x_param_values,
12434                          global_options_set.x_param_values);
12435
12436   /* If the user hasn't changed it via configure then set the default to 64 KB
12437      for the backend.  */
12438   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
12439                          DEFAULT_STK_CLASH_GUARD_SIZE == 0
12440                            ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE,
12441                          opts->x_param_values,
12442                          global_options_set.x_param_values);
12443
12444   /* Validate the guard size.  */
12445   int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
12446
12447   /* Enforce that interval is the same size as size so the mid-end does the
12448      right thing.  */
12449   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
12450                          guard_size,
12451                          opts->x_param_values,
12452                          global_options_set.x_param_values);
12453
12454   /* The maybe_set calls won't update the value if the user has explicitly set
12455      one.  Which means we need to validate that probing interval and guard size
12456      are equal.  */
12457   int probe_interval
12458     = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
12459   if (guard_size != probe_interval)
12460     error ("stack clash guard size %<%d%> must be equal to probing interval "
12461            "%<%d%>", guard_size, probe_interval);
12462
12463   /* Enable sw prefetching at specified optimization level for
12464      CPUS that have prefetch.  Lower optimization level threshold by 1
12465      when profiling is enabled.  */
12466   if (opts->x_flag_prefetch_loop_arrays < 0
12467       && !opts->x_optimize_size
12468       && aarch64_tune_params.prefetch->default_opt_level >= 0
12469       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
12470     opts->x_flag_prefetch_loop_arrays = 1;
12471
12472   if (opts->x_aarch64_arch_string == NULL)
12473     opts->x_aarch64_arch_string = selected_arch->name;
12474   if (opts->x_aarch64_cpu_string == NULL)
12475     opts->x_aarch64_cpu_string = selected_cpu->name;
12476   if (opts->x_aarch64_tune_string == NULL)
12477     opts->x_aarch64_tune_string = selected_tune->name;
12478
12479   aarch64_override_options_after_change_1 (opts);
12480 }
12481
12482 /* Print a hint with a suggestion for a core or architecture name that
12483    most closely resembles what the user passed in STR.  ARCH is true if
12484    the user is asking for an architecture name.  ARCH is false if the user
12485    is asking for a core name.  */
12486
12487 static void
12488 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
12489 {
12490   auto_vec<const char *> candidates;
12491   const struct processor *entry = arch ? all_architectures : all_cores;
12492   for (; entry->name != NULL; entry++)
12493     candidates.safe_push (entry->name);
12494
12495 #ifdef HAVE_LOCAL_CPU_DETECT
12496   /* Add also "native" as possible value.  */
12497   if (arch)
12498     candidates.safe_push ("native");
12499 #endif
12500
12501   char *s;
12502   const char *hint = candidates_list_and_hint (str, s, candidates);
12503   if (hint)
12504     inform (input_location, "valid arguments are: %s;"
12505                              " did you mean %qs?", s, hint);
12506   else
12507     inform (input_location, "valid arguments are: %s", s);
12508
12509   XDELETEVEC (s);
12510 }
12511
12512 /* Print a hint with a suggestion for a core name that most closely resembles
12513    what the user passed in STR.  */
12514
12515 inline static void
12516 aarch64_print_hint_for_core (const char *str)
12517 {
12518   aarch64_print_hint_for_core_or_arch (str, false);
12519 }
12520
12521 /* Print a hint with a suggestion for an architecture name that most closely
12522    resembles what the user passed in STR.  */
12523
12524 inline static void
12525 aarch64_print_hint_for_arch (const char *str)
12526 {
12527   aarch64_print_hint_for_core_or_arch (str, true);
12528 }
12529
12530
12531 /* Print a hint with a suggestion for an extension name
12532    that most closely resembles what the user passed in STR.  */
12533
12534 void
12535 aarch64_print_hint_for_extensions (const std::string &str)
12536 {
12537   auto_vec<const char *> candidates;
12538   aarch64_get_all_extension_candidates (&candidates);
12539   char *s;
12540   const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
12541   if (hint)
12542     inform (input_location, "valid arguments are: %s;"
12543                              " did you mean %qs?", s, hint);
12544   else
12545     inform (input_location, "valid arguments are: %s;", s);
12546
12547   XDELETEVEC (s);
12548 }
12549
12550 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
12551    specified in STR and throw errors if appropriate.  Put the results if
12552    they are valid in RES and ISA_FLAGS.  Return whether the option is
12553    valid.  */
12554
12555 static bool
12556 aarch64_validate_mcpu (const char *str, const struct processor **res,
12557                        uint64_t *isa_flags)
12558 {
12559   std::string invalid_extension;
12560   enum aarch64_parse_opt_result parse_res
12561     = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
12562
12563   if (parse_res == AARCH64_PARSE_OK)
12564     return true;
12565
12566   switch (parse_res)
12567     {
12568       case AARCH64_PARSE_MISSING_ARG:
12569         error ("missing cpu name in %<-mcpu=%s%>", str);
12570         break;
12571       case AARCH64_PARSE_INVALID_ARG:
12572         error ("unknown value %qs for %<-mcpu%>", str);
12573         aarch64_print_hint_for_core (str);
12574         break;
12575       case AARCH64_PARSE_INVALID_FEATURE:
12576         error ("invalid feature modifier %qs in %<-mcpu=%s%>",
12577                invalid_extension.c_str (), str);
12578         aarch64_print_hint_for_extensions (invalid_extension);
12579         break;
12580       default:
12581         gcc_unreachable ();
12582     }
12583
12584   return false;
12585 }
12586
12587 /* Parses CONST_STR for branch protection features specified in
12588    aarch64_branch_protect_types, and set any global variables required.  Returns
12589    the parsing result and assigns LAST_STR to the last processed token from
12590    CONST_STR so that it can be used for error reporting.  */
12591
12592 static enum
12593 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
12594                                                           char** last_str)
12595 {
12596   char *str_root = xstrdup (const_str);
12597   char* token_save = NULL;
12598   char *str = strtok_r (str_root, "+", &token_save);
12599   enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
12600   if (!str)
12601     res = AARCH64_PARSE_MISSING_ARG;
12602   else
12603     {
12604       char *next_str = strtok_r (NULL, "+", &token_save);
12605       /* Reset the branch protection features to their defaults.  */
12606       aarch64_handle_no_branch_protection (NULL, NULL);
12607
12608       while (str && res == AARCH64_PARSE_OK)
12609         {
12610           const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
12611           bool found = false;
12612           /* Search for this type.  */
12613           while (type && type->name && !found && res == AARCH64_PARSE_OK)
12614             {
12615               if (strcmp (str, type->name) == 0)
12616                 {
12617                   found = true;
12618                   res = type->handler (str, next_str);
12619                   str = next_str;
12620                   next_str = strtok_r (NULL, "+", &token_save);
12621                 }
12622               else
12623                 type++;
12624             }
12625           if (found && res == AARCH64_PARSE_OK)
12626             {
12627               bool found_subtype = true;
12628               /* Loop through each token until we find one that isn't a
12629                  subtype.  */
12630               while (found_subtype)
12631                 {
12632                   found_subtype = false;
12633                   const aarch64_branch_protect_type *subtype = type->subtypes;
12634                   /* Search for the subtype.  */
12635                   while (str && subtype && subtype->name && !found_subtype
12636                           && res == AARCH64_PARSE_OK)
12637                     {
12638                       if (strcmp (str, subtype->name) == 0)
12639                         {
12640                           found_subtype = true;
12641                           res = subtype->handler (str, next_str);
12642                           str = next_str;
12643                           next_str = strtok_r (NULL, "+", &token_save);
12644                         }
12645                       else
12646                         subtype++;
12647                     }
12648                 }
12649             }
12650           else if (!found)
12651             res = AARCH64_PARSE_INVALID_ARG;
12652         }
12653     }
12654   /* Copy the last processed token into the argument to pass it back.
12655     Used by option and attribute validation to print the offending token.  */
12656   if (last_str)
12657     {
12658       if (str) strcpy (*last_str, str);
12659       else *last_str = NULL;
12660     }
12661   if (res == AARCH64_PARSE_OK)
12662     {
12663       /* If needed, alloc the accepted string then copy in const_str.
12664         Used by override_option_after_change_1.  */
12665       if (!accepted_branch_protection_string)
12666         accepted_branch_protection_string = (char *) xmalloc (
12667                                                       BRANCH_PROTECT_STR_MAX
12668                                                         + 1);
12669       strncpy (accepted_branch_protection_string, const_str,
12670                 BRANCH_PROTECT_STR_MAX + 1);
12671       /* Forcibly null-terminate.  */
12672       accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
12673     }
12674   return res;
12675 }
12676
12677 static bool
12678 aarch64_validate_mbranch_protection (const char *const_str)
12679 {
12680   char *str = (char *) xmalloc (strlen (const_str));
12681   enum aarch64_parse_opt_result res =
12682     aarch64_parse_branch_protection (const_str, &str);
12683   if (res == AARCH64_PARSE_INVALID_ARG)
12684     error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
12685   else if (res == AARCH64_PARSE_MISSING_ARG)
12686     error ("missing argument for %<-mbranch-protection=%>");
12687   free (str);
12688   return res == AARCH64_PARSE_OK;
12689 }
12690
12691 /* Validate a command-line -march option.  Parse the arch and extensions
12692    (if any) specified in STR and throw errors if appropriate.  Put the
12693    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
12694    option is valid.  */
12695
12696 static bool
12697 aarch64_validate_march (const char *str, const struct processor **res,
12698                          uint64_t *isa_flags)
12699 {
12700   std::string invalid_extension;
12701   enum aarch64_parse_opt_result parse_res
12702     = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
12703
12704   if (parse_res == AARCH64_PARSE_OK)
12705     return true;
12706
12707   switch (parse_res)
12708     {
12709       case AARCH64_PARSE_MISSING_ARG:
12710         error ("missing arch name in %<-march=%s%>", str);
12711         break;
12712       case AARCH64_PARSE_INVALID_ARG:
12713         error ("unknown value %qs for %<-march%>", str);
12714         aarch64_print_hint_for_arch (str);
12715         break;
12716       case AARCH64_PARSE_INVALID_FEATURE:
12717         error ("invalid feature modifier %qs in %<-march=%s%>",
12718                invalid_extension.c_str (), str);
12719         aarch64_print_hint_for_extensions (invalid_extension);
12720         break;
12721       default:
12722         gcc_unreachable ();
12723     }
12724
12725   return false;
12726 }
12727
12728 /* Validate a command-line -mtune option.  Parse the cpu
12729    specified in STR and throw errors if appropriate.  Put the
12730    result, if it is valid, in RES.  Return whether the option is
12731    valid.  */
12732
12733 static bool
12734 aarch64_validate_mtune (const char *str, const struct processor **res)
12735 {
12736   enum aarch64_parse_opt_result parse_res
12737     = aarch64_parse_tune (str, res);
12738
12739   if (parse_res == AARCH64_PARSE_OK)
12740     return true;
12741
12742   switch (parse_res)
12743     {
12744       case AARCH64_PARSE_MISSING_ARG:
12745         error ("missing cpu name in %<-mtune=%s%>", str);
12746         break;
12747       case AARCH64_PARSE_INVALID_ARG:
12748         error ("unknown value %qs for %<-mtune%>", str);
12749         aarch64_print_hint_for_core (str);
12750         break;
12751       default:
12752         gcc_unreachable ();
12753     }
12754   return false;
12755 }
12756
12757 /* Return the CPU corresponding to the enum CPU.
12758    If it doesn't specify a cpu, return the default.  */
12759
12760 static const struct processor *
12761 aarch64_get_tune_cpu (enum aarch64_processor cpu)
12762 {
12763   if (cpu != aarch64_none)
12764     return &all_cores[cpu];
12765
12766   /* The & 0x3f is to extract the bottom 6 bits that encode the
12767      default cpu as selected by the --with-cpu GCC configure option
12768      in config.gcc.
12769      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
12770      flags mechanism should be reworked to make it more sane.  */
12771   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12772 }
12773
12774 /* Return the architecture corresponding to the enum ARCH.
12775    If it doesn't specify a valid architecture, return the default.  */
12776
12777 static const struct processor *
12778 aarch64_get_arch (enum aarch64_arch arch)
12779 {
12780   if (arch != aarch64_no_arch)
12781     return &all_architectures[arch];
12782
12783   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12784
12785   return &all_architectures[cpu->arch];
12786 }
12787
12788 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
12789
12790 static poly_uint16
12791 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
12792 {
12793   /* For now generate vector-length agnostic code for -msve-vector-bits=128.
12794      This ensures we can clearly distinguish SVE and Advanced SIMD modes when
12795      deciding which .md file patterns to use and when deciding whether
12796      something is a legitimate address or constant.  */
12797   if (value == SVE_SCALABLE || value == SVE_128)
12798     return poly_uint16 (2, 2);
12799   else
12800     return (int) value / 64;
12801 }
12802
12803 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
12804    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
12805    tuning structs.  In particular it must set selected_tune and
12806    aarch64_isa_flags that define the available ISA features and tuning
12807    decisions.  It must also set selected_arch as this will be used to
12808    output the .arch asm tags for each function.  */
12809
12810 static void
12811 aarch64_override_options (void)
12812 {
12813   uint64_t cpu_isa = 0;
12814   uint64_t arch_isa = 0;
12815   aarch64_isa_flags = 0;
12816
12817   bool valid_cpu = true;
12818   bool valid_tune = true;
12819   bool valid_arch = true;
12820
12821   selected_cpu = NULL;
12822   selected_arch = NULL;
12823   selected_tune = NULL;
12824
12825   if (aarch64_branch_protection_string)
12826     aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
12827
12828   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
12829      If either of -march or -mtune is given, they override their
12830      respective component of -mcpu.  */
12831   if (aarch64_cpu_string)
12832     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
12833                                         &cpu_isa);
12834
12835   if (aarch64_arch_string)
12836     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
12837                                           &arch_isa);
12838
12839   if (aarch64_tune_string)
12840     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
12841
12842 #ifdef SUBTARGET_OVERRIDE_OPTIONS
12843   SUBTARGET_OVERRIDE_OPTIONS;
12844 #endif
12845
12846   /* If the user did not specify a processor, choose the default
12847      one for them.  This will be the CPU set during configuration using
12848      --with-cpu, otherwise it is "generic".  */
12849   if (!selected_cpu)
12850     {
12851       if (selected_arch)
12852         {
12853           selected_cpu = &all_cores[selected_arch->ident];
12854           aarch64_isa_flags = arch_isa;
12855           explicit_arch = selected_arch->arch;
12856         }
12857       else
12858         {
12859           /* Get default configure-time CPU.  */
12860           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
12861           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
12862         }
12863
12864       if (selected_tune)
12865         explicit_tune_core = selected_tune->ident;
12866     }
12867   /* If both -mcpu and -march are specified check that they are architecturally
12868      compatible, warn if they're not and prefer the -march ISA flags.  */
12869   else if (selected_arch)
12870     {
12871       if (selected_arch->arch != selected_cpu->arch)
12872         {
12873           warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
12874                        all_architectures[selected_cpu->arch].name,
12875                        selected_arch->name);
12876         }
12877       aarch64_isa_flags = arch_isa;
12878       explicit_arch = selected_arch->arch;
12879       explicit_tune_core = selected_tune ? selected_tune->ident
12880                                           : selected_cpu->ident;
12881     }
12882   else
12883     {
12884       /* -mcpu but no -march.  */
12885       aarch64_isa_flags = cpu_isa;
12886       explicit_tune_core = selected_tune ? selected_tune->ident
12887                                           : selected_cpu->ident;
12888       gcc_assert (selected_cpu);
12889       selected_arch = &all_architectures[selected_cpu->arch];
12890       explicit_arch = selected_arch->arch;
12891     }
12892
12893   /* Set the arch as well as we will need it when outputing
12894      the .arch directive in assembly.  */
12895   if (!selected_arch)
12896     {
12897       gcc_assert (selected_cpu);
12898       selected_arch = &all_architectures[selected_cpu->arch];
12899     }
12900
12901   if (!selected_tune)
12902     selected_tune = selected_cpu;
12903
12904   if (aarch64_enable_bti == 2)
12905     {
12906 #ifdef TARGET_ENABLE_BTI
12907       aarch64_enable_bti = 1;
12908 #else
12909       aarch64_enable_bti = 0;
12910 #endif
12911     }
12912
12913   /* Return address signing is currently not supported for ILP32 targets.  For
12914      LP64 targets use the configured option in the absence of a command-line
12915      option for -mbranch-protection.  */
12916   if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
12917     {
12918 #ifdef TARGET_ENABLE_PAC_RET
12919       aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
12920 #else
12921       aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
12922 #endif
12923     }
12924
12925 #ifndef HAVE_AS_MABI_OPTION
12926   /* The compiler may have been configured with 2.23.* binutils, which does
12927      not have support for ILP32.  */
12928   if (TARGET_ILP32)
12929     error ("assembler does not support %<-mabi=ilp32%>");
12930 #endif
12931
12932   /* Convert -msve-vector-bits to a VG count.  */
12933   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
12934
12935   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
12936     sorry ("return address signing is only supported for %<-mabi=lp64%>");
12937
12938   /* Make sure we properly set up the explicit options.  */
12939   if ((aarch64_cpu_string && valid_cpu)
12940        || (aarch64_tune_string && valid_tune))
12941     gcc_assert (explicit_tune_core != aarch64_none);
12942
12943   if ((aarch64_cpu_string && valid_cpu)
12944        || (aarch64_arch_string && valid_arch))
12945     gcc_assert (explicit_arch != aarch64_no_arch);
12946
12947   /* The pass to insert speculation tracking runs before
12948      shrink-wrapping and the latter does not know how to update the
12949      tracking status.  So disable it in this case.  */
12950   if (aarch64_track_speculation)
12951     flag_shrink_wrap = 0;
12952
12953   aarch64_override_options_internal (&global_options);
12954
12955   /* Save these options as the default ones in case we push and pop them later
12956      while processing functions with potential target attributes.  */
12957   target_option_default_node = target_option_current_node
12958       = build_target_option_node (&global_options);
12959 }
12960
12961 /* Implement targetm.override_options_after_change.  */
12962
12963 static void
12964 aarch64_override_options_after_change (void)
12965 {
12966   aarch64_override_options_after_change_1 (&global_options);
12967 }
12968
12969 static struct machine_function *
12970 aarch64_init_machine_status (void)
12971 {
12972   struct machine_function *machine;
12973   machine = ggc_cleared_alloc<machine_function> ();
12974   return machine;
12975 }
12976
12977 void
12978 aarch64_init_expanders (void)
12979 {
12980   init_machine_status = aarch64_init_machine_status;
12981 }
12982
12983 /* A checking mechanism for the implementation of the various code models.  */
12984 static void
12985 initialize_aarch64_code_model (struct gcc_options *opts)
12986 {
12987    if (opts->x_flag_pic)
12988      {
12989        switch (opts->x_aarch64_cmodel_var)
12990          {
12991          case AARCH64_CMODEL_TINY:
12992            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
12993            break;
12994          case AARCH64_CMODEL_SMALL:
12995 #ifdef HAVE_AS_SMALL_PIC_RELOCS
12996            aarch64_cmodel = (flag_pic == 2
12997                              ? AARCH64_CMODEL_SMALL_PIC
12998                              : AARCH64_CMODEL_SMALL_SPIC);
12999 #else
13000            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
13001 #endif
13002            break;
13003          case AARCH64_CMODEL_LARGE:
13004            sorry ("code model %qs with %<-f%s%>", "large",
13005                   opts->x_flag_pic > 1 ? "PIC" : "pic");
13006            break;
13007          default:
13008            gcc_unreachable ();
13009          }
13010      }
13011    else
13012      aarch64_cmodel = opts->x_aarch64_cmodel_var;
13013 }
13014
13015 /* Implement TARGET_OPTION_SAVE.  */
13016
13017 static void
13018 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
13019 {
13020   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
13021   ptr->x_aarch64_branch_protection_string
13022     = opts->x_aarch64_branch_protection_string;
13023 }
13024
13025 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
13026    using the information saved in PTR.  */
13027
13028 static void
13029 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
13030 {
13031   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
13032   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
13033   opts->x_explicit_arch = ptr->x_explicit_arch;
13034   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
13035   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
13036   opts->x_aarch64_branch_protection_string
13037     = ptr->x_aarch64_branch_protection_string;
13038   if (opts->x_aarch64_branch_protection_string)
13039     {
13040       aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
13041                                         NULL);
13042     }
13043
13044   aarch64_override_options_internal (opts);
13045 }
13046
13047 /* Implement TARGET_OPTION_PRINT.  */
13048
13049 static void
13050 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
13051 {
13052   const struct processor *cpu
13053     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
13054   uint64_t isa_flags = ptr->x_aarch64_isa_flags;
13055   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
13056   std::string extension
13057     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
13058
13059   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
13060   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
13061            arch->name, extension.c_str ());
13062 }
13063
13064 static GTY(()) tree aarch64_previous_fndecl;
13065
13066 void
13067 aarch64_reset_previous_fndecl (void)
13068 {
13069   aarch64_previous_fndecl = NULL;
13070 }
13071
13072 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
13073    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
13074    make sure optab availability predicates are recomputed when necessary.  */
13075
13076 void
13077 aarch64_save_restore_target_globals (tree new_tree)
13078 {
13079   if (TREE_TARGET_GLOBALS (new_tree))
13080     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
13081   else if (new_tree == target_option_default_node)
13082     restore_target_globals (&default_target_globals);
13083   else
13084     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
13085 }
13086
13087 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
13088    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
13089    of the function, if such exists.  This function may be called multiple
13090    times on a single function so use aarch64_previous_fndecl to avoid
13091    setting up identical state.  */
13092
13093 static void
13094 aarch64_set_current_function (tree fndecl)
13095 {
13096   if (!fndecl || fndecl == aarch64_previous_fndecl)
13097     return;
13098
13099   tree old_tree = (aarch64_previous_fndecl
13100                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
13101                    : NULL_TREE);
13102
13103   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13104
13105   /* If current function has no attributes but the previous one did,
13106      use the default node.  */
13107   if (!new_tree && old_tree)
13108     new_tree = target_option_default_node;
13109
13110   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
13111      the default have been handled by aarch64_save_restore_target_globals from
13112      aarch64_pragma_target_parse.  */
13113   if (old_tree == new_tree)
13114     return;
13115
13116   aarch64_previous_fndecl = fndecl;
13117
13118   /* First set the target options.  */
13119   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
13120
13121   aarch64_save_restore_target_globals (new_tree);
13122 }
13123
13124 /* Enum describing the various ways we can handle attributes.
13125    In many cases we can reuse the generic option handling machinery.  */
13126
13127 enum aarch64_attr_opt_type
13128 {
13129   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
13130   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
13131   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
13132   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
13133 };
13134
13135 /* All the information needed to handle a target attribute.
13136    NAME is the name of the attribute.
13137    ATTR_TYPE specifies the type of behavior of the attribute as described
13138    in the definition of enum aarch64_attr_opt_type.
13139    ALLOW_NEG is true if the attribute supports a "no-" form.
13140    HANDLER is the function that takes the attribute string as an argument
13141    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
13142    OPT_NUM is the enum specifying the option that the attribute modifies.
13143    This is needed for attributes that mirror the behavior of a command-line
13144    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
13145    aarch64_attr_enum.  */
13146
13147 struct aarch64_attribute_info
13148 {
13149   const char *name;
13150   enum aarch64_attr_opt_type attr_type;
13151   bool allow_neg;
13152   bool (*handler) (const char *);
13153   enum opt_code opt_num;
13154 };
13155
13156 /* Handle the ARCH_STR argument to the arch= target attribute.  */
13157
13158 static bool
13159 aarch64_handle_attr_arch (const char *str)
13160 {
13161   const struct processor *tmp_arch = NULL;
13162   std::string invalid_extension;
13163   enum aarch64_parse_opt_result parse_res
13164     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
13165
13166   if (parse_res == AARCH64_PARSE_OK)
13167     {
13168       gcc_assert (tmp_arch);
13169       selected_arch = tmp_arch;
13170       explicit_arch = selected_arch->arch;
13171       return true;
13172     }
13173
13174   switch (parse_res)
13175     {
13176       case AARCH64_PARSE_MISSING_ARG:
13177         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
13178         break;
13179       case AARCH64_PARSE_INVALID_ARG:
13180         error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
13181         aarch64_print_hint_for_arch (str);
13182         break;
13183       case AARCH64_PARSE_INVALID_FEATURE:
13184         error ("invalid feature modifier %s of value (\"%s\") in "
13185                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13186         aarch64_print_hint_for_extensions (invalid_extension);
13187         break;
13188       default:
13189         gcc_unreachable ();
13190     }
13191
13192   return false;
13193 }
13194
13195 /* Handle the argument CPU_STR to the cpu= target attribute.  */
13196
13197 static bool
13198 aarch64_handle_attr_cpu (const char *str)
13199 {
13200   const struct processor *tmp_cpu = NULL;
13201   std::string invalid_extension;
13202   enum aarch64_parse_opt_result parse_res
13203     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
13204
13205   if (parse_res == AARCH64_PARSE_OK)
13206     {
13207       gcc_assert (tmp_cpu);
13208       selected_tune = tmp_cpu;
13209       explicit_tune_core = selected_tune->ident;
13210
13211       selected_arch = &all_architectures[tmp_cpu->arch];
13212       explicit_arch = selected_arch->arch;
13213       return true;
13214     }
13215
13216   switch (parse_res)
13217     {
13218       case AARCH64_PARSE_MISSING_ARG:
13219         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
13220         break;
13221       case AARCH64_PARSE_INVALID_ARG:
13222         error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
13223         aarch64_print_hint_for_core (str);
13224         break;
13225       case AARCH64_PARSE_INVALID_FEATURE:
13226         error ("invalid feature modifier %s of value (\"%s\") in "
13227                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13228         aarch64_print_hint_for_extensions (invalid_extension);
13229         break;
13230       default:
13231         gcc_unreachable ();
13232     }
13233
13234   return false;
13235 }
13236
13237 /* Handle the argument STR to the branch-protection= attribute.  */
13238
13239  static bool
13240  aarch64_handle_attr_branch_protection (const char* str)
13241  {
13242   char *err_str = (char *) xmalloc (strlen (str));
13243   enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
13244                                                                       &err_str);
13245   bool success = false;
13246   switch (res)
13247     {
13248      case AARCH64_PARSE_MISSING_ARG:
13249        error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
13250               " attribute");
13251        break;
13252      case AARCH64_PARSE_INVALID_ARG:
13253        error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
13254               "=\")%> pragma or attribute", err_str);
13255        break;
13256      case AARCH64_PARSE_OK:
13257        success = true;
13258       /* Fall through.  */
13259      case AARCH64_PARSE_INVALID_FEATURE:
13260        break;
13261      default:
13262        gcc_unreachable ();
13263     }
13264   free (err_str);
13265   return success;
13266  }
13267
13268 /* Handle the argument STR to the tune= target attribute.  */
13269
13270 static bool
13271 aarch64_handle_attr_tune (const char *str)
13272 {
13273   const struct processor *tmp_tune = NULL;
13274   enum aarch64_parse_opt_result parse_res
13275     = aarch64_parse_tune (str, &tmp_tune);
13276
13277   if (parse_res == AARCH64_PARSE_OK)
13278     {
13279       gcc_assert (tmp_tune);
13280       selected_tune = tmp_tune;
13281       explicit_tune_core = selected_tune->ident;
13282       return true;
13283     }
13284
13285   switch (parse_res)
13286     {
13287       case AARCH64_PARSE_INVALID_ARG:
13288         error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
13289         aarch64_print_hint_for_core (str);
13290         break;
13291       default:
13292         gcc_unreachable ();
13293     }
13294
13295   return false;
13296 }
13297
13298 /* Parse an architecture extensions target attribute string specified in STR.
13299    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
13300    if successful.  Update aarch64_isa_flags to reflect the ISA features
13301    modified.  */
13302
13303 static bool
13304 aarch64_handle_attr_isa_flags (char *str)
13305 {
13306   enum aarch64_parse_opt_result parse_res;
13307   uint64_t isa_flags = aarch64_isa_flags;
13308
13309   /* We allow "+nothing" in the beginning to clear out all architectural
13310      features if the user wants to handpick specific features.  */
13311   if (strncmp ("+nothing", str, 8) == 0)
13312     {
13313       isa_flags = 0;
13314       str += 8;
13315     }
13316
13317   std::string invalid_extension;
13318   parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
13319
13320   if (parse_res == AARCH64_PARSE_OK)
13321     {
13322       aarch64_isa_flags = isa_flags;
13323       return true;
13324     }
13325
13326   switch (parse_res)
13327     {
13328       case AARCH64_PARSE_MISSING_ARG:
13329         error ("missing value in %<target()%> pragma or attribute");
13330         break;
13331
13332       case AARCH64_PARSE_INVALID_FEATURE:
13333         error ("invalid feature modifier %s of value (\"%s\") in "
13334                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13335         break;
13336
13337       default:
13338         gcc_unreachable ();
13339     }
13340
13341  return false;
13342 }
13343
13344 /* The target attributes that we support.  On top of these we also support just
13345    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
13346    handled explicitly in aarch64_process_one_target_attr.  */
13347
13348 static const struct aarch64_attribute_info aarch64_attributes[] =
13349 {
13350   { "general-regs-only", aarch64_attr_mask, false, NULL,
13351      OPT_mgeneral_regs_only },
13352   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
13353      OPT_mfix_cortex_a53_835769 },
13354   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
13355      OPT_mfix_cortex_a53_843419 },
13356   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
13357   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
13358   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
13359      OPT_momit_leaf_frame_pointer },
13360   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
13361   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
13362      OPT_march_ },
13363   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
13364   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
13365      OPT_mtune_ },
13366   { "branch-protection", aarch64_attr_custom, false,
13367      aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
13368   { "sign-return-address", aarch64_attr_enum, false, NULL,
13369      OPT_msign_return_address_ },
13370   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
13371 };
13372
13373 /* Parse ARG_STR which contains the definition of one target attribute.
13374    Show appropriate errors if any or return true if the attribute is valid.  */
13375
13376 static bool
13377 aarch64_process_one_target_attr (char *arg_str)
13378 {
13379   bool invert = false;
13380
13381   size_t len = strlen (arg_str);
13382
13383   if (len == 0)
13384     {
13385       error ("malformed %<target()%> pragma or attribute");
13386       return false;
13387     }
13388
13389   char *str_to_check = (char *) alloca (len + 1);
13390   strcpy (str_to_check, arg_str);
13391
13392   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
13393      It is easier to detect and handle it explicitly here rather than going
13394      through the machinery for the rest of the target attributes in this
13395      function.  */
13396   if (*str_to_check == '+')
13397     return aarch64_handle_attr_isa_flags (str_to_check);
13398
13399   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
13400     {
13401       invert = true;
13402       str_to_check += 3;
13403     }
13404   char *arg = strchr (str_to_check, '=');
13405
13406   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
13407      and point ARG to "foo".  */
13408   if (arg)
13409     {
13410       *arg = '\0';
13411       arg++;
13412     }
13413   const struct aarch64_attribute_info *p_attr;
13414   bool found = false;
13415   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
13416     {
13417       /* If the names don't match up, or the user has given an argument
13418          to an attribute that doesn't accept one, or didn't give an argument
13419          to an attribute that expects one, fail to match.  */
13420       if (strcmp (str_to_check, p_attr->name) != 0)
13421         continue;
13422
13423       found = true;
13424       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
13425                               || p_attr->attr_type == aarch64_attr_enum;
13426
13427       if (attr_need_arg_p ^ (arg != NULL))
13428         {
13429           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
13430           return false;
13431         }
13432
13433       /* If the name matches but the attribute does not allow "no-" versions
13434          then we can't match.  */
13435       if (invert && !p_attr->allow_neg)
13436         {
13437           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
13438           return false;
13439         }
13440
13441       switch (p_attr->attr_type)
13442         {
13443         /* Has a custom handler registered.
13444            For example, cpu=, arch=, tune=.  */
13445           case aarch64_attr_custom:
13446             gcc_assert (p_attr->handler);
13447             if (!p_attr->handler (arg))
13448               return false;
13449             break;
13450
13451           /* Either set or unset a boolean option.  */
13452           case aarch64_attr_bool:
13453             {
13454               struct cl_decoded_option decoded;
13455
13456               generate_option (p_attr->opt_num, NULL, !invert,
13457                                CL_TARGET, &decoded);
13458               aarch64_handle_option (&global_options, &global_options_set,
13459                                       &decoded, input_location);
13460               break;
13461             }
13462           /* Set or unset a bit in the target_flags.  aarch64_handle_option
13463              should know what mask to apply given the option number.  */
13464           case aarch64_attr_mask:
13465             {
13466               struct cl_decoded_option decoded;
13467               /* We only need to specify the option number.
13468                  aarch64_handle_option will know which mask to apply.  */
13469               decoded.opt_index = p_attr->opt_num;
13470               decoded.value = !invert;
13471               aarch64_handle_option (&global_options, &global_options_set,
13472                                       &decoded, input_location);
13473               break;
13474             }
13475           /* Use the option setting machinery to set an option to an enum.  */
13476           case aarch64_attr_enum:
13477             {
13478               gcc_assert (arg);
13479               bool valid;
13480               int value;
13481               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
13482                                               &value, CL_TARGET);
13483               if (valid)
13484                 {
13485                   set_option (&global_options, NULL, p_attr->opt_num, value,
13486                               NULL, DK_UNSPECIFIED, input_location,
13487                               global_dc);
13488                 }
13489               else
13490                 {
13491                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
13492                 }
13493               break;
13494             }
13495           default:
13496             gcc_unreachable ();
13497         }
13498     }
13499
13500   /* If we reached here we either have found an attribute and validated
13501      it or didn't match any.  If we matched an attribute but its arguments
13502      were malformed we will have returned false already.  */
13503   return found;
13504 }
13505
13506 /* Count how many times the character C appears in
13507    NULL-terminated string STR.  */
13508
13509 static unsigned int
13510 num_occurences_in_str (char c, char *str)
13511 {
13512   unsigned int res = 0;
13513   while (*str != '\0')
13514     {
13515       if (*str == c)
13516         res++;
13517
13518       str++;
13519     }
13520
13521   return res;
13522 }
13523
13524 /* Parse the tree in ARGS that contains the target attribute information
13525    and update the global target options space.  */
13526
13527 bool
13528 aarch64_process_target_attr (tree args)
13529 {
13530   if (TREE_CODE (args) == TREE_LIST)
13531     {
13532       do
13533         {
13534           tree head = TREE_VALUE (args);
13535           if (head)
13536             {
13537               if (!aarch64_process_target_attr (head))
13538                 return false;
13539             }
13540           args = TREE_CHAIN (args);
13541         } while (args);
13542
13543       return true;
13544     }
13545
13546   if (TREE_CODE (args) != STRING_CST)
13547     {
13548       error ("attribute %<target%> argument not a string");
13549       return false;
13550     }
13551
13552   size_t len = strlen (TREE_STRING_POINTER (args));
13553   char *str_to_check = (char *) alloca (len + 1);
13554   strcpy (str_to_check, TREE_STRING_POINTER (args));
13555
13556   if (len == 0)
13557     {
13558       error ("malformed %<target()%> pragma or attribute");
13559       return false;
13560     }
13561
13562   /* Used to catch empty spaces between commas i.e.
13563      attribute ((target ("attr1,,attr2"))).  */
13564   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
13565
13566   /* Handle multiple target attributes separated by ','.  */
13567   char *token = strtok_r (str_to_check, ",", &str_to_check);
13568
13569   unsigned int num_attrs = 0;
13570   while (token)
13571     {
13572       num_attrs++;
13573       if (!aarch64_process_one_target_attr (token))
13574         {
13575           error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
13576           return false;
13577         }
13578
13579       token = strtok_r (NULL, ",", &str_to_check);
13580     }
13581
13582   if (num_attrs != num_commas + 1)
13583     {
13584       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
13585       return false;
13586     }
13587
13588   return true;
13589 }
13590
13591 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
13592    process attribute ((target ("..."))).  */
13593
13594 static bool
13595 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
13596 {
13597   struct cl_target_option cur_target;
13598   bool ret;
13599   tree old_optimize;
13600   tree new_target, new_optimize;
13601   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13602
13603   /* If what we're processing is the current pragma string then the
13604      target option node is already stored in target_option_current_node
13605      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
13606      having to re-parse the string.  This is especially useful to keep
13607      arm_neon.h compile times down since that header contains a lot
13608      of intrinsics enclosed in pragmas.  */
13609   if (!existing_target && args == current_target_pragma)
13610     {
13611       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
13612       return true;
13613     }
13614   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13615
13616   old_optimize = build_optimization_node (&global_options);
13617   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13618
13619   /* If the function changed the optimization levels as well as setting
13620      target options, start with the optimizations specified.  */
13621   if (func_optimize && func_optimize != old_optimize)
13622     cl_optimization_restore (&global_options,
13623                              TREE_OPTIMIZATION (func_optimize));
13624
13625   /* Save the current target options to restore at the end.  */
13626   cl_target_option_save (&cur_target, &global_options);
13627
13628   /* If fndecl already has some target attributes applied to it, unpack
13629      them so that we add this attribute on top of them, rather than
13630      overwriting them.  */
13631   if (existing_target)
13632     {
13633       struct cl_target_option *existing_options
13634         = TREE_TARGET_OPTION (existing_target);
13635
13636       if (existing_options)
13637         cl_target_option_restore (&global_options, existing_options);
13638     }
13639   else
13640     cl_target_option_restore (&global_options,
13641                         TREE_TARGET_OPTION (target_option_current_node));
13642
13643   ret = aarch64_process_target_attr (args);
13644
13645   /* Set up any additional state.  */
13646   if (ret)
13647     {
13648       aarch64_override_options_internal (&global_options);
13649       /* Initialize SIMD builtins if we haven't already.
13650          Set current_target_pragma to NULL for the duration so that
13651          the builtin initialization code doesn't try to tag the functions
13652          being built with the attributes specified by any current pragma, thus
13653          going into an infinite recursion.  */
13654       if (TARGET_SIMD)
13655         {
13656           tree saved_current_target_pragma = current_target_pragma;
13657           current_target_pragma = NULL;
13658           aarch64_init_simd_builtins ();
13659           current_target_pragma = saved_current_target_pragma;
13660         }
13661       new_target = build_target_option_node (&global_options);
13662     }
13663   else
13664     new_target = NULL;
13665
13666   new_optimize = build_optimization_node (&global_options);
13667
13668   if (fndecl && ret)
13669     {
13670       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
13671
13672       if (old_optimize != new_optimize)
13673         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
13674     }
13675
13676   cl_target_option_restore (&global_options, &cur_target);
13677
13678   if (old_optimize != new_optimize)
13679     cl_optimization_restore (&global_options,
13680                              TREE_OPTIMIZATION (old_optimize));
13681   return ret;
13682 }
13683
13684 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
13685    tri-bool options (yes, no, don't care) and the default value is
13686    DEF, determine whether to reject inlining.  */
13687
13688 static bool
13689 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
13690                                      int dont_care, int def)
13691 {
13692   /* If the callee doesn't care, always allow inlining.  */
13693   if (callee == dont_care)
13694     return true;
13695
13696   /* If the caller doesn't care, always allow inlining.  */
13697   if (caller == dont_care)
13698     return true;
13699
13700   /* Otherwise, allow inlining if either the callee and caller values
13701      agree, or if the callee is using the default value.  */
13702   return (callee == caller || callee == def);
13703 }
13704
13705 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
13706    to inline CALLEE into CALLER based on target-specific info.
13707    Make sure that the caller and callee have compatible architectural
13708    features.  Then go through the other possible target attributes
13709    and see if they can block inlining.  Try not to reject always_inline
13710    callees unless they are incompatible architecturally.  */
13711
13712 static bool
13713 aarch64_can_inline_p (tree caller, tree callee)
13714 {
13715   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
13716   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
13717
13718   struct cl_target_option *caller_opts
13719         = TREE_TARGET_OPTION (caller_tree ? caller_tree
13720                                            : target_option_default_node);
13721
13722   struct cl_target_option *callee_opts
13723         = TREE_TARGET_OPTION (callee_tree ? callee_tree
13724                                            : target_option_default_node);
13725
13726   /* Callee's ISA flags should be a subset of the caller's.  */
13727   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
13728        != callee_opts->x_aarch64_isa_flags)
13729     return false;
13730
13731   /* Allow non-strict aligned functions inlining into strict
13732      aligned ones.  */
13733   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
13734        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
13735       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
13736            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
13737     return false;
13738
13739   bool always_inline = lookup_attribute ("always_inline",
13740                                           DECL_ATTRIBUTES (callee));
13741
13742   /* If the architectural features match up and the callee is always_inline
13743      then the other attributes don't matter.  */
13744   if (always_inline)
13745     return true;
13746
13747   if (caller_opts->x_aarch64_cmodel_var
13748       != callee_opts->x_aarch64_cmodel_var)
13749     return false;
13750
13751   if (caller_opts->x_aarch64_tls_dialect
13752       != callee_opts->x_aarch64_tls_dialect)
13753     return false;
13754
13755   /* Honour explicit requests to workaround errata.  */
13756   if (!aarch64_tribools_ok_for_inlining_p (
13757           caller_opts->x_aarch64_fix_a53_err835769,
13758           callee_opts->x_aarch64_fix_a53_err835769,
13759           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
13760     return false;
13761
13762   if (!aarch64_tribools_ok_for_inlining_p (
13763           caller_opts->x_aarch64_fix_a53_err843419,
13764           callee_opts->x_aarch64_fix_a53_err843419,
13765           2, TARGET_FIX_ERR_A53_843419))
13766     return false;
13767
13768   /* If the user explicitly specified -momit-leaf-frame-pointer for the
13769      caller and calle and they don't match up, reject inlining.  */
13770   if (!aarch64_tribools_ok_for_inlining_p (
13771           caller_opts->x_flag_omit_leaf_frame_pointer,
13772           callee_opts->x_flag_omit_leaf_frame_pointer,
13773           2, 1))
13774     return false;
13775
13776   /* If the callee has specific tuning overrides, respect them.  */
13777   if (callee_opts->x_aarch64_override_tune_string != NULL
13778       && caller_opts->x_aarch64_override_tune_string == NULL)
13779     return false;
13780
13781   /* If the user specified tuning override strings for the
13782      caller and callee and they don't match up, reject inlining.
13783      We just do a string compare here, we don't analyze the meaning
13784      of the string, as it would be too costly for little gain.  */
13785   if (callee_opts->x_aarch64_override_tune_string
13786       && caller_opts->x_aarch64_override_tune_string
13787       && (strcmp (callee_opts->x_aarch64_override_tune_string,
13788                   caller_opts->x_aarch64_override_tune_string) != 0))
13789     return false;
13790
13791   return true;
13792 }
13793
13794 /* Return true if SYMBOL_REF X binds locally.  */
13795
13796 static bool
13797 aarch64_symbol_binds_local_p (const_rtx x)
13798 {
13799   return (SYMBOL_REF_DECL (x)
13800           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
13801           : SYMBOL_REF_LOCAL_P (x));
13802 }
13803
13804 /* Return true if SYMBOL_REF X is thread local */
13805 static bool
13806 aarch64_tls_symbol_p (rtx x)
13807 {
13808   if (! TARGET_HAVE_TLS)
13809     return false;
13810
13811   if (GET_CODE (x) != SYMBOL_REF)
13812     return false;
13813
13814   return SYMBOL_REF_TLS_MODEL (x) != 0;
13815 }
13816
13817 /* Classify a TLS symbol into one of the TLS kinds.  */
13818 enum aarch64_symbol_type
13819 aarch64_classify_tls_symbol (rtx x)
13820 {
13821   enum tls_model tls_kind = tls_symbolic_operand_type (x);
13822
13823   switch (tls_kind)
13824     {
13825     case TLS_MODEL_GLOBAL_DYNAMIC:
13826     case TLS_MODEL_LOCAL_DYNAMIC:
13827       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
13828
13829     case TLS_MODEL_INITIAL_EXEC:
13830       switch (aarch64_cmodel)
13831         {
13832         case AARCH64_CMODEL_TINY:
13833         case AARCH64_CMODEL_TINY_PIC:
13834           return SYMBOL_TINY_TLSIE;
13835         default:
13836           return SYMBOL_SMALL_TLSIE;
13837         }
13838
13839     case TLS_MODEL_LOCAL_EXEC:
13840       if (aarch64_tls_size == 12)
13841         return SYMBOL_TLSLE12;
13842       else if (aarch64_tls_size == 24)
13843         return SYMBOL_TLSLE24;
13844       else if (aarch64_tls_size == 32)
13845         return SYMBOL_TLSLE32;
13846       else if (aarch64_tls_size == 48)
13847         return SYMBOL_TLSLE48;
13848       else
13849         gcc_unreachable ();
13850
13851     case TLS_MODEL_EMULATED:
13852     case TLS_MODEL_NONE:
13853       return SYMBOL_FORCE_TO_MEM;
13854
13855     default:
13856       gcc_unreachable ();
13857     }
13858 }
13859
13860 /* Return the correct method for accessing X + OFFSET, where X is either
13861    a SYMBOL_REF or LABEL_REF.  */
13862
13863 enum aarch64_symbol_type
13864 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
13865 {
13866   if (GET_CODE (x) == LABEL_REF)
13867     {
13868       switch (aarch64_cmodel)
13869         {
13870         case AARCH64_CMODEL_LARGE:
13871           return SYMBOL_FORCE_TO_MEM;
13872
13873         case AARCH64_CMODEL_TINY_PIC:
13874         case AARCH64_CMODEL_TINY:
13875           return SYMBOL_TINY_ABSOLUTE;
13876
13877         case AARCH64_CMODEL_SMALL_SPIC:
13878         case AARCH64_CMODEL_SMALL_PIC:
13879         case AARCH64_CMODEL_SMALL:
13880           return SYMBOL_SMALL_ABSOLUTE;
13881
13882         default:
13883           gcc_unreachable ();
13884         }
13885     }
13886
13887   if (GET_CODE (x) == SYMBOL_REF)
13888     {
13889       if (aarch64_tls_symbol_p (x))
13890         return aarch64_classify_tls_symbol (x);
13891
13892       switch (aarch64_cmodel)
13893         {
13894         case AARCH64_CMODEL_TINY:
13895           /* When we retrieve symbol + offset address, we have to make sure
13896              the offset does not cause overflow of the final address.  But
13897              we have no way of knowing the address of symbol at compile time
13898              so we can't accurately say if the distance between the PC and
13899              symbol + offset is outside the addressible range of +/-1M in the
13900              TINY code model.  So we rely on images not being greater than
13901              1M and cap the offset at 1M and anything beyond 1M will have to
13902              be loaded using an alternative mechanism.  Furthermore if the
13903              symbol is a weak reference to something that isn't known to
13904              resolve to a symbol in this module, then force to memory.  */
13905           if ((SYMBOL_REF_WEAK (x)
13906                && !aarch64_symbol_binds_local_p (x))
13907               || !IN_RANGE (offset, -1048575, 1048575))
13908             return SYMBOL_FORCE_TO_MEM;
13909           return SYMBOL_TINY_ABSOLUTE;
13910
13911         case AARCH64_CMODEL_SMALL:
13912           /* Same reasoning as the tiny code model, but the offset cap here is
13913              4G.  */
13914           if ((SYMBOL_REF_WEAK (x)
13915                && !aarch64_symbol_binds_local_p (x))
13916               || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
13917                             HOST_WIDE_INT_C (4294967264)))
13918             return SYMBOL_FORCE_TO_MEM;
13919           return SYMBOL_SMALL_ABSOLUTE;
13920
13921         case AARCH64_CMODEL_TINY_PIC:
13922           if (!aarch64_symbol_binds_local_p (x))
13923             return SYMBOL_TINY_GOT;
13924           return SYMBOL_TINY_ABSOLUTE;
13925
13926         case AARCH64_CMODEL_SMALL_SPIC:
13927         case AARCH64_CMODEL_SMALL_PIC:
13928           if (!aarch64_symbol_binds_local_p (x))
13929             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
13930                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
13931           return SYMBOL_SMALL_ABSOLUTE;
13932
13933         case AARCH64_CMODEL_LARGE:
13934           /* This is alright even in PIC code as the constant
13935              pool reference is always PC relative and within
13936              the same translation unit.  */
13937           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
13938             return SYMBOL_SMALL_ABSOLUTE;
13939           else
13940             return SYMBOL_FORCE_TO_MEM;
13941
13942         default:
13943           gcc_unreachable ();
13944         }
13945     }
13946
13947   /* By default push everything into the constant pool.  */
13948   return SYMBOL_FORCE_TO_MEM;
13949 }
13950
13951 bool
13952 aarch64_constant_address_p (rtx x)
13953 {
13954   return (CONSTANT_P (x) && memory_address_p (DImode, x));
13955 }
13956
13957 bool
13958 aarch64_legitimate_pic_operand_p (rtx x)
13959 {
13960   if (GET_CODE (x) == SYMBOL_REF
13961       || (GET_CODE (x) == CONST
13962           && GET_CODE (XEXP (x, 0)) == PLUS
13963           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
13964      return false;
13965
13966   return true;
13967 }
13968
13969 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
13970    that should be rematerialized rather than spilled.  */
13971
13972 static bool
13973 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
13974 {
13975   /* Support CSE and rematerialization of common constants.  */
13976   if (CONST_INT_P (x)
13977       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
13978       || GET_CODE (x) == CONST_VECTOR)
13979     return true;
13980
13981   /* Do not allow vector struct mode constants for Advanced SIMD.
13982      We could support 0 and -1 easily, but they need support in
13983      aarch64-simd.md.  */
13984   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13985   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13986     return false;
13987
13988   /* Only accept variable-length vector constants if they can be
13989      handled directly.
13990
13991      ??? It would be possible to handle rematerialization of other
13992      constants via secondary reloads.  */
13993   if (vec_flags & VEC_ANY_SVE)
13994     return aarch64_simd_valid_immediate (x, NULL);
13995
13996   if (GET_CODE (x) == HIGH)
13997     x = XEXP (x, 0);
13998
13999   /* Accept polynomial constants that can be calculated by using the
14000      destination of a move as the sole temporary.  Constants that
14001      require a second temporary cannot be rematerialized (they can't be
14002      forced to memory and also aren't legitimate constants).  */
14003   poly_int64 offset;
14004   if (poly_int_rtx_p (x, &offset))
14005     return aarch64_offset_temporaries (false, offset) <= 1;
14006
14007   /* If an offset is being added to something else, we need to allow the
14008      base to be moved into the destination register, meaning that there
14009      are no free temporaries for the offset.  */
14010   x = strip_offset (x, &offset);
14011   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
14012     return false;
14013
14014   /* Do not allow const (plus (anchor_symbol, const_int)).  */
14015   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
14016     return false;
14017
14018   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
14019      so spilling them is better than rematerialization.  */
14020   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
14021     return true;
14022
14023   /* Label references are always constant.  */
14024   if (GET_CODE (x) == LABEL_REF)
14025     return true;
14026
14027   return false;
14028 }
14029
14030 rtx
14031 aarch64_load_tp (rtx target)
14032 {
14033   if (!target
14034       || GET_MODE (target) != Pmode
14035       || !register_operand (target, Pmode))
14036     target = gen_reg_rtx (Pmode);
14037
14038   /* Can return in any reg.  */
14039   emit_insn (gen_aarch64_load_tp_hard (target));
14040   return target;
14041 }
14042
14043 /* On AAPCS systems, this is the "struct __va_list".  */
14044 static GTY(()) tree va_list_type;
14045
14046 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
14047    Return the type to use as __builtin_va_list.
14048
14049    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
14050
14051    struct __va_list
14052    {
14053      void *__stack;
14054      void *__gr_top;
14055      void *__vr_top;
14056      int   __gr_offs;
14057      int   __vr_offs;
14058    };  */
14059
14060 static tree
14061 aarch64_build_builtin_va_list (void)
14062 {
14063   tree va_list_name;
14064   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14065
14066   /* Create the type.  */
14067   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
14068   /* Give it the required name.  */
14069   va_list_name = build_decl (BUILTINS_LOCATION,
14070                              TYPE_DECL,
14071                              get_identifier ("__va_list"),
14072                              va_list_type);
14073   DECL_ARTIFICIAL (va_list_name) = 1;
14074   TYPE_NAME (va_list_type) = va_list_name;
14075   TYPE_STUB_DECL (va_list_type) = va_list_name;
14076
14077   /* Create the fields.  */
14078   f_stack = build_decl (BUILTINS_LOCATION,
14079                         FIELD_DECL, get_identifier ("__stack"),
14080                         ptr_type_node);
14081   f_grtop = build_decl (BUILTINS_LOCATION,
14082                         FIELD_DECL, get_identifier ("__gr_top"),
14083                         ptr_type_node);
14084   f_vrtop = build_decl (BUILTINS_LOCATION,
14085                         FIELD_DECL, get_identifier ("__vr_top"),
14086                         ptr_type_node);
14087   f_groff = build_decl (BUILTINS_LOCATION,
14088                         FIELD_DECL, get_identifier ("__gr_offs"),
14089                         integer_type_node);
14090   f_vroff = build_decl (BUILTINS_LOCATION,
14091                         FIELD_DECL, get_identifier ("__vr_offs"),
14092                         integer_type_node);
14093
14094   /* Tell tree-stdarg pass about our internal offset fields.
14095      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
14096      purpose to identify whether the code is updating va_list internal
14097      offset fields through irregular way.  */
14098   va_list_gpr_counter_field = f_groff;
14099   va_list_fpr_counter_field = f_vroff;
14100
14101   DECL_ARTIFICIAL (f_stack) = 1;
14102   DECL_ARTIFICIAL (f_grtop) = 1;
14103   DECL_ARTIFICIAL (f_vrtop) = 1;
14104   DECL_ARTIFICIAL (f_groff) = 1;
14105   DECL_ARTIFICIAL (f_vroff) = 1;
14106
14107   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
14108   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
14109   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
14110   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
14111   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
14112
14113   TYPE_FIELDS (va_list_type) = f_stack;
14114   DECL_CHAIN (f_stack) = f_grtop;
14115   DECL_CHAIN (f_grtop) = f_vrtop;
14116   DECL_CHAIN (f_vrtop) = f_groff;
14117   DECL_CHAIN (f_groff) = f_vroff;
14118
14119   /* Compute its layout.  */
14120   layout_type (va_list_type);
14121
14122   return va_list_type;
14123 }
14124
14125 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
14126 static void
14127 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
14128 {
14129   const CUMULATIVE_ARGS *cum;
14130   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14131   tree stack, grtop, vrtop, groff, vroff;
14132   tree t;
14133   int gr_save_area_size = cfun->va_list_gpr_size;
14134   int vr_save_area_size = cfun->va_list_fpr_size;
14135   int vr_offset;
14136
14137   cum = &crtl->args.info;
14138   if (cfun->va_list_gpr_size)
14139     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
14140                              cfun->va_list_gpr_size);
14141   if (cfun->va_list_fpr_size)
14142     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
14143                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
14144
14145   if (!TARGET_FLOAT)
14146     {
14147       gcc_assert (cum->aapcs_nvrn == 0);
14148       vr_save_area_size = 0;
14149     }
14150
14151   f_stack = TYPE_FIELDS (va_list_type_node);
14152   f_grtop = DECL_CHAIN (f_stack);
14153   f_vrtop = DECL_CHAIN (f_grtop);
14154   f_groff = DECL_CHAIN (f_vrtop);
14155   f_vroff = DECL_CHAIN (f_groff);
14156
14157   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
14158                   NULL_TREE);
14159   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
14160                   NULL_TREE);
14161   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
14162                   NULL_TREE);
14163   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
14164                   NULL_TREE);
14165   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
14166                   NULL_TREE);
14167
14168   /* Emit code to initialize STACK, which points to the next varargs stack
14169      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
14170      by named arguments.  STACK is 8-byte aligned.  */
14171   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
14172   if (cum->aapcs_stack_size > 0)
14173     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
14174   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
14175   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14176
14177   /* Emit code to initialize GRTOP, the top of the GR save area.
14178      virtual_incoming_args_rtx should have been 16 byte aligned.  */
14179   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
14180   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
14181   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14182
14183   /* Emit code to initialize VRTOP, the top of the VR save area.
14184      This address is gr_save_area_bytes below GRTOP, rounded
14185      down to the next 16-byte boundary.  */
14186   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
14187   vr_offset = ROUND_UP (gr_save_area_size,
14188                         STACK_BOUNDARY / BITS_PER_UNIT);
14189
14190   if (vr_offset)
14191     t = fold_build_pointer_plus_hwi (t, -vr_offset);
14192   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
14193   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14194
14195   /* Emit code to initialize GROFF, the offset from GRTOP of the
14196      next GPR argument.  */
14197   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
14198               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
14199   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14200
14201   /* Likewise emit code to initialize VROFF, the offset from FTOP
14202      of the next VR argument.  */
14203   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
14204               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
14205   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14206 }
14207
14208 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
14209
14210 static tree
14211 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
14212                               gimple_seq *post_p ATTRIBUTE_UNUSED)
14213 {
14214   tree addr;
14215   bool indirect_p;
14216   bool is_ha;           /* is HFA or HVA.  */
14217   bool dw_align;        /* double-word align.  */
14218   machine_mode ag_mode = VOIDmode;
14219   int nregs;
14220   machine_mode mode;
14221
14222   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14223   tree stack, f_top, f_off, off, arg, roundup, on_stack;
14224   HOST_WIDE_INT size, rsize, adjust, align;
14225   tree t, u, cond1, cond2;
14226
14227   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
14228   if (indirect_p)
14229     type = build_pointer_type (type);
14230
14231   mode = TYPE_MODE (type);
14232
14233   f_stack = TYPE_FIELDS (va_list_type_node);
14234   f_grtop = DECL_CHAIN (f_stack);
14235   f_vrtop = DECL_CHAIN (f_grtop);
14236   f_groff = DECL_CHAIN (f_vrtop);
14237   f_vroff = DECL_CHAIN (f_groff);
14238
14239   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
14240                   f_stack, NULL_TREE);
14241   size = int_size_in_bytes (type);
14242
14243   bool abi_break;
14244   align
14245     = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
14246
14247   dw_align = false;
14248   adjust = 0;
14249   if (aarch64_vfp_is_call_or_return_candidate (mode,
14250                                                type,
14251                                                &ag_mode,
14252                                                &nregs,
14253                                                &is_ha))
14254     {
14255       /* No frontends can create types with variable-sized modes, so we
14256          shouldn't be asked to pass or return them.  */
14257       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
14258
14259       /* TYPE passed in fp/simd registers.  */
14260       if (!TARGET_FLOAT)
14261         aarch64_err_no_fpadvsimd (mode);
14262
14263       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
14264                       unshare_expr (valist), f_vrtop, NULL_TREE);
14265       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
14266                       unshare_expr (valist), f_vroff, NULL_TREE);
14267
14268       rsize = nregs * UNITS_PER_VREG;
14269
14270       if (is_ha)
14271         {
14272           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
14273             adjust = UNITS_PER_VREG - ag_size;
14274         }
14275       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14276                && size < UNITS_PER_VREG)
14277         {
14278           adjust = UNITS_PER_VREG - size;
14279         }
14280     }
14281   else
14282     {
14283       /* TYPE passed in general registers.  */
14284       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
14285                       unshare_expr (valist), f_grtop, NULL_TREE);
14286       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
14287                       unshare_expr (valist), f_groff, NULL_TREE);
14288       rsize = ROUND_UP (size, UNITS_PER_WORD);
14289       nregs = rsize / UNITS_PER_WORD;
14290
14291       if (align > 8)
14292         {
14293           if (abi_break && warn_psabi)
14294             inform (input_location, "parameter passing for argument of type "
14295                     "%qT changed in GCC 9.1", type);
14296           dw_align = true;
14297         }
14298
14299       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14300           && size < UNITS_PER_WORD)
14301         {
14302           adjust = UNITS_PER_WORD  - size;
14303         }
14304     }
14305
14306   /* Get a local temporary for the field value.  */
14307   off = get_initialized_tmp_var (f_off, pre_p, NULL);
14308
14309   /* Emit code to branch if off >= 0.  */
14310   t = build2 (GE_EXPR, boolean_type_node, off,
14311               build_int_cst (TREE_TYPE (off), 0));
14312   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
14313
14314   if (dw_align)
14315     {
14316       /* Emit: offs = (offs + 15) & -16.  */
14317       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
14318                   build_int_cst (TREE_TYPE (off), 15));
14319       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
14320                   build_int_cst (TREE_TYPE (off), -16));
14321       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
14322     }
14323   else
14324     roundup = NULL;
14325
14326   /* Update ap.__[g|v]r_offs  */
14327   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
14328               build_int_cst (TREE_TYPE (off), rsize));
14329   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
14330
14331   /* String up.  */
14332   if (roundup)
14333     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
14334
14335   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
14336   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
14337               build_int_cst (TREE_TYPE (f_off), 0));
14338   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
14339
14340   /* String up: make sure the assignment happens before the use.  */
14341   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
14342   COND_EXPR_ELSE (cond1) = t;
14343
14344   /* Prepare the trees handling the argument that is passed on the stack;
14345      the top level node will store in ON_STACK.  */
14346   arg = get_initialized_tmp_var (stack, pre_p, NULL);
14347   if (align > 8)
14348     {
14349       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
14350       t = fold_build_pointer_plus_hwi (arg, 15);
14351       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
14352                   build_int_cst (TREE_TYPE (t), -16));
14353       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
14354     }
14355   else
14356     roundup = NULL;
14357   /* Advance ap.__stack  */
14358   t = fold_build_pointer_plus_hwi (arg, size + 7);
14359   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
14360               build_int_cst (TREE_TYPE (t), -8));
14361   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
14362   /* String up roundup and advance.  */
14363   if (roundup)
14364     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
14365   /* String up with arg */
14366   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
14367   /* Big-endianness related address adjustment.  */
14368   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14369       && size < UNITS_PER_WORD)
14370   {
14371     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
14372                 size_int (UNITS_PER_WORD - size));
14373     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
14374   }
14375
14376   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
14377   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
14378
14379   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
14380   t = off;
14381   if (adjust)
14382     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
14383                 build_int_cst (TREE_TYPE (off), adjust));
14384
14385   t = fold_convert (sizetype, t);
14386   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
14387
14388   if (is_ha)
14389     {
14390       /* type ha; // treat as "struct {ftype field[n];}"
14391          ... [computing offs]
14392          for (i = 0; i <nregs; ++i, offs += 16)
14393            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
14394          return ha;  */
14395       int i;
14396       tree tmp_ha, field_t, field_ptr_t;
14397
14398       /* Declare a local variable.  */
14399       tmp_ha = create_tmp_var_raw (type, "ha");
14400       gimple_add_tmp_var (tmp_ha);
14401
14402       /* Establish the base type.  */
14403       switch (ag_mode)
14404         {
14405         case E_SFmode:
14406           field_t = float_type_node;
14407           field_ptr_t = float_ptr_type_node;
14408           break;
14409         case E_DFmode:
14410           field_t = double_type_node;
14411           field_ptr_t = double_ptr_type_node;
14412           break;
14413         case E_TFmode:
14414           field_t = long_double_type_node;
14415           field_ptr_t = long_double_ptr_type_node;
14416           break;
14417         case E_HFmode:
14418           field_t = aarch64_fp16_type_node;
14419           field_ptr_t = aarch64_fp16_ptr_type_node;
14420           break;
14421         case E_V2SImode:
14422         case E_V4SImode:
14423             {
14424               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
14425               field_t = build_vector_type_for_mode (innertype, ag_mode);
14426               field_ptr_t = build_pointer_type (field_t);
14427             }
14428           break;
14429         default:
14430           gcc_assert (0);
14431         }
14432
14433       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
14434       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
14435       addr = t;
14436       t = fold_convert (field_ptr_t, addr);
14437       t = build2 (MODIFY_EXPR, field_t,
14438                   build1 (INDIRECT_REF, field_t, tmp_ha),
14439                   build1 (INDIRECT_REF, field_t, t));
14440
14441       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
14442       for (i = 1; i < nregs; ++i)
14443         {
14444           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
14445           u = fold_convert (field_ptr_t, addr);
14446           u = build2 (MODIFY_EXPR, field_t,
14447                       build2 (MEM_REF, field_t, tmp_ha,
14448                               build_int_cst (field_ptr_t,
14449                                              (i *
14450                                               int_size_in_bytes (field_t)))),
14451                       build1 (INDIRECT_REF, field_t, u));
14452           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
14453         }
14454
14455       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
14456       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
14457     }
14458
14459   COND_EXPR_ELSE (cond2) = t;
14460   addr = fold_convert (build_pointer_type (type), cond1);
14461   addr = build_va_arg_indirect_ref (addr);
14462
14463   if (indirect_p)
14464     addr = build_va_arg_indirect_ref (addr);
14465
14466   return addr;
14467 }
14468
14469 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
14470
14471 static void
14472 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
14473                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
14474                                 int no_rtl)
14475 {
14476   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
14477   CUMULATIVE_ARGS local_cum;
14478   int gr_saved = cfun->va_list_gpr_size;
14479   int vr_saved = cfun->va_list_fpr_size;
14480
14481   /* The caller has advanced CUM up to, but not beyond, the last named
14482      argument.  Advance a local copy of CUM past the last "real" named
14483      argument, to find out how many registers are left over.  */
14484   local_cum = *cum;
14485   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
14486
14487   /* Found out how many registers we need to save.
14488      Honor tree-stdvar analysis results.  */
14489   if (cfun->va_list_gpr_size)
14490     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
14491                     cfun->va_list_gpr_size / UNITS_PER_WORD);
14492   if (cfun->va_list_fpr_size)
14493     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
14494                     cfun->va_list_fpr_size / UNITS_PER_VREG);
14495
14496   if (!TARGET_FLOAT)
14497     {
14498       gcc_assert (local_cum.aapcs_nvrn == 0);
14499       vr_saved = 0;
14500     }
14501
14502   if (!no_rtl)
14503     {
14504       if (gr_saved > 0)
14505         {
14506           rtx ptr, mem;
14507
14508           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
14509           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
14510                                - gr_saved * UNITS_PER_WORD);
14511           mem = gen_frame_mem (BLKmode, ptr);
14512           set_mem_alias_set (mem, get_varargs_alias_set ());
14513
14514           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
14515                                mem, gr_saved);
14516         }
14517       if (vr_saved > 0)
14518         {
14519           /* We can't use move_block_from_reg, because it will use
14520              the wrong mode, storing D regs only.  */
14521           machine_mode mode = TImode;
14522           int off, i, vr_start;
14523
14524           /* Set OFF to the offset from virtual_incoming_args_rtx of
14525              the first vector register.  The VR save area lies below
14526              the GR one, and is aligned to 16 bytes.  */
14527           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
14528                            STACK_BOUNDARY / BITS_PER_UNIT);
14529           off -= vr_saved * UNITS_PER_VREG;
14530
14531           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
14532           for (i = 0; i < vr_saved; ++i)
14533             {
14534               rtx ptr, mem;
14535
14536               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
14537               mem = gen_frame_mem (mode, ptr);
14538               set_mem_alias_set (mem, get_varargs_alias_set ());
14539               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
14540               off += UNITS_PER_VREG;
14541             }
14542         }
14543     }
14544
14545   /* We don't save the size into *PRETEND_SIZE because we want to avoid
14546      any complication of having crtl->args.pretend_args_size changed.  */
14547   cfun->machine->frame.saved_varargs_size
14548     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
14549                  STACK_BOUNDARY / BITS_PER_UNIT)
14550        + vr_saved * UNITS_PER_VREG);
14551 }
14552
14553 static void
14554 aarch64_conditional_register_usage (void)
14555 {
14556   int i;
14557   if (!TARGET_FLOAT)
14558     {
14559       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
14560         {
14561           fixed_regs[i] = 1;
14562           call_used_regs[i] = 1;
14563         }
14564     }
14565   if (!TARGET_SVE)
14566     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
14567       {
14568         fixed_regs[i] = 1;
14569         call_used_regs[i] = 1;
14570       }
14571
14572   /* When tracking speculation, we need a couple of call-clobbered registers
14573      to track the speculation state.  It would be nice to just use
14574      IP0 and IP1, but currently there are numerous places that just
14575      assume these registers are free for other uses (eg pointer
14576      authentication).  */
14577   if (aarch64_track_speculation)
14578     {
14579       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
14580       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
14581       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14582       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14583     }
14584 }
14585
14586 /* Walk down the type tree of TYPE counting consecutive base elements.
14587    If *MODEP is VOIDmode, then set it to the first valid floating point
14588    type.  If a non-floating point type is found, or if a floating point
14589    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
14590    otherwise return the count in the sub-tree.  */
14591 static int
14592 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
14593 {
14594   machine_mode mode;
14595   HOST_WIDE_INT size;
14596
14597   switch (TREE_CODE (type))
14598     {
14599     case REAL_TYPE:
14600       mode = TYPE_MODE (type);
14601       if (mode != DFmode && mode != SFmode
14602           && mode != TFmode && mode != HFmode)
14603         return -1;
14604
14605       if (*modep == VOIDmode)
14606         *modep = mode;
14607
14608       if (*modep == mode)
14609         return 1;
14610
14611       break;
14612
14613     case COMPLEX_TYPE:
14614       mode = TYPE_MODE (TREE_TYPE (type));
14615       if (mode != DFmode && mode != SFmode
14616           && mode != TFmode && mode != HFmode)
14617         return -1;
14618
14619       if (*modep == VOIDmode)
14620         *modep = mode;
14621
14622       if (*modep == mode)
14623         return 2;
14624
14625       break;
14626
14627     case VECTOR_TYPE:
14628       /* Use V2SImode and V4SImode as representatives of all 64-bit
14629          and 128-bit vector types.  */
14630       size = int_size_in_bytes (type);
14631       switch (size)
14632         {
14633         case 8:
14634           mode = V2SImode;
14635           break;
14636         case 16:
14637           mode = V4SImode;
14638           break;
14639         default:
14640           return -1;
14641         }
14642
14643       if (*modep == VOIDmode)
14644         *modep = mode;
14645
14646       /* Vector modes are considered to be opaque: two vectors are
14647          equivalent for the purposes of being homogeneous aggregates
14648          if they are the same size.  */
14649       if (*modep == mode)
14650         return 1;
14651
14652       break;
14653
14654     case ARRAY_TYPE:
14655       {
14656         int count;
14657         tree index = TYPE_DOMAIN (type);
14658
14659         /* Can't handle incomplete types nor sizes that are not
14660            fixed.  */
14661         if (!COMPLETE_TYPE_P (type)
14662             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14663           return -1;
14664
14665         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
14666         if (count == -1
14667             || !index
14668             || !TYPE_MAX_VALUE (index)
14669             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
14670             || !TYPE_MIN_VALUE (index)
14671             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
14672             || count < 0)
14673           return -1;
14674
14675         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
14676                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
14677
14678         /* There must be no padding.  */
14679         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14680                       count * GET_MODE_BITSIZE (*modep)))
14681           return -1;
14682
14683         return count;
14684       }
14685
14686     case RECORD_TYPE:
14687       {
14688         int count = 0;
14689         int sub_count;
14690         tree field;
14691
14692         /* Can't handle incomplete types nor sizes that are not
14693            fixed.  */
14694         if (!COMPLETE_TYPE_P (type)
14695             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14696           return -1;
14697
14698         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14699           {
14700             if (TREE_CODE (field) != FIELD_DECL)
14701               continue;
14702
14703             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14704             if (sub_count < 0)
14705               return -1;
14706             count += sub_count;
14707           }
14708
14709         /* There must be no padding.  */
14710         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14711                       count * GET_MODE_BITSIZE (*modep)))
14712           return -1;
14713
14714         return count;
14715       }
14716
14717     case UNION_TYPE:
14718     case QUAL_UNION_TYPE:
14719       {
14720         /* These aren't very interesting except in a degenerate case.  */
14721         int count = 0;
14722         int sub_count;
14723         tree field;
14724
14725         /* Can't handle incomplete types nor sizes that are not
14726            fixed.  */
14727         if (!COMPLETE_TYPE_P (type)
14728             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14729           return -1;
14730
14731         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14732           {
14733             if (TREE_CODE (field) != FIELD_DECL)
14734               continue;
14735
14736             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14737             if (sub_count < 0)
14738               return -1;
14739             count = count > sub_count ? count : sub_count;
14740           }
14741
14742         /* There must be no padding.  */
14743         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14744                       count * GET_MODE_BITSIZE (*modep)))
14745           return -1;
14746
14747         return count;
14748       }
14749
14750     default:
14751       break;
14752     }
14753
14754   return -1;
14755 }
14756
14757 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
14758    type as described in AAPCS64 \S 4.1.2.
14759
14760    See the comment above aarch64_composite_type_p for the notes on MODE.  */
14761
14762 static bool
14763 aarch64_short_vector_p (const_tree type,
14764                         machine_mode mode)
14765 {
14766   poly_int64 size = -1;
14767
14768   if (type && TREE_CODE (type) == VECTOR_TYPE)
14769     size = int_size_in_bytes (type);
14770   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
14771             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
14772     size = GET_MODE_SIZE (mode);
14773
14774   return known_eq (size, 8) || known_eq (size, 16);
14775 }
14776
14777 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
14778    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
14779    array types.  The C99 floating-point complex types are also considered
14780    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
14781    types, which are GCC extensions and out of the scope of AAPCS64, are
14782    treated as composite types here as well.
14783
14784    Note that MODE itself is not sufficient in determining whether a type
14785    is such a composite type or not.  This is because
14786    stor-layout.c:compute_record_mode may have already changed the MODE
14787    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
14788    structure with only one field may have its MODE set to the mode of the
14789    field.  Also an integer mode whose size matches the size of the
14790    RECORD_TYPE type may be used to substitute the original mode
14791    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
14792    solely relied on.  */
14793
14794 static bool
14795 aarch64_composite_type_p (const_tree type,
14796                           machine_mode mode)
14797 {
14798   if (aarch64_short_vector_p (type, mode))
14799     return false;
14800
14801   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
14802     return true;
14803
14804   if (mode == BLKmode
14805       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
14806       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
14807     return true;
14808
14809   return false;
14810 }
14811
14812 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
14813    shall be passed or returned in simd/fp register(s) (providing these
14814    parameter passing registers are available).
14815
14816    Upon successful return, *COUNT returns the number of needed registers,
14817    *BASE_MODE returns the mode of the individual register and when IS_HAF
14818    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
14819    floating-point aggregate or a homogeneous short-vector aggregate.  */
14820
14821 static bool
14822 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
14823                                          const_tree type,
14824                                          machine_mode *base_mode,
14825                                          int *count,
14826                                          bool *is_ha)
14827 {
14828   machine_mode new_mode = VOIDmode;
14829   bool composite_p = aarch64_composite_type_p (type, mode);
14830
14831   if (is_ha != NULL) *is_ha = false;
14832
14833   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
14834       || aarch64_short_vector_p (type, mode))
14835     {
14836       *count = 1;
14837       new_mode = mode;
14838     }
14839   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
14840     {
14841       if (is_ha != NULL) *is_ha = true;
14842       *count = 2;
14843       new_mode = GET_MODE_INNER (mode);
14844     }
14845   else if (type && composite_p)
14846     {
14847       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
14848
14849       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
14850         {
14851           if (is_ha != NULL) *is_ha = true;
14852           *count = ag_count;
14853         }
14854       else
14855         return false;
14856     }
14857   else
14858     return false;
14859
14860   *base_mode = new_mode;
14861   return true;
14862 }
14863
14864 /* Implement TARGET_STRUCT_VALUE_RTX.  */
14865
14866 static rtx
14867 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
14868                           int incoming ATTRIBUTE_UNUSED)
14869 {
14870   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
14871 }
14872
14873 /* Implements target hook vector_mode_supported_p.  */
14874 static bool
14875 aarch64_vector_mode_supported_p (machine_mode mode)
14876 {
14877   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14878   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
14879 }
14880
14881 /* Return the full-width SVE vector mode for element mode MODE, if one
14882    exists.  */
14883 opt_machine_mode
14884 aarch64_full_sve_mode (scalar_mode mode)
14885 {
14886   switch (mode)
14887     {
14888     case E_DFmode:
14889       return VNx2DFmode;
14890     case E_SFmode:
14891       return VNx4SFmode;
14892     case E_HFmode:
14893       return VNx8HFmode;
14894     case E_DImode:
14895         return VNx2DImode;
14896     case E_SImode:
14897       return VNx4SImode;
14898     case E_HImode:
14899       return VNx8HImode;
14900     case E_QImode:
14901       return VNx16QImode;
14902     default:
14903       return opt_machine_mode ();
14904     }
14905 }
14906
14907 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
14908    if it exists.  */
14909 opt_machine_mode
14910 aarch64_vq_mode (scalar_mode mode)
14911 {
14912   switch (mode)
14913     {
14914     case E_DFmode:
14915       return V2DFmode;
14916     case E_SFmode:
14917       return V4SFmode;
14918     case E_HFmode:
14919       return V8HFmode;
14920     case E_SImode:
14921       return V4SImode;
14922     case E_HImode:
14923       return V8HImode;
14924     case E_QImode:
14925       return V16QImode;
14926     case E_DImode:
14927       return V2DImode;
14928     default:
14929       return opt_machine_mode ();
14930     }
14931 }
14932
14933 /* Return appropriate SIMD container
14934    for MODE within a vector of WIDTH bits.  */
14935 static machine_mode
14936 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
14937 {
14938   if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
14939     return aarch64_full_sve_mode (mode).else_mode (word_mode);
14940
14941   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
14942   if (TARGET_SIMD)
14943     {
14944       if (known_eq (width, 128))
14945         return aarch64_vq_mode (mode).else_mode (word_mode);
14946       else
14947         switch (mode)
14948           {
14949           case E_SFmode:
14950             return V2SFmode;
14951           case E_HFmode:
14952             return V4HFmode;
14953           case E_SImode:
14954             return V2SImode;
14955           case E_HImode:
14956             return V4HImode;
14957           case E_QImode:
14958             return V8QImode;
14959           default:
14960             break;
14961           }
14962     }
14963   return word_mode;
14964 }
14965
14966 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
14967 static machine_mode
14968 aarch64_preferred_simd_mode (scalar_mode mode)
14969 {
14970   poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
14971   return aarch64_simd_container_mode (mode, bits);
14972 }
14973
14974 /* Return a list of possible vector sizes for the vectorizer
14975    to iterate over.  */
14976 static void
14977 aarch64_autovectorize_vector_sizes (vector_sizes *sizes, bool)
14978 {
14979   if (TARGET_SVE)
14980     sizes->safe_push (BYTES_PER_SVE_VECTOR);
14981   sizes->safe_push (16);
14982   sizes->safe_push (8);
14983 }
14984
14985 /* Implement TARGET_MANGLE_TYPE.  */
14986
14987 static const char *
14988 aarch64_mangle_type (const_tree type)
14989 {
14990   /* The AArch64 ABI documents say that "__va_list" has to be
14991      mangled as if it is in the "std" namespace.  */
14992   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
14993     return "St9__va_list";
14994
14995   /* Half-precision float.  */
14996   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
14997     return "Dh";
14998
14999   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
15000      builtin types.  */
15001   if (TYPE_NAME (type) != NULL)
15002     return aarch64_mangle_builtin_type (type);
15003
15004   /* Use the default mangling.  */
15005   return NULL;
15006 }
15007
15008 /* Find the first rtx_insn before insn that will generate an assembly
15009    instruction.  */
15010
15011 static rtx_insn *
15012 aarch64_prev_real_insn (rtx_insn *insn)
15013 {
15014   if (!insn)
15015     return NULL;
15016
15017   do
15018     {
15019       insn = prev_real_insn (insn);
15020     }
15021   while (insn && recog_memoized (insn) < 0);
15022
15023   return insn;
15024 }
15025
15026 static bool
15027 is_madd_op (enum attr_type t1)
15028 {
15029   unsigned int i;
15030   /* A number of these may be AArch32 only.  */
15031   enum attr_type mlatypes[] = {
15032     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
15033     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
15034     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
15035   };
15036
15037   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
15038     {
15039       if (t1 == mlatypes[i])
15040         return true;
15041     }
15042
15043   return false;
15044 }
15045
15046 /* Check if there is a register dependency between a load and the insn
15047    for which we hold recog_data.  */
15048
15049 static bool
15050 dep_between_memop_and_curr (rtx memop)
15051 {
15052   rtx load_reg;
15053   int opno;
15054
15055   gcc_assert (GET_CODE (memop) == SET);
15056
15057   if (!REG_P (SET_DEST (memop)))
15058     return false;
15059
15060   load_reg = SET_DEST (memop);
15061   for (opno = 1; opno < recog_data.n_operands; opno++)
15062     {
15063       rtx operand = recog_data.operand[opno];
15064       if (REG_P (operand)
15065           && reg_overlap_mentioned_p (load_reg, operand))
15066         return true;
15067
15068     }
15069   return false;
15070 }
15071
15072
15073 /* When working around the Cortex-A53 erratum 835769,
15074    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
15075    instruction and has a preceding memory instruction such that a NOP
15076    should be inserted between them.  */
15077
15078 bool
15079 aarch64_madd_needs_nop (rtx_insn* insn)
15080 {
15081   enum attr_type attr_type;
15082   rtx_insn *prev;
15083   rtx body;
15084
15085   if (!TARGET_FIX_ERR_A53_835769)
15086     return false;
15087
15088   if (!INSN_P (insn) || recog_memoized (insn) < 0)
15089     return false;
15090
15091   attr_type = get_attr_type (insn);
15092   if (!is_madd_op (attr_type))
15093     return false;
15094
15095   prev = aarch64_prev_real_insn (insn);
15096   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
15097      Restore recog state to INSN to avoid state corruption.  */
15098   extract_constrain_insn_cached (insn);
15099
15100   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
15101     return false;
15102
15103   body = single_set (prev);
15104
15105   /* If the previous insn is a memory op and there is no dependency between
15106      it and the DImode madd, emit a NOP between them.  If body is NULL then we
15107      have a complex memory operation, probably a load/store pair.
15108      Be conservative for now and emit a NOP.  */
15109   if (GET_MODE (recog_data.operand[0]) == DImode
15110       && (!body || !dep_between_memop_and_curr (body)))
15111     return true;
15112
15113   return false;
15114
15115 }
15116
15117
15118 /* Implement FINAL_PRESCAN_INSN.  */
15119
15120 void
15121 aarch64_final_prescan_insn (rtx_insn *insn)
15122 {
15123   if (aarch64_madd_needs_nop (insn))
15124     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
15125 }
15126
15127
15128 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
15129    instruction.  */
15130
15131 bool
15132 aarch64_sve_index_immediate_p (rtx base_or_step)
15133 {
15134   return (CONST_INT_P (base_or_step)
15135           && IN_RANGE (INTVAL (base_or_step), -16, 15));
15136 }
15137
15138 /* Return true if X is a valid immediate for the SVE ADD and SUB
15139    instructions.  Negate X first if NEGATE_P is true.  */
15140
15141 bool
15142 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
15143 {
15144   rtx elt;
15145
15146   if (!const_vec_duplicate_p (x, &elt)
15147       || !CONST_INT_P (elt))
15148     return false;
15149
15150   HOST_WIDE_INT val = INTVAL (elt);
15151   if (negate_p)
15152     val = -val;
15153   val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
15154
15155   if (val & 0xff)
15156     return IN_RANGE (val, 0, 0xff);
15157   return IN_RANGE (val, 0, 0xff00);
15158 }
15159
15160 /* Return true if X is a valid immediate operand for an SVE logical
15161    instruction such as AND.  */
15162
15163 bool
15164 aarch64_sve_bitmask_immediate_p (rtx x)
15165 {
15166   rtx elt;
15167
15168   return (const_vec_duplicate_p (x, &elt)
15169           && CONST_INT_P (elt)
15170           && aarch64_bitmask_imm (INTVAL (elt),
15171                                   GET_MODE_INNER (GET_MODE (x))));
15172 }
15173
15174 /* Return true if X is a valid immediate for the SVE DUP and CPY
15175    instructions.  */
15176
15177 bool
15178 aarch64_sve_dup_immediate_p (rtx x)
15179 {
15180   x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
15181   if (!CONST_INT_P (x))
15182     return false;
15183
15184   HOST_WIDE_INT val = INTVAL (x);
15185   if (val & 0xff)
15186     return IN_RANGE (val, -0x80, 0x7f);
15187   return IN_RANGE (val, -0x8000, 0x7f00);
15188 }
15189
15190 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
15191    SIGNED_P says whether the operand is signed rather than unsigned.  */
15192
15193 bool
15194 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
15195 {
15196   rtx elt;
15197
15198   return (const_vec_duplicate_p (x, &elt)
15199           && CONST_INT_P (elt)
15200           && (signed_p
15201               ? IN_RANGE (INTVAL (elt), -16, 15)
15202               : IN_RANGE (INTVAL (elt), 0, 127)));
15203 }
15204
15205 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
15206    instruction.  Negate X first if NEGATE_P is true.  */
15207
15208 bool
15209 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
15210 {
15211   rtx elt;
15212   REAL_VALUE_TYPE r;
15213
15214   if (!const_vec_duplicate_p (x, &elt)
15215       || GET_CODE (elt) != CONST_DOUBLE)
15216     return false;
15217
15218   r = *CONST_DOUBLE_REAL_VALUE (elt);
15219
15220   if (negate_p)
15221     r = real_value_negate (&r);
15222
15223   if (real_equal (&r, &dconst1))
15224     return true;
15225   if (real_equal (&r, &dconsthalf))
15226     return true;
15227   return false;
15228 }
15229
15230 /* Return true if X is a valid immediate operand for an SVE FMUL
15231    instruction.  */
15232
15233 bool
15234 aarch64_sve_float_mul_immediate_p (rtx x)
15235 {
15236   rtx elt;
15237
15238   return (const_vec_duplicate_p (x, &elt)
15239           && GET_CODE (elt) == CONST_DOUBLE
15240           && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
15241               || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
15242 }
15243
15244 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
15245    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
15246    is nonnull, use it to describe valid immediates.  */
15247 static bool
15248 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
15249                                     simd_immediate_info *info,
15250                                     enum simd_immediate_check which,
15251                                     simd_immediate_info::insn_type insn)
15252 {
15253   /* Try a 4-byte immediate with LSL.  */
15254   for (unsigned int shift = 0; shift < 32; shift += 8)
15255     if ((val32 & (0xff << shift)) == val32)
15256       {
15257         if (info)
15258           *info = simd_immediate_info (SImode, val32 >> shift, insn,
15259                                        simd_immediate_info::LSL, shift);
15260         return true;
15261       }
15262
15263   /* Try a 2-byte immediate with LSL.  */
15264   unsigned int imm16 = val32 & 0xffff;
15265   if (imm16 == (val32 >> 16))
15266     for (unsigned int shift = 0; shift < 16; shift += 8)
15267       if ((imm16 & (0xff << shift)) == imm16)
15268         {
15269           if (info)
15270             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
15271                                          simd_immediate_info::LSL, shift);
15272           return true;
15273         }
15274
15275   /* Try a 4-byte immediate with MSL, except for cases that MVN
15276      can handle.  */
15277   if (which == AARCH64_CHECK_MOV)
15278     for (unsigned int shift = 8; shift < 24; shift += 8)
15279       {
15280         unsigned int low = (1 << shift) - 1;
15281         if (((val32 & (0xff << shift)) | low) == val32)
15282           {
15283             if (info)
15284               *info = simd_immediate_info (SImode, val32 >> shift, insn,
15285                                            simd_immediate_info::MSL, shift);
15286             return true;
15287           }
15288       }
15289
15290   return false;
15291 }
15292
15293 /* Return true if replicating VAL64 is a valid immediate for the
15294    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
15295    use it to describe valid immediates.  */
15296 static bool
15297 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
15298                                  simd_immediate_info *info,
15299                                  enum simd_immediate_check which)
15300 {
15301   unsigned int val32 = val64 & 0xffffffff;
15302   unsigned int val16 = val64 & 0xffff;
15303   unsigned int val8 = val64 & 0xff;
15304
15305   if (val32 == (val64 >> 32))
15306     {
15307       if ((which & AARCH64_CHECK_ORR) != 0
15308           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
15309                                                  simd_immediate_info::MOV))
15310         return true;
15311
15312       if ((which & AARCH64_CHECK_BIC) != 0
15313           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
15314                                                  simd_immediate_info::MVN))
15315         return true;
15316
15317       /* Try using a replicated byte.  */
15318       if (which == AARCH64_CHECK_MOV
15319           && val16 == (val32 >> 16)
15320           && val8 == (val16 >> 8))
15321         {
15322           if (info)
15323             *info = simd_immediate_info (QImode, val8);
15324           return true;
15325         }
15326     }
15327
15328   /* Try using a bit-to-bytemask.  */
15329   if (which == AARCH64_CHECK_MOV)
15330     {
15331       unsigned int i;
15332       for (i = 0; i < 64; i += 8)
15333         {
15334           unsigned char byte = (val64 >> i) & 0xff;
15335           if (byte != 0 && byte != 0xff)
15336             break;
15337         }
15338       if (i == 64)
15339         {
15340           if (info)
15341             *info = simd_immediate_info (DImode, val64);
15342           return true;
15343         }
15344     }
15345   return false;
15346 }
15347
15348 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
15349    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
15350
15351 static bool
15352 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
15353                              simd_immediate_info *info)
15354 {
15355   scalar_int_mode mode = DImode;
15356   unsigned int val32 = val64 & 0xffffffff;
15357   if (val32 == (val64 >> 32))
15358     {
15359       mode = SImode;
15360       unsigned int val16 = val32 & 0xffff;
15361       if (val16 == (val32 >> 16))
15362         {
15363           mode = HImode;
15364           unsigned int val8 = val16 & 0xff;
15365           if (val8 == (val16 >> 8))
15366             mode = QImode;
15367         }
15368     }
15369   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
15370   if (IN_RANGE (val, -0x80, 0x7f))
15371     {
15372       /* DUP with no shift.  */
15373       if (info)
15374         *info = simd_immediate_info (mode, val);
15375       return true;
15376     }
15377   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
15378     {
15379       /* DUP with LSL #8.  */
15380       if (info)
15381         *info = simd_immediate_info (mode, val);
15382       return true;
15383     }
15384   if (aarch64_bitmask_imm (val64, mode))
15385     {
15386       /* DUPM.  */
15387       if (info)
15388         *info = simd_immediate_info (mode, val);
15389       return true;
15390     }
15391   return false;
15392 }
15393
15394 /* Return true if X is a valid SVE predicate.  If INFO is nonnull, use
15395    it to describe valid immediates.  */
15396
15397 static bool
15398 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
15399 {
15400   if (x == CONST0_RTX (GET_MODE (x)))
15401     {
15402       if (info)
15403         *info = simd_immediate_info (DImode, 0);
15404       return true;
15405     }
15406
15407   /* Analyze the value as a VNx16BImode.  This should be relatively
15408      efficient, since rtx_vector_builder has enough built-in capacity
15409      to store all VLA predicate constants without needing the heap.  */
15410   rtx_vector_builder builder;
15411   if (!aarch64_get_sve_pred_bits (builder, x))
15412     return false;
15413
15414   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
15415   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
15416     {
15417       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
15418       aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
15419       if (pattern != AARCH64_NUM_SVPATTERNS)
15420         {
15421           if (info)
15422             {
15423               scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
15424               *info = simd_immediate_info (int_mode, pattern);
15425             }
15426           return true;
15427         }
15428     }
15429   return false;
15430 }
15431
15432 /* Return true if OP is a valid SIMD immediate for the operation
15433    described by WHICH.  If INFO is nonnull, use it to describe valid
15434    immediates.  */
15435 bool
15436 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
15437                               enum simd_immediate_check which)
15438 {
15439   machine_mode mode = GET_MODE (op);
15440   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15441   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15442     return false;
15443
15444   if (vec_flags & VEC_SVE_PRED)
15445     return aarch64_sve_pred_valid_immediate (op, info);
15446
15447   scalar_mode elt_mode = GET_MODE_INNER (mode);
15448   rtx base, step;
15449   unsigned int n_elts;
15450   if (GET_CODE (op) == CONST_VECTOR
15451       && CONST_VECTOR_DUPLICATE_P (op))
15452     n_elts = CONST_VECTOR_NPATTERNS (op);
15453   else if ((vec_flags & VEC_SVE_DATA)
15454            && const_vec_series_p (op, &base, &step))
15455     {
15456       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
15457       if (!aarch64_sve_index_immediate_p (base)
15458           || !aarch64_sve_index_immediate_p (step))
15459         return false;
15460
15461       if (info)
15462         *info = simd_immediate_info (elt_mode, base, step);
15463       return true;
15464     }
15465   else if (GET_CODE (op) == CONST_VECTOR
15466            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
15467     /* N_ELTS set above.  */;
15468   else
15469     return false;
15470
15471   scalar_float_mode elt_float_mode;
15472   if (n_elts == 1
15473       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
15474     {
15475       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
15476       if (aarch64_float_const_zero_rtx_p (elt)
15477           || aarch64_float_const_representable_p (elt))
15478         {
15479           if (info)
15480             *info = simd_immediate_info (elt_float_mode, elt);
15481           return true;
15482         }
15483     }
15484
15485   unsigned int elt_size = GET_MODE_SIZE (elt_mode);
15486   if (elt_size > 8)
15487     return false;
15488
15489   scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
15490
15491   /* Expand the vector constant out into a byte vector, with the least
15492      significant byte of the register first.  */
15493   auto_vec<unsigned char, 16> bytes;
15494   bytes.reserve (n_elts * elt_size);
15495   for (unsigned int i = 0; i < n_elts; i++)
15496     {
15497       /* The vector is provided in gcc endian-neutral fashion.
15498          For aarch64_be Advanced SIMD, it must be laid out in the vector
15499          register in reverse order.  */
15500       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
15501       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
15502
15503       if (elt_mode != elt_int_mode)
15504         elt = gen_lowpart (elt_int_mode, elt);
15505
15506       if (!CONST_INT_P (elt))
15507         return false;
15508
15509       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
15510       for (unsigned int byte = 0; byte < elt_size; byte++)
15511         {
15512           bytes.quick_push (elt_val & 0xff);
15513           elt_val >>= BITS_PER_UNIT;
15514         }
15515     }
15516
15517   /* The immediate must repeat every eight bytes.  */
15518   unsigned int nbytes = bytes.length ();
15519   for (unsigned i = 8; i < nbytes; ++i)
15520     if (bytes[i] != bytes[i - 8])
15521       return false;
15522
15523   /* Get the repeating 8-byte value as an integer.  No endian correction
15524      is needed here because bytes is already in lsb-first order.  */
15525   unsigned HOST_WIDE_INT val64 = 0;
15526   for (unsigned int i = 0; i < 8; i++)
15527     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
15528               << (i * BITS_PER_UNIT));
15529
15530   if (vec_flags & VEC_SVE_DATA)
15531     return aarch64_sve_valid_immediate (val64, info);
15532   else
15533     return aarch64_advsimd_valid_immediate (val64, info, which);
15534 }
15535
15536 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
15537    has a step in the range of INDEX.  Return the index expression if so,
15538    otherwise return null.  */
15539 rtx
15540 aarch64_check_zero_based_sve_index_immediate (rtx x)
15541 {
15542   rtx base, step;
15543   if (const_vec_series_p (x, &base, &step)
15544       && base == const0_rtx
15545       && aarch64_sve_index_immediate_p (step))
15546     return step;
15547   return NULL_RTX;
15548 }
15549
15550 /* Check of immediate shift constants are within range.  */
15551 bool
15552 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
15553 {
15554   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
15555   if (left)
15556     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
15557   else
15558     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
15559 }
15560
15561 /* Return the bitmask CONST_INT to select the bits required by a zero extract
15562    operation of width WIDTH at bit position POS.  */
15563
15564 rtx
15565 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
15566 {
15567   gcc_assert (CONST_INT_P (width));
15568   gcc_assert (CONST_INT_P (pos));
15569
15570   unsigned HOST_WIDE_INT mask
15571     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
15572   return GEN_INT (mask << UINTVAL (pos));
15573 }
15574
15575 bool
15576 aarch64_mov_operand_p (rtx x, machine_mode mode)
15577 {
15578   if (GET_CODE (x) == HIGH
15579       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
15580     return true;
15581
15582   if (CONST_INT_P (x))
15583     return true;
15584
15585   if (VECTOR_MODE_P (GET_MODE (x)))
15586     {
15587       /* Require predicate constants to be VNx16BI before RA, so that we
15588          force everything to have a canonical form.  */
15589       if (!lra_in_progress
15590           && !reload_completed
15591           && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
15592           && GET_MODE (x) != VNx16BImode)
15593         return false;
15594
15595       return aarch64_simd_valid_immediate (x, NULL);
15596     }
15597
15598   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
15599     return true;
15600
15601   if (aarch64_sve_cnt_immediate_p (x))
15602     return true;
15603
15604   return aarch64_classify_symbolic_expression (x)
15605     == SYMBOL_TINY_ABSOLUTE;
15606 }
15607
15608 /* Return a const_int vector of VAL.  */
15609 rtx
15610 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
15611 {
15612   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
15613   return gen_const_vec_duplicate (mode, c);
15614 }
15615
15616 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
15617
15618 bool
15619 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
15620 {
15621   machine_mode vmode;
15622
15623   vmode = aarch64_simd_container_mode (mode, 64);
15624   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
15625   return aarch64_simd_valid_immediate (op_v, NULL);
15626 }
15627
15628 /* Construct and return a PARALLEL RTX vector with elements numbering the
15629    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
15630    the vector - from the perspective of the architecture.  This does not
15631    line up with GCC's perspective on lane numbers, so we end up with
15632    different masks depending on our target endian-ness.  The diagram
15633    below may help.  We must draw the distinction when building masks
15634    which select one half of the vector.  An instruction selecting
15635    architectural low-lanes for a big-endian target, must be described using
15636    a mask selecting GCC high-lanes.
15637
15638                  Big-Endian             Little-Endian
15639
15640 GCC             0   1   2   3           3   2   1   0
15641               | x | x | x | x |       | x | x | x | x |
15642 Architecture    3   2   1   0           3   2   1   0
15643
15644 Low Mask:         { 2, 3 }                { 0, 1 }
15645 High Mask:        { 0, 1 }                { 2, 3 }
15646
15647    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
15648
15649 rtx
15650 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
15651 {
15652   rtvec v = rtvec_alloc (nunits / 2);
15653   int high_base = nunits / 2;
15654   int low_base = 0;
15655   int base;
15656   rtx t1;
15657   int i;
15658
15659   if (BYTES_BIG_ENDIAN)
15660     base = high ? low_base : high_base;
15661   else
15662     base = high ? high_base : low_base;
15663
15664   for (i = 0; i < nunits / 2; i++)
15665     RTVEC_ELT (v, i) = GEN_INT (base + i);
15666
15667   t1 = gen_rtx_PARALLEL (mode, v);
15668   return t1;
15669 }
15670
15671 /* Check OP for validity as a PARALLEL RTX vector with elements
15672    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
15673    from the perspective of the architecture.  See the diagram above
15674    aarch64_simd_vect_par_cnst_half for more details.  */
15675
15676 bool
15677 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
15678                                        bool high)
15679 {
15680   int nelts;
15681   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
15682     return false;
15683
15684   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
15685   HOST_WIDE_INT count_op = XVECLEN (op, 0);
15686   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
15687   int i = 0;
15688
15689   if (count_op != count_ideal)
15690     return false;
15691
15692   for (i = 0; i < count_ideal; i++)
15693     {
15694       rtx elt_op = XVECEXP (op, 0, i);
15695       rtx elt_ideal = XVECEXP (ideal, 0, i);
15696
15697       if (!CONST_INT_P (elt_op)
15698           || INTVAL (elt_ideal) != INTVAL (elt_op))
15699         return false;
15700     }
15701   return true;
15702 }
15703
15704 /* Return a PARALLEL containing NELTS elements, with element I equal
15705    to BASE + I * STEP.  */
15706
15707 rtx
15708 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
15709 {
15710   rtvec vec = rtvec_alloc (nelts);
15711   for (unsigned int i = 0; i < nelts; ++i)
15712     RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
15713   return gen_rtx_PARALLEL (VOIDmode, vec);
15714 }
15715
15716 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
15717    series with step STEP.  */
15718
15719 bool
15720 aarch64_stepped_int_parallel_p (rtx op, int step)
15721 {
15722   if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
15723     return false;
15724
15725   unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
15726   for (int i = 1; i < XVECLEN (op, 0); ++i)
15727     if (!CONST_INT_P (XVECEXP (op, 0, i))
15728         || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
15729       return false;
15730
15731   return true;
15732 }
15733
15734 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
15735    HIGH (exclusive).  */
15736 void
15737 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
15738                           const_tree exp)
15739 {
15740   HOST_WIDE_INT lane;
15741   gcc_assert (CONST_INT_P (operand));
15742   lane = INTVAL (operand);
15743
15744   if (lane < low || lane >= high)
15745   {
15746     if (exp)
15747       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
15748     else
15749       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
15750   }
15751 }
15752
15753 /* Peform endian correction on lane number N, which indexes a vector
15754    of mode MODE, and return the result as an SImode rtx.  */
15755
15756 rtx
15757 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
15758 {
15759   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
15760 }
15761
15762 /* Return TRUE if OP is a valid vector addressing mode.  */
15763
15764 bool
15765 aarch64_simd_mem_operand_p (rtx op)
15766 {
15767   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
15768                         || REG_P (XEXP (op, 0)));
15769 }
15770
15771 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
15772
15773 bool
15774 aarch64_sve_ld1r_operand_p (rtx op)
15775 {
15776   struct aarch64_address_info addr;
15777   scalar_mode mode;
15778
15779   return (MEM_P (op)
15780           && is_a <scalar_mode> (GET_MODE (op), &mode)
15781           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
15782           && addr.type == ADDRESS_REG_IMM
15783           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
15784 }
15785
15786 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction.  */
15787 bool
15788 aarch64_sve_ld1rq_operand_p (rtx op)
15789 {
15790   struct aarch64_address_info addr;
15791   scalar_mode elem_mode = GET_MODE_INNER (GET_MODE (op));
15792   if (!MEM_P (op)
15793       || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
15794     return false;
15795
15796   if (addr.type == ADDRESS_REG_IMM)
15797     return offset_4bit_signed_scaled_p (TImode, addr.const_offset);
15798
15799   if (addr.type == ADDRESS_REG_REG)
15800     return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
15801
15802   return false;
15803 }
15804
15805 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
15806    The conditions for STR are the same.  */
15807 bool
15808 aarch64_sve_ldr_operand_p (rtx op)
15809 {
15810   struct aarch64_address_info addr;
15811
15812   return (MEM_P (op)
15813           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
15814                                        false, ADDR_QUERY_ANY)
15815           && addr.type == ADDRESS_REG_IMM);
15816 }
15817
15818 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
15819    We need to be able to access the individual pieces, so the range
15820    is different from LD[234] and ST[234].  */
15821 bool
15822 aarch64_sve_struct_memory_operand_p (rtx op)
15823 {
15824   if (!MEM_P (op))
15825     return false;
15826
15827   machine_mode mode = GET_MODE (op);
15828   struct aarch64_address_info addr;
15829   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
15830                                  ADDR_QUERY_ANY)
15831       || addr.type != ADDRESS_REG_IMM)
15832     return false;
15833
15834   poly_int64 first = addr.const_offset;
15835   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
15836   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
15837           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
15838 }
15839
15840 /* Emit a register copy from operand to operand, taking care not to
15841    early-clobber source registers in the process.
15842
15843    COUNT is the number of components into which the copy needs to be
15844    decomposed.  */
15845 void
15846 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
15847                                 unsigned int count)
15848 {
15849   unsigned int i;
15850   int rdest = REGNO (operands[0]);
15851   int rsrc = REGNO (operands[1]);
15852
15853   if (!reg_overlap_mentioned_p (operands[0], operands[1])
15854       || rdest < rsrc)
15855     for (i = 0; i < count; i++)
15856       emit_move_insn (gen_rtx_REG (mode, rdest + i),
15857                       gen_rtx_REG (mode, rsrc + i));
15858   else
15859     for (i = 0; i < count; i++)
15860       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
15861                       gen_rtx_REG (mode, rsrc + count - i - 1));
15862 }
15863
15864 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
15865    one of VSTRUCT modes: OI, CI, or XI.  */
15866 int
15867 aarch64_simd_attr_length_rglist (machine_mode mode)
15868 {
15869   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
15870   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
15871 }
15872
15873 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
15874    alignment of a vector to 128 bits.  SVE predicates have an alignment of
15875    16 bits.  */
15876 static HOST_WIDE_INT
15877 aarch64_simd_vector_alignment (const_tree type)
15878 {
15879   if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
15880     /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
15881        be set for non-predicate vectors of booleans.  Modes are the most
15882        direct way we have of identifying real SVE predicate types.  */
15883     return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
15884   return wi::umin (wi::to_wide (TYPE_SIZE (type)), 128).to_uhwi ();
15885 }
15886
15887 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
15888 static poly_uint64
15889 aarch64_vectorize_preferred_vector_alignment (const_tree type)
15890 {
15891   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
15892     {
15893       /* If the length of the vector is fixed, try to align to that length,
15894          otherwise don't try to align at all.  */
15895       HOST_WIDE_INT result;
15896       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
15897         result = TYPE_ALIGN (TREE_TYPE (type));
15898       return result;
15899     }
15900   return TYPE_ALIGN (type);
15901 }
15902
15903 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
15904 static bool
15905 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
15906 {
15907   if (is_packed)
15908     return false;
15909
15910   /* For fixed-length vectors, check that the vectorizer will aim for
15911      full-vector alignment.  This isn't true for generic GCC vectors
15912      that are wider than the ABI maximum of 128 bits.  */
15913   poly_uint64 preferred_alignment =
15914     aarch64_vectorize_preferred_vector_alignment (type);
15915   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
15916       && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
15917                    preferred_alignment))
15918     return false;
15919
15920   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
15921   return true;
15922 }
15923
15924 /* Return true if the vector misalignment factor is supported by the
15925    target.  */
15926 static bool
15927 aarch64_builtin_support_vector_misalignment (machine_mode mode,
15928                                              const_tree type, int misalignment,
15929                                              bool is_packed)
15930 {
15931   if (TARGET_SIMD && STRICT_ALIGNMENT)
15932     {
15933       /* Return if movmisalign pattern is not supported for this mode.  */
15934       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
15935         return false;
15936
15937       /* Misalignment factor is unknown at compile time.  */
15938       if (misalignment == -1)
15939         return false;
15940     }
15941   return default_builtin_support_vector_misalignment (mode, type, misalignment,
15942                                                       is_packed);
15943 }
15944
15945 /* If VALS is a vector constant that can be loaded into a register
15946    using DUP, generate instructions to do so and return an RTX to
15947    assign to the register.  Otherwise return NULL_RTX.  */
15948 static rtx
15949 aarch64_simd_dup_constant (rtx vals)
15950 {
15951   machine_mode mode = GET_MODE (vals);
15952   machine_mode inner_mode = GET_MODE_INNER (mode);
15953   rtx x;
15954
15955   if (!const_vec_duplicate_p (vals, &x))
15956     return NULL_RTX;
15957
15958   /* We can load this constant by using DUP and a constant in a
15959      single ARM register.  This will be cheaper than a vector
15960      load.  */
15961   x = copy_to_mode_reg (inner_mode, x);
15962   return gen_vec_duplicate (mode, x);
15963 }
15964
15965
15966 /* Generate code to load VALS, which is a PARALLEL containing only
15967    constants (for vec_init) or CONST_VECTOR, efficiently into a
15968    register.  Returns an RTX to copy into the register, or NULL_RTX
15969    for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
15970 static rtx
15971 aarch64_simd_make_constant (rtx vals)
15972 {
15973   machine_mode mode = GET_MODE (vals);
15974   rtx const_dup;
15975   rtx const_vec = NULL_RTX;
15976   int n_const = 0;
15977   int i;
15978
15979   if (GET_CODE (vals) == CONST_VECTOR)
15980     const_vec = vals;
15981   else if (GET_CODE (vals) == PARALLEL)
15982     {
15983       /* A CONST_VECTOR must contain only CONST_INTs and
15984          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
15985          Only store valid constants in a CONST_VECTOR.  */
15986       int n_elts = XVECLEN (vals, 0);
15987       for (i = 0; i < n_elts; ++i)
15988         {
15989           rtx x = XVECEXP (vals, 0, i);
15990           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15991             n_const++;
15992         }
15993       if (n_const == n_elts)
15994         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
15995     }
15996   else
15997     gcc_unreachable ();
15998
15999   if (const_vec != NULL_RTX
16000       && aarch64_simd_valid_immediate (const_vec, NULL))
16001     /* Load using MOVI/MVNI.  */
16002     return const_vec;
16003   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
16004     /* Loaded using DUP.  */
16005     return const_dup;
16006   else if (const_vec != NULL_RTX)
16007     /* Load from constant pool. We cannot take advantage of single-cycle
16008        LD1 because we need a PC-relative addressing mode.  */
16009     return const_vec;
16010   else
16011     /* A PARALLEL containing something not valid inside CONST_VECTOR.
16012        We cannot construct an initializer.  */
16013     return NULL_RTX;
16014 }
16015
16016 /* Expand a vector initialisation sequence, such that TARGET is
16017    initialised to contain VALS.  */
16018
16019 void
16020 aarch64_expand_vector_init (rtx target, rtx vals)
16021 {
16022   machine_mode mode = GET_MODE (target);
16023   scalar_mode inner_mode = GET_MODE_INNER (mode);
16024   /* The number of vector elements.  */
16025   int n_elts = XVECLEN (vals, 0);
16026   /* The number of vector elements which are not constant.  */
16027   int n_var = 0;
16028   rtx any_const = NULL_RTX;
16029   /* The first element of vals.  */
16030   rtx v0 = XVECEXP (vals, 0, 0);
16031   bool all_same = true;
16032
16033   /* This is a special vec_init<M><N> where N is not an element mode but a
16034      vector mode with half the elements of M.  We expect to find two entries
16035      of mode N in VALS and we must put their concatentation into TARGET.  */
16036   if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
16037     {
16038       gcc_assert (known_eq (GET_MODE_SIZE (mode),
16039                   2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
16040       rtx lo = XVECEXP (vals, 0, 0);
16041       rtx hi = XVECEXP (vals, 0, 1);
16042       machine_mode narrow_mode = GET_MODE (lo);
16043       gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
16044       gcc_assert (narrow_mode == GET_MODE (hi));
16045
16046       /* When we want to concatenate a half-width vector with zeroes we can
16047          use the aarch64_combinez[_be] patterns.  Just make sure that the
16048          zeroes are in the right half.  */
16049       if (BYTES_BIG_ENDIAN
16050           && aarch64_simd_imm_zero (lo, narrow_mode)
16051           && general_operand (hi, narrow_mode))
16052         emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
16053       else if (!BYTES_BIG_ENDIAN
16054                && aarch64_simd_imm_zero (hi, narrow_mode)
16055                && general_operand (lo, narrow_mode))
16056         emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
16057       else
16058         {
16059           /* Else create the two half-width registers and combine them.  */
16060           if (!REG_P (lo))
16061             lo = force_reg (GET_MODE (lo), lo);
16062           if (!REG_P (hi))
16063             hi = force_reg (GET_MODE (hi), hi);
16064
16065           if (BYTES_BIG_ENDIAN)
16066             std::swap (lo, hi);
16067           emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
16068         }
16069      return;
16070    }
16071
16072   /* Count the number of variable elements to initialise.  */
16073   for (int i = 0; i < n_elts; ++i)
16074     {
16075       rtx x = XVECEXP (vals, 0, i);
16076       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
16077         ++n_var;
16078       else
16079         any_const = x;
16080
16081       all_same &= rtx_equal_p (x, v0);
16082     }
16083
16084   /* No variable elements, hand off to aarch64_simd_make_constant which knows
16085      how best to handle this.  */
16086   if (n_var == 0)
16087     {
16088       rtx constant = aarch64_simd_make_constant (vals);
16089       if (constant != NULL_RTX)
16090         {
16091           emit_move_insn (target, constant);
16092           return;
16093         }
16094     }
16095
16096   /* Splat a single non-constant element if we can.  */
16097   if (all_same)
16098     {
16099       rtx x = copy_to_mode_reg (inner_mode, v0);
16100       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
16101       return;
16102     }
16103
16104   enum insn_code icode = optab_handler (vec_set_optab, mode);
16105   gcc_assert (icode != CODE_FOR_nothing);
16106
16107   /* If there are only variable elements, try to optimize
16108      the insertion using dup for the most common element
16109      followed by insertions.  */
16110
16111   /* The algorithm will fill matches[*][0] with the earliest matching element,
16112      and matches[X][1] with the count of duplicate elements (if X is the
16113      earliest element which has duplicates).  */
16114
16115   if (n_var == n_elts && n_elts <= 16)
16116     {
16117       int matches[16][2] = {0};
16118       for (int i = 0; i < n_elts; i++)
16119         {
16120           for (int j = 0; j <= i; j++)
16121             {
16122               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
16123                 {
16124                   matches[i][0] = j;
16125                   matches[j][1]++;
16126                   break;
16127                 }
16128             }
16129         }
16130       int maxelement = 0;
16131       int maxv = 0;
16132       for (int i = 0; i < n_elts; i++)
16133         if (matches[i][1] > maxv)
16134           {
16135             maxelement = i;
16136             maxv = matches[i][1];
16137           }
16138
16139       /* Create a duplicate of the most common element, unless all elements
16140          are equally useless to us, in which case just immediately set the
16141          vector register using the first element.  */
16142
16143       if (maxv == 1)
16144         {
16145           /* For vectors of two 64-bit elements, we can do even better.  */
16146           if (n_elts == 2
16147               && (inner_mode == E_DImode
16148                   || inner_mode == E_DFmode))
16149
16150             {
16151               rtx x0 = XVECEXP (vals, 0, 0);
16152               rtx x1 = XVECEXP (vals, 0, 1);
16153               /* Combine can pick up this case, but handling it directly
16154                  here leaves clearer RTL.
16155
16156                  This is load_pair_lanes<mode>, and also gives us a clean-up
16157                  for store_pair_lanes<mode>.  */
16158               if (memory_operand (x0, inner_mode)
16159                   && memory_operand (x1, inner_mode)
16160                   && !STRICT_ALIGNMENT
16161                   && rtx_equal_p (XEXP (x1, 0),
16162                                   plus_constant (Pmode,
16163                                                  XEXP (x0, 0),
16164                                                  GET_MODE_SIZE (inner_mode))))
16165                 {
16166                   rtx t;
16167                   if (inner_mode == DFmode)
16168                     t = gen_load_pair_lanesdf (target, x0, x1);
16169                   else
16170                     t = gen_load_pair_lanesdi (target, x0, x1);
16171                   emit_insn (t);
16172                   return;
16173                 }
16174             }
16175           /* The subreg-move sequence below will move into lane zero of the
16176              vector register.  For big-endian we want that position to hold
16177              the last element of VALS.  */
16178           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
16179           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
16180           aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
16181         }
16182       else
16183         {
16184           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
16185           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
16186         }
16187
16188       /* Insert the rest.  */
16189       for (int i = 0; i < n_elts; i++)
16190         {
16191           rtx x = XVECEXP (vals, 0, i);
16192           if (matches[i][0] == maxelement)
16193             continue;
16194           x = copy_to_mode_reg (inner_mode, x);
16195           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
16196         }
16197       return;
16198     }
16199
16200   /* Initialise a vector which is part-variable.  We want to first try
16201      to build those lanes which are constant in the most efficient way we
16202      can.  */
16203   if (n_var != n_elts)
16204     {
16205       rtx copy = copy_rtx (vals);
16206
16207       /* Load constant part of vector.  We really don't care what goes into the
16208          parts we will overwrite, but we're more likely to be able to load the
16209          constant efficiently if it has fewer, larger, repeating parts
16210          (see aarch64_simd_valid_immediate).  */
16211       for (int i = 0; i < n_elts; i++)
16212         {
16213           rtx x = XVECEXP (vals, 0, i);
16214           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16215             continue;
16216           rtx subst = any_const;
16217           for (int bit = n_elts / 2; bit > 0; bit /= 2)
16218             {
16219               /* Look in the copied vector, as more elements are const.  */
16220               rtx test = XVECEXP (copy, 0, i ^ bit);
16221               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
16222                 {
16223                   subst = test;
16224                   break;
16225                 }
16226             }
16227           XVECEXP (copy, 0, i) = subst;
16228         }
16229       aarch64_expand_vector_init (target, copy);
16230     }
16231
16232   /* Insert the variable lanes directly.  */
16233   for (int i = 0; i < n_elts; i++)
16234     {
16235       rtx x = XVECEXP (vals, 0, i);
16236       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16237         continue;
16238       x = copy_to_mode_reg (inner_mode, x);
16239       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
16240     }
16241 }
16242
16243 /* Emit RTL corresponding to:
16244    insr TARGET, ELEM.  */
16245
16246 static void
16247 emit_insr (rtx target, rtx elem)
16248 {
16249   machine_mode mode = GET_MODE (target);
16250   scalar_mode elem_mode = GET_MODE_INNER (mode);
16251   elem = force_reg (elem_mode, elem);
16252
16253   insn_code icode = optab_handler (vec_shl_insert_optab, mode);
16254   gcc_assert (icode != CODE_FOR_nothing);
16255   emit_insn (GEN_FCN (icode) (target, target, elem));
16256 }
16257
16258 /* Subroutine of aarch64_sve_expand_vector_init for handling
16259    trailing constants.
16260    This function works as follows:
16261    (a) Create a new vector consisting of trailing constants.
16262    (b) Initialize TARGET with the constant vector using emit_move_insn.
16263    (c) Insert remaining elements in TARGET using insr.
16264    NELTS is the total number of elements in original vector while
16265    while NELTS_REQD is the number of elements that are actually
16266    significant.
16267
16268    ??? The heuristic used is to do above only if number of constants
16269    is at least half the total number of elements.  May need fine tuning.  */
16270
16271 static bool
16272 aarch64_sve_expand_vector_init_handle_trailing_constants
16273  (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
16274 {
16275   machine_mode mode = GET_MODE (target);
16276   scalar_mode elem_mode = GET_MODE_INNER (mode);
16277   int n_trailing_constants = 0;
16278
16279   for (int i = nelts_reqd - 1;
16280        i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i));
16281        i--)
16282     n_trailing_constants++;
16283
16284   if (n_trailing_constants >= nelts_reqd / 2)
16285     {
16286       rtx_vector_builder v (mode, 1, nelts);
16287       for (int i = 0; i < nelts; i++)
16288         v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants));
16289       rtx const_vec = v.build ();
16290       emit_move_insn (target, const_vec);
16291
16292       for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
16293         emit_insr (target, builder.elt (i));
16294
16295       return true;
16296     }
16297
16298   return false;
16299 }
16300
16301 /* Subroutine of aarch64_sve_expand_vector_init.
16302    Works as follows:
16303    (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
16304    (b) Skip trailing elements from BUILDER, which are the same as
16305        element NELTS_REQD - 1.
16306    (c) Insert earlier elements in reverse order in TARGET using insr.  */
16307
16308 static void
16309 aarch64_sve_expand_vector_init_insert_elems (rtx target,
16310                                              const rtx_vector_builder &builder,
16311                                              int nelts_reqd)
16312 {
16313   machine_mode mode = GET_MODE (target);
16314   scalar_mode elem_mode = GET_MODE_INNER (mode);
16315
16316   struct expand_operand ops[2];
16317   enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
16318   gcc_assert (icode != CODE_FOR_nothing);
16319
16320   create_output_operand (&ops[0], target, mode);
16321   create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
16322   expand_insn (icode, 2, ops);
16323
16324   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
16325   for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
16326     emit_insr (target, builder.elt (i));
16327 }
16328
16329 /* Subroutine of aarch64_sve_expand_vector_init to handle case
16330    when all trailing elements of builder are same.
16331    This works as follows:
16332    (a) Use expand_insn interface to broadcast last vector element in TARGET.
16333    (b) Insert remaining elements in TARGET using insr.
16334
16335    ??? The heuristic used is to do above if number of same trailing elements
16336    is at least 3/4 of total number of elements, loosely based on
16337    heuristic from mostly_zeros_p.  May need fine-tuning.  */
16338
16339 static bool
16340 aarch64_sve_expand_vector_init_handle_trailing_same_elem
16341  (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
16342 {
16343   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
16344   if (ndups >= (3 * nelts_reqd) / 4)
16345     {
16346       aarch64_sve_expand_vector_init_insert_elems (target, builder,
16347                                                    nelts_reqd - ndups + 1);
16348       return true;
16349     }
16350
16351   return false;
16352 }
16353
16354 /* Initialize register TARGET from BUILDER. NELTS is the constant number
16355    of elements in BUILDER.
16356
16357    The function tries to initialize TARGET from BUILDER if it fits one
16358    of the special cases outlined below.
16359
16360    Failing that, the function divides BUILDER into two sub-vectors:
16361    v_even = even elements of BUILDER;
16362    v_odd = odd elements of BUILDER;
16363
16364    and recursively calls itself with v_even and v_odd.
16365
16366    if (recursive call succeeded for v_even or v_odd)
16367      TARGET = zip (v_even, v_odd)
16368
16369    The function returns true if it managed to build TARGET from BUILDER
16370    with one of the special cases, false otherwise.
16371
16372    Example: {a, 1, b, 2, c, 3, d, 4}
16373
16374    The vector gets divided into:
16375    v_even = {a, b, c, d}
16376    v_odd = {1, 2, 3, 4}
16377
16378    aarch64_sve_expand_vector_init(v_odd) hits case 1 and
16379    initialize tmp2 from constant vector v_odd using emit_move_insn.
16380
16381    aarch64_sve_expand_vector_init(v_even) fails since v_even contains
16382    4 elements, so we construct tmp1 from v_even using insr:
16383    tmp1 = dup(d)
16384    insr tmp1, c
16385    insr tmp1, b
16386    insr tmp1, a
16387
16388    And finally:
16389    TARGET = zip (tmp1, tmp2)
16390    which sets TARGET to {a, 1, b, 2, c, 3, d, 4}.  */
16391
16392 static bool
16393 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
16394                                 int nelts, int nelts_reqd)
16395 {
16396   machine_mode mode = GET_MODE (target);
16397
16398   /* Case 1: Vector contains trailing constants.  */
16399
16400   if (aarch64_sve_expand_vector_init_handle_trailing_constants
16401        (target, builder, nelts, nelts_reqd))
16402     return true;
16403
16404   /* Case 2: Vector contains leading constants.  */
16405
16406   rtx_vector_builder rev_builder (mode, 1, nelts_reqd);
16407   for (int i = 0; i < nelts_reqd; i++)
16408     rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
16409   rev_builder.finalize ();
16410
16411   if (aarch64_sve_expand_vector_init_handle_trailing_constants
16412        (target, rev_builder, nelts, nelts_reqd))
16413     {
16414       emit_insn (gen_aarch64_sve_rev (mode, target, target));
16415       return true;
16416     }
16417
16418   /* Case 3: Vector contains trailing same element.  */
16419
16420   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16421        (target, builder, nelts_reqd))
16422     return true;
16423
16424   /* Case 4: Vector contains leading same element.  */
16425
16426   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16427        (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
16428     {
16429       emit_insn (gen_aarch64_sve_rev (mode, target, target));
16430       return true;
16431     }
16432
16433   /* Avoid recursing below 4-elements.
16434      ??? The threshold 4 may need fine-tuning.  */
16435
16436   if (nelts_reqd <= 4)
16437     return false;
16438
16439   rtx_vector_builder v_even (mode, 1, nelts);
16440   rtx_vector_builder v_odd (mode, 1, nelts);
16441
16442   for (int i = 0; i < nelts * 2; i += 2)
16443     {
16444       v_even.quick_push (builder.elt (i));
16445       v_odd.quick_push (builder.elt (i + 1));
16446     }
16447
16448   v_even.finalize ();
16449   v_odd.finalize ();
16450
16451   rtx tmp1 = gen_reg_rtx (mode);
16452   bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
16453                                                     nelts, nelts_reqd / 2);
16454
16455   rtx tmp2 = gen_reg_rtx (mode);
16456   bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
16457                                                    nelts, nelts_reqd / 2);
16458
16459   if (!did_even_p && !did_odd_p)
16460     return false;
16461
16462   /* Initialize v_even and v_odd using INSR if it didn't match any of the
16463      special cases and zip v_even, v_odd.  */
16464
16465   if (!did_even_p)
16466     aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
16467
16468   if (!did_odd_p)
16469     aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
16470
16471   rtvec v = gen_rtvec (2, tmp1, tmp2);
16472   emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
16473   return true;
16474 }
16475
16476 /* Initialize register TARGET from the elements in PARALLEL rtx VALS.  */
16477
16478 void
16479 aarch64_sve_expand_vector_init (rtx target, rtx vals)
16480 {
16481   machine_mode mode = GET_MODE (target);
16482   int nelts = XVECLEN (vals, 0);
16483
16484   rtx_vector_builder v (mode, 1, nelts);
16485   for (int i = 0; i < nelts; i++)
16486     v.quick_push (XVECEXP (vals, 0, i));
16487   v.finalize ();
16488
16489   /* If neither sub-vectors of v could be initialized specially,
16490      then use INSR to insert all elements from v into TARGET.
16491      ??? This might not be optimal for vectors with large
16492      initializers like 16-element or above.
16493      For nelts < 4, it probably isn't useful to handle specially.  */
16494
16495   if (nelts < 4
16496       || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
16497     aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
16498 }
16499
16500 /* Check whether VALUE is a vector constant in which every element
16501    is either a power of 2 or a negated power of 2.  If so, return
16502    a constant vector of log2s, and flip CODE between PLUS and MINUS
16503    if VALUE contains negated powers of 2.  Return NULL_RTX otherwise.  */
16504
16505 static rtx
16506 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
16507 {
16508   if (GET_CODE (value) != CONST_VECTOR)
16509     return NULL_RTX;
16510
16511   rtx_vector_builder builder;
16512   if (!builder.new_unary_operation (GET_MODE (value), value, false))
16513     return NULL_RTX;
16514
16515   scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
16516   /* 1 if the result of the multiplication must be negated,
16517      0 if it mustn't, or -1 if we don't yet care.  */
16518   int negate = -1;
16519   unsigned int encoded_nelts = const_vector_encoded_nelts (value);
16520   for (unsigned int i = 0; i < encoded_nelts; ++i)
16521     {
16522       rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
16523       if (!CONST_SCALAR_INT_P (elt))
16524         return NULL_RTX;
16525       rtx_mode_t val (elt, int_mode);
16526       wide_int pow2 = wi::neg (val);
16527       if (val != pow2)
16528         {
16529           /* It matters whether we negate or not.  Make that choice,
16530              and make sure that it's consistent with previous elements.  */
16531           if (negate == !wi::neg_p (val))
16532             return NULL_RTX;
16533           negate = wi::neg_p (val);
16534           if (!negate)
16535             pow2 = val;
16536         }
16537       /* POW2 is now the value that we want to be a power of 2.  */
16538       int shift = wi::exact_log2 (pow2);
16539       if (shift < 0)
16540         return NULL_RTX;
16541       builder.quick_push (gen_int_mode (shift, int_mode));
16542     }
16543   if (negate == -1)
16544     /* PLUS and MINUS are equivalent; canonicalize on PLUS.  */
16545     code = PLUS;
16546   else if (negate == 1)
16547     code = code == PLUS ? MINUS : PLUS;
16548   return builder.build ();
16549 }
16550
16551 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
16552    CODE is PLUS for the former and MINUS for the latter.  OPERANDS is the
16553    operands array, in the same order as for fma_optab.  Return true if
16554    the function emitted all the necessary instructions, false if the caller
16555    should generate the pattern normally with the new OPERANDS array.  */
16556
16557 bool
16558 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
16559 {
16560   machine_mode mode = GET_MODE (operands[0]);
16561   if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
16562     {
16563       rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
16564                                   NULL_RTX, true, OPTAB_DIRECT);
16565       force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
16566                           operands[3], product, operands[0], true,
16567                           OPTAB_DIRECT);
16568       return true;
16569     }
16570   operands[2] = force_reg (mode, operands[2]);
16571   return false;
16572 }
16573
16574 /* Likewise, but for a conditional pattern.  */
16575
16576 bool
16577 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
16578 {
16579   machine_mode mode = GET_MODE (operands[0]);
16580   if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
16581     {
16582       rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
16583                                   NULL_RTX, true, OPTAB_DIRECT);
16584       emit_insn (gen_cond (code, mode, operands[0], operands[1],
16585                            operands[4], product, operands[5]));
16586       return true;
16587     }
16588   operands[3] = force_reg (mode, operands[3]);
16589   return false;
16590 }
16591
16592 static unsigned HOST_WIDE_INT
16593 aarch64_shift_truncation_mask (machine_mode mode)
16594 {
16595   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
16596     return 0;
16597   return GET_MODE_UNIT_BITSIZE (mode) - 1;
16598 }
16599
16600 /* Select a format to encode pointers in exception handling data.  */
16601 int
16602 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
16603 {
16604    int type;
16605    switch (aarch64_cmodel)
16606      {
16607      case AARCH64_CMODEL_TINY:
16608      case AARCH64_CMODEL_TINY_PIC:
16609      case AARCH64_CMODEL_SMALL:
16610      case AARCH64_CMODEL_SMALL_PIC:
16611      case AARCH64_CMODEL_SMALL_SPIC:
16612        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
16613           for everything.  */
16614        type = DW_EH_PE_sdata4;
16615        break;
16616      default:
16617        /* No assumptions here.  8-byte relocs required.  */
16618        type = DW_EH_PE_sdata8;
16619        break;
16620      }
16621    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
16622 }
16623
16624 /* Output .variant_pcs for aarch64_vector_pcs function symbols.  */
16625
16626 static void
16627 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
16628 {
16629   if (aarch64_simd_decl_p (decl))
16630     {
16631       fprintf (stream, "\t.variant_pcs\t");
16632       assemble_name (stream, name);
16633       fprintf (stream, "\n");
16634     }
16635 }
16636
16637 /* The last .arch and .tune assembly strings that we printed.  */
16638 static std::string aarch64_last_printed_arch_string;
16639 static std::string aarch64_last_printed_tune_string;
16640
16641 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
16642    by the function fndecl.  */
16643
16644 void
16645 aarch64_declare_function_name (FILE *stream, const char* name,
16646                                 tree fndecl)
16647 {
16648   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
16649
16650   struct cl_target_option *targ_options;
16651   if (target_parts)
16652     targ_options = TREE_TARGET_OPTION (target_parts);
16653   else
16654     targ_options = TREE_TARGET_OPTION (target_option_current_node);
16655   gcc_assert (targ_options);
16656
16657   const struct processor *this_arch
16658     = aarch64_get_arch (targ_options->x_explicit_arch);
16659
16660   uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
16661   std::string extension
16662     = aarch64_get_extension_string_for_isa_flags (isa_flags,
16663                                                   this_arch->flags);
16664   /* Only update the assembler .arch string if it is distinct from the last
16665      such string we printed.  */
16666   std::string to_print = this_arch->name + extension;
16667   if (to_print != aarch64_last_printed_arch_string)
16668     {
16669       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
16670       aarch64_last_printed_arch_string = to_print;
16671     }
16672
16673   /* Print the cpu name we're tuning for in the comments, might be
16674      useful to readers of the generated asm.  Do it only when it changes
16675      from function to function and verbose assembly is requested.  */
16676   const struct processor *this_tune
16677     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
16678
16679   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
16680     {
16681       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
16682                    this_tune->name);
16683       aarch64_last_printed_tune_string = this_tune->name;
16684     }
16685
16686   aarch64_asm_output_variant_pcs (stream, fndecl, name);
16687
16688   /* Don't forget the type directive for ELF.  */
16689   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
16690   ASM_OUTPUT_LABEL (stream, name);
16691 }
16692
16693 /* Implement ASM_OUTPUT_DEF_FROM_DECLS.  Output .variant_pcs for aliases.  */
16694
16695 void
16696 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
16697 {
16698   const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
16699   const char *value = IDENTIFIER_POINTER (target);
16700   aarch64_asm_output_variant_pcs (stream, decl, name);
16701   ASM_OUTPUT_DEF (stream, name, value);
16702 }
16703
16704 /* Implement ASM_OUTPUT_EXTERNAL.  Output .variant_pcs for undefined
16705    function symbol references.  */
16706
16707 void
16708 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
16709 {
16710   default_elf_asm_output_external (stream, decl, name);
16711   aarch64_asm_output_variant_pcs (stream, decl, name);
16712 }
16713
16714 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
16715    Used to output the .cfi_b_key_frame directive when signing the current
16716    function with the B key.  */
16717
16718 void
16719 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
16720 {
16721   if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
16722       && aarch64_ra_sign_key == AARCH64_KEY_B)
16723         asm_fprintf (f, "\t.cfi_b_key_frame\n");
16724 }
16725
16726 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
16727
16728 static void
16729 aarch64_start_file (void)
16730 {
16731   struct cl_target_option *default_options
16732     = TREE_TARGET_OPTION (target_option_default_node);
16733
16734   const struct processor *default_arch
16735     = aarch64_get_arch (default_options->x_explicit_arch);
16736   uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
16737   std::string extension
16738     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
16739                                                   default_arch->flags);
16740
16741    aarch64_last_printed_arch_string = default_arch->name + extension;
16742    aarch64_last_printed_tune_string = "";
16743    asm_fprintf (asm_out_file, "\t.arch %s\n",
16744                 aarch64_last_printed_arch_string.c_str ());
16745
16746    default_file_start ();
16747 }
16748
16749 /* Emit load exclusive.  */
16750
16751 static void
16752 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
16753                              rtx mem, rtx model_rtx)
16754 {
16755   emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
16756 }
16757
16758 /* Emit store exclusive.  */
16759
16760 static void
16761 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
16762                               rtx rval, rtx mem, rtx model_rtx)
16763 {
16764   emit_insn (gen_aarch64_store_exclusive (mode, bval, rval, mem, model_rtx));
16765 }
16766
16767 /* Mark the previous jump instruction as unlikely.  */
16768
16769 static void
16770 aarch64_emit_unlikely_jump (rtx insn)
16771 {
16772   rtx_insn *jump = emit_jump_insn (insn);
16773   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
16774 }
16775
16776 /* Expand a compare and swap pattern.  */
16777
16778 void
16779 aarch64_expand_compare_and_swap (rtx operands[])
16780 {
16781   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
16782   machine_mode mode, r_mode;
16783
16784   bval = operands[0];
16785   rval = operands[1];
16786   mem = operands[2];
16787   oldval = operands[3];
16788   newval = operands[4];
16789   is_weak = operands[5];
16790   mod_s = operands[6];
16791   mod_f = operands[7];
16792   mode = GET_MODE (mem);
16793
16794   /* Normally the succ memory model must be stronger than fail, but in the
16795      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
16796      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
16797   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
16798       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
16799     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
16800
16801   r_mode = mode;
16802   if (mode == QImode || mode == HImode)
16803     {
16804       r_mode = SImode;
16805       rval = gen_reg_rtx (r_mode);
16806     }
16807
16808   if (TARGET_LSE)
16809     {
16810       /* The CAS insn requires oldval and rval overlap, but we need to
16811          have a copy of oldval saved across the operation to tell if
16812          the operation is successful.  */
16813       if (reg_overlap_mentioned_p (rval, oldval))
16814         rval = copy_to_mode_reg (r_mode, oldval);
16815       else
16816         emit_move_insn (rval, gen_lowpart (r_mode, oldval));
16817
16818       emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
16819                                                    newval, mod_s));
16820       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
16821     }
16822   else
16823     {
16824       /* The oldval predicate varies by mode.  Test it and force to reg.  */
16825       insn_code code = code_for_aarch64_compare_and_swap (mode);
16826       if (!insn_data[code].operand[2].predicate (oldval, mode))
16827         oldval = force_reg (mode, oldval);
16828
16829       emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
16830                                  is_weak, mod_s, mod_f));
16831       cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
16832     }
16833
16834   if (r_mode != mode)
16835     rval = gen_lowpart (mode, rval);
16836   emit_move_insn (operands[1], rval);
16837
16838   x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
16839   emit_insn (gen_rtx_SET (bval, x));
16840 }
16841
16842 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
16843    sequence implementing an atomic operation.  */
16844
16845 static void
16846 aarch64_emit_post_barrier (enum memmodel model)
16847 {
16848   const enum memmodel base_model = memmodel_base (model);
16849
16850   if (is_mm_sync (model)
16851       && (base_model == MEMMODEL_ACQUIRE
16852           || base_model == MEMMODEL_ACQ_REL
16853           || base_model == MEMMODEL_SEQ_CST))
16854     {
16855       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
16856     }
16857 }
16858
16859 /* Split a compare and swap pattern.  */
16860
16861 void
16862 aarch64_split_compare_and_swap (rtx operands[])
16863 {
16864   rtx rval, mem, oldval, newval, scratch;
16865   machine_mode mode;
16866   bool is_weak;
16867   rtx_code_label *label1, *label2;
16868   rtx x, cond;
16869   enum memmodel model;
16870   rtx model_rtx;
16871
16872   rval = operands[0];
16873   mem = operands[1];
16874   oldval = operands[2];
16875   newval = operands[3];
16876   is_weak = (operands[4] != const0_rtx);
16877   model_rtx = operands[5];
16878   scratch = operands[7];
16879   mode = GET_MODE (mem);
16880   model = memmodel_from_int (INTVAL (model_rtx));
16881
16882   /* When OLDVAL is zero and we want the strong version we can emit a tighter
16883     loop:
16884     .label1:
16885         LD[A]XR rval, [mem]
16886         CBNZ    rval, .label2
16887         ST[L]XR scratch, newval, [mem]
16888         CBNZ    scratch, .label1
16889     .label2:
16890         CMP     rval, 0.  */
16891   bool strong_zero_p = !is_weak && oldval == const0_rtx;
16892
16893   label1 = NULL;
16894   if (!is_weak)
16895     {
16896       label1 = gen_label_rtx ();
16897       emit_label (label1);
16898     }
16899   label2 = gen_label_rtx ();
16900
16901   /* The initial load can be relaxed for a __sync operation since a final
16902      barrier will be emitted to stop code hoisting.  */
16903   if (is_mm_sync (model))
16904     aarch64_emit_load_exclusive (mode, rval, mem,
16905                                  GEN_INT (MEMMODEL_RELAXED));
16906   else
16907     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
16908
16909   if (strong_zero_p)
16910     {
16911       if (aarch64_track_speculation)
16912         {
16913           /* Emit an explicit compare instruction, so that we can correctly
16914              track the condition codes.  */
16915           rtx cc_reg = aarch64_gen_compare_reg (NE, rval, const0_rtx);
16916           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16917         }
16918       else
16919         x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
16920
16921       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16922                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
16923       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16924     }
16925   else
16926     {
16927       cond = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
16928       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
16929       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16930                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
16931       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16932     }
16933
16934   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
16935
16936   if (!is_weak)
16937     {
16938       if (aarch64_track_speculation)
16939         {
16940           /* Emit an explicit compare instruction, so that we can correctly
16941              track the condition codes.  */
16942           rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
16943           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16944         }
16945       else
16946         x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
16947
16948       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16949                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
16950       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16951     }
16952   else
16953     {
16954       cond = gen_rtx_REG (CCmode, CC_REGNUM);
16955       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
16956       emit_insn (gen_rtx_SET (cond, x));
16957     }
16958
16959   emit_label (label2);
16960   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
16961      to set the condition flags.  If this is not used it will be removed by
16962      later passes.  */
16963   if (strong_zero_p)
16964     {
16965       cond = gen_rtx_REG (CCmode, CC_REGNUM);
16966       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
16967       emit_insn (gen_rtx_SET (cond, x));
16968     }
16969   /* Emit any final barrier needed for a __sync operation.  */
16970   if (is_mm_sync (model))
16971     aarch64_emit_post_barrier (model);
16972 }
16973
16974 /* Split an atomic operation.  */
16975
16976 void
16977 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
16978                          rtx value, rtx model_rtx, rtx cond)
16979 {
16980   machine_mode mode = GET_MODE (mem);
16981   machine_mode wmode = (mode == DImode ? DImode : SImode);
16982   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
16983   const bool is_sync = is_mm_sync (model);
16984   rtx_code_label *label;
16985   rtx x;
16986
16987   /* Split the atomic operation into a sequence.  */
16988   label = gen_label_rtx ();
16989   emit_label (label);
16990
16991   if (new_out)
16992     new_out = gen_lowpart (wmode, new_out);
16993   if (old_out)
16994     old_out = gen_lowpart (wmode, old_out);
16995   else
16996     old_out = new_out;
16997   value = simplify_gen_subreg (wmode, value, mode, 0);
16998
16999   /* The initial load can be relaxed for a __sync operation since a final
17000      barrier will be emitted to stop code hoisting.  */
17001  if (is_sync)
17002     aarch64_emit_load_exclusive (mode, old_out, mem,
17003                                  GEN_INT (MEMMODEL_RELAXED));
17004   else
17005     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
17006
17007   switch (code)
17008     {
17009     case SET:
17010       new_out = value;
17011       break;
17012
17013     case NOT:
17014       x = gen_rtx_AND (wmode, old_out, value);
17015       emit_insn (gen_rtx_SET (new_out, x));
17016       x = gen_rtx_NOT (wmode, new_out);
17017       emit_insn (gen_rtx_SET (new_out, x));
17018       break;
17019
17020     case MINUS:
17021       if (CONST_INT_P (value))
17022         {
17023           value = GEN_INT (-INTVAL (value));
17024           code = PLUS;
17025         }
17026       /* Fall through.  */
17027
17028     default:
17029       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
17030       emit_insn (gen_rtx_SET (new_out, x));
17031       break;
17032     }
17033
17034   aarch64_emit_store_exclusive (mode, cond, mem,
17035                                 gen_lowpart (mode, new_out), model_rtx);
17036
17037   if (aarch64_track_speculation)
17038     {
17039       /* Emit an explicit compare instruction, so that we can correctly
17040          track the condition codes.  */
17041       rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
17042       x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
17043     }
17044   else
17045     x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
17046
17047   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
17048                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
17049   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
17050
17051   /* Emit any final barrier needed for a __sync operation.  */
17052   if (is_sync)
17053     aarch64_emit_post_barrier (model);
17054 }
17055
17056 static void
17057 aarch64_init_libfuncs (void)
17058 {
17059    /* Half-precision float operations.  The compiler handles all operations
17060      with NULL libfuncs by converting to SFmode.  */
17061
17062   /* Conversions.  */
17063   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
17064   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
17065
17066   /* Arithmetic.  */
17067   set_optab_libfunc (add_optab, HFmode, NULL);
17068   set_optab_libfunc (sdiv_optab, HFmode, NULL);
17069   set_optab_libfunc (smul_optab, HFmode, NULL);
17070   set_optab_libfunc (neg_optab, HFmode, NULL);
17071   set_optab_libfunc (sub_optab, HFmode, NULL);
17072
17073   /* Comparisons.  */
17074   set_optab_libfunc (eq_optab, HFmode, NULL);
17075   set_optab_libfunc (ne_optab, HFmode, NULL);
17076   set_optab_libfunc (lt_optab, HFmode, NULL);
17077   set_optab_libfunc (le_optab, HFmode, NULL);
17078   set_optab_libfunc (ge_optab, HFmode, NULL);
17079   set_optab_libfunc (gt_optab, HFmode, NULL);
17080   set_optab_libfunc (unord_optab, HFmode, NULL);
17081 }
17082
17083 /* Target hook for c_mode_for_suffix.  */
17084 static machine_mode
17085 aarch64_c_mode_for_suffix (char suffix)
17086 {
17087   if (suffix == 'q')
17088     return TFmode;
17089
17090   return VOIDmode;
17091 }
17092
17093 /* We can only represent floating point constants which will fit in
17094    "quarter-precision" values.  These values are characterised by
17095    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
17096    by:
17097
17098    (-1)^s * (n/16) * 2^r
17099
17100    Where:
17101      's' is the sign bit.
17102      'n' is an integer in the range 16 <= n <= 31.
17103      'r' is an integer in the range -3 <= r <= 4.  */
17104
17105 /* Return true iff X can be represented by a quarter-precision
17106    floating point immediate operand X.  Note, we cannot represent 0.0.  */
17107 bool
17108 aarch64_float_const_representable_p (rtx x)
17109 {
17110   /* This represents our current view of how many bits
17111      make up the mantissa.  */
17112   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
17113   int exponent;
17114   unsigned HOST_WIDE_INT mantissa, mask;
17115   REAL_VALUE_TYPE r, m;
17116   bool fail;
17117
17118   x = unwrap_const_vec_duplicate (x);
17119   if (!CONST_DOUBLE_P (x))
17120     return false;
17121
17122   if (GET_MODE (x) == VOIDmode
17123       || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
17124     return false;
17125
17126   r = *CONST_DOUBLE_REAL_VALUE (x);
17127
17128   /* We cannot represent infinities, NaNs or +/-zero.  We won't
17129      know if we have +zero until we analyse the mantissa, but we
17130      can reject the other invalid values.  */
17131   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
17132       || REAL_VALUE_MINUS_ZERO (r))
17133     return false;
17134
17135   /* Extract exponent.  */
17136   r = real_value_abs (&r);
17137   exponent = REAL_EXP (&r);
17138
17139   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
17140      highest (sign) bit, with a fixed binary point at bit point_pos.
17141      m1 holds the low part of the mantissa, m2 the high part.
17142      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
17143      bits for the mantissa, this can fail (low bits will be lost).  */
17144   real_ldexp (&m, &r, point_pos - exponent);
17145   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
17146
17147   /* If the low part of the mantissa has bits set we cannot represent
17148      the value.  */
17149   if (w.ulow () != 0)
17150     return false;
17151   /* We have rejected the lower HOST_WIDE_INT, so update our
17152      understanding of how many bits lie in the mantissa and
17153      look only at the high HOST_WIDE_INT.  */
17154   mantissa = w.elt (1);
17155   point_pos -= HOST_BITS_PER_WIDE_INT;
17156
17157   /* We can only represent values with a mantissa of the form 1.xxxx.  */
17158   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
17159   if ((mantissa & mask) != 0)
17160     return false;
17161
17162   /* Having filtered unrepresentable values, we may now remove all
17163      but the highest 5 bits.  */
17164   mantissa >>= point_pos - 5;
17165
17166   /* We cannot represent the value 0.0, so reject it.  This is handled
17167      elsewhere.  */
17168   if (mantissa == 0)
17169     return false;
17170
17171   /* Then, as bit 4 is always set, we can mask it off, leaving
17172      the mantissa in the range [0, 15].  */
17173   mantissa &= ~(1 << 4);
17174   gcc_assert (mantissa <= 15);
17175
17176   /* GCC internally does not use IEEE754-like encoding (where normalized
17177      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
17178      Our mantissa values are shifted 4 places to the left relative to
17179      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
17180      by 5 places to correct for GCC's representation.  */
17181   exponent = 5 - exponent;
17182
17183   return (exponent >= 0 && exponent <= 7);
17184 }
17185
17186 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
17187    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
17188    output MOVI/MVNI, ORR or BIC immediate.  */
17189 char*
17190 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
17191                                    enum simd_immediate_check which)
17192 {
17193   bool is_valid;
17194   static char templ[40];
17195   const char *mnemonic;
17196   const char *shift_op;
17197   unsigned int lane_count = 0;
17198   char element_char;
17199
17200   struct simd_immediate_info info;
17201
17202   /* This will return true to show const_vector is legal for use as either
17203      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
17204      It will also update INFO to show how the immediate should be generated.
17205      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
17206   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
17207   gcc_assert (is_valid);
17208
17209   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
17210   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
17211
17212   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
17213     {
17214       gcc_assert (info.insn == simd_immediate_info::MOV
17215                   && info.u.mov.shift == 0);
17216       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
17217          move immediate path.  */
17218       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
17219         info.u.mov.value = GEN_INT (0);
17220       else
17221         {
17222           const unsigned int buf_size = 20;
17223           char float_buf[buf_size] = {'\0'};
17224           real_to_decimal_for_mode (float_buf,
17225                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
17226                                     buf_size, buf_size, 1, info.elt_mode);
17227
17228           if (lane_count == 1)
17229             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
17230           else
17231             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
17232                       lane_count, element_char, float_buf);
17233           return templ;
17234         }
17235     }
17236
17237   gcc_assert (CONST_INT_P (info.u.mov.value));
17238
17239   if (which == AARCH64_CHECK_MOV)
17240     {
17241       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
17242       shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
17243                   ? "msl" : "lsl");
17244       if (lane_count == 1)
17245         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
17246                   mnemonic, UINTVAL (info.u.mov.value));
17247       else if (info.u.mov.shift)
17248         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
17249                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
17250                   element_char, UINTVAL (info.u.mov.value), shift_op,
17251                   info.u.mov.shift);
17252       else
17253         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
17254                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
17255                   element_char, UINTVAL (info.u.mov.value));
17256     }
17257   else
17258     {
17259       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
17260       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
17261       if (info.u.mov.shift)
17262         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
17263                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
17264                   element_char, UINTVAL (info.u.mov.value), "lsl",
17265                   info.u.mov.shift);
17266       else
17267         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
17268                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
17269                   element_char, UINTVAL (info.u.mov.value));
17270     }
17271   return templ;
17272 }
17273
17274 char*
17275 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
17276 {
17277
17278   /* If a floating point number was passed and we desire to use it in an
17279      integer mode do the conversion to integer.  */
17280   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
17281     {
17282       unsigned HOST_WIDE_INT ival;
17283       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
17284           gcc_unreachable ();
17285       immediate = gen_int_mode (ival, mode);
17286     }
17287
17288   machine_mode vmode;
17289   /* use a 64 bit mode for everything except for DI/DF mode, where we use
17290      a 128 bit vector mode.  */
17291   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
17292
17293   vmode = aarch64_simd_container_mode (mode, width);
17294   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
17295   return aarch64_output_simd_mov_immediate (v_op, width);
17296 }
17297
17298 /* Return the output string to use for moving immediate CONST_VECTOR
17299    into an SVE register.  */
17300
17301 char *
17302 aarch64_output_sve_mov_immediate (rtx const_vector)
17303 {
17304   static char templ[40];
17305   struct simd_immediate_info info;
17306   char element_char;
17307
17308   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
17309   gcc_assert (is_valid);
17310
17311   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
17312
17313   machine_mode vec_mode = GET_MODE (const_vector);
17314   if (aarch64_sve_pred_mode_p (vec_mode))
17315     {
17316       static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
17317       if (info.insn == simd_immediate_info::MOV)
17318         {
17319           gcc_assert (info.u.mov.value == const0_rtx);
17320           snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
17321         }
17322       else
17323         {
17324           gcc_assert (info.insn == simd_immediate_info::PTRUE);
17325           unsigned int total_bytes;
17326           if (info.u.pattern == AARCH64_SV_ALL
17327               && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
17328             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
17329                       total_bytes / GET_MODE_SIZE (info.elt_mode));
17330           else
17331             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
17332                       svpattern_token (info.u.pattern));
17333         }
17334       return buf;
17335     }
17336
17337   if (info.insn == simd_immediate_info::INDEX)
17338     {
17339       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
17340                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
17341                 element_char, INTVAL (info.u.index.base),
17342                 INTVAL (info.u.index.step));
17343       return templ;
17344     }
17345
17346   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
17347     {
17348       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
17349         info.u.mov.value = GEN_INT (0);
17350       else
17351         {
17352           const int buf_size = 20;
17353           char float_buf[buf_size] = {};
17354           real_to_decimal_for_mode (float_buf,
17355                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
17356                                     buf_size, buf_size, 1, info.elt_mode);
17357
17358           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
17359                     element_char, float_buf);
17360           return templ;
17361         }
17362     }
17363
17364   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
17365             element_char, INTVAL (info.u.mov.value));
17366   return templ;
17367 }
17368
17369 /* Split operands into moves from op[1] + op[2] into op[0].  */
17370
17371 void
17372 aarch64_split_combinev16qi (rtx operands[3])
17373 {
17374   unsigned int dest = REGNO (operands[0]);
17375   unsigned int src1 = REGNO (operands[1]);
17376   unsigned int src2 = REGNO (operands[2]);
17377   machine_mode halfmode = GET_MODE (operands[1]);
17378   unsigned int halfregs = REG_NREGS (operands[1]);
17379   rtx destlo, desthi;
17380
17381   gcc_assert (halfmode == V16QImode);
17382
17383   if (src1 == dest && src2 == dest + halfregs)
17384     {
17385       /* No-op move.  Can't split to nothing; emit something.  */
17386       emit_note (NOTE_INSN_DELETED);
17387       return;
17388     }
17389
17390   /* Preserve register attributes for variable tracking.  */
17391   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
17392   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
17393                                GET_MODE_SIZE (halfmode));
17394
17395   /* Special case of reversed high/low parts.  */
17396   if (reg_overlap_mentioned_p (operands[2], destlo)
17397       && reg_overlap_mentioned_p (operands[1], desthi))
17398     {
17399       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
17400       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
17401       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
17402     }
17403   else if (!reg_overlap_mentioned_p (operands[2], destlo))
17404     {
17405       /* Try to avoid unnecessary moves if part of the result
17406          is in the right place already.  */
17407       if (src1 != dest)
17408         emit_move_insn (destlo, operands[1]);
17409       if (src2 != dest + halfregs)
17410         emit_move_insn (desthi, operands[2]);
17411     }
17412   else
17413     {
17414       if (src2 != dest + halfregs)
17415         emit_move_insn (desthi, operands[2]);
17416       if (src1 != dest)
17417         emit_move_insn (destlo, operands[1]);
17418     }
17419 }
17420
17421 /* vec_perm support.  */
17422
17423 struct expand_vec_perm_d
17424 {
17425   rtx target, op0, op1;
17426   vec_perm_indices perm;
17427   machine_mode vmode;
17428   unsigned int vec_flags;
17429   bool one_vector_p;
17430   bool testing_p;
17431 };
17432
17433 /* Generate a variable permutation.  */
17434
17435 static void
17436 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
17437 {
17438   machine_mode vmode = GET_MODE (target);
17439   bool one_vector_p = rtx_equal_p (op0, op1);
17440
17441   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
17442   gcc_checking_assert (GET_MODE (op0) == vmode);
17443   gcc_checking_assert (GET_MODE (op1) == vmode);
17444   gcc_checking_assert (GET_MODE (sel) == vmode);
17445   gcc_checking_assert (TARGET_SIMD);
17446
17447   if (one_vector_p)
17448     {
17449       if (vmode == V8QImode)
17450         {
17451           /* Expand the argument to a V16QI mode by duplicating it.  */
17452           rtx pair = gen_reg_rtx (V16QImode);
17453           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
17454           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
17455         }
17456       else
17457         {
17458           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
17459         }
17460     }
17461   else
17462     {
17463       rtx pair;
17464
17465       if (vmode == V8QImode)
17466         {
17467           pair = gen_reg_rtx (V16QImode);
17468           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
17469           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
17470         }
17471       else
17472         {
17473           pair = gen_reg_rtx (OImode);
17474           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
17475           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
17476         }
17477     }
17478 }
17479
17480 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
17481    NELT is the number of elements in the vector.  */
17482
17483 void
17484 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
17485                          unsigned int nelt)
17486 {
17487   machine_mode vmode = GET_MODE (target);
17488   bool one_vector_p = rtx_equal_p (op0, op1);
17489   rtx mask;
17490
17491   /* The TBL instruction does not use a modulo index, so we must take care
17492      of that ourselves.  */
17493   mask = aarch64_simd_gen_const_vector_dup (vmode,
17494       one_vector_p ? nelt - 1 : 2 * nelt - 1);
17495   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
17496
17497   /* For big-endian, we also need to reverse the index within the vector
17498      (but not which vector).  */
17499   if (BYTES_BIG_ENDIAN)
17500     {
17501       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
17502       if (!one_vector_p)
17503         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
17504       sel = expand_simple_binop (vmode, XOR, sel, mask,
17505                                  NULL, 0, OPTAB_LIB_WIDEN);
17506     }
17507   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
17508 }
17509
17510 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
17511
17512 static void
17513 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
17514 {
17515   emit_insn (gen_rtx_SET (target,
17516                           gen_rtx_UNSPEC (GET_MODE (target),
17517                                           gen_rtvec (2, op0, op1), code)));
17518 }
17519
17520 /* Expand an SVE vec_perm with the given operands.  */
17521
17522 void
17523 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
17524 {
17525   machine_mode data_mode = GET_MODE (target);
17526   machine_mode sel_mode = GET_MODE (sel);
17527   /* Enforced by the pattern condition.  */
17528   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
17529
17530   /* Note: vec_perm indices are supposed to wrap when they go beyond the
17531      size of the two value vectors, i.e. the upper bits of the indices
17532      are effectively ignored.  SVE TBL instead produces 0 for any
17533      out-of-range indices, so we need to modulo all the vec_perm indices
17534      to ensure they are all in range.  */
17535   rtx sel_reg = force_reg (sel_mode, sel);
17536
17537   /* Check if the sel only references the first values vector.  */
17538   if (GET_CODE (sel) == CONST_VECTOR
17539       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
17540     {
17541       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
17542       return;
17543     }
17544
17545   /* Check if the two values vectors are the same.  */
17546   if (rtx_equal_p (op0, op1))
17547     {
17548       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
17549       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
17550                                          NULL, 0, OPTAB_DIRECT);
17551       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
17552       return;
17553     }
17554
17555   /* Run TBL on for each value vector and combine the results.  */
17556
17557   rtx res0 = gen_reg_rtx (data_mode);
17558   rtx res1 = gen_reg_rtx (data_mode);
17559   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
17560   if (GET_CODE (sel) != CONST_VECTOR
17561       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
17562     {
17563       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
17564                                                        2 * nunits - 1);
17565       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
17566                                      NULL, 0, OPTAB_DIRECT);
17567     }
17568   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
17569   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
17570                                      NULL, 0, OPTAB_DIRECT);
17571   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
17572   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
17573     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
17574   else
17575     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
17576 }
17577
17578 /* Recognize patterns suitable for the TRN instructions.  */
17579 static bool
17580 aarch64_evpc_trn (struct expand_vec_perm_d *d)
17581 {
17582   HOST_WIDE_INT odd;
17583   poly_uint64 nelt = d->perm.length ();
17584   rtx out, in0, in1, x;
17585   machine_mode vmode = d->vmode;
17586
17587   if (GET_MODE_UNIT_SIZE (vmode) > 8)
17588     return false;
17589
17590   /* Note that these are little-endian tests.
17591      We correct for big-endian later.  */
17592   if (!d->perm[0].is_constant (&odd)
17593       || (odd != 0 && odd != 1)
17594       || !d->perm.series_p (0, 2, odd, 2)
17595       || !d->perm.series_p (1, 2, nelt + odd, 2))
17596     return false;
17597
17598   /* Success!  */
17599   if (d->testing_p)
17600     return true;
17601
17602   in0 = d->op0;
17603   in1 = d->op1;
17604   /* We don't need a big-endian lane correction for SVE; see the comment
17605      at the head of aarch64-sve.md for details.  */
17606   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17607     {
17608       x = in0, in0 = in1, in1 = x;
17609       odd = !odd;
17610     }
17611   out = d->target;
17612
17613   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17614                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
17615   return true;
17616 }
17617
17618 /* Recognize patterns suitable for the UZP instructions.  */
17619 static bool
17620 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
17621 {
17622   HOST_WIDE_INT odd;
17623   rtx out, in0, in1, x;
17624   machine_mode vmode = d->vmode;
17625
17626   if (GET_MODE_UNIT_SIZE (vmode) > 8)
17627     return false;
17628
17629   /* Note that these are little-endian tests.
17630      We correct for big-endian later.  */
17631   if (!d->perm[0].is_constant (&odd)
17632       || (odd != 0 && odd != 1)
17633       || !d->perm.series_p (0, 1, odd, 2))
17634     return false;
17635
17636   /* Success!  */
17637   if (d->testing_p)
17638     return true;
17639
17640   in0 = d->op0;
17641   in1 = d->op1;
17642   /* We don't need a big-endian lane correction for SVE; see the comment
17643      at the head of aarch64-sve.md for details.  */
17644   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17645     {
17646       x = in0, in0 = in1, in1 = x;
17647       odd = !odd;
17648     }
17649   out = d->target;
17650
17651   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17652                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
17653   return true;
17654 }
17655
17656 /* Recognize patterns suitable for the ZIP instructions.  */
17657 static bool
17658 aarch64_evpc_zip (struct expand_vec_perm_d *d)
17659 {
17660   unsigned int high;
17661   poly_uint64 nelt = d->perm.length ();
17662   rtx out, in0, in1, x;
17663   machine_mode vmode = d->vmode;
17664
17665   if (GET_MODE_UNIT_SIZE (vmode) > 8)
17666     return false;
17667
17668   /* Note that these are little-endian tests.
17669      We correct for big-endian later.  */
17670   poly_uint64 first = d->perm[0];
17671   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
17672       || !d->perm.series_p (0, 2, first, 1)
17673       || !d->perm.series_p (1, 2, first + nelt, 1))
17674     return false;
17675   high = maybe_ne (first, 0U);
17676
17677   /* Success!  */
17678   if (d->testing_p)
17679     return true;
17680
17681   in0 = d->op0;
17682   in1 = d->op1;
17683   /* We don't need a big-endian lane correction for SVE; see the comment
17684      at the head of aarch64-sve.md for details.  */
17685   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17686     {
17687       x = in0, in0 = in1, in1 = x;
17688       high = !high;
17689     }
17690   out = d->target;
17691
17692   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17693                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
17694   return true;
17695 }
17696
17697 /* Recognize patterns for the EXT insn.  */
17698
17699 static bool
17700 aarch64_evpc_ext (struct expand_vec_perm_d *d)
17701 {
17702   HOST_WIDE_INT location;
17703   rtx offset;
17704
17705   /* The first element always refers to the first vector.
17706      Check if the extracted indices are increasing by one.  */
17707   if (d->vec_flags == VEC_SVE_PRED
17708       || !d->perm[0].is_constant (&location)
17709       || !d->perm.series_p (0, 1, location, 1))
17710     return false;
17711
17712   /* Success! */
17713   if (d->testing_p)
17714     return true;
17715
17716   /* The case where (location == 0) is a no-op for both big- and little-endian,
17717      and is removed by the mid-end at optimization levels -O1 and higher.
17718
17719      We don't need a big-endian lane correction for SVE; see the comment
17720      at the head of aarch64-sve.md for details.  */
17721   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
17722     {
17723       /* After setup, we want the high elements of the first vector (stored
17724          at the LSB end of the register), and the low elements of the second
17725          vector (stored at the MSB end of the register). So swap.  */
17726       std::swap (d->op0, d->op1);
17727       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
17728          to_constant () is safe since this is restricted to Advanced SIMD
17729          vectors.  */
17730       location = d->perm.length ().to_constant () - location;
17731     }
17732
17733   offset = GEN_INT (location);
17734   emit_set_insn (d->target,
17735                  gen_rtx_UNSPEC (d->vmode,
17736                                  gen_rtvec (3, d->op0, d->op1, offset),
17737                                  UNSPEC_EXT));
17738   return true;
17739 }
17740
17741 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
17742    within each 64-bit, 32-bit or 16-bit granule.  */
17743
17744 static bool
17745 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
17746 {
17747   HOST_WIDE_INT diff;
17748   unsigned int i, size, unspec;
17749   machine_mode pred_mode;
17750
17751   if (d->vec_flags == VEC_SVE_PRED
17752       || !d->one_vector_p
17753       || !d->perm[0].is_constant (&diff))
17754     return false;
17755
17756   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
17757   if (size == 8)
17758     {
17759       unspec = UNSPEC_REV64;
17760       pred_mode = VNx2BImode;
17761     }
17762   else if (size == 4)
17763     {
17764       unspec = UNSPEC_REV32;
17765       pred_mode = VNx4BImode;
17766     }
17767   else if (size == 2)
17768     {
17769       unspec = UNSPEC_REV16;
17770       pred_mode = VNx8BImode;
17771     }
17772   else
17773     return false;
17774
17775   unsigned int step = diff + 1;
17776   for (i = 0; i < step; ++i)
17777     if (!d->perm.series_p (i, step, diff - i, step))
17778       return false;
17779
17780   /* Success! */
17781   if (d->testing_p)
17782     return true;
17783
17784   if (d->vec_flags == VEC_SVE_DATA)
17785     {
17786       machine_mode int_mode = aarch64_sve_int_mode (pred_mode);
17787       rtx target = gen_reg_rtx (int_mode);
17788       if (BYTES_BIG_ENDIAN)
17789         /* The act of taking a subreg between INT_MODE and d->vmode
17790            is itself a reversing operation on big-endian targets;
17791            see the comment at the head of aarch64-sve.md for details.
17792            First reinterpret OP0 as INT_MODE without using a subreg
17793            and without changing the contents.  */
17794         emit_insn (gen_aarch64_sve_reinterpret (int_mode, target, d->op0));
17795       else
17796         {
17797           /* For SVE we use REV[BHW] unspecs derived from the element size
17798              of v->mode and vector modes whose elements have SIZE bytes.
17799              This ensures that the vector modes match the predicate modes.  */
17800           int unspec = aarch64_sve_rev_unspec (d->vmode);
17801           rtx pred = aarch64_ptrue_reg (pred_mode);
17802           emit_insn (gen_aarch64_pred (unspec, int_mode, target, pred,
17803                                        gen_lowpart (int_mode, d->op0)));
17804         }
17805       emit_move_insn (d->target, gen_lowpart (d->vmode, target));
17806       return true;
17807     }
17808   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
17809   emit_set_insn (d->target, src);
17810   return true;
17811 }
17812
17813 /* Recognize patterns for the REV insn, which reverses elements within
17814    a full vector.  */
17815
17816 static bool
17817 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
17818 {
17819   poly_uint64 nelt = d->perm.length ();
17820
17821   if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
17822     return false;
17823
17824   if (!d->perm.series_p (0, 1, nelt - 1, -1))
17825     return false;
17826
17827   /* Success! */
17828   if (d->testing_p)
17829     return true;
17830
17831   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
17832   emit_set_insn (d->target, src);
17833   return true;
17834 }
17835
17836 static bool
17837 aarch64_evpc_dup (struct expand_vec_perm_d *d)
17838 {
17839   rtx out = d->target;
17840   rtx in0;
17841   HOST_WIDE_INT elt;
17842   machine_mode vmode = d->vmode;
17843   rtx lane;
17844
17845   if (d->vec_flags == VEC_SVE_PRED
17846       || d->perm.encoding ().encoded_nelts () != 1
17847       || !d->perm[0].is_constant (&elt))
17848     return false;
17849
17850   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
17851     return false;
17852
17853   /* Success! */
17854   if (d->testing_p)
17855     return true;
17856
17857   /* The generic preparation in aarch64_expand_vec_perm_const_1
17858      swaps the operand order and the permute indices if it finds
17859      d->perm[0] to be in the second operand.  Thus, we can always
17860      use d->op0 and need not do any extra arithmetic to get the
17861      correct lane number.  */
17862   in0 = d->op0;
17863   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
17864
17865   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
17866   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
17867   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
17868   return true;
17869 }
17870
17871 static bool
17872 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
17873 {
17874   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
17875   machine_mode vmode = d->vmode;
17876
17877   /* Make sure that the indices are constant.  */
17878   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
17879   for (unsigned int i = 0; i < encoded_nelts; ++i)
17880     if (!d->perm[i].is_constant ())
17881       return false;
17882
17883   if (d->testing_p)
17884     return true;
17885
17886   /* Generic code will try constant permutation twice.  Once with the
17887      original mode and again with the elements lowered to QImode.
17888      So wait and don't do the selector expansion ourselves.  */
17889   if (vmode != V8QImode && vmode != V16QImode)
17890     return false;
17891
17892   /* to_constant is safe since this routine is specific to Advanced SIMD
17893      vectors.  */
17894   unsigned int nelt = d->perm.length ().to_constant ();
17895   for (unsigned int i = 0; i < nelt; ++i)
17896     /* If big-endian and two vectors we end up with a weird mixed-endian
17897        mode on NEON.  Reverse the index within each word but not the word
17898        itself.  to_constant is safe because we checked is_constant above.  */
17899     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
17900                         ? d->perm[i].to_constant () ^ (nelt - 1)
17901                         : d->perm[i].to_constant ());
17902
17903   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
17904   sel = force_reg (vmode, sel);
17905
17906   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
17907   return true;
17908 }
17909
17910 /* Try to implement D using an SVE TBL instruction.  */
17911
17912 static bool
17913 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
17914 {
17915   unsigned HOST_WIDE_INT nelt;
17916
17917   /* Permuting two variable-length vectors could overflow the
17918      index range.  */
17919   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
17920     return false;
17921
17922   if (d->testing_p)
17923     return true;
17924
17925   machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
17926   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
17927   if (d->one_vector_p)
17928     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
17929   else
17930     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
17931   return true;
17932 }
17933
17934 static bool
17935 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
17936 {
17937   /* The pattern matching functions above are written to look for a small
17938      number to begin the sequence (0, 1, N/2).  If we begin with an index
17939      from the second operand, we can swap the operands.  */
17940   poly_int64 nelt = d->perm.length ();
17941   if (known_ge (d->perm[0], nelt))
17942     {
17943       d->perm.rotate_inputs (1);
17944       std::swap (d->op0, d->op1);
17945     }
17946
17947   if ((d->vec_flags == VEC_ADVSIMD
17948        || d->vec_flags == VEC_SVE_DATA
17949        || d->vec_flags == VEC_SVE_PRED)
17950       && known_gt (nelt, 1))
17951     {
17952       if (aarch64_evpc_rev_local (d))
17953         return true;
17954       else if (aarch64_evpc_rev_global (d))
17955         return true;
17956       else if (aarch64_evpc_ext (d))
17957         return true;
17958       else if (aarch64_evpc_dup (d))
17959         return true;
17960       else if (aarch64_evpc_zip (d))
17961         return true;
17962       else if (aarch64_evpc_uzp (d))
17963         return true;
17964       else if (aarch64_evpc_trn (d))
17965         return true;
17966       if (d->vec_flags == VEC_SVE_DATA)
17967         return aarch64_evpc_sve_tbl (d);
17968       else if (d->vec_flags == VEC_ADVSIMD)
17969         return aarch64_evpc_tbl (d);
17970     }
17971   return false;
17972 }
17973
17974 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
17975
17976 static bool
17977 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
17978                                   rtx op1, const vec_perm_indices &sel)
17979 {
17980   struct expand_vec_perm_d d;
17981
17982   /* Check whether the mask can be applied to a single vector.  */
17983   if (sel.ninputs () == 1
17984       || (op0 && rtx_equal_p (op0, op1)))
17985     d.one_vector_p = true;
17986   else if (sel.all_from_input_p (0))
17987     {
17988       d.one_vector_p = true;
17989       op1 = op0;
17990     }
17991   else if (sel.all_from_input_p (1))
17992     {
17993       d.one_vector_p = true;
17994       op0 = op1;
17995     }
17996   else
17997     d.one_vector_p = false;
17998
17999   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
18000                      sel.nelts_per_input ());
18001   d.vmode = vmode;
18002   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
18003   d.target = target;
18004   d.op0 = op0;
18005   d.op1 = op1;
18006   d.testing_p = !target;
18007
18008   if (!d.testing_p)
18009     return aarch64_expand_vec_perm_const_1 (&d);
18010
18011   rtx_insn *last = get_last_insn ();
18012   bool ret = aarch64_expand_vec_perm_const_1 (&d);
18013   gcc_assert (last == get_last_insn ());
18014
18015   return ret;
18016 }
18017
18018 /* Generate a byte permute mask for a register of mode MODE,
18019    which has NUNITS units.  */
18020
18021 rtx
18022 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
18023 {
18024   /* We have to reverse each vector because we dont have
18025      a permuted load that can reverse-load according to ABI rules.  */
18026   rtx mask;
18027   rtvec v = rtvec_alloc (16);
18028   unsigned int i, j;
18029   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
18030
18031   gcc_assert (BYTES_BIG_ENDIAN);
18032   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
18033
18034   for (i = 0; i < nunits; i++)
18035     for (j = 0; j < usize; j++)
18036       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
18037   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
18038   return force_reg (V16QImode, mask);
18039 }
18040
18041 /* Expand an SVE integer comparison using the SVE equivalent of:
18042
18043      (set TARGET (CODE OP0 OP1)).  */
18044
18045 void
18046 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
18047 {
18048   machine_mode pred_mode = GET_MODE (target);
18049   machine_mode data_mode = GET_MODE (op0);
18050   rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
18051                                       op0, op1);
18052   if (!rtx_equal_p (target, res))
18053     emit_move_insn (target, res);
18054 }
18055
18056 /* Return the UNSPEC_COND_* code for comparison CODE.  */
18057
18058 static unsigned int
18059 aarch64_unspec_cond_code (rtx_code code)
18060 {
18061   switch (code)
18062     {
18063     case NE:
18064       return UNSPEC_COND_FCMNE;
18065     case EQ:
18066       return UNSPEC_COND_FCMEQ;
18067     case LT:
18068       return UNSPEC_COND_FCMLT;
18069     case GT:
18070       return UNSPEC_COND_FCMGT;
18071     case LE:
18072       return UNSPEC_COND_FCMLE;
18073     case GE:
18074       return UNSPEC_COND_FCMGE;
18075     case UNORDERED:
18076       return UNSPEC_COND_FCMUO;
18077     default:
18078       gcc_unreachable ();
18079     }
18080 }
18081
18082 /* Emit:
18083
18084       (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
18085
18086    where <X> is the operation associated with comparison CODE.
18087    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
18088
18089 static void
18090 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
18091                           bool known_ptrue_p, rtx op0, rtx op1)
18092 {
18093   rtx flag = gen_int_mode (known_ptrue_p, SImode);
18094   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
18095                                gen_rtvec (4, pred, flag, op0, op1),
18096                                aarch64_unspec_cond_code (code));
18097   emit_set_insn (target, unspec);
18098 }
18099
18100 /* Emit the SVE equivalent of:
18101
18102       (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
18103       (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
18104       (set TARGET (ior:PRED_MODE TMP1 TMP2))
18105
18106    where <Xi> is the operation associated with comparison CODEi.
18107    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
18108
18109 static void
18110 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
18111                               rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
18112 {
18113   machine_mode pred_mode = GET_MODE (pred);
18114   rtx tmp1 = gen_reg_rtx (pred_mode);
18115   aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
18116   rtx tmp2 = gen_reg_rtx (pred_mode);
18117   aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
18118   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
18119 }
18120
18121 /* Emit the SVE equivalent of:
18122
18123       (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
18124       (set TARGET (not TMP))
18125
18126    where <X> is the operation associated with comparison CODE.
18127    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
18128
18129 static void
18130 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
18131                                  bool known_ptrue_p, rtx op0, rtx op1)
18132 {
18133   machine_mode pred_mode = GET_MODE (pred);
18134   rtx tmp = gen_reg_rtx (pred_mode);
18135   aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
18136   aarch64_emit_unop (target, one_cmpl_optab, tmp);
18137 }
18138
18139 /* Expand an SVE floating-point comparison using the SVE equivalent of:
18140
18141      (set TARGET (CODE OP0 OP1))
18142
18143    If CAN_INVERT_P is true, the caller can also handle inverted results;
18144    return true if the result is in fact inverted.  */
18145
18146 bool
18147 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
18148                                   rtx op0, rtx op1, bool can_invert_p)
18149 {
18150   machine_mode pred_mode = GET_MODE (target);
18151   machine_mode data_mode = GET_MODE (op0);
18152
18153   rtx ptrue = aarch64_ptrue_reg (pred_mode);
18154   switch (code)
18155     {
18156     case UNORDERED:
18157       /* UNORDERED has no immediate form.  */
18158       op1 = force_reg (data_mode, op1);
18159       /* fall through */
18160     case LT:
18161     case LE:
18162     case GT:
18163     case GE:
18164     case EQ:
18165     case NE:
18166       {
18167         /* There is native support for the comparison.  */
18168         aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
18169         return false;
18170       }
18171
18172     case LTGT:
18173       /* This is a trapping operation (LT or GT).  */
18174       aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
18175       return false;
18176
18177     case UNEQ:
18178       if (!flag_trapping_math)
18179         {
18180           /* This would trap for signaling NaNs.  */
18181           op1 = force_reg (data_mode, op1);
18182           aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
18183                                         ptrue, true, op0, op1);
18184           return false;
18185         }
18186       /* fall through */
18187     case UNLT:
18188     case UNLE:
18189     case UNGT:
18190     case UNGE:
18191       if (flag_trapping_math)
18192         {
18193           /* Work out which elements are ordered.  */
18194           rtx ordered = gen_reg_rtx (pred_mode);
18195           op1 = force_reg (data_mode, op1);
18196           aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
18197                                            ptrue, true, op0, op1);
18198
18199           /* Test the opposite condition for the ordered elements,
18200              then invert the result.  */
18201           if (code == UNEQ)
18202             code = NE;
18203           else
18204             code = reverse_condition_maybe_unordered (code);
18205           if (can_invert_p)
18206             {
18207               aarch64_emit_sve_fp_cond (target, code,
18208                                         ordered, false, op0, op1);
18209               return true;
18210             }
18211           aarch64_emit_sve_invert_fp_cond (target, code,
18212                                            ordered, false, op0, op1);
18213           return false;
18214         }
18215       break;
18216
18217     case ORDERED:
18218       /* ORDERED has no immediate form.  */
18219       op1 = force_reg (data_mode, op1);
18220       break;
18221
18222     default:
18223       gcc_unreachable ();
18224     }
18225
18226   /* There is native support for the inverse comparison.  */
18227   code = reverse_condition_maybe_unordered (code);
18228   if (can_invert_p)
18229     {
18230       aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
18231       return true;
18232     }
18233   aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
18234   return false;
18235 }
18236
18237 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
18238    of the data being selected and CMP_MODE is the mode of the values being
18239    compared.  */
18240
18241 void
18242 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
18243                           rtx *ops)
18244 {
18245   machine_mode pred_mode
18246     = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
18247                              GET_MODE_SIZE (cmp_mode)).require ();
18248   rtx pred = gen_reg_rtx (pred_mode);
18249   if (FLOAT_MODE_P (cmp_mode))
18250     {
18251       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
18252                                             ops[4], ops[5], true))
18253         std::swap (ops[1], ops[2]);
18254     }
18255   else
18256     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
18257
18258   if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
18259     ops[1] = force_reg (data_mode, ops[1]);
18260   /* The "false" value can only be zero if the "true" value is a constant.  */
18261   if (register_operand (ops[1], data_mode)
18262       || !aarch64_simd_reg_or_zero (ops[2], data_mode))
18263     ops[2] = force_reg (data_mode, ops[2]);
18264
18265   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
18266   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
18267 }
18268
18269 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
18270    true.  However due to issues with register allocation it is preferable
18271    to avoid tieing integer scalar and FP scalar modes.  Executing integer
18272    operations in general registers is better than treating them as scalar
18273    vector operations.  This reduces latency and avoids redundant int<->FP
18274    moves.  So tie modes if they are either the same class, or vector modes
18275    with other vector modes, vector structs or any scalar mode.  */
18276
18277 static bool
18278 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
18279 {
18280   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
18281     return true;
18282
18283   /* We specifically want to allow elements of "structure" modes to
18284      be tieable to the structure.  This more general condition allows
18285      other rarer situations too.  The reason we don't extend this to
18286      predicate modes is that there are no predicate structure modes
18287      nor any specific instructions for extracting part of a predicate
18288      register.  */
18289   if (aarch64_vector_data_mode_p (mode1)
18290       && aarch64_vector_data_mode_p (mode2))
18291     return true;
18292
18293   /* Also allow any scalar modes with vectors.  */
18294   if (aarch64_vector_mode_supported_p (mode1)
18295       || aarch64_vector_mode_supported_p (mode2))
18296     return true;
18297
18298   return false;
18299 }
18300
18301 /* Return a new RTX holding the result of moving POINTER forward by
18302    AMOUNT bytes.  */
18303
18304 static rtx
18305 aarch64_move_pointer (rtx pointer, poly_int64 amount)
18306 {
18307   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
18308
18309   return adjust_automodify_address (pointer, GET_MODE (pointer),
18310                                     next, amount);
18311 }
18312
18313 /* Return a new RTX holding the result of moving POINTER forward by the
18314    size of the mode it points to.  */
18315
18316 static rtx
18317 aarch64_progress_pointer (rtx pointer)
18318 {
18319   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
18320 }
18321
18322 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
18323    MODE bytes.  */
18324
18325 static void
18326 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
18327                                               machine_mode mode)
18328 {
18329   rtx reg = gen_reg_rtx (mode);
18330
18331   /* "Cast" the pointers to the correct mode.  */
18332   *src = adjust_address (*src, mode, 0);
18333   *dst = adjust_address (*dst, mode, 0);
18334   /* Emit the memcpy.  */
18335   emit_move_insn (reg, *src);
18336   emit_move_insn (*dst, reg);
18337   /* Move the pointers forward.  */
18338   *src = aarch64_progress_pointer (*src);
18339   *dst = aarch64_progress_pointer (*dst);
18340 }
18341
18342 /* Expand cpymem, as if from a __builtin_memcpy.  Return true if
18343    we succeed, otherwise return false.  */
18344
18345 bool
18346 aarch64_expand_cpymem (rtx *operands)
18347 {
18348   int n, mode_bits;
18349   rtx dst = operands[0];
18350   rtx src = operands[1];
18351   rtx base;
18352   machine_mode cur_mode = BLKmode, next_mode;
18353   bool speed_p = !optimize_function_for_size_p (cfun);
18354
18355   /* When optimizing for size, give a better estimate of the length of a
18356      memcpy call, but use the default otherwise.  Moves larger than 8 bytes
18357      will always require an even number of instructions to do now.  And each
18358      operation requires both a load+store, so devide the max number by 2.  */
18359   int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
18360
18361   /* We can't do anything smart if the amount to copy is not constant.  */
18362   if (!CONST_INT_P (operands[2]))
18363     return false;
18364
18365   n = INTVAL (operands[2]);
18366
18367   /* Try to keep the number of instructions low.  For all cases we will do at
18368      most two moves for the residual amount, since we'll always overlap the
18369      remainder.  */
18370   if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
18371     return false;
18372
18373   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
18374   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
18375
18376   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
18377   src = adjust_automodify_address (src, VOIDmode, base, 0);
18378
18379   /* Convert n to bits to make the rest of the code simpler.  */
18380   n = n * BITS_PER_UNIT;
18381
18382   /* Maximum amount to copy in one go.  The AArch64 back-end has integer modes
18383      larger than TImode, but we should not use them for loads/stores here.  */
18384   const int copy_limit = GET_MODE_BITSIZE (TImode);
18385
18386   while (n > 0)
18387     {
18388       /* Find the largest mode in which to do the copy in without over reading
18389          or writing.  */
18390       opt_scalar_int_mode mode_iter;
18391       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
18392         if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
18393           cur_mode = mode_iter.require ();
18394
18395       gcc_assert (cur_mode != BLKmode);
18396
18397       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
18398       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
18399
18400       n -= mode_bits;
18401
18402       /* Do certain trailing copies as overlapping if it's going to be
18403          cheaper.  i.e. less instructions to do so.  For instance doing a 15
18404          byte copy it's more efficient to do two overlapping 8 byte copies than
18405          8 + 6 + 1.  */
18406       if (n > 0 && n <= 8 * BITS_PER_UNIT)
18407         {
18408           next_mode = smallest_mode_for_size (n, MODE_INT);
18409           int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
18410           src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
18411           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
18412           n = n_bits;
18413         }
18414     }
18415
18416   return true;
18417 }
18418
18419 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
18420    SImode stores.  Handle the case when the constant has identical
18421    bottom and top halves.  This is beneficial when the two stores can be
18422    merged into an STP and we avoid synthesising potentially expensive
18423    immediates twice.  Return true if such a split is possible.  */
18424
18425 bool
18426 aarch64_split_dimode_const_store (rtx dst, rtx src)
18427 {
18428   rtx lo = gen_lowpart (SImode, src);
18429   rtx hi = gen_highpart_mode (SImode, DImode, src);
18430
18431   bool size_p = optimize_function_for_size_p (cfun);
18432
18433   if (!rtx_equal_p (lo, hi))
18434     return false;
18435
18436   unsigned int orig_cost
18437     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
18438   unsigned int lo_cost
18439     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
18440
18441   /* We want to transform:
18442      MOV        x1, 49370
18443      MOVK       x1, 0x140, lsl 16
18444      MOVK       x1, 0xc0da, lsl 32
18445      MOVK       x1, 0x140, lsl 48
18446      STR        x1, [x0]
18447    into:
18448      MOV        w1, 49370
18449      MOVK       w1, 0x140, lsl 16
18450      STP        w1, w1, [x0]
18451    So we want to perform this only when we save two instructions
18452    or more.  When optimizing for size, however, accept any code size
18453    savings we can.  */
18454   if (size_p && orig_cost <= lo_cost)
18455     return false;
18456
18457   if (!size_p
18458       && (orig_cost <= lo_cost + 1))
18459     return false;
18460
18461   rtx mem_lo = adjust_address (dst, SImode, 0);
18462   if (!aarch64_mem_pair_operand (mem_lo, SImode))
18463     return false;
18464
18465   rtx tmp_reg = gen_reg_rtx (SImode);
18466   aarch64_expand_mov_immediate (tmp_reg, lo);
18467   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
18468   /* Don't emit an explicit store pair as this may not be always profitable.
18469      Let the sched-fusion logic decide whether to merge them.  */
18470   emit_move_insn (mem_lo, tmp_reg);
18471   emit_move_insn (mem_hi, tmp_reg);
18472
18473   return true;
18474 }
18475
18476 /* Generate RTL for a conditional branch with rtx comparison CODE in
18477    mode CC_MODE.  The destination of the unlikely conditional branch
18478    is LABEL_REF.  */
18479
18480 void
18481 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
18482                               rtx label_ref)
18483 {
18484   rtx x;
18485   x = gen_rtx_fmt_ee (code, VOIDmode,
18486                       gen_rtx_REG (cc_mode, CC_REGNUM),
18487                       const0_rtx);
18488
18489   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
18490                             gen_rtx_LABEL_REF (VOIDmode, label_ref),
18491                             pc_rtx);
18492   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
18493 }
18494
18495 /* Generate DImode scratch registers for 128-bit (TImode) addition.
18496
18497    OP1 represents the TImode destination operand 1
18498    OP2 represents the TImode destination operand 2
18499    LOW_DEST represents the low half (DImode) of TImode operand 0
18500    LOW_IN1 represents the low half (DImode) of TImode operand 1
18501    LOW_IN2 represents the low half (DImode) of TImode operand 2
18502    HIGH_DEST represents the high half (DImode) of TImode operand 0
18503    HIGH_IN1 represents the high half (DImode) of TImode operand 1
18504    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
18505
18506 void
18507 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
18508                             rtx *low_in1, rtx *low_in2,
18509                             rtx *high_dest, rtx *high_in1,
18510                             rtx *high_in2)
18511 {
18512   *low_dest = gen_reg_rtx (DImode);
18513   *low_in1 = gen_lowpart (DImode, op1);
18514   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
18515                                   subreg_lowpart_offset (DImode, TImode));
18516   *high_dest = gen_reg_rtx (DImode);
18517   *high_in1 = gen_highpart (DImode, op1);
18518   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
18519                                    subreg_highpart_offset (DImode, TImode));
18520 }
18521
18522 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
18523
18524    This function differs from 'arch64_addti_scratch_regs' in that
18525    OP1 can be an immediate constant (zero). We must call
18526    subreg_highpart_offset with DImode and TImode arguments, otherwise
18527    VOIDmode will be used for the const_int which generates an internal
18528    error from subreg_size_highpart_offset which does not expect a size of zero.
18529
18530    OP1 represents the TImode destination operand 1
18531    OP2 represents the TImode destination operand 2
18532    LOW_DEST represents the low half (DImode) of TImode operand 0
18533    LOW_IN1 represents the low half (DImode) of TImode operand 1
18534    LOW_IN2 represents the low half (DImode) of TImode operand 2
18535    HIGH_DEST represents the high half (DImode) of TImode operand 0
18536    HIGH_IN1 represents the high half (DImode) of TImode operand 1
18537    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
18538
18539
18540 void
18541 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
18542                              rtx *low_in1, rtx *low_in2,
18543                              rtx *high_dest, rtx *high_in1,
18544                              rtx *high_in2)
18545 {
18546   *low_dest = gen_reg_rtx (DImode);
18547   *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
18548                                   subreg_lowpart_offset (DImode, TImode));
18549
18550   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
18551                                   subreg_lowpart_offset (DImode, TImode));
18552   *high_dest = gen_reg_rtx (DImode);
18553
18554   *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
18555                                    subreg_highpart_offset (DImode, TImode));
18556   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
18557                                    subreg_highpart_offset (DImode, TImode));
18558 }
18559
18560 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
18561
18562    OP0 represents the TImode destination operand 0
18563    LOW_DEST represents the low half (DImode) of TImode operand 0
18564    LOW_IN1 represents the low half (DImode) of TImode operand 1
18565    LOW_IN2 represents the low half (DImode) of TImode operand 2
18566    HIGH_DEST represents the high half (DImode) of TImode operand 0
18567    HIGH_IN1 represents the high half (DImode) of TImode operand 1
18568    HIGH_IN2 represents the high half (DImode) of TImode operand 2
18569    UNSIGNED_P is true if the operation is being performed on unsigned
18570    values.  */
18571 void
18572 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
18573                        rtx low_in2, rtx high_dest, rtx high_in1,
18574                        rtx high_in2, bool unsigned_p)
18575 {
18576   if (low_in2 == const0_rtx)
18577     {
18578       low_dest = low_in1;
18579       high_in2 = force_reg (DImode, high_in2);
18580       if (unsigned_p)
18581         emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
18582       else
18583         emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
18584     }
18585   else
18586     {
18587       if (CONST_INT_P (low_in2))
18588         {
18589           high_in2 = force_reg (DImode, high_in2);
18590           emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
18591                                               GEN_INT (-INTVAL (low_in2))));
18592         }
18593       else
18594         emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
18595
18596       if (unsigned_p)
18597         emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
18598       else
18599         emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
18600     }
18601
18602   emit_move_insn (gen_lowpart (DImode, op0), low_dest);
18603   emit_move_insn (gen_highpart (DImode, op0), high_dest);
18604
18605 }
18606
18607 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
18608
18609 static unsigned HOST_WIDE_INT
18610 aarch64_asan_shadow_offset (void)
18611 {
18612   if (TARGET_ILP32)
18613     return (HOST_WIDE_INT_1 << 29);
18614   else
18615     return (HOST_WIDE_INT_1 << 36);
18616 }
18617
18618 static rtx
18619 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
18620                         int code, tree treeop0, tree treeop1)
18621 {
18622   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
18623   rtx op0, op1;
18624   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
18625   insn_code icode;
18626   struct expand_operand ops[4];
18627
18628   start_sequence ();
18629   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
18630
18631   op_mode = GET_MODE (op0);
18632   if (op_mode == VOIDmode)
18633     op_mode = GET_MODE (op1);
18634
18635   switch (op_mode)
18636     {
18637     case E_QImode:
18638     case E_HImode:
18639     case E_SImode:
18640       cmp_mode = SImode;
18641       icode = CODE_FOR_cmpsi;
18642       break;
18643
18644     case E_DImode:
18645       cmp_mode = DImode;
18646       icode = CODE_FOR_cmpdi;
18647       break;
18648
18649     case E_SFmode:
18650       cmp_mode = SFmode;
18651       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
18652       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
18653       break;
18654
18655     case E_DFmode:
18656       cmp_mode = DFmode;
18657       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
18658       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
18659       break;
18660
18661     default:
18662       end_sequence ();
18663       return NULL_RTX;
18664     }
18665
18666   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
18667   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
18668   if (!op0 || !op1)
18669     {
18670       end_sequence ();
18671       return NULL_RTX;
18672     }
18673   *prep_seq = get_insns ();
18674   end_sequence ();
18675
18676   create_fixed_operand (&ops[0], op0);
18677   create_fixed_operand (&ops[1], op1);
18678
18679   start_sequence ();
18680   if (!maybe_expand_insn (icode, 2, ops))
18681     {
18682       end_sequence ();
18683       return NULL_RTX;
18684     }
18685   *gen_seq = get_insns ();
18686   end_sequence ();
18687
18688   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
18689                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
18690 }
18691
18692 static rtx
18693 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
18694                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
18695 {
18696   rtx op0, op1, target;
18697   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
18698   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
18699   insn_code icode;
18700   struct expand_operand ops[6];
18701   int aarch64_cond;
18702
18703   push_to_sequence (*prep_seq);
18704   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
18705
18706   op_mode = GET_MODE (op0);
18707   if (op_mode == VOIDmode)
18708     op_mode = GET_MODE (op1);
18709
18710   switch (op_mode)
18711     {
18712     case E_QImode:
18713     case E_HImode:
18714     case E_SImode:
18715       cmp_mode = SImode;
18716       icode = CODE_FOR_ccmpsi;
18717       break;
18718
18719     case E_DImode:
18720       cmp_mode = DImode;
18721       icode = CODE_FOR_ccmpdi;
18722       break;
18723
18724     case E_SFmode:
18725       cmp_mode = SFmode;
18726       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
18727       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
18728       break;
18729
18730     case E_DFmode:
18731       cmp_mode = DFmode;
18732       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
18733       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
18734       break;
18735
18736     default:
18737       end_sequence ();
18738       return NULL_RTX;
18739     }
18740
18741   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
18742   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
18743   if (!op0 || !op1)
18744     {
18745       end_sequence ();
18746       return NULL_RTX;
18747     }
18748   *prep_seq = get_insns ();
18749   end_sequence ();
18750
18751   target = gen_rtx_REG (cc_mode, CC_REGNUM);
18752   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
18753
18754   if (bit_code != AND)
18755     {
18756       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
18757                                                 GET_MODE (XEXP (prev, 0))),
18758                              VOIDmode, XEXP (prev, 0), const0_rtx);
18759       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
18760     }
18761
18762   create_fixed_operand (&ops[0], XEXP (prev, 0));
18763   create_fixed_operand (&ops[1], target);
18764   create_fixed_operand (&ops[2], op0);
18765   create_fixed_operand (&ops[3], op1);
18766   create_fixed_operand (&ops[4], prev);
18767   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
18768
18769   push_to_sequence (*gen_seq);
18770   if (!maybe_expand_insn (icode, 6, ops))
18771     {
18772       end_sequence ();
18773       return NULL_RTX;
18774     }
18775
18776   *gen_seq = get_insns ();
18777   end_sequence ();
18778
18779   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
18780 }
18781
18782 #undef TARGET_GEN_CCMP_FIRST
18783 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
18784
18785 #undef TARGET_GEN_CCMP_NEXT
18786 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
18787
18788 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
18789    instruction fusion of some sort.  */
18790
18791 static bool
18792 aarch64_macro_fusion_p (void)
18793 {
18794   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
18795 }
18796
18797
18798 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
18799    should be kept together during scheduling.  */
18800
18801 static bool
18802 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
18803 {
18804   rtx set_dest;
18805   rtx prev_set = single_set (prev);
18806   rtx curr_set = single_set (curr);
18807   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
18808   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
18809
18810   if (!aarch64_macro_fusion_p ())
18811     return false;
18812
18813   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
18814     {
18815       /* We are trying to match:
18816          prev (mov)  == (set (reg r0) (const_int imm16))
18817          curr (movk) == (set (zero_extract (reg r0)
18818                                            (const_int 16)
18819                                            (const_int 16))
18820                              (const_int imm16_1))  */
18821
18822       set_dest = SET_DEST (curr_set);
18823
18824       if (GET_CODE (set_dest) == ZERO_EXTRACT
18825           && CONST_INT_P (SET_SRC (curr_set))
18826           && CONST_INT_P (SET_SRC (prev_set))
18827           && CONST_INT_P (XEXP (set_dest, 2))
18828           && INTVAL (XEXP (set_dest, 2)) == 16
18829           && REG_P (XEXP (set_dest, 0))
18830           && REG_P (SET_DEST (prev_set))
18831           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
18832         {
18833           return true;
18834         }
18835     }
18836
18837   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
18838     {
18839
18840       /*  We're trying to match:
18841           prev (adrp) == (set (reg r1)
18842                               (high (symbol_ref ("SYM"))))
18843           curr (add) == (set (reg r0)
18844                              (lo_sum (reg r1)
18845                                      (symbol_ref ("SYM"))))
18846           Note that r0 need not necessarily be the same as r1, especially
18847           during pre-regalloc scheduling.  */
18848
18849       if (satisfies_constraint_Ush (SET_SRC (prev_set))
18850           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
18851         {
18852           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
18853               && REG_P (XEXP (SET_SRC (curr_set), 0))
18854               && REGNO (XEXP (SET_SRC (curr_set), 0))
18855                  == REGNO (SET_DEST (prev_set))
18856               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
18857                               XEXP (SET_SRC (curr_set), 1)))
18858             return true;
18859         }
18860     }
18861
18862   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
18863     {
18864
18865       /* We're trying to match:
18866          prev (movk) == (set (zero_extract (reg r0)
18867                                            (const_int 16)
18868                                            (const_int 32))
18869                              (const_int imm16_1))
18870          curr (movk) == (set (zero_extract (reg r0)
18871                                            (const_int 16)
18872                                            (const_int 48))
18873                              (const_int imm16_2))  */
18874
18875       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
18876           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
18877           && REG_P (XEXP (SET_DEST (prev_set), 0))
18878           && REG_P (XEXP (SET_DEST (curr_set), 0))
18879           && REGNO (XEXP (SET_DEST (prev_set), 0))
18880              == REGNO (XEXP (SET_DEST (curr_set), 0))
18881           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
18882           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
18883           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
18884           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
18885           && CONST_INT_P (SET_SRC (prev_set))
18886           && CONST_INT_P (SET_SRC (curr_set)))
18887         return true;
18888
18889     }
18890   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
18891     {
18892       /* We're trying to match:
18893           prev (adrp) == (set (reg r0)
18894                               (high (symbol_ref ("SYM"))))
18895           curr (ldr) == (set (reg r1)
18896                              (mem (lo_sum (reg r0)
18897                                              (symbol_ref ("SYM")))))
18898                  or
18899           curr (ldr) == (set (reg r1)
18900                              (zero_extend (mem
18901                                            (lo_sum (reg r0)
18902                                                    (symbol_ref ("SYM"))))))  */
18903       if (satisfies_constraint_Ush (SET_SRC (prev_set))
18904           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
18905         {
18906           rtx curr_src = SET_SRC (curr_set);
18907
18908           if (GET_CODE (curr_src) == ZERO_EXTEND)
18909             curr_src = XEXP (curr_src, 0);
18910
18911           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
18912               && REG_P (XEXP (XEXP (curr_src, 0), 0))
18913               && REGNO (XEXP (XEXP (curr_src, 0), 0))
18914                  == REGNO (SET_DEST (prev_set))
18915               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
18916                               XEXP (SET_SRC (prev_set), 0)))
18917               return true;
18918         }
18919     }
18920
18921   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
18922       && any_condjump_p (curr))
18923     {
18924       unsigned int condreg1, condreg2;
18925       rtx cc_reg_1;
18926       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
18927       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
18928
18929       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
18930           && prev
18931           && modified_in_p (cc_reg_1, prev))
18932         {
18933           enum attr_type prev_type = get_attr_type (prev);
18934
18935           /* FIXME: this misses some which is considered simple arthematic
18936              instructions for ThunderX.  Simple shifts are missed here.  */
18937           if (prev_type == TYPE_ALUS_SREG
18938               || prev_type == TYPE_ALUS_IMM
18939               || prev_type == TYPE_LOGICS_REG
18940               || prev_type == TYPE_LOGICS_IMM)
18941             return true;
18942         }
18943     }
18944
18945   if (prev_set
18946       && curr_set
18947       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
18948       && any_condjump_p (curr))
18949     {
18950       /* We're trying to match:
18951           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
18952           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
18953                                                          (const_int 0))
18954                                                  (label_ref ("SYM"))
18955                                                  (pc))  */
18956       if (SET_DEST (curr_set) == (pc_rtx)
18957           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
18958           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
18959           && REG_P (SET_DEST (prev_set))
18960           && REGNO (SET_DEST (prev_set))
18961              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
18962         {
18963           /* Fuse ALU operations followed by conditional branch instruction.  */
18964           switch (get_attr_type (prev))
18965             {
18966             case TYPE_ALU_IMM:
18967             case TYPE_ALU_SREG:
18968             case TYPE_ADC_REG:
18969             case TYPE_ADC_IMM:
18970             case TYPE_ADCS_REG:
18971             case TYPE_ADCS_IMM:
18972             case TYPE_LOGIC_REG:
18973             case TYPE_LOGIC_IMM:
18974             case TYPE_CSEL:
18975             case TYPE_ADR:
18976             case TYPE_MOV_IMM:
18977             case TYPE_SHIFT_REG:
18978             case TYPE_SHIFT_IMM:
18979             case TYPE_BFM:
18980             case TYPE_RBIT:
18981             case TYPE_REV:
18982             case TYPE_EXTEND:
18983               return true;
18984
18985             default:;
18986             }
18987         }
18988     }
18989
18990   return false;
18991 }
18992
18993 /* Return true iff the instruction fusion described by OP is enabled.  */
18994
18995 bool
18996 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
18997 {
18998   return (aarch64_tune_params.fusible_ops & op) != 0;
18999 }
19000
19001 /* If MEM is in the form of [base+offset], extract the two parts
19002    of address and set to BASE and OFFSET, otherwise return false
19003    after clearing BASE and OFFSET.  */
19004
19005 bool
19006 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
19007 {
19008   rtx addr;
19009
19010   gcc_assert (MEM_P (mem));
19011
19012   addr = XEXP (mem, 0);
19013
19014   if (REG_P (addr))
19015     {
19016       *base = addr;
19017       *offset = const0_rtx;
19018       return true;
19019     }
19020
19021   if (GET_CODE (addr) == PLUS
19022       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
19023     {
19024       *base = XEXP (addr, 0);
19025       *offset = XEXP (addr, 1);
19026       return true;
19027     }
19028
19029   *base = NULL_RTX;
19030   *offset = NULL_RTX;
19031
19032   return false;
19033 }
19034
19035 /* Types for scheduling fusion.  */
19036 enum sched_fusion_type
19037 {
19038   SCHED_FUSION_NONE = 0,
19039   SCHED_FUSION_LD_SIGN_EXTEND,
19040   SCHED_FUSION_LD_ZERO_EXTEND,
19041   SCHED_FUSION_LD,
19042   SCHED_FUSION_ST,
19043   SCHED_FUSION_NUM
19044 };
19045
19046 /* If INSN is a load or store of address in the form of [base+offset],
19047    extract the two parts and set to BASE and OFFSET.  Return scheduling
19048    fusion type this INSN is.  */
19049
19050 static enum sched_fusion_type
19051 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
19052 {
19053   rtx x, dest, src;
19054   enum sched_fusion_type fusion = SCHED_FUSION_LD;
19055
19056   gcc_assert (INSN_P (insn));
19057   x = PATTERN (insn);
19058   if (GET_CODE (x) != SET)
19059     return SCHED_FUSION_NONE;
19060
19061   src = SET_SRC (x);
19062   dest = SET_DEST (x);
19063
19064   machine_mode dest_mode = GET_MODE (dest);
19065
19066   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
19067     return SCHED_FUSION_NONE;
19068
19069   if (GET_CODE (src) == SIGN_EXTEND)
19070     {
19071       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
19072       src = XEXP (src, 0);
19073       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
19074         return SCHED_FUSION_NONE;
19075     }
19076   else if (GET_CODE (src) == ZERO_EXTEND)
19077     {
19078       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
19079       src = XEXP (src, 0);
19080       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
19081         return SCHED_FUSION_NONE;
19082     }
19083
19084   if (GET_CODE (src) == MEM && REG_P (dest))
19085     extract_base_offset_in_addr (src, base, offset);
19086   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
19087     {
19088       fusion = SCHED_FUSION_ST;
19089       extract_base_offset_in_addr (dest, base, offset);
19090     }
19091   else
19092     return SCHED_FUSION_NONE;
19093
19094   if (*base == NULL_RTX || *offset == NULL_RTX)
19095     fusion = SCHED_FUSION_NONE;
19096
19097   return fusion;
19098 }
19099
19100 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
19101
19102    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
19103    and PRI are only calculated for these instructions.  For other instruction,
19104    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
19105    type instruction fusion can be added by returning different priorities.
19106
19107    It's important that irrelevant instructions get the largest FUSION_PRI.  */
19108
19109 static void
19110 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
19111                                int *fusion_pri, int *pri)
19112 {
19113   int tmp, off_val;
19114   rtx base, offset;
19115   enum sched_fusion_type fusion;
19116
19117   gcc_assert (INSN_P (insn));
19118
19119   tmp = max_pri - 1;
19120   fusion = fusion_load_store (insn, &base, &offset);
19121   if (fusion == SCHED_FUSION_NONE)
19122     {
19123       *pri = tmp;
19124       *fusion_pri = tmp;
19125       return;
19126     }
19127
19128   /* Set FUSION_PRI according to fusion type and base register.  */
19129   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
19130
19131   /* Calculate PRI.  */
19132   tmp /= 2;
19133
19134   /* INSN with smaller offset goes first.  */
19135   off_val = (int)(INTVAL (offset));
19136   if (off_val >= 0)
19137     tmp -= (off_val & 0xfffff);
19138   else
19139     tmp += ((- off_val) & 0xfffff);
19140
19141   *pri = tmp;
19142   return;
19143 }
19144
19145 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
19146    Adjust priority of sha1h instructions so they are scheduled before
19147    other SHA1 instructions.  */
19148
19149 static int
19150 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
19151 {
19152   rtx x = PATTERN (insn);
19153
19154   if (GET_CODE (x) == SET)
19155     {
19156       x = SET_SRC (x);
19157
19158       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
19159         return priority + 10;
19160     }
19161
19162   return priority;
19163 }
19164
19165 /* Given OPERANDS of consecutive load/store, check if we can merge
19166    them into ldp/stp.  LOAD is true if they are load instructions.
19167    MODE is the mode of memory operands.  */
19168
19169 bool
19170 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
19171                                 machine_mode mode)
19172 {
19173   HOST_WIDE_INT offval_1, offval_2, msize;
19174   enum reg_class rclass_1, rclass_2;
19175   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
19176
19177   if (load)
19178     {
19179       mem_1 = operands[1];
19180       mem_2 = operands[3];
19181       reg_1 = operands[0];
19182       reg_2 = operands[2];
19183       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
19184       if (REGNO (reg_1) == REGNO (reg_2))
19185         return false;
19186     }
19187   else
19188     {
19189       mem_1 = operands[0];
19190       mem_2 = operands[2];
19191       reg_1 = operands[1];
19192       reg_2 = operands[3];
19193     }
19194
19195   /* The mems cannot be volatile.  */
19196   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
19197     return false;
19198
19199   /* If we have SImode and slow unaligned ldp,
19200      check the alignment to be at least 8 byte. */
19201   if (mode == SImode
19202       && (aarch64_tune_params.extra_tuning_flags
19203           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
19204       && !optimize_size
19205       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
19206     return false;
19207
19208   /* Check if the addresses are in the form of [base+offset].  */
19209   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
19210   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
19211     return false;
19212   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
19213   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
19214     return false;
19215
19216   /* Check if the bases are same.  */
19217   if (!rtx_equal_p (base_1, base_2))
19218     return false;
19219
19220   /* The operands must be of the same size.  */
19221   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
19222                          GET_MODE_SIZE (GET_MODE (mem_2))));
19223
19224   offval_1 = INTVAL (offset_1);
19225   offval_2 = INTVAL (offset_2);
19226   /* We should only be trying this for fixed-sized modes.  There is no
19227      SVE LDP/STP instruction.  */
19228   msize = GET_MODE_SIZE (mode).to_constant ();
19229   /* Check if the offsets are consecutive.  */
19230   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
19231     return false;
19232
19233   /* Check if the addresses are clobbered by load.  */
19234   if (load)
19235     {
19236       if (reg_mentioned_p (reg_1, mem_1))
19237         return false;
19238
19239       /* In increasing order, the last load can clobber the address.  */
19240       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
19241         return false;
19242     }
19243
19244   /* One of the memory accesses must be a mempair operand.
19245      If it is not the first one, they need to be swapped by the
19246      peephole.  */
19247   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
19248        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
19249     return false;
19250
19251   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
19252     rclass_1 = FP_REGS;
19253   else
19254     rclass_1 = GENERAL_REGS;
19255
19256   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
19257     rclass_2 = FP_REGS;
19258   else
19259     rclass_2 = GENERAL_REGS;
19260
19261   /* Check if the registers are of same class.  */
19262   if (rclass_1 != rclass_2)
19263     return false;
19264
19265   return true;
19266 }
19267
19268 /* Given OPERANDS of consecutive load/store that can be merged,
19269    swap them if they are not in ascending order.  */
19270 void
19271 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
19272 {
19273   rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
19274   HOST_WIDE_INT offval_1, offval_2;
19275
19276   if (load)
19277     {
19278       mem_1 = operands[1];
19279       mem_2 = operands[3];
19280     }
19281   else
19282     {
19283       mem_1 = operands[0];
19284       mem_2 = operands[2];
19285     }
19286
19287   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
19288   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
19289
19290   offval_1 = INTVAL (offset_1);
19291   offval_2 = INTVAL (offset_2);
19292
19293   if (offval_1 > offval_2)
19294     {
19295       /* Irrespective of whether this is a load or a store,
19296          we do the same swap.  */
19297       std::swap (operands[0], operands[2]);
19298       std::swap (operands[1], operands[3]);
19299     }
19300 }
19301
19302 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
19303    comparison between the two.  */
19304 int
19305 aarch64_host_wide_int_compare (const void *x, const void *y)
19306 {
19307   return wi::cmps (* ((const HOST_WIDE_INT *) x),
19308                    * ((const HOST_WIDE_INT *) y));
19309 }
19310
19311 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
19312    other pointing to a REG rtx containing an offset, compare the offsets
19313    of the two pairs.
19314
19315    Return:
19316
19317         1 iff offset (X) > offset (Y)
19318         0 iff offset (X) == offset (Y)
19319         -1 iff offset (X) < offset (Y)  */
19320 int
19321 aarch64_ldrstr_offset_compare (const void *x, const void *y)
19322 {
19323   const rtx * operands_1 = (const rtx *) x;
19324   const rtx * operands_2 = (const rtx *) y;
19325   rtx mem_1, mem_2, base, offset_1, offset_2;
19326
19327   if (MEM_P (operands_1[0]))
19328     mem_1 = operands_1[0];
19329   else
19330     mem_1 = operands_1[1];
19331
19332   if (MEM_P (operands_2[0]))
19333     mem_2 = operands_2[0];
19334   else
19335     mem_2 = operands_2[1];
19336
19337   /* Extract the offsets.  */
19338   extract_base_offset_in_addr (mem_1, &base, &offset_1);
19339   extract_base_offset_in_addr (mem_2, &base, &offset_2);
19340
19341   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
19342
19343   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
19344 }
19345
19346 /* Given OPERANDS of consecutive load/store, check if we can merge
19347    them into ldp/stp by adjusting the offset.  LOAD is true if they
19348    are load instructions.  MODE is the mode of memory operands.
19349
19350    Given below consecutive stores:
19351
19352      str  w1, [xb, 0x100]
19353      str  w1, [xb, 0x104]
19354      str  w1, [xb, 0x108]
19355      str  w1, [xb, 0x10c]
19356
19357    Though the offsets are out of the range supported by stp, we can
19358    still pair them after adjusting the offset, like:
19359
19360      add  scratch, xb, 0x100
19361      stp  w1, w1, [scratch]
19362      stp  w1, w1, [scratch, 0x8]
19363
19364    The peephole patterns detecting this opportunity should guarantee
19365    the scratch register is avaliable.  */
19366
19367 bool
19368 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
19369                                        scalar_mode mode)
19370 {
19371   const int num_insns = 4;
19372   enum reg_class rclass;
19373   HOST_WIDE_INT offvals[num_insns], msize;
19374   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
19375
19376   if (load)
19377     {
19378       for (int i = 0; i < num_insns; i++)
19379         {
19380           reg[i] = operands[2 * i];
19381           mem[i] = operands[2 * i + 1];
19382
19383           gcc_assert (REG_P (reg[i]));
19384         }
19385
19386       /* Do not attempt to merge the loads if the loads clobber each other.  */
19387       for (int i = 0; i < 8; i += 2)
19388         for (int j = i + 2; j < 8; j += 2)
19389           if (reg_overlap_mentioned_p (operands[i], operands[j]))
19390             return false;
19391     }
19392   else
19393     for (int i = 0; i < num_insns; i++)
19394       {
19395         mem[i] = operands[2 * i];
19396         reg[i] = operands[2 * i + 1];
19397       }
19398
19399   /* Skip if memory operand is by itself valid for ldp/stp.  */
19400   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
19401     return false;
19402
19403   for (int i = 0; i < num_insns; i++)
19404     {
19405       /* The mems cannot be volatile.  */
19406       if (MEM_VOLATILE_P (mem[i]))
19407         return false;
19408
19409       /* Check if the addresses are in the form of [base+offset].  */
19410       extract_base_offset_in_addr (mem[i], base + i, offset + i);
19411       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
19412         return false;
19413     }
19414
19415   /* Check if the registers are of same class.  */
19416   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
19417     ? FP_REGS : GENERAL_REGS;
19418
19419   for (int i = 1; i < num_insns; i++)
19420     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
19421       {
19422         if (rclass != FP_REGS)
19423           return false;
19424       }
19425     else
19426       {
19427         if (rclass != GENERAL_REGS)
19428           return false;
19429       }
19430
19431   /* Only the last register in the order in which they occur
19432      may be clobbered by the load.  */
19433   if (rclass == GENERAL_REGS && load)
19434     for (int i = 0; i < num_insns - 1; i++)
19435       if (reg_mentioned_p (reg[i], mem[i]))
19436         return false;
19437
19438   /* Check if the bases are same.  */
19439   for (int i = 0; i < num_insns - 1; i++)
19440     if (!rtx_equal_p (base[i], base[i + 1]))
19441       return false;
19442
19443   for (int i = 0; i < num_insns; i++)
19444     offvals[i] = INTVAL (offset[i]);
19445
19446   msize = GET_MODE_SIZE (mode);
19447
19448   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
19449   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
19450          aarch64_host_wide_int_compare);
19451
19452   if (!(offvals[1] == offvals[0] + msize
19453         && offvals[3] == offvals[2] + msize))
19454     return false;
19455
19456   /* Check that offsets are within range of each other.  The ldp/stp
19457      instructions have 7 bit immediate offsets, so use 0x80.  */
19458   if (offvals[2] - offvals[0] >= msize * 0x80)
19459     return false;
19460
19461   /* The offsets must be aligned with respect to each other.  */
19462   if (offvals[0] % msize != offvals[2] % msize)
19463     return false;
19464
19465   /* If we have SImode and slow unaligned ldp,
19466      check the alignment to be at least 8 byte. */
19467   if (mode == SImode
19468       && (aarch64_tune_params.extra_tuning_flags
19469           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
19470       && !optimize_size
19471       && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
19472     return false;
19473
19474   return true;
19475 }
19476
19477 /* Given OPERANDS of consecutive load/store, this function pairs them
19478    into LDP/STP after adjusting the offset.  It depends on the fact
19479    that the operands can be sorted so the offsets are correct for STP.
19480    MODE is the mode of memory operands.  CODE is the rtl operator
19481    which should be applied to all memory operands, it's SIGN_EXTEND,
19482    ZERO_EXTEND or UNKNOWN.  */
19483
19484 bool
19485 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
19486                              scalar_mode mode, RTX_CODE code)
19487 {
19488   rtx base, offset_1, offset_3, t1, t2;
19489   rtx mem_1, mem_2, mem_3, mem_4;
19490   rtx temp_operands[8];
19491   HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
19492                 stp_off_upper_limit, stp_off_lower_limit, msize;
19493
19494   /* We make changes on a copy as we may still bail out.  */
19495   for (int i = 0; i < 8; i ++)
19496     temp_operands[i] = operands[i];
19497
19498   /* Sort the operands.  */
19499   qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
19500
19501   /* Copy the memory operands so that if we have to bail for some
19502      reason the original addresses are unchanged.  */
19503   if (load)
19504     {
19505       mem_1 = copy_rtx (temp_operands[1]);
19506       mem_2 = copy_rtx (temp_operands[3]);
19507       mem_3 = copy_rtx (temp_operands[5]);
19508       mem_4 = copy_rtx (temp_operands[7]);
19509     }
19510   else
19511     {
19512       mem_1 = copy_rtx (temp_operands[0]);
19513       mem_2 = copy_rtx (temp_operands[2]);
19514       mem_3 = copy_rtx (temp_operands[4]);
19515       mem_4 = copy_rtx (temp_operands[6]);
19516       gcc_assert (code == UNKNOWN);
19517     }
19518
19519   extract_base_offset_in_addr (mem_1, &base, &offset_1);
19520   extract_base_offset_in_addr (mem_3, &base, &offset_3);
19521   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
19522               && offset_3 != NULL_RTX);
19523
19524   /* Adjust offset so it can fit in LDP/STP instruction.  */
19525   msize = GET_MODE_SIZE (mode);
19526   stp_off_upper_limit = msize * (0x40 - 1);
19527   stp_off_lower_limit = - msize * 0x40;
19528
19529   off_val_1 = INTVAL (offset_1);
19530   off_val_3 = INTVAL (offset_3);
19531
19532   /* The base offset is optimally half way between the two STP/LDP offsets.  */
19533   if (msize <= 4)
19534     base_off = (off_val_1 + off_val_3) / 2;
19535   else
19536     /* However, due to issues with negative LDP/STP offset generation for
19537        larger modes, for DF, DI and vector modes. we must not use negative
19538        addresses smaller than 9 signed unadjusted bits can store.  This
19539        provides the most range in this case.  */
19540     base_off = off_val_1;
19541
19542   /* Adjust the base so that it is aligned with the addresses but still
19543      optimal.  */
19544   if (base_off % msize != off_val_1 % msize)
19545     /* Fix the offset, bearing in mind we want to make it bigger not
19546        smaller.  */
19547     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19548   else if (msize <= 4)
19549     /* The negative range of LDP/STP is one larger than the positive range.  */
19550     base_off += msize;
19551
19552   /* Check if base offset is too big or too small.  We can attempt to resolve
19553      this issue by setting it to the maximum value and seeing if the offsets
19554      still fit.  */
19555   if (base_off >= 0x1000)
19556     {
19557       base_off = 0x1000 - 1;
19558       /* We must still make sure that the base offset is aligned with respect
19559          to the address.  But it may may not be made any bigger.  */
19560       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19561     }
19562
19563   /* Likewise for the case where the base is too small.  */
19564   if (base_off <= -0x1000)
19565     {
19566       base_off = -0x1000 + 1;
19567       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19568     }
19569
19570   /* Offset of the first STP/LDP.  */
19571   new_off_1 = off_val_1 - base_off;
19572
19573   /* Offset of the second STP/LDP.  */
19574   new_off_3 = off_val_3 - base_off;
19575
19576   /* The offsets must be within the range of the LDP/STP instructions.  */
19577   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
19578       || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
19579     return false;
19580
19581   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
19582                                                   new_off_1), true);
19583   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
19584                                                   new_off_1 + msize), true);
19585   replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
19586                                                   new_off_3), true);
19587   replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
19588                                                   new_off_3 + msize), true);
19589
19590   if (!aarch64_mem_pair_operand (mem_1, mode)
19591       || !aarch64_mem_pair_operand (mem_3, mode))
19592     return false;
19593
19594   if (code == ZERO_EXTEND)
19595     {
19596       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
19597       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
19598       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
19599       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
19600     }
19601   else if (code == SIGN_EXTEND)
19602     {
19603       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
19604       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
19605       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
19606       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
19607     }
19608
19609   if (load)
19610     {
19611       operands[0] = temp_operands[0];
19612       operands[1] = mem_1;
19613       operands[2] = temp_operands[2];
19614       operands[3] = mem_2;
19615       operands[4] = temp_operands[4];
19616       operands[5] = mem_3;
19617       operands[6] = temp_operands[6];
19618       operands[7] = mem_4;
19619     }
19620   else
19621     {
19622       operands[0] = mem_1;
19623       operands[1] = temp_operands[1];
19624       operands[2] = mem_2;
19625       operands[3] = temp_operands[3];
19626       operands[4] = mem_3;
19627       operands[5] = temp_operands[5];
19628       operands[6] = mem_4;
19629       operands[7] = temp_operands[7];
19630     }
19631
19632   /* Emit adjusting instruction.  */
19633   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
19634   /* Emit ldp/stp instructions.  */
19635   t1 = gen_rtx_SET (operands[0], operands[1]);
19636   t2 = gen_rtx_SET (operands[2], operands[3]);
19637   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
19638   t1 = gen_rtx_SET (operands[4], operands[5]);
19639   t2 = gen_rtx_SET (operands[6], operands[7]);
19640   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
19641   return true;
19642 }
19643
19644 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
19645    it isn't worth branching around empty masked ops (including masked
19646    stores).  */
19647
19648 static bool
19649 aarch64_empty_mask_is_expensive (unsigned)
19650 {
19651   return false;
19652 }
19653
19654 /* Return 1 if pseudo register should be created and used to hold
19655    GOT address for PIC code.  */
19656
19657 bool
19658 aarch64_use_pseudo_pic_reg (void)
19659 {
19660   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
19661 }
19662
19663 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
19664
19665 static int
19666 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
19667 {
19668   switch (XINT (x, 1))
19669     {
19670     case UNSPEC_GOTSMALLPIC:
19671     case UNSPEC_GOTSMALLPIC28K:
19672     case UNSPEC_GOTTINYPIC:
19673       return 0;
19674     default:
19675       break;
19676     }
19677
19678   return default_unspec_may_trap_p (x, flags);
19679 }
19680
19681
19682 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
19683    return the log2 of that value.  Otherwise return -1.  */
19684
19685 int
19686 aarch64_fpconst_pow_of_2 (rtx x)
19687 {
19688   const REAL_VALUE_TYPE *r;
19689
19690   if (!CONST_DOUBLE_P (x))
19691     return -1;
19692
19693   r = CONST_DOUBLE_REAL_VALUE (x);
19694
19695   if (REAL_VALUE_NEGATIVE (*r)
19696       || REAL_VALUE_ISNAN (*r)
19697       || REAL_VALUE_ISINF (*r)
19698       || !real_isinteger (r, DFmode))
19699     return -1;
19700
19701   return exact_log2 (real_to_integer (r));
19702 }
19703
19704 /* If X is a vector of equal CONST_DOUBLE values and that value is
19705    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
19706
19707 int
19708 aarch64_vec_fpconst_pow_of_2 (rtx x)
19709 {
19710   int nelts;
19711   if (GET_CODE (x) != CONST_VECTOR
19712       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
19713     return -1;
19714
19715   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
19716     return -1;
19717
19718   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
19719   if (firstval <= 0)
19720     return -1;
19721
19722   for (int i = 1; i < nelts; i++)
19723     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
19724       return -1;
19725
19726   return firstval;
19727 }
19728
19729 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
19730    to float.
19731
19732    __fp16 always promotes through this hook.
19733    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
19734    through the generic excess precision logic rather than here.  */
19735
19736 static tree
19737 aarch64_promoted_type (const_tree t)
19738 {
19739   if (SCALAR_FLOAT_TYPE_P (t)
19740       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
19741     return float_type_node;
19742
19743   return NULL_TREE;
19744 }
19745
19746 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
19747
19748 static bool
19749 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
19750                            optimization_type opt_type)
19751 {
19752   switch (op)
19753     {
19754     case rsqrt_optab:
19755       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
19756
19757     default:
19758       return true;
19759     }
19760 }
19761
19762 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
19763
19764 static unsigned int
19765 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
19766                                         int *offset)
19767 {
19768   /* Polynomial invariant 1 == (VG / 2) - 1.  */
19769   gcc_assert (i == 1);
19770   *factor = 2;
19771   *offset = 1;
19772   return AARCH64_DWARF_VG;
19773 }
19774
19775 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
19776    if MODE is HFmode, and punt to the generic implementation otherwise.  */
19777
19778 static bool
19779 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
19780 {
19781   return (mode == HFmode
19782           ? true
19783           : default_libgcc_floating_mode_supported_p (mode));
19784 }
19785
19786 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
19787    if MODE is HFmode, and punt to the generic implementation otherwise.  */
19788
19789 static bool
19790 aarch64_scalar_mode_supported_p (scalar_mode mode)
19791 {
19792   return (mode == HFmode
19793           ? true
19794           : default_scalar_mode_supported_p (mode));
19795 }
19796
19797 /* Set the value of FLT_EVAL_METHOD.
19798    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
19799
19800     0: evaluate all operations and constants, whose semantic type has at
19801        most the range and precision of type float, to the range and
19802        precision of float; evaluate all other operations and constants to
19803        the range and precision of the semantic type;
19804
19805     N, where _FloatN is a supported interchange floating type
19806        evaluate all operations and constants, whose semantic type has at
19807        most the range and precision of _FloatN type, to the range and
19808        precision of the _FloatN type; evaluate all other operations and
19809        constants to the range and precision of the semantic type;
19810
19811    If we have the ARMv8.2-A extensions then we support _Float16 in native
19812    precision, so we should set this to 16.  Otherwise, we support the type,
19813    but want to evaluate expressions in float precision, so set this to
19814    0.  */
19815
19816 static enum flt_eval_method
19817 aarch64_excess_precision (enum excess_precision_type type)
19818 {
19819   switch (type)
19820     {
19821       case EXCESS_PRECISION_TYPE_FAST:
19822       case EXCESS_PRECISION_TYPE_STANDARD:
19823         /* We can calculate either in 16-bit range and precision or
19824            32-bit range and precision.  Make that decision based on whether
19825            we have native support for the ARMv8.2-A 16-bit floating-point
19826            instructions or not.  */
19827         return (TARGET_FP_F16INST
19828                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
19829                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
19830       case EXCESS_PRECISION_TYPE_IMPLICIT:
19831         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
19832       default:
19833         gcc_unreachable ();
19834     }
19835   return FLT_EVAL_METHOD_UNPREDICTABLE;
19836 }
19837
19838 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
19839    scheduled for speculative execution.  Reject the long-running division
19840    and square-root instructions.  */
19841
19842 static bool
19843 aarch64_sched_can_speculate_insn (rtx_insn *insn)
19844 {
19845   switch (get_attr_type (insn))
19846     {
19847       case TYPE_SDIV:
19848       case TYPE_UDIV:
19849       case TYPE_FDIVS:
19850       case TYPE_FDIVD:
19851       case TYPE_FSQRTS:
19852       case TYPE_FSQRTD:
19853       case TYPE_NEON_FP_SQRT_S:
19854       case TYPE_NEON_FP_SQRT_D:
19855       case TYPE_NEON_FP_SQRT_S_Q:
19856       case TYPE_NEON_FP_SQRT_D_Q:
19857       case TYPE_NEON_FP_DIV_S:
19858       case TYPE_NEON_FP_DIV_D:
19859       case TYPE_NEON_FP_DIV_S_Q:
19860       case TYPE_NEON_FP_DIV_D_Q:
19861         return false;
19862       default:
19863         return true;
19864     }
19865 }
19866
19867 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
19868
19869 static int
19870 aarch64_compute_pressure_classes (reg_class *classes)
19871 {
19872   int i = 0;
19873   classes[i++] = GENERAL_REGS;
19874   classes[i++] = FP_REGS;
19875   /* PR_REGS isn't a useful pressure class because many predicate pseudo
19876      registers need to go in PR_LO_REGS at some point during their
19877      lifetime.  Splitting it into two halves has the effect of making
19878      all predicates count against PR_LO_REGS, so that we try whenever
19879      possible to restrict the number of live predicates to 8.  This
19880      greatly reduces the amount of spilling in certain loops.  */
19881   classes[i++] = PR_LO_REGS;
19882   classes[i++] = PR_HI_REGS;
19883   return i;
19884 }
19885
19886 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
19887
19888 static bool
19889 aarch64_can_change_mode_class (machine_mode from,
19890                                machine_mode to, reg_class_t)
19891 {
19892   if (BYTES_BIG_ENDIAN)
19893     {
19894       bool from_sve_p = aarch64_sve_data_mode_p (from);
19895       bool to_sve_p = aarch64_sve_data_mode_p (to);
19896
19897       /* Don't allow changes between SVE data modes and non-SVE modes.
19898          See the comment at the head of aarch64-sve.md for details.  */
19899       if (from_sve_p != to_sve_p)
19900         return false;
19901
19902       /* Don't allow changes in element size: lane 0 of the new vector
19903          would not then be lane 0 of the old vector.  See the comment
19904          above aarch64_maybe_expand_sve_subreg_move for a more detailed
19905          description.
19906
19907          In the worst case, this forces a register to be spilled in
19908          one mode and reloaded in the other, which handles the
19909          endianness correctly.  */
19910       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
19911         return false;
19912     }
19913   return true;
19914 }
19915
19916 /* Implement TARGET_EARLY_REMAT_MODES.  */
19917
19918 static void
19919 aarch64_select_early_remat_modes (sbitmap modes)
19920 {
19921   /* SVE values are not normally live across a call, so it should be
19922      worth doing early rematerialization even in VL-specific mode.  */
19923   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
19924     {
19925       machine_mode mode = (machine_mode) i;
19926       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
19927       if (vec_flags & VEC_ANY_SVE)
19928         bitmap_set_bit (modes, i);
19929     }
19930 }
19931
19932 /* Override the default target speculation_safe_value.  */
19933 static rtx
19934 aarch64_speculation_safe_value (machine_mode mode,
19935                                 rtx result, rtx val, rtx failval)
19936 {
19937   /* Maybe we should warn if falling back to hard barriers.  They are
19938      likely to be noticably more expensive than the alternative below.  */
19939   if (!aarch64_track_speculation)
19940     return default_speculation_safe_value (mode, result, val, failval);
19941
19942   if (!REG_P (val))
19943     val = copy_to_mode_reg (mode, val);
19944
19945   if (!aarch64_reg_or_zero (failval, mode))
19946     failval = copy_to_mode_reg (mode, failval);
19947
19948   emit_insn (gen_despeculate_copy (mode, result, val, failval));
19949   return result;
19950 }
19951
19952 /* Implement TARGET_ESTIMATED_POLY_VALUE.
19953    Look into the tuning structure for an estimate.
19954    VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
19955    Advanced SIMD 128 bits.  */
19956
19957 static HOST_WIDE_INT
19958 aarch64_estimated_poly_value (poly_int64 val)
19959 {
19960   enum aarch64_sve_vector_bits_enum width_source
19961     = aarch64_tune_params.sve_width;
19962
19963   /* If we still don't have an estimate, use the default.  */
19964   if (width_source == SVE_SCALABLE)
19965     return default_estimated_poly_value (val);
19966
19967   HOST_WIDE_INT over_128 = width_source - 128;
19968   return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
19969 }
19970
19971
19972 /* Return true for types that could be supported as SIMD return or
19973    argument types.  */
19974
19975 static bool
19976 supported_simd_type (tree t)
19977 {
19978   if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
19979     {
19980       HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
19981       return s == 1 || s == 2 || s == 4 || s == 8;
19982     }
19983   return false;
19984 }
19985
19986 /* Return true for types that currently are supported as SIMD return
19987    or argument types.  */
19988
19989 static bool
19990 currently_supported_simd_type (tree t, tree b)
19991 {
19992   if (COMPLEX_FLOAT_TYPE_P (t))
19993     return false;
19994
19995   if (TYPE_SIZE (t) != TYPE_SIZE (b))
19996     return false;
19997
19998   return supported_simd_type (t);
19999 }
20000
20001 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
20002
20003 static int
20004 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
20005                                         struct cgraph_simd_clone *clonei,
20006                                         tree base_type, int num)
20007 {
20008   tree t, ret_type, arg_type;
20009   unsigned int elt_bits, vec_bits, count;
20010
20011   if (!TARGET_SIMD)
20012     return 0;
20013
20014   if (clonei->simdlen
20015       && (clonei->simdlen < 2
20016           || clonei->simdlen > 1024
20017           || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
20018     {
20019       warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20020                   "unsupported simdlen %d", clonei->simdlen);
20021       return 0;
20022     }
20023
20024   ret_type = TREE_TYPE (TREE_TYPE (node->decl));
20025   if (TREE_CODE (ret_type) != VOID_TYPE
20026       && !currently_supported_simd_type (ret_type, base_type))
20027     {
20028       if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
20029         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20030                     "GCC does not currently support mixed size types "
20031                     "for %<simd%> functions");
20032       else if (supported_simd_type (ret_type))
20033         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20034                     "GCC does not currently support return type %qT "
20035                     "for %<simd%> functions", ret_type);
20036       else
20037         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20038                     "unsupported return type %qT for %<simd%> functions",
20039                     ret_type);
20040       return 0;
20041     }
20042
20043   for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
20044     {
20045       arg_type = TREE_TYPE (t);
20046
20047       if (!currently_supported_simd_type (arg_type, base_type))
20048         {
20049           if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
20050             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20051                         "GCC does not currently support mixed size types "
20052                         "for %<simd%> functions");
20053           else
20054             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20055                         "GCC does not currently support argument type %qT "
20056                         "for %<simd%> functions", arg_type);
20057           return 0;
20058         }
20059     }
20060
20061   clonei->vecsize_mangle = 'n';
20062   clonei->mask_mode = VOIDmode;
20063   elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
20064   if (clonei->simdlen == 0)
20065     {
20066       count = 2;
20067       vec_bits = (num == 0 ? 64 : 128);
20068       clonei->simdlen = vec_bits / elt_bits;
20069     }
20070   else
20071     {
20072       count = 1;
20073       vec_bits = clonei->simdlen * elt_bits;
20074       if (vec_bits != 64 && vec_bits != 128)
20075         {
20076           warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20077                       "GCC does not currently support simdlen %d for type %qT",
20078                       clonei->simdlen, base_type);
20079           return 0;
20080         }
20081     }
20082   clonei->vecsize_int = vec_bits;
20083   clonei->vecsize_float = vec_bits;
20084   return count;
20085 }
20086
20087 /* Implement TARGET_SIMD_CLONE_ADJUST.  */
20088
20089 static void
20090 aarch64_simd_clone_adjust (struct cgraph_node *node)
20091 {
20092   /* Add aarch64_vector_pcs target attribute to SIMD clones so they
20093      use the correct ABI.  */
20094
20095   tree t = TREE_TYPE (node->decl);
20096   TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
20097                                         TYPE_ATTRIBUTES (t));
20098 }
20099
20100 /* Implement TARGET_SIMD_CLONE_USABLE.  */
20101
20102 static int
20103 aarch64_simd_clone_usable (struct cgraph_node *node)
20104 {
20105   switch (node->simdclone->vecsize_mangle)
20106     {
20107     case 'n':
20108       if (!TARGET_SIMD)
20109         return -1;
20110       return 0;
20111     default:
20112       gcc_unreachable ();
20113     }
20114 }
20115
20116 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
20117
20118 static int
20119 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
20120 {
20121   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
20122       != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
20123     return 0;
20124   return 1;
20125 }
20126
20127 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
20128
20129 static const char *
20130 aarch64_get_multilib_abi_name (void)
20131 {
20132   if (TARGET_BIG_END)
20133     return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
20134   return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
20135 }
20136
20137 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
20138    global variable based guard use the default else
20139    return a null tree.  */
20140 static tree
20141 aarch64_stack_protect_guard (void)
20142 {
20143   if (aarch64_stack_protector_guard == SSP_GLOBAL)
20144     return default_stack_protect_guard ();
20145
20146   return NULL_TREE;
20147 }
20148
20149 /* Implement TARGET_ASM_FILE_END for AArch64.  This adds the AArch64 GNU NOTE
20150    section at the end if needed.  */
20151 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND      0xc0000000
20152 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI      (1U << 0)
20153 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC      (1U << 1)
20154 void
20155 aarch64_file_end_indicate_exec_stack ()
20156 {
20157   file_end_indicate_exec_stack ();
20158
20159   unsigned feature_1_and = 0;
20160   if (aarch64_bti_enabled ())
20161     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
20162
20163   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
20164     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
20165
20166   if (feature_1_and)
20167     {
20168       /* Generate .note.gnu.property section.  */
20169       switch_to_section (get_section (".note.gnu.property",
20170                                       SECTION_NOTYPE, NULL));
20171
20172       /* PT_NOTE header: namesz, descsz, type.
20173          namesz = 4 ("GNU\0")
20174          descsz = 16 (Size of the program property array)
20175                   [(12 + padding) * Number of array elements]
20176          type   = 5 (NT_GNU_PROPERTY_TYPE_0).  */
20177       assemble_align (POINTER_SIZE);
20178       assemble_integer (GEN_INT (4), 4, 32, 1);
20179       assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
20180       assemble_integer (GEN_INT (5), 4, 32, 1);
20181
20182       /* PT_NOTE name.  */
20183       assemble_string ("GNU", 4);
20184
20185       /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
20186          type   = GNU_PROPERTY_AARCH64_FEATURE_1_AND
20187          datasz = 4
20188          data   = feature_1_and.  */
20189       assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
20190       assemble_integer (GEN_INT (4), 4, 32, 1);
20191       assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
20192
20193       /* Pad the size of the note to the required alignment.  */
20194       assemble_align (POINTER_SIZE);
20195     }
20196 }
20197 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
20198 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
20199 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
20200
20201 /* Target-specific selftests.  */
20202
20203 #if CHECKING_P
20204
20205 namespace selftest {
20206
20207 /* Selftest for the RTL loader.
20208    Verify that the RTL loader copes with a dump from
20209    print_rtx_function.  This is essentially just a test that class
20210    function_reader can handle a real dump, but it also verifies
20211    that lookup_reg_by_dump_name correctly handles hard regs.
20212    The presence of hard reg names in the dump means that the test is
20213    target-specific, hence it is in this file.  */
20214
20215 static void
20216 aarch64_test_loading_full_dump ()
20217 {
20218   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
20219
20220   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
20221
20222   rtx_insn *insn_1 = get_insn_by_uid (1);
20223   ASSERT_EQ (NOTE, GET_CODE (insn_1));
20224
20225   rtx_insn *insn_15 = get_insn_by_uid (15);
20226   ASSERT_EQ (INSN, GET_CODE (insn_15));
20227   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
20228
20229   /* Verify crtl->return_rtx.  */
20230   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
20231   ASSERT_EQ (0, REGNO (crtl->return_rtx));
20232   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
20233 }
20234
20235 /* Run all target-specific selftests.  */
20236
20237 static void
20238 aarch64_run_selftests (void)
20239 {
20240   aarch64_test_loading_full_dump ();
20241 }
20242
20243 } // namespace selftest
20244
20245 #endif /* #if CHECKING_P */
20246
20247 #undef TARGET_STACK_PROTECT_GUARD
20248 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
20249
20250 #undef TARGET_ADDRESS_COST
20251 #define TARGET_ADDRESS_COST aarch64_address_cost
20252
20253 /* This hook will determines whether unnamed bitfields affect the alignment
20254    of the containing structure.  The hook returns true if the structure
20255    should inherit the alignment requirements of an unnamed bitfield's
20256    type.  */
20257 #undef TARGET_ALIGN_ANON_BITFIELD
20258 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
20259
20260 #undef TARGET_ASM_ALIGNED_DI_OP
20261 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
20262
20263 #undef TARGET_ASM_ALIGNED_HI_OP
20264 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
20265
20266 #undef TARGET_ASM_ALIGNED_SI_OP
20267 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
20268
20269 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
20270 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
20271   hook_bool_const_tree_hwi_hwi_const_tree_true
20272
20273 #undef TARGET_ASM_FILE_START
20274 #define TARGET_ASM_FILE_START aarch64_start_file
20275
20276 #undef TARGET_ASM_OUTPUT_MI_THUNK
20277 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
20278
20279 #undef TARGET_ASM_SELECT_RTX_SECTION
20280 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
20281
20282 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
20283 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
20284
20285 #undef TARGET_BUILD_BUILTIN_VA_LIST
20286 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
20287
20288 #undef TARGET_CALLEE_COPIES
20289 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
20290
20291 #undef TARGET_CAN_ELIMINATE
20292 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
20293
20294 #undef TARGET_CAN_INLINE_P
20295 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
20296
20297 #undef TARGET_CANNOT_FORCE_CONST_MEM
20298 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
20299
20300 #undef TARGET_CASE_VALUES_THRESHOLD
20301 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
20302
20303 #undef TARGET_CONDITIONAL_REGISTER_USAGE
20304 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
20305
20306 /* Only the least significant bit is used for initialization guard
20307    variables.  */
20308 #undef TARGET_CXX_GUARD_MASK_BIT
20309 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
20310
20311 #undef TARGET_C_MODE_FOR_SUFFIX
20312 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
20313
20314 #ifdef TARGET_BIG_ENDIAN_DEFAULT
20315 #undef  TARGET_DEFAULT_TARGET_FLAGS
20316 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
20317 #endif
20318
20319 #undef TARGET_CLASS_MAX_NREGS
20320 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
20321
20322 #undef TARGET_BUILTIN_DECL
20323 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
20324
20325 #undef TARGET_BUILTIN_RECIPROCAL
20326 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
20327
20328 #undef TARGET_C_EXCESS_PRECISION
20329 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
20330
20331 #undef  TARGET_EXPAND_BUILTIN
20332 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
20333
20334 #undef TARGET_EXPAND_BUILTIN_VA_START
20335 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
20336
20337 #undef TARGET_FOLD_BUILTIN
20338 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
20339
20340 #undef TARGET_FUNCTION_ARG
20341 #define TARGET_FUNCTION_ARG aarch64_function_arg
20342
20343 #undef TARGET_FUNCTION_ARG_ADVANCE
20344 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
20345
20346 #undef TARGET_FUNCTION_ARG_BOUNDARY
20347 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
20348
20349 #undef TARGET_FUNCTION_ARG_PADDING
20350 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
20351
20352 #undef TARGET_GET_RAW_RESULT_MODE
20353 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
20354 #undef TARGET_GET_RAW_ARG_MODE
20355 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
20356
20357 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
20358 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
20359
20360 #undef TARGET_FUNCTION_VALUE
20361 #define TARGET_FUNCTION_VALUE aarch64_function_value
20362
20363 #undef TARGET_FUNCTION_VALUE_REGNO_P
20364 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
20365
20366 #undef TARGET_GIMPLE_FOLD_BUILTIN
20367 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
20368
20369 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
20370 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
20371
20372 #undef  TARGET_INIT_BUILTINS
20373 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
20374
20375 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
20376 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
20377   aarch64_ira_change_pseudo_allocno_class
20378
20379 #undef TARGET_LEGITIMATE_ADDRESS_P
20380 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
20381
20382 #undef TARGET_LEGITIMATE_CONSTANT_P
20383 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
20384
20385 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
20386 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
20387   aarch64_legitimize_address_displacement
20388
20389 #undef TARGET_LIBGCC_CMP_RETURN_MODE
20390 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
20391
20392 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
20393 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
20394 aarch64_libgcc_floating_mode_supported_p
20395
20396 #undef TARGET_MANGLE_TYPE
20397 #define TARGET_MANGLE_TYPE aarch64_mangle_type
20398
20399 #undef TARGET_MEMORY_MOVE_COST
20400 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
20401
20402 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
20403 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
20404
20405 #undef TARGET_MUST_PASS_IN_STACK
20406 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
20407
20408 /* This target hook should return true if accesses to volatile bitfields
20409    should use the narrowest mode possible.  It should return false if these
20410    accesses should use the bitfield container type.  */
20411 #undef TARGET_NARROW_VOLATILE_BITFIELD
20412 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
20413
20414 #undef  TARGET_OPTION_OVERRIDE
20415 #define TARGET_OPTION_OVERRIDE aarch64_override_options
20416
20417 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
20418 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
20419   aarch64_override_options_after_change
20420
20421 #undef TARGET_OPTION_SAVE
20422 #define TARGET_OPTION_SAVE aarch64_option_save
20423
20424 #undef TARGET_OPTION_RESTORE
20425 #define TARGET_OPTION_RESTORE aarch64_option_restore
20426
20427 #undef TARGET_OPTION_PRINT
20428 #define TARGET_OPTION_PRINT aarch64_option_print
20429
20430 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
20431 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
20432
20433 #undef TARGET_SET_CURRENT_FUNCTION
20434 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
20435
20436 #undef TARGET_PASS_BY_REFERENCE
20437 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
20438
20439 #undef TARGET_PREFERRED_RELOAD_CLASS
20440 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
20441
20442 #undef TARGET_SCHED_REASSOCIATION_WIDTH
20443 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
20444
20445 #undef TARGET_PROMOTED_TYPE
20446 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
20447
20448 #undef TARGET_SECONDARY_RELOAD
20449 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
20450
20451 #undef TARGET_SHIFT_TRUNCATION_MASK
20452 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
20453
20454 #undef TARGET_SETUP_INCOMING_VARARGS
20455 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
20456
20457 #undef TARGET_STRUCT_VALUE_RTX
20458 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
20459
20460 #undef TARGET_REGISTER_MOVE_COST
20461 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
20462
20463 #undef TARGET_RETURN_IN_MEMORY
20464 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
20465
20466 #undef TARGET_RETURN_IN_MSB
20467 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
20468
20469 #undef TARGET_RTX_COSTS
20470 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
20471
20472 #undef TARGET_SCALAR_MODE_SUPPORTED_P
20473 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
20474
20475 #undef TARGET_SCHED_ISSUE_RATE
20476 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
20477
20478 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
20479 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
20480   aarch64_sched_first_cycle_multipass_dfa_lookahead
20481
20482 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
20483 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
20484   aarch64_first_cycle_multipass_dfa_lookahead_guard
20485
20486 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
20487 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
20488   aarch64_get_separate_components
20489
20490 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
20491 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
20492   aarch64_components_for_bb
20493
20494 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
20495 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
20496   aarch64_disqualify_components
20497
20498 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
20499 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
20500   aarch64_emit_prologue_components
20501
20502 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
20503 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
20504   aarch64_emit_epilogue_components
20505
20506 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
20507 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
20508   aarch64_set_handled_components
20509
20510 #undef TARGET_TRAMPOLINE_INIT
20511 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
20512
20513 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
20514 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
20515
20516 #undef TARGET_VECTOR_MODE_SUPPORTED_P
20517 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
20518
20519 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
20520 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
20521   aarch64_builtin_support_vector_misalignment
20522
20523 #undef TARGET_ARRAY_MODE
20524 #define TARGET_ARRAY_MODE aarch64_array_mode
20525
20526 #undef TARGET_ARRAY_MODE_SUPPORTED_P
20527 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
20528
20529 #undef TARGET_VECTORIZE_ADD_STMT_COST
20530 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
20531
20532 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
20533 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
20534   aarch64_builtin_vectorization_cost
20535
20536 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
20537 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
20538
20539 #undef TARGET_VECTORIZE_BUILTINS
20540 #define TARGET_VECTORIZE_BUILTINS
20541
20542 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
20543 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
20544   aarch64_builtin_vectorized_function
20545
20546 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
20547 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
20548   aarch64_autovectorize_vector_sizes
20549
20550 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
20551 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
20552   aarch64_atomic_assign_expand_fenv
20553
20554 /* Section anchor support.  */
20555
20556 #undef TARGET_MIN_ANCHOR_OFFSET
20557 #define TARGET_MIN_ANCHOR_OFFSET -256
20558
20559 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
20560    byte offset; we can do much more for larger data types, but have no way
20561    to determine the size of the access.  We assume accesses are aligned.  */
20562 #undef TARGET_MAX_ANCHOR_OFFSET
20563 #define TARGET_MAX_ANCHOR_OFFSET 4095
20564
20565 #undef TARGET_VECTOR_ALIGNMENT
20566 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
20567
20568 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
20569 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
20570   aarch64_vectorize_preferred_vector_alignment
20571 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
20572 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
20573   aarch64_simd_vector_alignment_reachable
20574
20575 /* vec_perm support.  */
20576
20577 #undef TARGET_VECTORIZE_VEC_PERM_CONST
20578 #define TARGET_VECTORIZE_VEC_PERM_CONST \
20579   aarch64_vectorize_vec_perm_const
20580
20581 #undef TARGET_VECTORIZE_GET_MASK_MODE
20582 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
20583 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
20584 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
20585   aarch64_empty_mask_is_expensive
20586 #undef TARGET_PREFERRED_ELSE_VALUE
20587 #define TARGET_PREFERRED_ELSE_VALUE \
20588   aarch64_preferred_else_value
20589
20590 #undef TARGET_INIT_LIBFUNCS
20591 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
20592
20593 #undef TARGET_FIXED_CONDITION_CODE_REGS
20594 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
20595
20596 #undef TARGET_FLAGS_REGNUM
20597 #define TARGET_FLAGS_REGNUM CC_REGNUM
20598
20599 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
20600 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
20601
20602 #undef TARGET_ASAN_SHADOW_OFFSET
20603 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
20604
20605 #undef TARGET_LEGITIMIZE_ADDRESS
20606 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
20607
20608 #undef TARGET_SCHED_CAN_SPECULATE_INSN
20609 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
20610
20611 #undef TARGET_CAN_USE_DOLOOP_P
20612 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
20613
20614 #undef TARGET_SCHED_ADJUST_PRIORITY
20615 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
20616
20617 #undef TARGET_SCHED_MACRO_FUSION_P
20618 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
20619
20620 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
20621 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
20622
20623 #undef TARGET_SCHED_FUSION_PRIORITY
20624 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
20625
20626 #undef TARGET_UNSPEC_MAY_TRAP_P
20627 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
20628
20629 #undef TARGET_USE_PSEUDO_PIC_REG
20630 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
20631
20632 #undef TARGET_PRINT_OPERAND
20633 #define TARGET_PRINT_OPERAND aarch64_print_operand
20634
20635 #undef TARGET_PRINT_OPERAND_ADDRESS
20636 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
20637
20638 #undef TARGET_OPTAB_SUPPORTED_P
20639 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
20640
20641 #undef TARGET_OMIT_STRUCT_RETURN_REG
20642 #define TARGET_OMIT_STRUCT_RETURN_REG true
20643
20644 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
20645 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
20646   aarch64_dwarf_poly_indeterminate_value
20647
20648 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
20649 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
20650 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
20651
20652 #undef TARGET_HARD_REGNO_NREGS
20653 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
20654 #undef TARGET_HARD_REGNO_MODE_OK
20655 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
20656
20657 #undef TARGET_MODES_TIEABLE_P
20658 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
20659
20660 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
20661 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
20662   aarch64_hard_regno_call_part_clobbered
20663
20664 #undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
20665 #define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \
20666   aarch64_remove_extra_call_preserved_regs
20667
20668 #undef TARGET_RETURN_CALL_WITH_MAX_CLOBBERS
20669 #define TARGET_RETURN_CALL_WITH_MAX_CLOBBERS \
20670   aarch64_return_call_with_max_clobbers
20671
20672 #undef TARGET_CONSTANT_ALIGNMENT
20673 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
20674
20675 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
20676 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
20677   aarch64_stack_clash_protection_alloca_probe_range
20678
20679 #undef TARGET_COMPUTE_PRESSURE_CLASSES
20680 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
20681
20682 #undef TARGET_CAN_CHANGE_MODE_CLASS
20683 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
20684
20685 #undef TARGET_SELECT_EARLY_REMAT_MODES
20686 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
20687
20688 #undef TARGET_SPECULATION_SAFE_VALUE
20689 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
20690
20691 #undef TARGET_ESTIMATED_POLY_VALUE
20692 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
20693
20694 #undef TARGET_ATTRIBUTE_TABLE
20695 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
20696
20697 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
20698 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
20699   aarch64_simd_clone_compute_vecsize_and_simdlen
20700
20701 #undef TARGET_SIMD_CLONE_ADJUST
20702 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
20703
20704 #undef TARGET_SIMD_CLONE_USABLE
20705 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
20706
20707 #undef TARGET_COMP_TYPE_ATTRIBUTES
20708 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
20709
20710 #undef TARGET_GET_MULTILIB_ABI_NAME
20711 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
20712
20713 #if CHECKING_P
20714 #undef TARGET_RUN_TARGET_SELFTESTS
20715 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
20716 #endif /* #if CHECKING_P */
20717
20718 #undef TARGET_ASM_POST_CFI_STARTPROC
20719 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
20720
20721 struct gcc_target targetm = TARGET_INITIALIZER;
20722
20723 #include "gt-aarch64.h"