gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2019 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #include "config.h"
  24 #define INCLUDE_STRING
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "memmodel.h"
  32 #include "gimple.h"
  33 #include "cfghooks.h"
  34 #include "cfgloop.h"
  35 #include "df.h"
  36 #include "tm_p.h"
  37 #include "stringpool.h"
  38 #include "attribs.h"
  39 #include "optabs.h"
  40 #include "regs.h"
  41 #include "emit-rtl.h"
  42 #include "recog.h"
  43 #include "cgraph.h"
  44 #include "diagnostic.h"
  45 #include "insn-attr.h"
  46 #include "alias.h"
  47 #include "fold-const.h"
  48 #include "stor-layout.h"
  49 #include "calls.h"
  50 #include "varasm.h"
  51 #include "output.h"
  52 #include "flags.h"
  53 #include "explow.h"
  54 #include "expr.h"
  55 #include "reload.h"
  56 #include "langhooks.h"
  57 #include "opts.h"
  58 #include "params.h"
  59 #include "gimplify.h"
  60 #include "dwarf2.h"
  61 #include "gimple-iterator.h"
  62 #include "tree-vectorizer.h"
  63 #include "aarch64-cost-tables.h"
  64 #include "dumpfile.h"
  65 #include "builtins.h"
  66 #include "rtl-iter.h"
  67 #include "tm-constrs.h"
  68 #include "sched-int.h"
  69 #include "target-globals.h"
  70 #include "common/common-target.h"
  71 #include "cfgrtl.h"
  72 #include "selftest.h"
  73 #include "selftest-rtl.h"
  74 #include "rtx-vector-builder.h"
  75 #include "intl.h"
  76
  77 /* This file should be included last.  */
  78 #include "target-def.h"
  79
  80 /* Defined for convenience.  */
  81 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  82
  83 /* Information about a legitimate vector immediate operand.  */
  84 struct simd_immediate_info
  85 {
  86   enum insn_type { MOV, MVN, INDEX };
  87   enum modifier_type { LSL, MSL };
  88
  89   simd_immediate_info () {}
  90   simd_immediate_info (scalar_float_mode, rtx);
  91   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
  92                        insn_type = MOV, modifier_type = LSL,
  93                        unsigned int = 0);
  94   simd_immediate_info (scalar_mode, rtx, rtx);
  95
  96   /* The mode of the elements.  */
  97   scalar_mode elt_mode;
  98
  99   /* The instruction to use to move the immediate into a vector.  */
 100   insn_type insn;
 101
 102   union
 103   {
 104     /* For MOV and MVN.  */
 105     struct
 106     {
 107       /* The value of each element.  */
 108       rtx value;
 109
 110       /* The kind of shift modifier to use, and the number of bits to shift.
 111          This is (LSL, 0) if no shift is needed.  */
 112       modifier_type modifier;
 113       unsigned int shift;
 114     } mov;
 115
 116     /* For INDEX.  */
 117     struct
 118     {
 119       /* The value of the first element and the step to be added for each
 120          subsequent element.  */
 121       rtx base, step;
 122     } index;
 123   } u;
 124 };
 125
 126 /* Construct a floating-point immediate in which each element has mode
 127    ELT_MODE_IN and value VALUE_IN.  */
 128 inline simd_immediate_info
 129 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 130   : elt_mode (elt_mode_in), insn (MOV)
 131 {
 132   u.mov.value = value_in;
 133   u.mov.modifier = LSL;
 134   u.mov.shift = 0;
 135 }
 136
 137 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 138    and value VALUE_IN.  The other parameters are as for the structure
 139    fields.  */
 140 inline simd_immediate_info
 141 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 142                        unsigned HOST_WIDE_INT value_in,
 143                        insn_type insn_in, modifier_type modifier_in,
 144                        unsigned int shift_in)
 145   : elt_mode (elt_mode_in), insn (insn_in)
 146 {
 147   u.mov.value = gen_int_mode (value_in, elt_mode_in);
 148   u.mov.modifier = modifier_in;
 149   u.mov.shift = shift_in;
 150 }
 151
 152 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 153    and where element I is equal to BASE_IN + I * STEP_IN.  */
 154 inline simd_immediate_info
 155 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
 156   : elt_mode (elt_mode_in), insn (INDEX)
 157 {
 158   u.index.base = base_in;
 159   u.index.step = step_in;
 160 }
 161
 162 /* The current code model.  */
 163 enum aarch64_code_model aarch64_cmodel;
 164
 165 /* The number of 64-bit elements in an SVE vector.  */
 166 poly_uint16 aarch64_sve_vg;
 167
 168 #ifdef HAVE_AS_TLS
 169 #undef TARGET_HAVE_TLS
 170 #define TARGET_HAVE_TLS 1
 171 #endif
 172
 173 static bool aarch64_composite_type_p (const_tree, machine_mode);
 174 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 175                                                      const_tree,
 176                                                      machine_mode *, int *,
 177                                                      bool *);
 178 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 179 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 180 static void aarch64_override_options_after_change (void);
 181 static bool aarch64_vector_mode_supported_p (machine_mode);
 182 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 183 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 184                                                          const_tree type,
 185                                                          int misalignment,
 186                                                          bool is_packed);
 187 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 188 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 189                                             aarch64_addr_query_type);
 190 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
 191
 192 /* Major revision number of the ARM Architecture implemented by the target.  */
 193 unsigned aarch64_architecture_version;
 194
 195 /* The processor for which instructions should be scheduled.  */
 196 enum aarch64_processor aarch64_tune = cortexa53;
 197
 198 /* Mask to specify which instruction scheduling options should be used.  */
 199 uint64_t aarch64_tune_flags = 0;
 200
 201 /* Global flag for PC relative loads.  */
 202 bool aarch64_pcrelative_literal_loads;
 203
 204 /* Global flag for whether frame pointer is enabled.  */
 205 bool aarch64_use_frame_pointer;
 206
 207 #define BRANCH_PROTECT_STR_MAX 255
 208 char *accepted_branch_protection_string = NULL;
 209
 210 static enum aarch64_parse_opt_result
 211 aarch64_parse_branch_protection (const char*, char**);
 212
 213 /* Support for command line parsing of boolean flags in the tuning
 214    structures.  */
 215 struct aarch64_flag_desc
 216 {
 217   const char* name;
 218   unsigned int flag;
 219 };
 220
 221 #define AARCH64_FUSION_PAIR(name, internal_name) \
 222   { name, AARCH64_FUSE_##internal_name },
 223 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 224 {
 225   { "none", AARCH64_FUSE_NOTHING },
 226 #include "aarch64-fusion-pairs.def"
 227   { "all", AARCH64_FUSE_ALL },
 228   { NULL, AARCH64_FUSE_NOTHING }
 229 };
 230
 231 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 232   { name, AARCH64_EXTRA_TUNE_##internal_name },
 233 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 234 {
 235   { "none", AARCH64_EXTRA_TUNE_NONE },
 236 #include "aarch64-tuning-flags.def"
 237   { "all", AARCH64_EXTRA_TUNE_ALL },
 238   { NULL, AARCH64_EXTRA_TUNE_NONE }
 239 };
 240
 241 /* Tuning parameters.  */
 242
 243 static const struct cpu_addrcost_table generic_addrcost_table =
 244 {
 245     {
 246       1, /* hi  */
 247       0, /* si  */
 248       0, /* di  */
 249       1, /* ti  */
 250     },
 251   0, /* pre_modify  */
 252   0, /* post_modify  */
 253   0, /* register_offset  */
 254   0, /* register_sextend  */
 255   0, /* register_zextend  */
 256   0 /* imm_offset  */
 257 };
 258
 259 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 260 {
 261     {
 262       0, /* hi  */
 263       0, /* si  */
 264       0, /* di  */
 265       2, /* ti  */
 266     },
 267   0, /* pre_modify  */
 268   0, /* post_modify  */
 269   1, /* register_offset  */
 270   1, /* register_sextend  */
 271   2, /* register_zextend  */
 272   0, /* imm_offset  */
 273 };
 274
 275 static const struct cpu_addrcost_table xgene1_addrcost_table =
 276 {
 277     {
 278       1, /* hi  */
 279       0, /* si  */
 280       0, /* di  */
 281       1, /* ti  */
 282     },
 283   1, /* pre_modify  */
 284   1, /* post_modify  */
 285   0, /* register_offset  */
 286   1, /* register_sextend  */
 287   1, /* register_zextend  */
 288   0, /* imm_offset  */
 289 };
 290
 291 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 292 {
 293     {
 294       1, /* hi  */
 295       1, /* si  */
 296       1, /* di  */
 297       2, /* ti  */
 298     },
 299   0, /* pre_modify  */
 300   0, /* post_modify  */
 301   2, /* register_offset  */
 302   3, /* register_sextend  */
 303   3, /* register_zextend  */
 304   0, /* imm_offset  */
 305 };
 306
 307 static const struct cpu_addrcost_table tsv110_addrcost_table =
 308 {
 309     {
 310       1, /* hi  */
 311       0, /* si  */
 312       0, /* di  */
 313       1, /* ti  */
 314     },
 315   0, /* pre_modify  */
 316   0, /* post_modify  */
 317   0, /* register_offset  */
 318   1, /* register_sextend  */
 319   1, /* register_zextend  */
 320   0, /* imm_offset  */
 321 };
 322
 323 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 324 {
 325     {
 326       1, /* hi  */
 327       1, /* si  */
 328       1, /* di  */
 329       2, /* ti  */
 330     },
 331   1, /* pre_modify  */
 332   1, /* post_modify  */
 333   3, /* register_offset  */
 334   3, /* register_sextend  */
 335   3, /* register_zextend  */
 336   2, /* imm_offset  */
 337 };
 338
 339 static const struct cpu_regmove_cost generic_regmove_cost =
 340 {
 341   1, /* GP2GP  */
 342   /* Avoid the use of slow int<->fp moves for spilling by setting
 343      their cost higher than memmov_cost.  */
 344   5, /* GP2FP  */
 345   5, /* FP2GP  */
 346   2 /* FP2FP  */
 347 };
 348
 349 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 350 {
 351   1, /* GP2GP  */
 352   /* Avoid the use of slow int<->fp moves for spilling by setting
 353      their cost higher than memmov_cost.  */
 354   5, /* GP2FP  */
 355   5, /* FP2GP  */
 356   2 /* FP2FP  */
 357 };
 358
 359 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 360 {
 361   1, /* GP2GP  */
 362   /* Avoid the use of slow int<->fp moves for spilling by setting
 363      their cost higher than memmov_cost.  */
 364   5, /* GP2FP  */
 365   5, /* FP2GP  */
 366   2 /* FP2FP  */
 367 };
 368
 369 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 370 {
 371   1, /* GP2GP  */
 372   /* Avoid the use of slow int<->fp moves for spilling by setting
 373      their cost higher than memmov_cost (actual, 4 and 9).  */
 374   9, /* GP2FP  */
 375   9, /* FP2GP  */
 376   1 /* FP2FP  */
 377 };
 378
 379 static const struct cpu_regmove_cost thunderx_regmove_cost =
 380 {
 381   2, /* GP2GP  */
 382   2, /* GP2FP  */
 383   6, /* FP2GP  */
 384   4 /* FP2FP  */
 385 };
 386
 387 static const struct cpu_regmove_cost xgene1_regmove_cost =
 388 {
 389   1, /* GP2GP  */
 390   /* Avoid the use of slow int<->fp moves for spilling by setting
 391      their cost higher than memmov_cost.  */
 392   8, /* GP2FP  */
 393   8, /* FP2GP  */
 394   2 /* FP2FP  */
 395 };
 396
 397 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 398 {
 399   2, /* GP2GP  */
 400   /* Avoid the use of int<->fp moves for spilling.  */
 401   6, /* GP2FP  */
 402   6, /* FP2GP  */
 403   4 /* FP2FP  */
 404 };
 405
 406 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 407 {
 408   1, /* GP2GP  */
 409   /* Avoid the use of int<->fp moves for spilling.  */
 410   8, /* GP2FP  */
 411   8, /* FP2GP  */
 412   4  /* FP2FP  */
 413 };
 414
 415 static const struct cpu_regmove_cost tsv110_regmove_cost =
 416 {
 417   1, /* GP2GP  */
 418   /* Avoid the use of slow int<->fp moves for spilling by setting
 419      their cost higher than memmov_cost.  */
 420   2, /* GP2FP  */
 421   3, /* FP2GP  */
 422   2  /* FP2FP  */
 423 };
 424
 425 /* Generic costs for vector insn classes.  */
 426 static const struct cpu_vector_cost generic_vector_cost =
 427 {
 428   1, /* scalar_int_stmt_cost  */
 429   1, /* scalar_fp_stmt_cost  */
 430   1, /* scalar_load_cost  */
 431   1, /* scalar_store_cost  */
 432   1, /* vec_int_stmt_cost  */
 433   1, /* vec_fp_stmt_cost  */
 434   2, /* vec_permute_cost  */
 435   1, /* vec_to_scalar_cost  */
 436   1, /* scalar_to_vec_cost  */
 437   1, /* vec_align_load_cost  */
 438   1, /* vec_unalign_load_cost  */
 439   1, /* vec_unalign_store_cost  */
 440   1, /* vec_store_cost  */
 441   3, /* cond_taken_branch_cost  */
 442   1 /* cond_not_taken_branch_cost  */
 443 };
 444
 445 /* QDF24XX costs for vector insn classes.  */
 446 static const struct cpu_vector_cost qdf24xx_vector_cost =
 447 {
 448   1, /* scalar_int_stmt_cost  */
 449   1, /* scalar_fp_stmt_cost  */
 450   1, /* scalar_load_cost  */
 451   1, /* scalar_store_cost  */
 452   1, /* vec_int_stmt_cost  */
 453   3, /* vec_fp_stmt_cost  */
 454   2, /* vec_permute_cost  */
 455   1, /* vec_to_scalar_cost  */
 456   1, /* scalar_to_vec_cost  */
 457   1, /* vec_align_load_cost  */
 458   1, /* vec_unalign_load_cost  */
 459   1, /* vec_unalign_store_cost  */
 460   1, /* vec_store_cost  */
 461   3, /* cond_taken_branch_cost  */
 462   1 /* cond_not_taken_branch_cost  */
 463 };
 464
 465 /* ThunderX costs for vector insn classes.  */
 466 static const struct cpu_vector_cost thunderx_vector_cost =
 467 {
 468   1, /* scalar_int_stmt_cost  */
 469   1, /* scalar_fp_stmt_cost  */
 470   3, /* scalar_load_cost  */
 471   1, /* scalar_store_cost  */
 472   4, /* vec_int_stmt_cost  */
 473   1, /* vec_fp_stmt_cost  */
 474   4, /* vec_permute_cost  */
 475   2, /* vec_to_scalar_cost  */
 476   2, /* scalar_to_vec_cost  */
 477   3, /* vec_align_load_cost  */
 478   5, /* vec_unalign_load_cost  */
 479   5, /* vec_unalign_store_cost  */
 480   1, /* vec_store_cost  */
 481   3, /* cond_taken_branch_cost  */
 482   3 /* cond_not_taken_branch_cost  */
 483 };
 484
 485 static const struct cpu_vector_cost tsv110_vector_cost =
 486 {
 487   1, /* scalar_int_stmt_cost  */
 488   1, /* scalar_fp_stmt_cost  */
 489   5, /* scalar_load_cost  */
 490   1, /* scalar_store_cost  */
 491   2, /* vec_int_stmt_cost  */
 492   2, /* vec_fp_stmt_cost  */
 493   2, /* vec_permute_cost  */
 494   3, /* vec_to_scalar_cost  */
 495   2, /* scalar_to_vec_cost  */
 496   5, /* vec_align_load_cost  */
 497   5, /* vec_unalign_load_cost  */
 498   1, /* vec_unalign_store_cost  */
 499   1, /* vec_store_cost  */
 500   1, /* cond_taken_branch_cost  */
 501   1 /* cond_not_taken_branch_cost  */
 502 };
 503
 504 /* Generic costs for vector insn classes.  */
 505 static const struct cpu_vector_cost cortexa57_vector_cost =
 506 {
 507   1, /* scalar_int_stmt_cost  */
 508   1, /* scalar_fp_stmt_cost  */
 509   4, /* scalar_load_cost  */
 510   1, /* scalar_store_cost  */
 511   2, /* vec_int_stmt_cost  */
 512   2, /* vec_fp_stmt_cost  */
 513   3, /* vec_permute_cost  */
 514   8, /* vec_to_scalar_cost  */
 515   8, /* scalar_to_vec_cost  */
 516   4, /* vec_align_load_cost  */
 517   4, /* vec_unalign_load_cost  */
 518   1, /* vec_unalign_store_cost  */
 519   1, /* vec_store_cost  */
 520   1, /* cond_taken_branch_cost  */
 521   1 /* cond_not_taken_branch_cost  */
 522 };
 523
 524 static const struct cpu_vector_cost exynosm1_vector_cost =
 525 {
 526   1, /* scalar_int_stmt_cost  */
 527   1, /* scalar_fp_stmt_cost  */
 528   5, /* scalar_load_cost  */
 529   1, /* scalar_store_cost  */
 530   3, /* vec_int_stmt_cost  */
 531   3, /* vec_fp_stmt_cost  */
 532   3, /* vec_permute_cost  */
 533   3, /* vec_to_scalar_cost  */
 534   3, /* scalar_to_vec_cost  */
 535   5, /* vec_align_load_cost  */
 536   5, /* vec_unalign_load_cost  */
 537   1, /* vec_unalign_store_cost  */
 538   1, /* vec_store_cost  */
 539   1, /* cond_taken_branch_cost  */
 540   1 /* cond_not_taken_branch_cost  */
 541 };
 542
 543 /* Generic costs for vector insn classes.  */
 544 static const struct cpu_vector_cost xgene1_vector_cost =
 545 {
 546   1, /* scalar_int_stmt_cost  */
 547   1, /* scalar_fp_stmt_cost  */
 548   5, /* scalar_load_cost  */
 549   1, /* scalar_store_cost  */
 550   2, /* vec_int_stmt_cost  */
 551   2, /* vec_fp_stmt_cost  */
 552   2, /* vec_permute_cost  */
 553   4, /* vec_to_scalar_cost  */
 554   4, /* scalar_to_vec_cost  */
 555   10, /* vec_align_load_cost  */
 556   10, /* vec_unalign_load_cost  */
 557   2, /* vec_unalign_store_cost  */
 558   2, /* vec_store_cost  */
 559   2, /* cond_taken_branch_cost  */
 560   1 /* cond_not_taken_branch_cost  */
 561 };
 562
 563 /* Costs for vector insn classes for Vulcan.  */
 564 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 565 {
 566   1, /* scalar_int_stmt_cost  */
 567   6, /* scalar_fp_stmt_cost  */
 568   4, /* scalar_load_cost  */
 569   1, /* scalar_store_cost  */
 570   5, /* vec_int_stmt_cost  */
 571   6, /* vec_fp_stmt_cost  */
 572   3, /* vec_permute_cost  */
 573   6, /* vec_to_scalar_cost  */
 574   5, /* scalar_to_vec_cost  */
 575   8, /* vec_align_load_cost  */
 576   8, /* vec_unalign_load_cost  */
 577   4, /* vec_unalign_store_cost  */
 578   4, /* vec_store_cost  */
 579   2, /* cond_taken_branch_cost  */
 580   1  /* cond_not_taken_branch_cost  */
 581 };
 582
 583 /* Generic costs for branch instructions.  */
 584 static const struct cpu_branch_cost generic_branch_cost =
 585 {
 586   1,  /* Predictable.  */
 587   3   /* Unpredictable.  */
 588 };
 589
 590 /* Generic approximation modes.  */
 591 static const cpu_approx_modes generic_approx_modes =
 592 {
 593   AARCH64_APPROX_NONE,  /* division  */
 594   AARCH64_APPROX_NONE,  /* sqrt  */
 595   AARCH64_APPROX_NONE   /* recip_sqrt  */
 596 };
 597
 598 /* Approximation modes for Exynos M1.  */
 599 static const cpu_approx_modes exynosm1_approx_modes =
 600 {
 601   AARCH64_APPROX_NONE,  /* division  */
 602   AARCH64_APPROX_ALL,   /* sqrt  */
 603   AARCH64_APPROX_ALL    /* recip_sqrt  */
 604 };
 605
 606 /* Approximation modes for X-Gene 1.  */
 607 static const cpu_approx_modes xgene1_approx_modes =
 608 {
 609   AARCH64_APPROX_NONE,  /* division  */
 610   AARCH64_APPROX_NONE,  /* sqrt  */
 611   AARCH64_APPROX_ALL    /* recip_sqrt  */
 612 };
 613
 614 /* Generic prefetch settings (which disable prefetch).  */
 615 static const cpu_prefetch_tune generic_prefetch_tune =
 616 {
 617   0,                    /* num_slots  */
 618   -1,                   /* l1_cache_size  */
 619   -1,                   /* l1_cache_line_size  */
 620   -1,                   /* l2_cache_size  */
 621   true,                 /* prefetch_dynamic_strides */
 622   -1,                   /* minimum_stride */
 623   -1                    /* default_opt_level  */
 624 };
 625
 626 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 627 {
 628   0,                    /* num_slots  */
 629   -1,                   /* l1_cache_size  */
 630   64,                   /* l1_cache_line_size  */
 631   -1,                   /* l2_cache_size  */
 632   true,                 /* prefetch_dynamic_strides */
 633   -1,                   /* minimum_stride */
 634   -1                    /* default_opt_level  */
 635 };
 636
 637 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 638 {
 639   4,                    /* num_slots  */
 640   32,                   /* l1_cache_size  */
 641   64,                   /* l1_cache_line_size  */
 642   512,                  /* l2_cache_size  */
 643   false,                /* prefetch_dynamic_strides */
 644   2048,                 /* minimum_stride */
 645   3                     /* default_opt_level  */
 646 };
 647
 648 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 649 {
 650   8,                    /* num_slots  */
 651   32,                   /* l1_cache_size  */
 652   128,                  /* l1_cache_line_size  */
 653   16*1024,              /* l2_cache_size  */
 654   true,                 /* prefetch_dynamic_strides */
 655   -1,                   /* minimum_stride */
 656   3                     /* default_opt_level  */
 657 };
 658
 659 static const cpu_prefetch_tune thunderx_prefetch_tune =
 660 {
 661   8,                    /* num_slots  */
 662   32,                   /* l1_cache_size  */
 663   128,                  /* l1_cache_line_size  */
 664   -1,                   /* l2_cache_size  */
 665   true,                 /* prefetch_dynamic_strides */
 666   -1,                   /* minimum_stride */
 667   -1                    /* default_opt_level  */
 668 };
 669
 670 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 671 {
 672   8,                    /* num_slots  */
 673   32,                   /* l1_cache_size  */
 674   64,                   /* l1_cache_line_size  */
 675   256,                  /* l2_cache_size  */
 676   true,                 /* prefetch_dynamic_strides */
 677   -1,                   /* minimum_stride */
 678   -1                    /* default_opt_level  */
 679 };
 680
 681 static const cpu_prefetch_tune tsv110_prefetch_tune =
 682 {
 683   0,                    /* num_slots  */
 684   64,                   /* l1_cache_size  */
 685   64,                   /* l1_cache_line_size  */
 686   512,                  /* l2_cache_size  */
 687   true,                 /* prefetch_dynamic_strides */
 688   -1,                   /* minimum_stride */
 689   -1                    /* default_opt_level  */
 690 };
 691
 692 static const cpu_prefetch_tune xgene1_prefetch_tune =
 693 {
 694   8,                    /* num_slots  */
 695   32,                   /* l1_cache_size  */
 696   64,                   /* l1_cache_line_size  */
 697   256,                  /* l2_cache_size  */
 698   true,                 /* prefetch_dynamic_strides */
 699   -1,                   /* minimum_stride */
 700   -1                    /* default_opt_level  */
 701 };
 702
 703 static const struct tune_params generic_tunings =
 704 {
 705   &cortexa57_extra_costs,
 706   &generic_addrcost_table,
 707   &generic_regmove_cost,
 708   &generic_vector_cost,
 709   &generic_branch_cost,
 710   &generic_approx_modes,
 711   SVE_NOT_IMPLEMENTED, /* sve_width  */
 712   4, /* memmov_cost  */
 713   2, /* issue_rate  */
 714   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 715   "8",  /* function_align.  */
 716   "4",  /* jump_align.  */
 717   "8",  /* loop_align.  */
 718   2,    /* int_reassoc_width.  */
 719   4,    /* fp_reassoc_width.  */
 720   1,    /* vec_reassoc_width.  */
 721   2,    /* min_div_recip_mul_sf.  */
 722   2,    /* min_div_recip_mul_df.  */
 723   0,    /* max_case_values.  */
 724   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 725   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 726   &generic_prefetch_tune
 727 };
 728
 729 static const struct tune_params cortexa35_tunings =
 730 {
 731   &cortexa53_extra_costs,
 732   &generic_addrcost_table,
 733   &cortexa53_regmove_cost,
 734   &generic_vector_cost,
 735   &generic_branch_cost,
 736   &generic_approx_modes,
 737   SVE_NOT_IMPLEMENTED, /* sve_width  */
 738   4, /* memmov_cost  */
 739   1, /* issue_rate  */
 740   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 741    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 742   "16", /* function_align.  */
 743   "4",  /* jump_align.  */
 744   "8",  /* loop_align.  */
 745   2,    /* int_reassoc_width.  */
 746   4,    /* fp_reassoc_width.  */
 747   1,    /* vec_reassoc_width.  */
 748   2,    /* min_div_recip_mul_sf.  */
 749   2,    /* min_div_recip_mul_df.  */
 750   0,    /* max_case_values.  */
 751   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 752   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 753   &generic_prefetch_tune
 754 };
 755
 756 static const struct tune_params cortexa53_tunings =
 757 {
 758   &cortexa53_extra_costs,
 759   &generic_addrcost_table,
 760   &cortexa53_regmove_cost,
 761   &generic_vector_cost,
 762   &generic_branch_cost,
 763   &generic_approx_modes,
 764   SVE_NOT_IMPLEMENTED, /* sve_width  */
 765   4, /* memmov_cost  */
 766   2, /* issue_rate  */
 767   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 768    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 769   "16", /* function_align.  */
 770   "4",  /* jump_align.  */
 771   "8",  /* loop_align.  */
 772   2,    /* int_reassoc_width.  */
 773   4,    /* fp_reassoc_width.  */
 774   1,    /* vec_reassoc_width.  */
 775   2,    /* min_div_recip_mul_sf.  */
 776   2,    /* min_div_recip_mul_df.  */
 777   0,    /* max_case_values.  */
 778   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 779   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 780   &generic_prefetch_tune
 781 };
 782
 783 static const struct tune_params cortexa57_tunings =
 784 {
 785   &cortexa57_extra_costs,
 786   &generic_addrcost_table,
 787   &cortexa57_regmove_cost,
 788   &cortexa57_vector_cost,
 789   &generic_branch_cost,
 790   &generic_approx_modes,
 791   SVE_NOT_IMPLEMENTED, /* sve_width  */
 792   4, /* memmov_cost  */
 793   3, /* issue_rate  */
 794   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 795    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 796   "16", /* function_align.  */
 797   "4",  /* jump_align.  */
 798   "8",  /* loop_align.  */
 799   2,    /* int_reassoc_width.  */
 800   4,    /* fp_reassoc_width.  */
 801   1,    /* vec_reassoc_width.  */
 802   2,    /* min_div_recip_mul_sf.  */
 803   2,    /* min_div_recip_mul_df.  */
 804   0,    /* max_case_values.  */
 805   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 806   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 807   &generic_prefetch_tune
 808 };
 809
 810 static const struct tune_params cortexa72_tunings =
 811 {
 812   &cortexa57_extra_costs,
 813   &generic_addrcost_table,
 814   &cortexa57_regmove_cost,
 815   &cortexa57_vector_cost,
 816   &generic_branch_cost,
 817   &generic_approx_modes,
 818   SVE_NOT_IMPLEMENTED, /* sve_width  */
 819   4, /* memmov_cost  */
 820   3, /* issue_rate  */
 821   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 822    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 823   "16", /* function_align.  */
 824   "4",  /* jump_align.  */
 825   "8",  /* loop_align.  */
 826   2,    /* int_reassoc_width.  */
 827   4,    /* fp_reassoc_width.  */
 828   1,    /* vec_reassoc_width.  */
 829   2,    /* min_div_recip_mul_sf.  */
 830   2,    /* min_div_recip_mul_df.  */
 831   0,    /* max_case_values.  */
 832   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 833   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 834   &generic_prefetch_tune
 835 };
 836
 837 static const struct tune_params cortexa73_tunings =
 838 {
 839   &cortexa57_extra_costs,
 840   &generic_addrcost_table,
 841   &cortexa57_regmove_cost,
 842   &cortexa57_vector_cost,
 843   &generic_branch_cost,
 844   &generic_approx_modes,
 845   SVE_NOT_IMPLEMENTED, /* sve_width  */
 846   4, /* memmov_cost.  */
 847   2, /* issue_rate.  */
 848   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 849    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 850   "16", /* function_align.  */
 851   "4",  /* jump_align.  */
 852   "8",  /* loop_align.  */
 853   2,    /* int_reassoc_width.  */
 854   4,    /* fp_reassoc_width.  */
 855   1,    /* vec_reassoc_width.  */
 856   2,    /* min_div_recip_mul_sf.  */
 857   2,    /* min_div_recip_mul_df.  */
 858   0,    /* max_case_values.  */
 859   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 860   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 861   &generic_prefetch_tune
 862 };
 863
 864
 865
 866 static const struct tune_params exynosm1_tunings =
 867 {
 868   &exynosm1_extra_costs,
 869   &exynosm1_addrcost_table,
 870   &exynosm1_regmove_cost,
 871   &exynosm1_vector_cost,
 872   &generic_branch_cost,
 873   &exynosm1_approx_modes,
 874   SVE_NOT_IMPLEMENTED, /* sve_width  */
 875   4,    /* memmov_cost  */
 876   3,    /* issue_rate  */
 877   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 878   "4",  /* function_align.  */
 879   "4",  /* jump_align.  */
 880   "4",  /* loop_align.  */
 881   2,    /* int_reassoc_width.  */
 882   4,    /* fp_reassoc_width.  */
 883   1,    /* vec_reassoc_width.  */
 884   2,    /* min_div_recip_mul_sf.  */
 885   2,    /* min_div_recip_mul_df.  */
 886   48,   /* max_case_values.  */
 887   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 888   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 889   &exynosm1_prefetch_tune
 890 };
 891
 892 static const struct tune_params thunderxt88_tunings =
 893 {
 894   &thunderx_extra_costs,
 895   &generic_addrcost_table,
 896   &thunderx_regmove_cost,
 897   &thunderx_vector_cost,
 898   &generic_branch_cost,
 899   &generic_approx_modes,
 900   SVE_NOT_IMPLEMENTED, /* sve_width  */
 901   6, /* memmov_cost  */
 902   2, /* issue_rate  */
 903   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 904   "8",  /* function_align.  */
 905   "8",  /* jump_align.  */
 906   "8",  /* loop_align.  */
 907   2,    /* int_reassoc_width.  */
 908   4,    /* fp_reassoc_width.  */
 909   1,    /* vec_reassoc_width.  */
 910   2,    /* min_div_recip_mul_sf.  */
 911   2,    /* min_div_recip_mul_df.  */
 912   0,    /* max_case_values.  */
 913   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 914   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 915   &thunderxt88_prefetch_tune
 916 };
 917
 918 static const struct tune_params thunderx_tunings =
 919 {
 920   &thunderx_extra_costs,
 921   &generic_addrcost_table,
 922   &thunderx_regmove_cost,
 923   &thunderx_vector_cost,
 924   &generic_branch_cost,
 925   &generic_approx_modes,
 926   SVE_NOT_IMPLEMENTED, /* sve_width  */
 927   6, /* memmov_cost  */
 928   2, /* issue_rate  */
 929   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 930   "8",  /* function_align.  */
 931   "8",  /* jump_align.  */
 932   "8",  /* loop_align.  */
 933   2,    /* int_reassoc_width.  */
 934   4,    /* fp_reassoc_width.  */
 935   1,    /* vec_reassoc_width.  */
 936   2,    /* min_div_recip_mul_sf.  */
 937   2,    /* min_div_recip_mul_df.  */
 938   0,    /* max_case_values.  */
 939   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 940   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 941    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 942   &thunderx_prefetch_tune
 943 };
 944
 945 static const struct tune_params tsv110_tunings =
 946 {
 947   &tsv110_extra_costs,
 948   &tsv110_addrcost_table,
 949   &tsv110_regmove_cost,
 950   &tsv110_vector_cost,
 951   &generic_branch_cost,
 952   &generic_approx_modes,
 953   SVE_NOT_IMPLEMENTED, /* sve_width  */
 954   4,    /* memmov_cost  */
 955   4,    /* issue_rate  */
 956   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
 957    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 958   "16", /* function_align.  */
 959   "4",  /* jump_align.  */
 960   "8",  /* loop_align.  */
 961   2,    /* int_reassoc_width.  */
 962   4,    /* fp_reassoc_width.  */
 963   1,    /* vec_reassoc_width.  */
 964   2,    /* min_div_recip_mul_sf.  */
 965   2,    /* min_div_recip_mul_df.  */
 966   0,    /* max_case_values.  */
 967   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 968   (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
 969   &tsv110_prefetch_tune
 970 };
 971
 972 static const struct tune_params xgene1_tunings =
 973 {
 974   &xgene1_extra_costs,
 975   &xgene1_addrcost_table,
 976   &xgene1_regmove_cost,
 977   &xgene1_vector_cost,
 978   &generic_branch_cost,
 979   &xgene1_approx_modes,
 980   SVE_NOT_IMPLEMENTED, /* sve_width  */
 981   6, /* memmov_cost  */
 982   4, /* issue_rate  */
 983   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 984   "16", /* function_align.  */
 985   "16", /* jump_align.  */
 986   "16", /* loop_align.  */
 987   2,    /* int_reassoc_width.  */
 988   4,    /* fp_reassoc_width.  */
 989   1,    /* vec_reassoc_width.  */
 990   2,    /* min_div_recip_mul_sf.  */
 991   2,    /* min_div_recip_mul_df.  */
 992   17,   /* max_case_values.  */
 993   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 994   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
 995   &xgene1_prefetch_tune
 996 };
 997
 998 static const struct tune_params emag_tunings =
 999 {
1000   &xgene1_extra_costs,
1001   &xgene1_addrcost_table,
1002   &xgene1_regmove_cost,
1003   &xgene1_vector_cost,
1004   &generic_branch_cost,
1005   &xgene1_approx_modes,
1006   SVE_NOT_IMPLEMENTED,
1007   6, /* memmov_cost  */
1008   4, /* issue_rate  */
1009   AARCH64_FUSE_NOTHING, /* fusible_ops  */
1010   "16", /* function_align.  */
1011   "16", /* jump_align.  */
1012   "16", /* loop_align.  */
1013   2,    /* int_reassoc_width.  */
1014   4,    /* fp_reassoc_width.  */
1015   1,    /* vec_reassoc_width.  */
1016   2,    /* min_div_recip_mul_sf.  */
1017   2,    /* min_div_recip_mul_df.  */
1018   17,   /* max_case_values.  */
1019   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1020   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1021   &xgene1_prefetch_tune
1022 };
1023
1024 static const struct tune_params qdf24xx_tunings =
1025 {
1026   &qdf24xx_extra_costs,
1027   &qdf24xx_addrcost_table,
1028   &qdf24xx_regmove_cost,
1029   &qdf24xx_vector_cost,
1030   &generic_branch_cost,
1031   &generic_approx_modes,
1032   SVE_NOT_IMPLEMENTED, /* sve_width  */
1033   4, /* memmov_cost  */
1034   4, /* issue_rate  */
1035   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1036    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1037   "16", /* function_align.  */
1038   "8",  /* jump_align.  */
1039   "16", /* loop_align.  */
1040   2,    /* int_reassoc_width.  */
1041   4,    /* fp_reassoc_width.  */
1042   1,    /* vec_reassoc_width.  */
1043   2,    /* min_div_recip_mul_sf.  */
1044   2,    /* min_div_recip_mul_df.  */
1045   0,    /* max_case_values.  */
1046   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1047   AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
1048   &qdf24xx_prefetch_tune
1049 };
1050
1051 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
1052    for now.  */
1053 static const struct tune_params saphira_tunings =
1054 {
1055   &generic_extra_costs,
1056   &generic_addrcost_table,
1057   &generic_regmove_cost,
1058   &generic_vector_cost,
1059   &generic_branch_cost,
1060   &generic_approx_modes,
1061   SVE_NOT_IMPLEMENTED, /* sve_width  */
1062   4, /* memmov_cost  */
1063   4, /* issue_rate  */
1064   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1065    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1066   "16", /* function_align.  */
1067   "8",  /* jump_align.  */
1068   "16", /* loop_align.  */
1069   2,    /* int_reassoc_width.  */
1070   4,    /* fp_reassoc_width.  */
1071   1,    /* vec_reassoc_width.  */
1072   2,    /* min_div_recip_mul_sf.  */
1073   2,    /* min_div_recip_mul_df.  */
1074   0,    /* max_case_values.  */
1075   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1076   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
1077   &generic_prefetch_tune
1078 };
1079
1080 static const struct tune_params thunderx2t99_tunings =
1081 {
1082   &thunderx2t99_extra_costs,
1083   &thunderx2t99_addrcost_table,
1084   &thunderx2t99_regmove_cost,
1085   &thunderx2t99_vector_cost,
1086   &generic_branch_cost,
1087   &generic_approx_modes,
1088   SVE_NOT_IMPLEMENTED, /* sve_width  */
1089   4, /* memmov_cost.  */
1090   4, /* issue_rate.  */
1091   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
1092    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
1093   "16", /* function_align.  */
1094   "8",  /* jump_align.  */
1095   "16", /* loop_align.  */
1096   3,    /* int_reassoc_width.  */
1097   2,    /* fp_reassoc_width.  */
1098   2,    /* vec_reassoc_width.  */
1099   2,    /* min_div_recip_mul_sf.  */
1100   2,    /* min_div_recip_mul_df.  */
1101   0,    /* max_case_values.  */
1102   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1103   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1104   &thunderx2t99_prefetch_tune
1105 };
1106
1107 static const struct tune_params neoversen1_tunings =
1108 {
1109   &cortexa57_extra_costs,
1110   &generic_addrcost_table,
1111   &generic_regmove_cost,
1112   &cortexa57_vector_cost,
1113   &generic_branch_cost,
1114   &generic_approx_modes,
1115   SVE_NOT_IMPLEMENTED, /* sve_width  */
1116   4, /* memmov_cost  */
1117   3, /* issue_rate  */
1118   AARCH64_FUSE_AES_AESMC, /* fusible_ops  */
1119   "32:16",      /* function_align.  */
1120   "32:16",      /* jump_align.  */
1121   "32:16",      /* loop_align.  */
1122   2,    /* int_reassoc_width.  */
1123   4,    /* fp_reassoc_width.  */
1124   2,    /* vec_reassoc_width.  */
1125   2,    /* min_div_recip_mul_sf.  */
1126   2,    /* min_div_recip_mul_df.  */
1127   0,    /* max_case_values.  */
1128   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1129   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1130   &generic_prefetch_tune
1131 };
1132
1133 /* Support for fine-grained override of the tuning structures.  */
1134 struct aarch64_tuning_override_function
1135 {
1136   const char* name;
1137   void (*parse_override)(const char*, struct tune_params*);
1138 };
1139
1140 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1141 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1142 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1143
1144 static const struct aarch64_tuning_override_function
1145 aarch64_tuning_override_functions[] =
1146 {
1147   { "fuse", aarch64_parse_fuse_string },
1148   { "tune", aarch64_parse_tune_string },
1149   { "sve_width", aarch64_parse_sve_width_string },
1150   { NULL, NULL }
1151 };
1152
1153 /* A processor implementing AArch64.  */
1154 struct processor
1155 {
1156   const char *const name;
1157   enum aarch64_processor ident;
1158   enum aarch64_processor sched_core;
1159   enum aarch64_arch arch;
1160   unsigned architecture_version;
1161   const uint64_t flags;
1162   const struct tune_params *const tune;
1163 };
1164
1165 /* Architectures implementing AArch64.  */
1166 static const struct processor all_architectures[] =
1167 {
1168 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1169   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1170 #include "aarch64-arches.def"
1171   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1172 };
1173
1174 /* Processor cores implementing AArch64.  */
1175 static const struct processor all_cores[] =
1176 {
1177 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1178   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
1179   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
1180   FLAGS, &COSTS##_tunings},
1181 #include "aarch64-cores.def"
1182   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1183     AARCH64_FL_FOR_ARCH8, &generic_tunings},
1184   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1185 };
1186
1187
1188 /* Target specification.  These are populated by the -march, -mtune, -mcpu
1189    handling code or by target attributes.  */
1190 static const struct processor *selected_arch;
1191 static const struct processor *selected_cpu;
1192 static const struct processor *selected_tune;
1193
1194 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1195
1196 /* The current tuning set.  */
1197 struct tune_params aarch64_tune_params = generic_tunings;
1198
1199 /* Table of machine attributes.  */
1200 static const struct attribute_spec aarch64_attribute_table[] =
1201 {
1202   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1203        affects_type_identity, handler, exclude } */
1204   { "aarch64_vector_pcs", 0, 0, false, true,  true,  true,  NULL, NULL },
1205   { NULL,                 0, 0, false, false, false, false, NULL, NULL }
1206 };
1207
1208 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1209
1210 /* An ISA extension in the co-processor and main instruction set space.  */
1211 struct aarch64_option_extension
1212 {
1213   const char *const name;
1214   const unsigned long flags_on;
1215   const unsigned long flags_off;
1216 };
1217
1218 typedef enum aarch64_cond_code
1219 {
1220   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1221   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1222   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1223 }
1224 aarch64_cc;
1225
1226 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1227
1228 struct aarch64_branch_protect_type
1229 {
1230   /* The type's name that the user passes to the branch-protection option
1231     string.  */
1232   const char* name;
1233   /* Function to handle the protection type and set global variables.
1234     First argument is the string token corresponding with this type and the
1235     second argument is the next token in the option string.
1236     Return values:
1237     * AARCH64_PARSE_OK: Handling was sucessful.
1238     * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1239       should print an error.
1240     * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1241       own error.  */
1242   enum aarch64_parse_opt_result (*handler)(char*, char*);
1243   /* A list of types that can follow this type in the option string.  */
1244   const aarch64_branch_protect_type* subtypes;
1245   unsigned int num_subtypes;
1246 };
1247
1248 static enum aarch64_parse_opt_result
1249 aarch64_handle_no_branch_protection (char* str, char* rest)
1250 {
1251   aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1252   aarch64_enable_bti = 0;
1253   if (rest)
1254     {
1255       error ("unexpected %<%s%> after %<%s%>", rest, str);
1256       return AARCH64_PARSE_INVALID_FEATURE;
1257     }
1258   return AARCH64_PARSE_OK;
1259 }
1260
1261 static enum aarch64_parse_opt_result
1262 aarch64_handle_standard_branch_protection (char* str, char* rest)
1263 {
1264   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1265   aarch64_ra_sign_key = AARCH64_KEY_A;
1266   aarch64_enable_bti = 1;
1267   if (rest)
1268     {
1269       error ("unexpected %<%s%> after %<%s%>", rest, str);
1270       return AARCH64_PARSE_INVALID_FEATURE;
1271     }
1272   return AARCH64_PARSE_OK;
1273 }
1274
1275 static enum aarch64_parse_opt_result
1276 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1277                                     char* rest ATTRIBUTE_UNUSED)
1278 {
1279   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1280   aarch64_ra_sign_key = AARCH64_KEY_A;
1281   return AARCH64_PARSE_OK;
1282 }
1283
1284 static enum aarch64_parse_opt_result
1285 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1286                               char* rest ATTRIBUTE_UNUSED)
1287 {
1288   aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1289   return AARCH64_PARSE_OK;
1290 }
1291
1292 static enum aarch64_parse_opt_result
1293 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
1294                               char* rest ATTRIBUTE_UNUSED)
1295 {
1296   aarch64_ra_sign_key = AARCH64_KEY_B;
1297   return AARCH64_PARSE_OK;
1298 }
1299
1300 static enum aarch64_parse_opt_result
1301 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1302                                     char* rest ATTRIBUTE_UNUSED)
1303 {
1304   aarch64_enable_bti = 1;
1305   return AARCH64_PARSE_OK;
1306 }
1307
1308 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1309   { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1310   { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
1311   { NULL, NULL, NULL, 0 }
1312 };
1313
1314 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1315   { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1316   { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1317   { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1318     ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1319   { "bti", aarch64_handle_bti_protection, NULL, 0 },
1320   { NULL, NULL, NULL, 0 }
1321 };
1322
1323 /* The condition codes of the processor, and the inverse function.  */
1324 static const char * const aarch64_condition_codes[] =
1325 {
1326   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1327   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1328 };
1329
1330 /* The preferred condition codes for SVE conditions.  */
1331 static const char *const aarch64_sve_condition_codes[] =
1332 {
1333   "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1334   "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1335 };
1336
1337 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1338 const char *
1339 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1340                         const char * branch_format)
1341 {
1342     rtx_code_label * tmp_label = gen_label_rtx ();
1343     char label_buf[256];
1344     char buffer[128];
1345     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1346                                  CODE_LABEL_NUMBER (tmp_label));
1347     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1348     rtx dest_label = operands[pos_label];
1349     operands[pos_label] = tmp_label;
1350
1351     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1352     output_asm_insn (buffer, operands);
1353
1354     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1355     operands[pos_label] = dest_label;
1356     output_asm_insn (buffer, operands);
1357     return "";
1358 }
1359
1360 void
1361 aarch64_err_no_fpadvsimd (machine_mode mode)
1362 {
1363   if (TARGET_GENERAL_REGS_ONLY)
1364     if (FLOAT_MODE_P (mode))
1365       error ("%qs is incompatible with the use of floating-point types",
1366              "-mgeneral-regs-only");
1367     else
1368       error ("%qs is incompatible with the use of vector types",
1369              "-mgeneral-regs-only");
1370   else
1371     if (FLOAT_MODE_P (mode))
1372       error ("%qs feature modifier is incompatible with the use of"
1373              " floating-point types", "+nofp");
1374     else
1375       error ("%qs feature modifier is incompatible with the use of"
1376              " vector types", "+nofp");
1377 }
1378
1379 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1380    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1381    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1382    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1383    and GENERAL_REGS is lower than the memory cost (in this case the best class
1384    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
1385    cost results in bad allocations with many redundant int<->FP moves which
1386    are expensive on various cores.
1387    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1388    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
1389    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
1390    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
1391    The result of this is that it is no longer inefficient to have a higher
1392    memory move cost than the register move cost.
1393 */
1394
1395 static reg_class_t
1396 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1397                                          reg_class_t best_class)
1398 {
1399   machine_mode mode;
1400
1401   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1402       || !reg_class_subset_p (FP_REGS, allocno_class))
1403     return allocno_class;
1404
1405   if (!reg_class_subset_p (GENERAL_REGS, best_class)
1406       || !reg_class_subset_p (FP_REGS, best_class))
1407     return best_class;
1408
1409   mode = PSEUDO_REGNO_MODE (regno);
1410   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1411 }
1412
1413 static unsigned int
1414 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1415 {
1416   if (GET_MODE_UNIT_SIZE (mode) == 4)
1417     return aarch64_tune_params.min_div_recip_mul_sf;
1418   return aarch64_tune_params.min_div_recip_mul_df;
1419 }
1420
1421 /* Return the reassociation width of treeop OPC with mode MODE.  */
1422 static int
1423 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1424 {
1425   if (VECTOR_MODE_P (mode))
1426     return aarch64_tune_params.vec_reassoc_width;
1427   if (INTEGRAL_MODE_P (mode))
1428     return aarch64_tune_params.int_reassoc_width;
1429   /* Avoid reassociating floating point addition so we emit more FMAs.  */
1430   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1431     return aarch64_tune_params.fp_reassoc_width;
1432   return 1;
1433 }
1434
1435 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1436 unsigned
1437 aarch64_dbx_register_number (unsigned regno)
1438 {
1439    if (GP_REGNUM_P (regno))
1440      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1441    else if (regno == SP_REGNUM)
1442      return AARCH64_DWARF_SP;
1443    else if (FP_REGNUM_P (regno))
1444      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1445    else if (PR_REGNUM_P (regno))
1446      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1447    else if (regno == VG_REGNUM)
1448      return AARCH64_DWARF_VG;
1449
1450    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1451       equivalent DWARF register.  */
1452    return DWARF_FRAME_REGISTERS;
1453 }
1454
1455 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1456 static bool
1457 aarch64_advsimd_struct_mode_p (machine_mode mode)
1458 {
1459   return (TARGET_SIMD
1460           && (mode == OImode || mode == CImode || mode == XImode));
1461 }
1462
1463 /* Return true if MODE is an SVE predicate mode.  */
1464 static bool
1465 aarch64_sve_pred_mode_p (machine_mode mode)
1466 {
1467   return (TARGET_SVE
1468           && (mode == VNx16BImode
1469               || mode == VNx8BImode
1470               || mode == VNx4BImode
1471               || mode == VNx2BImode));
1472 }
1473
1474 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1475 const unsigned int VEC_ADVSIMD  = 1;
1476 const unsigned int VEC_SVE_DATA = 2;
1477 const unsigned int VEC_SVE_PRED = 4;
1478 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1479    a structure of 2, 3 or 4 vectors.  */
1480 const unsigned int VEC_STRUCT   = 8;
1481 /* Useful combinations of the above.  */
1482 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1483 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1484
1485 /* Return a set of flags describing the vector properties of mode MODE.
1486    Ignore modes that are not supported by the current target.  */
1487 static unsigned int
1488 aarch64_classify_vector_mode (machine_mode mode)
1489 {
1490   if (aarch64_advsimd_struct_mode_p (mode))
1491     return VEC_ADVSIMD | VEC_STRUCT;
1492
1493   if (aarch64_sve_pred_mode_p (mode))
1494     return VEC_SVE_PRED;
1495
1496   /* Make the decision based on the mode's enum value rather than its
1497      properties, so that we keep the correct classification regardless
1498      of -msve-vector-bits.  */
1499   switch (mode)
1500     {
1501     /* Single SVE vectors.  */
1502     case E_VNx16QImode:
1503     case E_VNx8HImode:
1504     case E_VNx4SImode:
1505     case E_VNx2DImode:
1506     case E_VNx8HFmode:
1507     case E_VNx4SFmode:
1508     case E_VNx2DFmode:
1509       return TARGET_SVE ? VEC_SVE_DATA : 0;
1510
1511     /* x2 SVE vectors.  */
1512     case E_VNx32QImode:
1513     case E_VNx16HImode:
1514     case E_VNx8SImode:
1515     case E_VNx4DImode:
1516     case E_VNx16HFmode:
1517     case E_VNx8SFmode:
1518     case E_VNx4DFmode:
1519     /* x3 SVE vectors.  */
1520     case E_VNx48QImode:
1521     case E_VNx24HImode:
1522     case E_VNx12SImode:
1523     case E_VNx6DImode:
1524     case E_VNx24HFmode:
1525     case E_VNx12SFmode:
1526     case E_VNx6DFmode:
1527     /* x4 SVE vectors.  */
1528     case E_VNx64QImode:
1529     case E_VNx32HImode:
1530     case E_VNx16SImode:
1531     case E_VNx8DImode:
1532     case E_VNx32HFmode:
1533     case E_VNx16SFmode:
1534     case E_VNx8DFmode:
1535       return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
1536
1537     /* 64-bit Advanced SIMD vectors.  */
1538     case E_V8QImode:
1539     case E_V4HImode:
1540     case E_V2SImode:
1541     /* ...E_V1DImode doesn't exist.  */
1542     case E_V4HFmode:
1543     case E_V2SFmode:
1544     case E_V1DFmode:
1545     /* 128-bit Advanced SIMD vectors.  */
1546     case E_V16QImode:
1547     case E_V8HImode:
1548     case E_V4SImode:
1549     case E_V2DImode:
1550     case E_V8HFmode:
1551     case E_V4SFmode:
1552     case E_V2DFmode:
1553       return TARGET_SIMD ? VEC_ADVSIMD : 0;
1554
1555     default:
1556       return 0;
1557     }
1558 }
1559
1560 /* Return true if MODE is any of the data vector modes, including
1561    structure modes.  */
1562 static bool
1563 aarch64_vector_data_mode_p (machine_mode mode)
1564 {
1565   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1566 }
1567
1568 /* Return true if MODE is an SVE data vector mode; either a single vector
1569    or a structure of vectors.  */
1570 static bool
1571 aarch64_sve_data_mode_p (machine_mode mode)
1572 {
1573   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1574 }
1575
1576 /* Implement target hook TARGET_ARRAY_MODE.  */
1577 static opt_machine_mode
1578 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1579 {
1580   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1581       && IN_RANGE (nelems, 2, 4))
1582     return mode_for_vector (GET_MODE_INNER (mode),
1583                             GET_MODE_NUNITS (mode) * nelems);
1584
1585   return opt_machine_mode ();
1586 }
1587
1588 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1589 static bool
1590 aarch64_array_mode_supported_p (machine_mode mode,
1591                                 unsigned HOST_WIDE_INT nelems)
1592 {
1593   if (TARGET_SIMD
1594       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1595           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1596       && (nelems >= 2 && nelems <= 4))
1597     return true;
1598
1599   return false;
1600 }
1601
1602 /* Return the SVE predicate mode to use for elements that have
1603    ELEM_NBYTES bytes, if such a mode exists.  */
1604
1605 opt_machine_mode
1606 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1607 {
1608   if (TARGET_SVE)
1609     {
1610       if (elem_nbytes == 1)
1611         return VNx16BImode;
1612       if (elem_nbytes == 2)
1613         return VNx8BImode;
1614       if (elem_nbytes == 4)
1615         return VNx4BImode;
1616       if (elem_nbytes == 8)
1617         return VNx2BImode;
1618     }
1619   return opt_machine_mode ();
1620 }
1621
1622 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1623
1624 static opt_machine_mode
1625 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1626 {
1627   if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1628     {
1629       unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1630       machine_mode pred_mode;
1631       if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1632         return pred_mode;
1633     }
1634
1635   return default_get_mask_mode (nunits, nbytes);
1636 }
1637
1638 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
1639    prefer to use the first arithmetic operand as the else value if
1640    the else value doesn't matter, since that exactly matches the SVE
1641    destructive merging form.  For ternary operations we could either
1642    pick the first operand and use FMAD-like instructions or the last
1643    operand and use FMLA-like instructions; the latter seems more
1644    natural.  */
1645
1646 static tree
1647 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1648 {
1649   return nops == 3 ? ops[2] : ops[0];
1650 }
1651
1652 /* Implement TARGET_HARD_REGNO_NREGS.  */
1653
1654 static unsigned int
1655 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1656 {
1657   /* ??? Logically we should only need to provide a value when
1658      HARD_REGNO_MODE_OK says that the combination is valid,
1659      but at the moment we need to handle all modes.  Just ignore
1660      any runtime parts for registers that can't store them.  */
1661   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1662   switch (aarch64_regno_regclass (regno))
1663     {
1664     case FP_REGS:
1665     case FP_LO_REGS:
1666     case FP_LO8_REGS:
1667       if (aarch64_sve_data_mode_p (mode))
1668         return exact_div (GET_MODE_SIZE (mode),
1669                           BYTES_PER_SVE_VECTOR).to_constant ();
1670       return CEIL (lowest_size, UNITS_PER_VREG);
1671     case PR_REGS:
1672     case PR_LO_REGS:
1673     case PR_HI_REGS:
1674       return 1;
1675     default:
1676       return CEIL (lowest_size, UNITS_PER_WORD);
1677     }
1678   gcc_unreachable ();
1679 }
1680
1681 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1682
1683 static bool
1684 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1685 {
1686   if (GET_MODE_CLASS (mode) == MODE_CC)
1687     return regno == CC_REGNUM;
1688
1689   if (regno == VG_REGNUM)
1690     /* This must have the same size as _Unwind_Word.  */
1691     return mode == DImode;
1692
1693   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1694   if (vec_flags & VEC_SVE_PRED)
1695     return PR_REGNUM_P (regno);
1696
1697   if (PR_REGNUM_P (regno))
1698     return 0;
1699
1700   if (regno == SP_REGNUM)
1701     /* The purpose of comparing with ptr_mode is to support the
1702        global register variable associated with the stack pointer
1703        register via the syntax of asm ("wsp") in ILP32.  */
1704     return mode == Pmode || mode == ptr_mode;
1705
1706   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1707     return mode == Pmode;
1708
1709   if (GP_REGNUM_P (regno))
1710     {
1711       if (known_le (GET_MODE_SIZE (mode), 8))
1712         return true;
1713       else if (known_le (GET_MODE_SIZE (mode), 16))
1714         return (regno & 1) == 0;
1715     }
1716   else if (FP_REGNUM_P (regno))
1717     {
1718       if (vec_flags & VEC_STRUCT)
1719         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1720       else
1721         return !VECTOR_MODE_P (mode) || vec_flags != 0;
1722     }
1723
1724   return false;
1725 }
1726
1727 /* Return true if this is a definition of a vectorized simd function.  */
1728
1729 static bool
1730 aarch64_simd_decl_p (tree fndecl)
1731 {
1732   tree fntype;
1733
1734   if (fndecl == NULL)
1735     return false;
1736   fntype = TREE_TYPE (fndecl);
1737   if (fntype == NULL)
1738     return false;
1739
1740   /* Functions with the aarch64_vector_pcs attribute use the simd ABI.  */
1741   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
1742     return true;
1743
1744   return false;
1745 }
1746
1747 /* Return the mode a register save/restore should use.  DImode for integer
1748    registers, DFmode for FP registers in non-SIMD functions (they only save
1749    the bottom half of a 128 bit register), or TFmode for FP registers in
1750    SIMD functions.  */
1751
1752 static machine_mode
1753 aarch64_reg_save_mode (tree fndecl, unsigned regno)
1754 {
1755   return GP_REGNUM_P (regno)
1756            ? E_DImode
1757            : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
1758 }
1759
1760 /* Return true if the instruction is a call to a SIMD function, false
1761    if it is not a SIMD function or if we do not know anything about
1762    the function.  */
1763
1764 static bool
1765 aarch64_simd_call_p (rtx_insn *insn)
1766 {
1767   rtx symbol;
1768   rtx call;
1769   tree fndecl;
1770
1771   gcc_assert (CALL_P (insn));
1772   call = get_call_rtx_from (insn);
1773   symbol = XEXP (XEXP (call, 0), 0);
1774   if (GET_CODE (symbol) != SYMBOL_REF)
1775     return false;
1776   fndecl = SYMBOL_REF_DECL (symbol);
1777   if (!fndecl)
1778     return false;
1779
1780   return aarch64_simd_decl_p (fndecl);
1781 }
1782
1783 /* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS.  If INSN calls
1784    a function that uses the SIMD ABI, take advantage of the extra
1785    call-preserved registers that the ABI provides.  */
1786
1787 void
1788 aarch64_remove_extra_call_preserved_regs (rtx_insn *insn,
1789                                           HARD_REG_SET *return_set)
1790 {
1791   if (aarch64_simd_call_p (insn))
1792     {
1793       for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1794         if (FP_SIMD_SAVED_REGNUM_P (regno))
1795           CLEAR_HARD_REG_BIT (*return_set, regno);
1796     }
1797 }
1798
1799 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1800    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1801    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1802
1803 static bool
1804 aarch64_hard_regno_call_part_clobbered (rtx_insn *insn, unsigned int regno,
1805                                         machine_mode mode)
1806 {
1807   bool simd_p = insn && CALL_P (insn) && aarch64_simd_call_p (insn);
1808   return FP_REGNUM_P (regno)
1809          && maybe_gt (GET_MODE_SIZE (mode), simd_p ? 16 : 8);
1810 }
1811
1812 /* Implement TARGET_RETURN_CALL_WITH_MAX_CLOBBERS.  */
1813
1814 rtx_insn *
1815 aarch64_return_call_with_max_clobbers (rtx_insn *call_1, rtx_insn *call_2)
1816 {
1817   gcc_assert (CALL_P (call_1) && CALL_P (call_2));
1818
1819   if (!aarch64_simd_call_p (call_1) || aarch64_simd_call_p (call_2))
1820     return call_1;
1821   else
1822     return call_2;
1823 }
1824
1825 /* Implement REGMODE_NATURAL_SIZE.  */
1826 poly_uint64
1827 aarch64_regmode_natural_size (machine_mode mode)
1828 {
1829   /* The natural size for SVE data modes is one SVE data vector,
1830      and similarly for predicates.  We can't independently modify
1831      anything smaller than that.  */
1832   /* ??? For now, only do this for variable-width SVE registers.
1833      Doing it for constant-sized registers breaks lower-subreg.c.  */
1834   /* ??? And once that's fixed, we should probably have similar
1835      code for Advanced SIMD.  */
1836   if (!aarch64_sve_vg.is_constant ())
1837     {
1838       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1839       if (vec_flags & VEC_SVE_PRED)
1840         return BYTES_PER_SVE_PRED;
1841       if (vec_flags & VEC_SVE_DATA)
1842         return BYTES_PER_SVE_VECTOR;
1843     }
1844   return UNITS_PER_WORD;
1845 }
1846
1847 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1848 machine_mode
1849 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1850                                      machine_mode mode)
1851 {
1852   /* The predicate mode determines which bits are significant and
1853      which are "don't care".  Decreasing the number of lanes would
1854      lose data while increasing the number of lanes would make bits
1855      unnecessarily significant.  */
1856   if (PR_REGNUM_P (regno))
1857     return mode;
1858   if (known_ge (GET_MODE_SIZE (mode), 4))
1859     return mode;
1860   else
1861     return SImode;
1862 }
1863
1864 /* Return true if I's bits are consecutive ones from the MSB.  */
1865 bool
1866 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
1867 {
1868   return exact_log2 (-i) != HOST_WIDE_INT_M1;
1869 }
1870
1871 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
1872    that strcpy from constants will be faster.  */
1873
1874 static HOST_WIDE_INT
1875 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1876 {
1877   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1878     return MAX (align, BITS_PER_WORD);
1879   return align;
1880 }
1881
1882 /* Return true if calls to DECL should be treated as
1883    long-calls (ie called via a register).  */
1884 static bool
1885 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1886 {
1887   return false;
1888 }
1889
1890 /* Return true if calls to symbol-ref SYM should be treated as
1891    long-calls (ie called via a register).  */
1892 bool
1893 aarch64_is_long_call_p (rtx sym)
1894 {
1895   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1896 }
1897
1898 /* Return true if calls to symbol-ref SYM should not go through
1899    plt stubs.  */
1900
1901 bool
1902 aarch64_is_noplt_call_p (rtx sym)
1903 {
1904   const_tree decl = SYMBOL_REF_DECL (sym);
1905
1906   if (flag_pic
1907       && decl
1908       && (!flag_plt
1909           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1910       && !targetm.binds_local_p (decl))
1911     return true;
1912
1913   return false;
1914 }
1915
1916 /* Return true if the offsets to a zero/sign-extract operation
1917    represent an expression that matches an extend operation.  The
1918    operands represent the paramters from
1919
1920    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1921 bool
1922 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1923                                 rtx extract_imm)
1924 {
1925   HOST_WIDE_INT mult_val, extract_val;
1926
1927   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1928     return false;
1929
1930   mult_val = INTVAL (mult_imm);
1931   extract_val = INTVAL (extract_imm);
1932
1933   if (extract_val > 8
1934       && extract_val < GET_MODE_BITSIZE (mode)
1935       && exact_log2 (extract_val & ~7) > 0
1936       && (extract_val & 7) <= 4
1937       && mult_val == (1 << (extract_val & 7)))
1938     return true;
1939
1940   return false;
1941 }
1942
1943 /* Emit an insn that's a simple single-set.  Both the operands must be
1944    known to be valid.  */
1945 inline static rtx_insn *
1946 emit_set_insn (rtx x, rtx y)
1947 {
1948   return emit_insn (gen_rtx_SET (x, y));
1949 }
1950
1951 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1952    return the rtx for register 0 in the proper mode.  */
1953 rtx
1954 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1955 {
1956   machine_mode mode = SELECT_CC_MODE (code, x, y);
1957   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1958
1959   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1960   return cc_reg;
1961 }
1962
1963 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
1964
1965 static rtx
1966 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
1967                                   machine_mode y_mode)
1968 {
1969   if (y_mode == E_QImode || y_mode == E_HImode)
1970     {
1971       if (CONST_INT_P (y))
1972         y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
1973       else
1974         {
1975           rtx t, cc_reg;
1976           machine_mode cc_mode;
1977
1978           t = gen_rtx_ZERO_EXTEND (SImode, y);
1979           t = gen_rtx_COMPARE (CC_SWPmode, t, x);
1980           cc_mode = CC_SWPmode;
1981           cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
1982           emit_set_insn (cc_reg, t);
1983           return cc_reg;
1984         }
1985     }
1986
1987   return aarch64_gen_compare_reg (code, x, y);
1988 }
1989
1990 /* Build the SYMBOL_REF for __tls_get_addr.  */
1991
1992 static GTY(()) rtx tls_get_addr_libfunc;
1993
1994 rtx
1995 aarch64_tls_get_addr (void)
1996 {
1997   if (!tls_get_addr_libfunc)
1998     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1999   return tls_get_addr_libfunc;
2000 }
2001
2002 /* Return the TLS model to use for ADDR.  */
2003
2004 static enum tls_model
2005 tls_symbolic_operand_type (rtx addr)
2006 {
2007   enum tls_model tls_kind = TLS_MODEL_NONE;
2008   if (GET_CODE (addr) == CONST)
2009     {
2010       poly_int64 addend;
2011       rtx sym = strip_offset (addr, &addend);
2012       if (GET_CODE (sym) == SYMBOL_REF)
2013         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
2014     }
2015   else if (GET_CODE (addr) == SYMBOL_REF)
2016     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2017
2018   return tls_kind;
2019 }
2020
2021 /* We'll allow lo_sum's in addresses in our legitimate addresses
2022    so that combine would take care of combining addresses where
2023    necessary, but for generation purposes, we'll generate the address
2024    as :
2025    RTL                               Absolute
2026    tmp = hi (symbol_ref);            adrp  x1, foo
2027    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
2028                                      nop
2029
2030    PIC                               TLS
2031    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
2032    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
2033                                      bl   __tls_get_addr
2034                                      nop
2035
2036    Load TLS symbol, depending on TLS mechanism and TLS access model.
2037
2038    Global Dynamic - Traditional TLS:
2039    adrp tmp, :tlsgd:imm
2040    add  dest, tmp, #:tlsgd_lo12:imm
2041    bl   __tls_get_addr
2042
2043    Global Dynamic - TLS Descriptors:
2044    adrp dest, :tlsdesc:imm
2045    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
2046    add  dest, dest, #:tlsdesc_lo12:imm
2047    blr  tmp
2048    mrs  tp, tpidr_el0
2049    add  dest, dest, tp
2050
2051    Initial Exec:
2052    mrs  tp, tpidr_el0
2053    adrp tmp, :gottprel:imm
2054    ldr  dest, [tmp, #:gottprel_lo12:imm]
2055    add  dest, dest, tp
2056
2057    Local Exec:
2058    mrs  tp, tpidr_el0
2059    add  t0, tp, #:tprel_hi12:imm, lsl #12
2060    add  t0, t0, #:tprel_lo12_nc:imm
2061 */
2062
2063 static void
2064 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2065                                    enum aarch64_symbol_type type)
2066 {
2067   switch (type)
2068     {
2069     case SYMBOL_SMALL_ABSOLUTE:
2070       {
2071         /* In ILP32, the mode of dest can be either SImode or DImode.  */
2072         rtx tmp_reg = dest;
2073         machine_mode mode = GET_MODE (dest);
2074
2075         gcc_assert (mode == Pmode || mode == ptr_mode);
2076
2077         if (can_create_pseudo_p ())
2078           tmp_reg = gen_reg_rtx (mode);
2079
2080         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2081         emit_insn (gen_add_losym (dest, tmp_reg, imm));
2082         return;
2083       }
2084
2085     case SYMBOL_TINY_ABSOLUTE:
2086       emit_insn (gen_rtx_SET (dest, imm));
2087       return;
2088
2089     case SYMBOL_SMALL_GOT_28K:
2090       {
2091         machine_mode mode = GET_MODE (dest);
2092         rtx gp_rtx = pic_offset_table_rtx;
2093         rtx insn;
2094         rtx mem;
2095
2096         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2097            here before rtl expand.  Tree IVOPT will generate rtl pattern to
2098            decide rtx costs, in which case pic_offset_table_rtx is not
2099            initialized.  For that case no need to generate the first adrp
2100            instruction as the final cost for global variable access is
2101            one instruction.  */
2102         if (gp_rtx != NULL)
2103           {
2104             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2105                using the page base as GOT base, the first page may be wasted,
2106                in the worst scenario, there is only 28K space for GOT).
2107
2108                The generate instruction sequence for accessing global variable
2109                is:
2110
2111                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2112
2113                Only one instruction needed. But we must initialize
2114                pic_offset_table_rtx properly.  We generate initialize insn for
2115                every global access, and allow CSE to remove all redundant.
2116
2117                The final instruction sequences will look like the following
2118                for multiply global variables access.
2119
2120                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2121
2122                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2123                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2124                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2125                  ...  */
2126
2127             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2128             crtl->uses_pic_offset_table = 1;
2129             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2130
2131             if (mode != GET_MODE (gp_rtx))
2132              gp_rtx = gen_lowpart (mode, gp_rtx);
2133
2134           }
2135
2136         if (mode == ptr_mode)
2137           {
2138             if (mode == DImode)
2139               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2140             else
2141               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2142
2143             mem = XVECEXP (SET_SRC (insn), 0, 0);
2144           }
2145         else
2146           {
2147             gcc_assert (mode == Pmode);
2148
2149             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2150             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2151           }
2152
2153         /* The operand is expected to be MEM.  Whenever the related insn
2154            pattern changed, above code which calculate mem should be
2155            updated.  */
2156         gcc_assert (GET_CODE (mem) == MEM);
2157         MEM_READONLY_P (mem) = 1;
2158         MEM_NOTRAP_P (mem) = 1;
2159         emit_insn (insn);
2160         return;
2161       }
2162
2163     case SYMBOL_SMALL_GOT_4G:
2164       {
2165         /* In ILP32, the mode of dest can be either SImode or DImode,
2166            while the got entry is always of SImode size.  The mode of
2167            dest depends on how dest is used: if dest is assigned to a
2168            pointer (e.g. in the memory), it has SImode; it may have
2169            DImode if dest is dereferenced to access the memeory.
2170            This is why we have to handle three different ldr_got_small
2171            patterns here (two patterns for ILP32).  */
2172
2173         rtx insn;
2174         rtx mem;
2175         rtx tmp_reg = dest;
2176         machine_mode mode = GET_MODE (dest);
2177
2178         if (can_create_pseudo_p ())
2179           tmp_reg = gen_reg_rtx (mode);
2180
2181         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2182         if (mode == ptr_mode)
2183           {
2184             if (mode == DImode)
2185               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2186             else
2187               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2188
2189             mem = XVECEXP (SET_SRC (insn), 0, 0);
2190           }
2191         else
2192           {
2193             gcc_assert (mode == Pmode);
2194
2195             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2196             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2197           }
2198
2199         gcc_assert (GET_CODE (mem) == MEM);
2200         MEM_READONLY_P (mem) = 1;
2201         MEM_NOTRAP_P (mem) = 1;
2202         emit_insn (insn);
2203         return;
2204       }
2205
2206     case SYMBOL_SMALL_TLSGD:
2207       {
2208         rtx_insn *insns;
2209         machine_mode mode = GET_MODE (dest);
2210         rtx result = gen_rtx_REG (mode, R0_REGNUM);
2211
2212         start_sequence ();
2213         if (TARGET_ILP32)
2214           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2215         else
2216           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2217         insns = get_insns ();
2218         end_sequence ();
2219
2220         RTL_CONST_CALL_P (insns) = 1;
2221         emit_libcall_block (insns, dest, result, imm);
2222         return;
2223       }
2224
2225     case SYMBOL_SMALL_TLSDESC:
2226       {
2227         machine_mode mode = GET_MODE (dest);
2228         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2229         rtx tp;
2230
2231         gcc_assert (mode == Pmode || mode == ptr_mode);
2232
2233         /* In ILP32, the got entry is always of SImode size.  Unlike
2234            small GOT, the dest is fixed at reg 0.  */
2235         if (TARGET_ILP32)
2236           emit_insn (gen_tlsdesc_small_si (imm));
2237         else
2238           emit_insn (gen_tlsdesc_small_di (imm));
2239         tp = aarch64_load_tp (NULL);
2240
2241         if (mode != Pmode)
2242           tp = gen_lowpart (mode, tp);
2243
2244         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2245         if (REG_P (dest))
2246           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2247         return;
2248       }
2249
2250     case SYMBOL_SMALL_TLSIE:
2251       {
2252         /* In ILP32, the mode of dest can be either SImode or DImode,
2253            while the got entry is always of SImode size.  The mode of
2254            dest depends on how dest is used: if dest is assigned to a
2255            pointer (e.g. in the memory), it has SImode; it may have
2256            DImode if dest is dereferenced to access the memeory.
2257            This is why we have to handle three different tlsie_small
2258            patterns here (two patterns for ILP32).  */
2259         machine_mode mode = GET_MODE (dest);
2260         rtx tmp_reg = gen_reg_rtx (mode);
2261         rtx tp = aarch64_load_tp (NULL);
2262
2263         if (mode == ptr_mode)
2264           {
2265             if (mode == DImode)
2266               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2267             else
2268               {
2269                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2270                 tp = gen_lowpart (mode, tp);
2271               }
2272           }
2273         else
2274           {
2275             gcc_assert (mode == Pmode);
2276             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2277           }
2278
2279         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2280         if (REG_P (dest))
2281           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2282         return;
2283       }
2284
2285     case SYMBOL_TLSLE12:
2286     case SYMBOL_TLSLE24:
2287     case SYMBOL_TLSLE32:
2288     case SYMBOL_TLSLE48:
2289       {
2290         machine_mode mode = GET_MODE (dest);
2291         rtx tp = aarch64_load_tp (NULL);
2292
2293         if (mode != Pmode)
2294           tp = gen_lowpart (mode, tp);
2295
2296         switch (type)
2297           {
2298           case SYMBOL_TLSLE12:
2299             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2300                         (dest, tp, imm));
2301             break;
2302           case SYMBOL_TLSLE24:
2303             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2304                         (dest, tp, imm));
2305           break;
2306           case SYMBOL_TLSLE32:
2307             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2308                         (dest, imm));
2309             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2310                         (dest, dest, tp));
2311           break;
2312           case SYMBOL_TLSLE48:
2313             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2314                         (dest, imm));
2315             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2316                         (dest, dest, tp));
2317             break;
2318           default:
2319             gcc_unreachable ();
2320           }
2321
2322         if (REG_P (dest))
2323           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2324         return;
2325       }
2326
2327     case SYMBOL_TINY_GOT:
2328       emit_insn (gen_ldr_got_tiny (dest, imm));
2329       return;
2330
2331     case SYMBOL_TINY_TLSIE:
2332       {
2333         machine_mode mode = GET_MODE (dest);
2334         rtx tp = aarch64_load_tp (NULL);
2335
2336         if (mode == ptr_mode)
2337           {
2338             if (mode == DImode)
2339               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2340             else
2341               {
2342                 tp = gen_lowpart (mode, tp);
2343                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2344               }
2345           }
2346         else
2347           {
2348             gcc_assert (mode == Pmode);
2349             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2350           }
2351
2352         if (REG_P (dest))
2353           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2354         return;
2355       }
2356
2357     default:
2358       gcc_unreachable ();
2359     }
2360 }
2361
2362 /* Emit a move from SRC to DEST.  Assume that the move expanders can
2363    handle all moves if !can_create_pseudo_p ().  The distinction is
2364    important because, unlike emit_move_insn, the move expanders know
2365    how to force Pmode objects into the constant pool even when the
2366    constant pool address is not itself legitimate.  */
2367 static rtx
2368 aarch64_emit_move (rtx dest, rtx src)
2369 {
2370   return (can_create_pseudo_p ()
2371           ? emit_move_insn (dest, src)
2372           : emit_move_insn_1 (dest, src));
2373 }
2374
2375 /* Apply UNOPTAB to OP and store the result in DEST.  */
2376
2377 static void
2378 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2379 {
2380   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2381   if (dest != tmp)
2382     emit_move_insn (dest, tmp);
2383 }
2384
2385 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
2386
2387 static void
2388 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2389 {
2390   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2391                           OPTAB_DIRECT);
2392   if (dest != tmp)
2393     emit_move_insn (dest, tmp);
2394 }
2395
2396 /* Split a 128-bit move operation into two 64-bit move operations,
2397    taking care to handle partial overlap of register to register
2398    copies.  Special cases are needed when moving between GP regs and
2399    FP regs.  SRC can be a register, constant or memory; DST a register
2400    or memory.  If either operand is memory it must not have any side
2401    effects.  */
2402 void
2403 aarch64_split_128bit_move (rtx dst, rtx src)
2404 {
2405   rtx dst_lo, dst_hi;
2406   rtx src_lo, src_hi;
2407
2408   machine_mode mode = GET_MODE (dst);
2409
2410   gcc_assert (mode == TImode || mode == TFmode);
2411   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2412   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2413
2414   if (REG_P (dst) && REG_P (src))
2415     {
2416       int src_regno = REGNO (src);
2417       int dst_regno = REGNO (dst);
2418
2419       /* Handle FP <-> GP regs.  */
2420       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2421         {
2422           src_lo = gen_lowpart (word_mode, src);
2423           src_hi = gen_highpart (word_mode, src);
2424
2425           emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2426           emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2427           return;
2428         }
2429       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2430         {
2431           dst_lo = gen_lowpart (word_mode, dst);
2432           dst_hi = gen_highpart (word_mode, dst);
2433
2434           emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2435           emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2436           return;
2437         }
2438     }
2439
2440   dst_lo = gen_lowpart (word_mode, dst);
2441   dst_hi = gen_highpart (word_mode, dst);
2442   src_lo = gen_lowpart (word_mode, src);
2443   src_hi = gen_highpart_mode (word_mode, mode, src);
2444
2445   /* At most one pairing may overlap.  */
2446   if (reg_overlap_mentioned_p (dst_lo, src_hi))
2447     {
2448       aarch64_emit_move (dst_hi, src_hi);
2449       aarch64_emit_move (dst_lo, src_lo);
2450     }
2451   else
2452     {
2453       aarch64_emit_move (dst_lo, src_lo);
2454       aarch64_emit_move (dst_hi, src_hi);
2455     }
2456 }
2457
2458 bool
2459 aarch64_split_128bit_move_p (rtx dst, rtx src)
2460 {
2461   return (! REG_P (src)
2462           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2463 }
2464
2465 /* Split a complex SIMD combine.  */
2466
2467 void
2468 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2469 {
2470   machine_mode src_mode = GET_MODE (src1);
2471   machine_mode dst_mode = GET_MODE (dst);
2472
2473   gcc_assert (VECTOR_MODE_P (dst_mode));
2474   gcc_assert (register_operand (dst, dst_mode)
2475               && register_operand (src1, src_mode)
2476               && register_operand (src2, src_mode));
2477
2478   emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2479   return;
2480 }
2481
2482 /* Split a complex SIMD move.  */
2483
2484 void
2485 aarch64_split_simd_move (rtx dst, rtx src)
2486 {
2487   machine_mode src_mode = GET_MODE (src);
2488   machine_mode dst_mode = GET_MODE (dst);
2489
2490   gcc_assert (VECTOR_MODE_P (dst_mode));
2491
2492   if (REG_P (dst) && REG_P (src))
2493     {
2494       gcc_assert (VECTOR_MODE_P (src_mode));
2495       emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2496     }
2497 }
2498
2499 bool
2500 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2501                               machine_mode ymode, rtx y)
2502 {
2503   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2504   gcc_assert (r != NULL);
2505   return rtx_equal_p (x, r);
2506 }
2507
2508
2509 static rtx
2510 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2511 {
2512   if (can_create_pseudo_p ())
2513     return force_reg (mode, value);
2514   else
2515     {
2516       gcc_assert (x);
2517       aarch64_emit_move (x, value);
2518       return x;
2519     }
2520 }
2521
2522 /* Return an all-true predicate register of mode MODE.  */
2523
2524 rtx
2525 aarch64_ptrue_reg (machine_mode mode)
2526 {
2527   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2528   return force_reg (mode, CONSTM1_RTX (mode));
2529 }
2530
2531 /* Return an all-false predicate register of mode MODE.  */
2532
2533 rtx
2534 aarch64_pfalse_reg (machine_mode mode)
2535 {
2536   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2537   return force_reg (mode, CONST0_RTX (mode));
2538 }
2539
2540 /* Return true if we can move VALUE into a register using a single
2541    CNT[BHWD] instruction.  */
2542
2543 static bool
2544 aarch64_sve_cnt_immediate_p (poly_int64 value)
2545 {
2546   HOST_WIDE_INT factor = value.coeffs[0];
2547   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
2548   return (value.coeffs[1] == factor
2549           && IN_RANGE (factor, 2, 16 * 16)
2550           && (factor & 1) == 0
2551           && factor <= 16 * (factor & -factor));
2552 }
2553
2554 /* Likewise for rtx X.  */
2555
2556 bool
2557 aarch64_sve_cnt_immediate_p (rtx x)
2558 {
2559   poly_int64 value;
2560   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2561 }
2562
2563 /* Return the asm string for an instruction with a CNT-like vector size
2564    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2565    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2566    first part of the operands template (the part that comes before the
2567    vector size itself).  FACTOR is the number of quadwords.
2568    NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2569    If it is zero, we can use any element size.  */
2570
2571 static char *
2572 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2573                                   unsigned int factor,
2574                                   unsigned int nelts_per_vq)
2575 {
2576   static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2577
2578   if (nelts_per_vq == 0)
2579     /* There is some overlap in the ranges of the four CNT instructions.
2580        Here we always use the smallest possible element size, so that the
2581        multiplier is 1 whereever possible.  */
2582     nelts_per_vq = factor & -factor;
2583   int shift = std::min (exact_log2 (nelts_per_vq), 4);
2584   gcc_assert (IN_RANGE (shift, 1, 4));
2585   char suffix = "dwhb"[shift - 1];
2586
2587   factor >>= shift;
2588   unsigned int written;
2589   if (factor == 1)
2590     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2591                         prefix, suffix, operands);
2592   else
2593     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2594                         prefix, suffix, operands, factor);
2595   gcc_assert (written < sizeof (buffer));
2596   return buffer;
2597 }
2598
2599 /* Return the asm string for an instruction with a CNT-like vector size
2600    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2601    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2602    first part of the operands template (the part that comes before the
2603    vector size itself).  X is the value of the vector size operand,
2604    as a polynomial integer rtx.  */
2605
2606 char *
2607 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2608                                   rtx x)
2609 {
2610   poly_int64 value = rtx_to_poly_int64 (x);
2611   gcc_assert (aarch64_sve_cnt_immediate_p (value));
2612   return aarch64_output_sve_cnt_immediate (prefix, operands,
2613                                            value.coeffs[1], 0);
2614 }
2615
2616 /* Return true if we can add VALUE to a register using a single ADDVL
2617    or ADDPL instruction.  */
2618
2619 static bool
2620 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2621 {
2622   HOST_WIDE_INT factor = value.coeffs[0];
2623   if (factor == 0 || value.coeffs[1] != factor)
2624     return false;
2625   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2626      and a value of 16 is one vector width.  */
2627   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2628           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2629 }
2630
2631 /* Likewise for rtx X.  */
2632
2633 bool
2634 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2635 {
2636   poly_int64 value;
2637   return (poly_int_rtx_p (x, &value)
2638           && aarch64_sve_addvl_addpl_immediate_p (value));
2639 }
2640
2641 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2642    and storing the result in operand 0.  */
2643
2644 char *
2645 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2646 {
2647   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2648   poly_int64 offset_value = rtx_to_poly_int64 (offset);
2649   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2650
2651   /* Use INC or DEC if possible.  */
2652   if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2653     {
2654       if (aarch64_sve_cnt_immediate_p (offset_value))
2655         return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2656                                                  offset_value.coeffs[1], 0);
2657       if (aarch64_sve_cnt_immediate_p (-offset_value))
2658         return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2659                                                  -offset_value.coeffs[1], 0);
2660     }
2661
2662   int factor = offset_value.coeffs[1];
2663   if ((factor & 15) == 0)
2664     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2665   else
2666     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2667   return buffer;
2668 }
2669
2670 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2671    instruction.  If it is, store the number of elements in each vector
2672    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2673    factor in *FACTOR_OUT (if nonnull).  */
2674
2675 bool
2676 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2677                                  unsigned int *nelts_per_vq_out)
2678 {
2679   rtx elt;
2680   poly_int64 value;
2681
2682   if (!const_vec_duplicate_p (x, &elt)
2683       || !poly_int_rtx_p (elt, &value))
2684     return false;
2685
2686   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2687   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2688     /* There's no vector INCB.  */
2689     return false;
2690
2691   HOST_WIDE_INT factor = value.coeffs[0];
2692   if (value.coeffs[1] != factor)
2693     return false;
2694
2695   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
2696   if ((factor % nelts_per_vq) != 0
2697       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2698     return false;
2699
2700   if (factor_out)
2701     *factor_out = factor;
2702   if (nelts_per_vq_out)
2703     *nelts_per_vq_out = nelts_per_vq;
2704   return true;
2705 }
2706
2707 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2708    instruction.  */
2709
2710 bool
2711 aarch64_sve_inc_dec_immediate_p (rtx x)
2712 {
2713   return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2714 }
2715
2716 /* Return the asm template for an SVE vector INC or DEC instruction.
2717    OPERANDS gives the operands before the vector count and X is the
2718    value of the vector count operand itself.  */
2719
2720 char *
2721 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2722 {
2723   int factor;
2724   unsigned int nelts_per_vq;
2725   if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2726     gcc_unreachable ();
2727   if (factor < 0)
2728     return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2729                                              nelts_per_vq);
2730   else
2731     return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2732                                              nelts_per_vq);
2733 }
2734
2735 static int
2736 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2737                                 scalar_int_mode mode)
2738 {
2739   int i;
2740   unsigned HOST_WIDE_INT val, val2, mask;
2741   int one_match, zero_match;
2742   int num_insns;
2743
2744   val = INTVAL (imm);
2745
2746   if (aarch64_move_imm (val, mode))
2747     {
2748       if (generate)
2749         emit_insn (gen_rtx_SET (dest, imm));
2750       return 1;
2751     }
2752
2753   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2754      (with XXXX non-zero). In that case check to see if the move can be done in
2755      a smaller mode.  */
2756   val2 = val & 0xffffffff;
2757   if (mode == DImode
2758       && aarch64_move_imm (val2, SImode)
2759       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2760     {
2761       if (generate)
2762         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2763
2764       /* Check if we have to emit a second instruction by checking to see
2765          if any of the upper 32 bits of the original DI mode value is set.  */
2766       if (val == val2)
2767         return 1;
2768
2769       i = (val >> 48) ? 48 : 32;
2770
2771       if (generate)
2772          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2773                                     GEN_INT ((val >> i) & 0xffff)));
2774
2775       return 2;
2776     }
2777
2778   if ((val >> 32) == 0 || mode == SImode)
2779     {
2780       if (generate)
2781         {
2782           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2783           if (mode == SImode)
2784             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2785                                        GEN_INT ((val >> 16) & 0xffff)));
2786           else
2787             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2788                                        GEN_INT ((val >> 16) & 0xffff)));
2789         }
2790       return 2;
2791     }
2792
2793   /* Remaining cases are all for DImode.  */
2794
2795   mask = 0xffff;
2796   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2797     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2798   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2799     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2800
2801   if (zero_match != 2 && one_match != 2)
2802     {
2803       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2804          For a 64-bit bitmask try whether changing 16 bits to all ones or
2805          zeroes creates a valid bitmask.  To check any repeated bitmask,
2806          try using 16 bits from the other 32-bit half of val.  */
2807
2808       for (i = 0; i < 64; i += 16, mask <<= 16)
2809         {
2810           val2 = val & ~mask;
2811           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2812             break;
2813           val2 = val | mask;
2814           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2815             break;
2816           val2 = val2 & ~mask;
2817           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2818           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2819             break;
2820         }
2821       if (i != 64)
2822         {
2823           if (generate)
2824             {
2825               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2826               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2827                                          GEN_INT ((val >> i) & 0xffff)));
2828             }
2829           return 2;
2830         }
2831     }
2832
2833   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2834      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
2835      otherwise skip zero bits.  */
2836
2837   num_insns = 1;
2838   mask = 0xffff;
2839   val2 = one_match > zero_match ? ~val : val;
2840   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2841
2842   if (generate)
2843     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2844                                            ? (val | ~(mask << i))
2845                                            : (val & (mask << i)))));
2846   for (i += 16; i < 64; i += 16)
2847     {
2848       if ((val2 & (mask << i)) == 0)
2849         continue;
2850       if (generate)
2851         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2852                                    GEN_INT ((val >> i) & 0xffff)));
2853       num_insns ++;
2854     }
2855
2856   return num_insns;
2857 }
2858
2859 /* Return whether imm is a 128-bit immediate which is simple enough to
2860    expand inline.  */
2861 bool
2862 aarch64_mov128_immediate (rtx imm)
2863 {
2864   if (GET_CODE (imm) == CONST_INT)
2865     return true;
2866
2867   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2868
2869   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2870   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2871
2872   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2873          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2874 }
2875
2876
2877 /* Return the number of temporary registers that aarch64_add_offset_1
2878    would need to add OFFSET to a register.  */
2879
2880 static unsigned int
2881 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2882 {
2883   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2884 }
2885
2886 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
2887    a non-polynomial OFFSET.  MODE is the mode of the addition.
2888    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2889    be set and CFA adjustments added to the generated instructions.
2890
2891    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2892    temporary if register allocation is already complete.  This temporary
2893    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
2894    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2895    the immediate again.
2896
2897    Since this function may be used to adjust the stack pointer, we must
2898    ensure that it cannot cause transient stack deallocation (for example
2899    by first incrementing SP and then decrementing when adjusting by a
2900    large immediate).  */
2901
2902 static void
2903 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2904                       rtx src, HOST_WIDE_INT offset, rtx temp1,
2905                       bool frame_related_p, bool emit_move_imm)
2906 {
2907   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2908   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2909
2910   HOST_WIDE_INT moffset = abs_hwi (offset);
2911   rtx_insn *insn;
2912
2913   if (!moffset)
2914     {
2915       if (!rtx_equal_p (dest, src))
2916         {
2917           insn = emit_insn (gen_rtx_SET (dest, src));
2918           RTX_FRAME_RELATED_P (insn) = frame_related_p;
2919         }
2920       return;
2921     }
2922
2923   /* Single instruction adjustment.  */
2924   if (aarch64_uimm12_shift (moffset))
2925     {
2926       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2927       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2928       return;
2929     }
2930
2931   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2932      and either:
2933
2934      a) the offset cannot be loaded by a 16-bit move or
2935      b) there is no spare register into which we can move it.  */
2936   if (moffset < 0x1000000
2937       && ((!temp1 && !can_create_pseudo_p ())
2938           || !aarch64_move_imm (moffset, mode)))
2939     {
2940       HOST_WIDE_INT low_off = moffset & 0xfff;
2941
2942       low_off = offset < 0 ? -low_off : low_off;
2943       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2944       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2945       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2946       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2947       return;
2948     }
2949
2950   /* Emit a move immediate if required and an addition/subtraction.  */
2951   if (emit_move_imm)
2952     {
2953       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2954       temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2955     }
2956   insn = emit_insn (offset < 0
2957                     ? gen_sub3_insn (dest, src, temp1)
2958                     : gen_add3_insn (dest, src, temp1));
2959   if (frame_related_p)
2960     {
2961       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2962       rtx adj = plus_constant (mode, src, offset);
2963       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2964     }
2965 }
2966
2967 /* Return the number of temporary registers that aarch64_add_offset
2968    would need to move OFFSET into a register or add OFFSET to a register;
2969    ADD_P is true if we want the latter rather than the former.  */
2970
2971 static unsigned int
2972 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2973 {
2974   /* This follows the same structure as aarch64_add_offset.  */
2975   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2976     return 0;
2977
2978   unsigned int count = 0;
2979   HOST_WIDE_INT factor = offset.coeffs[1];
2980   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2981   poly_int64 poly_offset (factor, factor);
2982   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2983     /* Need one register for the ADDVL/ADDPL result.  */
2984     count += 1;
2985   else if (factor != 0)
2986     {
2987       factor = abs (factor);
2988       if (factor > 16 * (factor & -factor))
2989         /* Need one register for the CNT result and one for the multiplication
2990            factor.  If necessary, the second temporary can be reused for the
2991            constant part of the offset.  */
2992         return 2;
2993       /* Need one register for the CNT result (which might then
2994          be shifted).  */
2995       count += 1;
2996     }
2997   return count + aarch64_add_offset_1_temporaries (constant);
2998 }
2999
3000 /* If X can be represented as a poly_int64, return the number
3001    of temporaries that are required to add it to a register.
3002    Return -1 otherwise.  */
3003
3004 int
3005 aarch64_add_offset_temporaries (rtx x)
3006 {
3007   poly_int64 offset;
3008   if (!poly_int_rtx_p (x, &offset))
3009     return -1;
3010   return aarch64_offset_temporaries (true, offset);
3011 }
3012
3013 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
3014    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3015    be set and CFA adjustments added to the generated instructions.
3016
3017    TEMP1, if nonnull, is a register of mode MODE that can be used as a
3018    temporary if register allocation is already complete.  This temporary
3019    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
3020    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
3021    false to avoid emitting the immediate again.
3022
3023    TEMP2, if nonnull, is a second temporary register that doesn't
3024    overlap either DEST or REG.
3025
3026    Since this function may be used to adjust the stack pointer, we must
3027    ensure that it cannot cause transient stack deallocation (for example
3028    by first incrementing SP and then decrementing when adjusting by a
3029    large immediate).  */
3030
3031 static void
3032 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3033                     poly_int64 offset, rtx temp1, rtx temp2,
3034                     bool frame_related_p, bool emit_move_imm = true)
3035 {
3036   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3037   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3038   gcc_assert (temp1 == NULL_RTX
3039               || !frame_related_p
3040               || !reg_overlap_mentioned_p (temp1, dest));
3041   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
3042
3043   /* Try using ADDVL or ADDPL to add the whole value.  */
3044   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
3045     {
3046       rtx offset_rtx = gen_int_mode (offset, mode);
3047       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3048       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3049       return;
3050     }
3051
3052   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
3053      SVE vector register, over and above the minimum size of 128 bits.
3054      This is equivalent to half the value returned by CNTD with a
3055      vector shape of ALL.  */
3056   HOST_WIDE_INT factor = offset.coeffs[1];
3057   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3058
3059   /* Try using ADDVL or ADDPL to add the VG-based part.  */
3060   poly_int64 poly_offset (factor, factor);
3061   if (src != const0_rtx
3062       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3063     {
3064       rtx offset_rtx = gen_int_mode (poly_offset, mode);
3065       if (frame_related_p)
3066         {
3067           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3068           RTX_FRAME_RELATED_P (insn) = true;
3069           src = dest;
3070         }
3071       else
3072         {
3073           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
3074           src = aarch64_force_temporary (mode, temp1, addr);
3075           temp1 = temp2;
3076           temp2 = NULL_RTX;
3077         }
3078     }
3079   /* Otherwise use a CNT-based sequence.  */
3080   else if (factor != 0)
3081     {
3082       /* Use a subtraction if we have a negative factor.  */
3083       rtx_code code = PLUS;
3084       if (factor < 0)
3085         {
3086           factor = -factor;
3087           code = MINUS;
3088         }
3089
3090       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
3091          into the multiplication.  */
3092       rtx val;
3093       int shift = 0;
3094       if (factor & 1)
3095         /* Use a right shift by 1.  */
3096         shift = -1;
3097       else
3098         factor /= 2;
3099       HOST_WIDE_INT low_bit = factor & -factor;
3100       if (factor <= 16 * low_bit)
3101         {
3102           if (factor > 16 * 8)
3103             {
3104               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3105                  the value with the minimum multiplier and shift it into
3106                  position.  */
3107               int extra_shift = exact_log2 (low_bit);
3108               shift += extra_shift;
3109               factor >>= extra_shift;
3110             }
3111           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
3112         }
3113       else
3114         {
3115           /* Use CNTD, then multiply it by FACTOR.  */
3116           val = gen_int_mode (poly_int64 (2, 2), mode);
3117           val = aarch64_force_temporary (mode, temp1, val);
3118
3119           /* Go back to using a negative multiplication factor if we have
3120              no register from which to subtract.  */
3121           if (code == MINUS && src == const0_rtx)
3122             {
3123               factor = -factor;
3124               code = PLUS;
3125             }
3126           rtx coeff1 = gen_int_mode (factor, mode);
3127           coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
3128           val = gen_rtx_MULT (mode, val, coeff1);
3129         }
3130
3131       if (shift > 0)
3132         {
3133           /* Multiply by 1 << SHIFT.  */
3134           val = aarch64_force_temporary (mode, temp1, val);
3135           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3136         }
3137       else if (shift == -1)
3138         {
3139           /* Divide by 2.  */
3140           val = aarch64_force_temporary (mode, temp1, val);
3141           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3142         }
3143
3144       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
3145       if (src != const0_rtx)
3146         {
3147           val = aarch64_force_temporary (mode, temp1, val);
3148           val = gen_rtx_fmt_ee (code, mode, src, val);
3149         }
3150       else if (code == MINUS)
3151         {
3152           val = aarch64_force_temporary (mode, temp1, val);
3153           val = gen_rtx_NEG (mode, val);
3154         }
3155
3156       if (constant == 0 || frame_related_p)
3157         {
3158           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3159           if (frame_related_p)
3160             {
3161               RTX_FRAME_RELATED_P (insn) = true;
3162               add_reg_note (insn, REG_CFA_ADJUST_CFA,
3163                             gen_rtx_SET (dest, plus_constant (Pmode, src,
3164                                                               poly_offset)));
3165             }
3166           src = dest;
3167           if (constant == 0)
3168             return;
3169         }
3170       else
3171         {
3172           src = aarch64_force_temporary (mode, temp1, val);
3173           temp1 = temp2;
3174           temp2 = NULL_RTX;
3175         }
3176
3177       emit_move_imm = true;
3178     }
3179
3180   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
3181                         frame_related_p, emit_move_imm);
3182 }
3183
3184 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3185    than a poly_int64.  */
3186
3187 void
3188 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3189                           rtx offset_rtx, rtx temp1, rtx temp2)
3190 {
3191   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
3192                       temp1, temp2, false);
3193 }
3194
3195 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3196    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
3197    if TEMP1 already contains abs (DELTA).  */
3198
3199 static inline void
3200 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
3201 {
3202   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
3203                       temp1, temp2, true, emit_move_imm);
3204 }
3205
3206 /* Subtract DELTA from the stack pointer, marking the instructions
3207    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
3208    if nonnull.  */
3209
3210 static inline void
3211 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
3212                 bool emit_move_imm = true)
3213 {
3214   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
3215                       temp1, temp2, frame_related_p, emit_move_imm);
3216 }
3217
3218 /* Set DEST to (vec_series BASE STEP).  */
3219
3220 static void
3221 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
3222 {
3223   machine_mode mode = GET_MODE (dest);
3224   scalar_mode inner = GET_MODE_INNER (mode);
3225
3226   /* Each operand can be a register or an immediate in the range [-16, 15].  */
3227   if (!aarch64_sve_index_immediate_p (base))
3228     base = force_reg (inner, base);
3229   if (!aarch64_sve_index_immediate_p (step))
3230     step = force_reg (inner, step);
3231
3232   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
3233 }
3234
3235 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
3236    integer of mode INT_MODE.  Return true on success.  */
3237
3238 static bool
3239 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
3240                                       rtx src)
3241 {
3242   /* If the constant is smaller than 128 bits, we can do the move
3243      using a vector of SRC_MODEs.  */
3244   if (src_mode != TImode)
3245     {
3246       poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
3247                                      GET_MODE_SIZE (src_mode));
3248       machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
3249       emit_move_insn (gen_lowpart (dup_mode, dest),
3250                       gen_const_vec_duplicate (dup_mode, src));
3251       return true;
3252     }
3253
3254   /* Use LD1RQ[BHWD] to load the 128 bits from memory.  */
3255   src = force_const_mem (src_mode, src);
3256   if (!src)
3257     return false;
3258
3259   /* Make sure that the address is legitimate.  */
3260   if (!aarch64_sve_ld1r_operand_p (src))
3261     {
3262       rtx addr = force_reg (Pmode, XEXP (src, 0));
3263       src = replace_equiv_address (src, addr);
3264     }
3265
3266   machine_mode mode = GET_MODE (dest);
3267   unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
3268   machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
3269   rtx ptrue = aarch64_ptrue_reg (pred_mode);
3270   src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
3271   emit_insn (gen_rtx_SET (dest, src));
3272   return true;
3273 }
3274
3275 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
3276    isn't a simple duplicate or series.  */
3277
3278 static void
3279 aarch64_expand_sve_const_vector (rtx dest, rtx src)
3280 {
3281   machine_mode mode = GET_MODE (src);
3282   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
3283   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
3284   gcc_assert (npatterns > 1);
3285
3286   if (nelts_per_pattern == 1)
3287     {
3288       /* The constant is a repeating seqeuence of at least two elements,
3289          where the repeating elements occupy no more than 128 bits.
3290          Get an integer representation of the replicated value.  */
3291       scalar_int_mode int_mode;
3292       if (BYTES_BIG_ENDIAN)
3293         /* For now, always use LD1RQ to load the value on big-endian
3294            targets, since the handling of smaller integers includes a
3295            subreg that is semantically an element reverse.  */
3296         int_mode = TImode;
3297       else
3298         {
3299           unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
3300           gcc_assert (int_bits <= 128);
3301           int_mode = int_mode_for_size (int_bits, 0).require ();
3302         }
3303       rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
3304       if (int_value
3305           && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
3306         return;
3307     }
3308
3309   /* Expand each pattern individually.  */
3310   rtx_vector_builder builder;
3311   auto_vec<rtx, 16> vectors (npatterns);
3312   for (unsigned int i = 0; i < npatterns; ++i)
3313     {
3314       builder.new_vector (mode, 1, nelts_per_pattern);
3315       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
3316         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
3317       vectors.quick_push (force_reg (mode, builder.build ()));
3318     }
3319
3320   /* Use permutes to interleave the separate vectors.  */
3321   while (npatterns > 1)
3322     {
3323       npatterns /= 2;
3324       for (unsigned int i = 0; i < npatterns; ++i)
3325         {
3326           rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
3327           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
3328           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
3329           vectors[i] = tmp;
3330         }
3331     }
3332   gcc_assert (vectors[0] == dest);
3333 }
3334
3335 /* Set DEST to immediate IMM.  For SVE vector modes, GEN_VEC_DUPLICATE
3336    is a pattern that can be used to set DEST to a replicated scalar
3337    element.  */
3338
3339 void
3340 aarch64_expand_mov_immediate (rtx dest, rtx imm,
3341                               rtx (*gen_vec_duplicate) (rtx, rtx))
3342 {
3343   machine_mode mode = GET_MODE (dest);
3344
3345   /* Check on what type of symbol it is.  */
3346   scalar_int_mode int_mode;
3347   if ((GET_CODE (imm) == SYMBOL_REF
3348        || GET_CODE (imm) == LABEL_REF
3349        || GET_CODE (imm) == CONST
3350        || GET_CODE (imm) == CONST_POLY_INT)
3351       && is_a <scalar_int_mode> (mode, &int_mode))
3352     {
3353       rtx mem;
3354       poly_int64 offset;
3355       HOST_WIDE_INT const_offset;
3356       enum aarch64_symbol_type sty;
3357
3358       /* If we have (const (plus symbol offset)), separate out the offset
3359          before we start classifying the symbol.  */
3360       rtx base = strip_offset (imm, &offset);
3361
3362       /* We must always add an offset involving VL separately, rather than
3363          folding it into the relocation.  */
3364       if (!offset.is_constant (&const_offset))
3365         {
3366           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
3367             emit_insn (gen_rtx_SET (dest, imm));
3368           else
3369             {
3370               /* Do arithmetic on 32-bit values if the result is smaller
3371                  than that.  */
3372               if (partial_subreg_p (int_mode, SImode))
3373                 {
3374                   /* It is invalid to do symbol calculations in modes
3375                      narrower than SImode.  */
3376                   gcc_assert (base == const0_rtx);
3377                   dest = gen_lowpart (SImode, dest);
3378                   int_mode = SImode;
3379                 }
3380               if (base != const0_rtx)
3381                 {
3382                   base = aarch64_force_temporary (int_mode, dest, base);
3383                   aarch64_add_offset (int_mode, dest, base, offset,
3384                                       NULL_RTX, NULL_RTX, false);
3385                 }
3386               else
3387                 aarch64_add_offset (int_mode, dest, base, offset,
3388                                     dest, NULL_RTX, false);
3389             }
3390           return;
3391         }
3392
3393       sty = aarch64_classify_symbol (base, const_offset);
3394       switch (sty)
3395         {
3396         case SYMBOL_FORCE_TO_MEM:
3397           if (const_offset != 0
3398               && targetm.cannot_force_const_mem (int_mode, imm))
3399             {
3400               gcc_assert (can_create_pseudo_p ());
3401               base = aarch64_force_temporary (int_mode, dest, base);
3402               aarch64_add_offset (int_mode, dest, base, const_offset,
3403                                   NULL_RTX, NULL_RTX, false);
3404               return;
3405             }
3406
3407           mem = force_const_mem (ptr_mode, imm);
3408           gcc_assert (mem);
3409
3410           /* If we aren't generating PC relative literals, then
3411              we need to expand the literal pool access carefully.
3412              This is something that needs to be done in a number
3413              of places, so could well live as a separate function.  */
3414           if (!aarch64_pcrelative_literal_loads)
3415             {
3416               gcc_assert (can_create_pseudo_p ());
3417               base = gen_reg_rtx (ptr_mode);
3418               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
3419               if (ptr_mode != Pmode)
3420                 base = convert_memory_address (Pmode, base);
3421               mem = gen_rtx_MEM (ptr_mode, base);
3422             }
3423
3424           if (int_mode != ptr_mode)
3425             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
3426
3427           emit_insn (gen_rtx_SET (dest, mem));
3428
3429           return;
3430
3431         case SYMBOL_SMALL_TLSGD:
3432         case SYMBOL_SMALL_TLSDESC:
3433         case SYMBOL_SMALL_TLSIE:
3434         case SYMBOL_SMALL_GOT_28K:
3435         case SYMBOL_SMALL_GOT_4G:
3436         case SYMBOL_TINY_GOT:
3437         case SYMBOL_TINY_TLSIE:
3438           if (const_offset != 0)
3439             {
3440               gcc_assert(can_create_pseudo_p ());
3441               base = aarch64_force_temporary (int_mode, dest, base);
3442               aarch64_add_offset (int_mode, dest, base, const_offset,
3443                                   NULL_RTX, NULL_RTX, false);
3444               return;
3445             }
3446           /* FALLTHRU */
3447
3448         case SYMBOL_SMALL_ABSOLUTE:
3449         case SYMBOL_TINY_ABSOLUTE:
3450         case SYMBOL_TLSLE12:
3451         case SYMBOL_TLSLE24:
3452         case SYMBOL_TLSLE32:
3453         case SYMBOL_TLSLE48:
3454           aarch64_load_symref_appropriately (dest, imm, sty);
3455           return;
3456
3457         default:
3458           gcc_unreachable ();
3459         }
3460     }
3461
3462   if (!CONST_INT_P (imm))
3463     {
3464       rtx base, step, value;
3465       if (GET_CODE (imm) == HIGH
3466           || aarch64_simd_valid_immediate (imm, NULL))
3467         emit_insn (gen_rtx_SET (dest, imm));
3468       else if (const_vec_series_p (imm, &base, &step))
3469         aarch64_expand_vec_series (dest, base, step);
3470       else if (const_vec_duplicate_p (imm, &value))
3471         {
3472           /* If the constant is out of range of an SVE vector move,
3473              load it from memory if we can, otherwise move it into
3474              a register and use a DUP.  */
3475           scalar_mode inner_mode = GET_MODE_INNER (mode);
3476           rtx op = force_const_mem (inner_mode, value);
3477           if (!op)
3478             op = force_reg (inner_mode, value);
3479           else if (!aarch64_sve_ld1r_operand_p (op))
3480             {
3481               rtx addr = force_reg (Pmode, XEXP (op, 0));
3482               op = replace_equiv_address (op, addr);
3483             }
3484           emit_insn (gen_vec_duplicate (dest, op));
3485         }
3486       else if (GET_CODE (imm) == CONST_VECTOR
3487                && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3488         aarch64_expand_sve_const_vector (dest, imm);
3489       else
3490         {
3491           rtx mem = force_const_mem (mode, imm);
3492           gcc_assert (mem);
3493           emit_move_insn (dest, mem);
3494         }
3495
3496       return;
3497     }
3498
3499   aarch64_internal_mov_immediate (dest, imm, true,
3500                                   as_a <scalar_int_mode> (mode));
3501 }
3502
3503 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
3504    that is known to contain PTRUE.  */
3505
3506 void
3507 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3508 {
3509   expand_operand ops[3];
3510   machine_mode mode = GET_MODE (dest);
3511   create_output_operand (&ops[0], dest, mode);
3512   create_input_operand (&ops[1], pred, GET_MODE(pred));
3513   create_input_operand (&ops[2], src, mode);
3514   temporary_volatile_ok v (true);
3515   expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
3516 }
3517
3518 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3519    operand is in memory.  In this case we need to use the predicated LD1
3520    and ST1 instead of LDR and STR, both for correctness on big-endian
3521    targets and because LD1 and ST1 support a wider range of addressing modes.
3522    PRED_MODE is the mode of the predicate.
3523
3524    See the comment at the head of aarch64-sve.md for details about the
3525    big-endian handling.  */
3526
3527 void
3528 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3529 {
3530   machine_mode mode = GET_MODE (dest);
3531   rtx ptrue = aarch64_ptrue_reg (pred_mode);
3532   if (!register_operand (src, mode)
3533       && !register_operand (dest, mode))
3534     {
3535       rtx tmp = gen_reg_rtx (mode);
3536       if (MEM_P (src))
3537         aarch64_emit_sve_pred_move (tmp, ptrue, src);
3538       else
3539         emit_move_insn (tmp, src);
3540       src = tmp;
3541     }
3542   aarch64_emit_sve_pred_move (dest, ptrue, src);
3543 }
3544
3545 /* Called only on big-endian targets.  See whether an SVE vector move
3546    from SRC to DEST is effectively a REV[BHW] instruction, because at
3547    least one operand is a subreg of an SVE vector that has wider or
3548    narrower elements.  Return true and emit the instruction if so.
3549
3550    For example:
3551
3552      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3553
3554    represents a VIEW_CONVERT between the following vectors, viewed
3555    in memory order:
3556
3557      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
3558      R1: { [0],      [1],      [2],      [3],     ... }
3559
3560    The high part of lane X in R2 should therefore correspond to lane X*2
3561    of R1, but the register representations are:
3562
3563          msb                                      lsb
3564      R2: ...... [1].high  [1].low   [0].high  [0].low
3565      R1: ...... [3]       [2]       [1]       [0]
3566
3567    where the low part of lane X in R2 corresponds to lane X*2 in R1.
3568    We therefore need a reverse operation to swap the high and low values
3569    around.
3570
3571    This is purely an optimization.  Without it we would spill the
3572    subreg operand to the stack in one mode and reload it in the
3573    other mode, which has the same effect as the REV.  */
3574
3575 bool
3576 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3577 {
3578   gcc_assert (BYTES_BIG_ENDIAN);
3579   if (GET_CODE (dest) == SUBREG)
3580     dest = SUBREG_REG (dest);
3581   if (GET_CODE (src) == SUBREG)
3582     src = SUBREG_REG (src);
3583
3584   /* The optimization handles two single SVE REGs with different element
3585      sizes.  */
3586   if (!REG_P (dest)
3587       || !REG_P (src)
3588       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3589       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3590       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3591           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3592     return false;
3593
3594   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
3595   rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
3596   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3597                                UNSPEC_REV_SUBREG);
3598   emit_insn (gen_rtx_SET (dest, unspec));
3599   return true;
3600 }
3601
3602 /* Return a copy of X with mode MODE, without changing its other
3603    attributes.  Unlike gen_lowpart, this doesn't care whether the
3604    mode change is valid.  */
3605
3606 static rtx
3607 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3608 {
3609   if (GET_MODE (x) == mode)
3610     return x;
3611
3612   x = shallow_copy_rtx (x);
3613   set_mode_and_regno (x, mode, REGNO (x));
3614   return x;
3615 }
3616
3617 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3618    operands.  */
3619
3620 void
3621 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3622 {
3623   /* Decide which REV operation we need.  The mode with narrower elements
3624      determines the mode of the operands and the mode with the wider
3625      elements determines the reverse width.  */
3626   machine_mode mode_with_wider_elts = GET_MODE (dest);
3627   machine_mode mode_with_narrower_elts = GET_MODE (src);
3628   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3629       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3630     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3631
3632   unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3633   unsigned int unspec;
3634   if (wider_bytes == 8)
3635     unspec = UNSPEC_REV64;
3636   else if (wider_bytes == 4)
3637     unspec = UNSPEC_REV32;
3638   else if (wider_bytes == 2)
3639     unspec = UNSPEC_REV16;
3640   else
3641     gcc_unreachable ();
3642   machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3643
3644   /* Emit:
3645
3646        (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3647                          UNSPEC_MERGE_PTRUE))
3648
3649      with the appropriate modes.  */
3650   ptrue = gen_lowpart (pred_mode, ptrue);
3651   dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3652   src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3653   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3654   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3655                         UNSPEC_MERGE_PTRUE);
3656   emit_insn (gen_rtx_SET (dest, src));
3657 }
3658
3659 static bool
3660 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3661                                  tree exp ATTRIBUTE_UNUSED)
3662 {
3663   if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
3664     return false;
3665
3666   return true;
3667 }
3668
3669 /* Implement TARGET_PASS_BY_REFERENCE.  */
3670
3671 static bool
3672 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3673                            machine_mode mode,
3674                            const_tree type,
3675                            bool named ATTRIBUTE_UNUSED)
3676 {
3677   HOST_WIDE_INT size;
3678   machine_mode dummymode;
3679   int nregs;
3680
3681   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
3682   if (mode == BLKmode && type)
3683     size = int_size_in_bytes (type);
3684   else
3685     /* No frontends can create types with variable-sized modes, so we
3686        shouldn't be asked to pass or return them.  */
3687     size = GET_MODE_SIZE (mode).to_constant ();
3688
3689   /* Aggregates are passed by reference based on their size.  */
3690   if (type && AGGREGATE_TYPE_P (type))
3691     {
3692       size = int_size_in_bytes (type);
3693     }
3694
3695   /* Variable sized arguments are always returned by reference.  */
3696   if (size < 0)
3697     return true;
3698
3699   /* Can this be a candidate to be passed in fp/simd register(s)?  */
3700   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3701                                                &dummymode, &nregs,
3702                                                NULL))
3703     return false;
3704
3705   /* Arguments which are variable sized or larger than 2 registers are
3706      passed by reference unless they are a homogenous floating point
3707      aggregate.  */
3708   return size > 2 * UNITS_PER_WORD;
3709 }
3710
3711 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
3712 static bool
3713 aarch64_return_in_msb (const_tree valtype)
3714 {
3715   machine_mode dummy_mode;
3716   int dummy_int;
3717
3718   /* Never happens in little-endian mode.  */
3719   if (!BYTES_BIG_ENDIAN)
3720     return false;
3721
3722   /* Only composite types smaller than or equal to 16 bytes can
3723      be potentially returned in registers.  */
3724   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3725       || int_size_in_bytes (valtype) <= 0
3726       || int_size_in_bytes (valtype) > 16)
3727     return false;
3728
3729   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3730      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3731      is always passed/returned in the least significant bits of fp/simd
3732      register(s).  */
3733   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3734                                                &dummy_mode, &dummy_int, NULL))
3735     return false;
3736
3737   return true;
3738 }
3739
3740 /* Implement TARGET_FUNCTION_VALUE.
3741    Define how to find the value returned by a function.  */
3742
3743 static rtx
3744 aarch64_function_value (const_tree type, const_tree func,
3745                         bool outgoing ATTRIBUTE_UNUSED)
3746 {
3747   machine_mode mode;
3748   int unsignedp;
3749   int count;
3750   machine_mode ag_mode;
3751
3752   mode = TYPE_MODE (type);
3753   if (INTEGRAL_TYPE_P (type))
3754     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3755
3756   if (aarch64_return_in_msb (type))
3757     {
3758       HOST_WIDE_INT size = int_size_in_bytes (type);
3759
3760       if (size % UNITS_PER_WORD != 0)
3761         {
3762           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3763           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3764         }
3765     }
3766
3767   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3768                                                &ag_mode, &count, NULL))
3769     {
3770       if (!aarch64_composite_type_p (type, mode))
3771         {
3772           gcc_assert (count == 1 && mode == ag_mode);
3773           return gen_rtx_REG (mode, V0_REGNUM);
3774         }
3775       else
3776         {
3777           int i;
3778           rtx par;
3779
3780           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3781           for (i = 0; i < count; i++)
3782             {
3783               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3784               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3785               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3786               XVECEXP (par, 0, i) = tmp;
3787             }
3788           return par;
3789         }
3790     }
3791   else
3792     return gen_rtx_REG (mode, R0_REGNUM);
3793 }
3794
3795 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3796    Return true if REGNO is the number of a hard register in which the values
3797    of called function may come back.  */
3798
3799 static bool
3800 aarch64_function_value_regno_p (const unsigned int regno)
3801 {
3802   /* Maximum of 16 bytes can be returned in the general registers.  Examples
3803      of 16-byte return values are: 128-bit integers and 16-byte small
3804      structures (excluding homogeneous floating-point aggregates).  */
3805   if (regno == R0_REGNUM || regno == R1_REGNUM)
3806     return true;
3807
3808   /* Up to four fp/simd registers can return a function value, e.g. a
3809      homogeneous floating-point aggregate having four members.  */
3810   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3811     return TARGET_FLOAT;
3812
3813   return false;
3814 }
3815
3816 /* Implement TARGET_RETURN_IN_MEMORY.
3817
3818    If the type T of the result of a function is such that
3819      void func (T arg)
3820    would require that arg be passed as a value in a register (or set of
3821    registers) according to the parameter passing rules, then the result
3822    is returned in the same registers as would be used for such an
3823    argument.  */
3824
3825 static bool
3826 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3827 {
3828   HOST_WIDE_INT size;
3829   machine_mode ag_mode;
3830   int count;
3831
3832   if (!AGGREGATE_TYPE_P (type)
3833       && TREE_CODE (type) != COMPLEX_TYPE
3834       && TREE_CODE (type) != VECTOR_TYPE)
3835     /* Simple scalar types always returned in registers.  */
3836     return false;
3837
3838   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3839                                                type,
3840                                                &ag_mode,
3841                                                &count,
3842                                                NULL))
3843     return false;
3844
3845   /* Types larger than 2 registers returned in memory.  */
3846   size = int_size_in_bytes (type);
3847   return (size < 0 || size > 2 * UNITS_PER_WORD);
3848 }
3849
3850 static bool
3851 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3852                                const_tree type, int *nregs)
3853 {
3854   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3855   return aarch64_vfp_is_call_or_return_candidate (mode,
3856                                                   type,
3857                                                   &pcum->aapcs_vfp_rmode,
3858                                                   nregs,
3859                                                   NULL);
3860 }
3861
3862 /* Given MODE and TYPE of a function argument, return the alignment in
3863    bits.  The idea is to suppress any stronger alignment requested by
3864    the user and opt for the natural alignment (specified in AAPCS64 \S
3865    4.1).  ABI_BREAK is set to true if the alignment was incorrectly
3866    calculated in versions of GCC prior to GCC-9.  This is a helper
3867    function for local use only.  */
3868
3869 static unsigned int
3870 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
3871                                 bool *abi_break)
3872 {
3873   *abi_break = false;
3874   if (!type)
3875     return GET_MODE_ALIGNMENT (mode);
3876
3877   if (integer_zerop (TYPE_SIZE (type)))
3878     return 0;
3879
3880   gcc_assert (TYPE_MODE (type) == mode);
3881
3882   if (!AGGREGATE_TYPE_P (type))
3883     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3884
3885   if (TREE_CODE (type) == ARRAY_TYPE)
3886     return TYPE_ALIGN (TREE_TYPE (type));
3887
3888   unsigned int alignment = 0;
3889   unsigned int bitfield_alignment = 0;
3890   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3891     if (TREE_CODE (field) == FIELD_DECL)
3892       {
3893         alignment = std::max (alignment, DECL_ALIGN (field));
3894         if (DECL_BIT_FIELD_TYPE (field))
3895           bitfield_alignment
3896             = std::max (bitfield_alignment,
3897                         TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
3898       }
3899
3900   if (bitfield_alignment > alignment)
3901     {
3902       *abi_break = true;
3903       return bitfield_alignment;
3904     }
3905
3906   return alignment;
3907 }
3908
3909 /* Layout a function argument according to the AAPCS64 rules.  The rule
3910    numbers refer to the rule numbers in the AAPCS64.  */
3911
3912 static void
3913 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3914                     const_tree type,
3915                     bool named ATTRIBUTE_UNUSED)
3916 {
3917   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3918   int ncrn, nvrn, nregs;
3919   bool allocate_ncrn, allocate_nvrn;
3920   HOST_WIDE_INT size;
3921   bool abi_break;
3922
3923   /* We need to do this once per argument.  */
3924   if (pcum->aapcs_arg_processed)
3925     return;
3926
3927   pcum->aapcs_arg_processed = true;
3928
3929   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
3930   if (type)
3931     size = int_size_in_bytes (type);
3932   else
3933     /* No frontends can create types with variable-sized modes, so we
3934        shouldn't be asked to pass or return them.  */
3935     size = GET_MODE_SIZE (mode).to_constant ();
3936   size = ROUND_UP (size, UNITS_PER_WORD);
3937
3938   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3939   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3940                                                  mode,
3941                                                  type,
3942                                                  &nregs);
3943
3944   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3945      The following code thus handles passing by SIMD/FP registers first.  */
3946
3947   nvrn = pcum->aapcs_nvrn;
3948
3949   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3950      and homogenous short-vector aggregates (HVA).  */
3951   if (allocate_nvrn)
3952     {
3953       if (!TARGET_FLOAT)
3954         aarch64_err_no_fpadvsimd (mode);
3955
3956       if (nvrn + nregs <= NUM_FP_ARG_REGS)
3957         {
3958           pcum->aapcs_nextnvrn = nvrn + nregs;
3959           if (!aarch64_composite_type_p (type, mode))
3960             {
3961               gcc_assert (nregs == 1);
3962               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3963             }
3964           else
3965             {
3966               rtx par;
3967               int i;
3968               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3969               for (i = 0; i < nregs; i++)
3970                 {
3971                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3972                                          V0_REGNUM + nvrn + i);
3973                   rtx offset = gen_int_mode
3974                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3975                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3976                   XVECEXP (par, 0, i) = tmp;
3977                 }
3978               pcum->aapcs_reg = par;
3979             }
3980           return;
3981         }
3982       else
3983         {
3984           /* C.3 NSRN is set to 8.  */
3985           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3986           goto on_stack;
3987         }
3988     }
3989
3990   ncrn = pcum->aapcs_ncrn;
3991   nregs = size / UNITS_PER_WORD;
3992
3993   /* C6 - C9.  though the sign and zero extension semantics are
3994      handled elsewhere.  This is the case where the argument fits
3995      entirely general registers.  */
3996   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3997     {
3998       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3999
4000       /* C.8 if the argument has an alignment of 16 then the NGRN is
4001          rounded up to the next even number.  */
4002       if (nregs == 2
4003           && ncrn % 2
4004           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
4005              comparison is there because for > 16 * BITS_PER_UNIT
4006              alignment nregs should be > 2 and therefore it should be
4007              passed by reference rather than value.  */
4008           && (aarch64_function_arg_alignment (mode, type, &abi_break)
4009               == 16 * BITS_PER_UNIT))
4010         {
4011           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4012             inform (input_location, "parameter passing for argument of type "
4013                     "%qT changed in GCC 9.1", type);
4014           ++ncrn;
4015           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
4016         }
4017
4018       /* NREGS can be 0 when e.g. an empty structure is to be passed.
4019          A reg is still generated for it, but the caller should be smart
4020          enough not to use it.  */
4021       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
4022         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
4023       else
4024         {
4025           rtx par;
4026           int i;
4027
4028           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4029           for (i = 0; i < nregs; i++)
4030             {
4031               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
4032               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
4033                                        GEN_INT (i * UNITS_PER_WORD));
4034               XVECEXP (par, 0, i) = tmp;
4035             }
4036           pcum->aapcs_reg = par;
4037         }
4038
4039       pcum->aapcs_nextncrn = ncrn + nregs;
4040       return;
4041     }
4042
4043   /* C.11  */
4044   pcum->aapcs_nextncrn = NUM_ARG_REGS;
4045
4046   /* The argument is passed on stack; record the needed number of words for
4047      this argument and align the total size if necessary.  */
4048 on_stack:
4049   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
4050
4051   if (aarch64_function_arg_alignment (mode, type, &abi_break)
4052       == 16 * BITS_PER_UNIT)
4053     {
4054       int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
4055       if (pcum->aapcs_stack_size != new_size)
4056         {
4057           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4058             inform (input_location, "parameter passing for argument of type "
4059                     "%qT changed in GCC 9.1", type);
4060           pcum->aapcs_stack_size = new_size;
4061         }
4062     }
4063   return;
4064 }
4065
4066 /* Implement TARGET_FUNCTION_ARG.  */
4067
4068 static rtx
4069 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
4070                       const_tree type, bool named)
4071 {
4072   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4073   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
4074
4075   if (mode == VOIDmode)
4076     return NULL_RTX;
4077
4078   aarch64_layout_arg (pcum_v, mode, type, named);
4079   return pcum->aapcs_reg;
4080 }
4081
4082 void
4083 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
4084                            const_tree fntype ATTRIBUTE_UNUSED,
4085                            rtx libname ATTRIBUTE_UNUSED,
4086                            const_tree fndecl ATTRIBUTE_UNUSED,
4087                            unsigned n_named ATTRIBUTE_UNUSED)
4088 {
4089   pcum->aapcs_ncrn = 0;
4090   pcum->aapcs_nvrn = 0;
4091   pcum->aapcs_nextncrn = 0;
4092   pcum->aapcs_nextnvrn = 0;
4093   pcum->pcs_variant = ARM_PCS_AAPCS64;
4094   pcum->aapcs_reg = NULL_RTX;
4095   pcum->aapcs_arg_processed = false;
4096   pcum->aapcs_stack_words = 0;
4097   pcum->aapcs_stack_size = 0;
4098
4099   if (!TARGET_FLOAT
4100       && fndecl && TREE_PUBLIC (fndecl)
4101       && fntype && fntype != error_mark_node)
4102     {
4103       const_tree type = TREE_TYPE (fntype);
4104       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
4105       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
4106       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
4107                                                    &mode, &nregs, NULL))
4108         aarch64_err_no_fpadvsimd (TYPE_MODE (type));
4109     }
4110   return;
4111 }
4112
4113 static void
4114 aarch64_function_arg_advance (cumulative_args_t pcum_v,
4115                               machine_mode mode,
4116                               const_tree type,
4117                               bool named)
4118 {
4119   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4120   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
4121     {
4122       aarch64_layout_arg (pcum_v, mode, type, named);
4123       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
4124                   != (pcum->aapcs_stack_words != 0));
4125       pcum->aapcs_arg_processed = false;
4126       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
4127       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
4128       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
4129       pcum->aapcs_stack_words = 0;
4130       pcum->aapcs_reg = NULL_RTX;
4131     }
4132 }
4133
4134 bool
4135 aarch64_function_arg_regno_p (unsigned regno)
4136 {
4137   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
4138           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
4139 }
4140
4141 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
4142    PARM_BOUNDARY bits of alignment, but will be given anything up
4143    to STACK_BOUNDARY bits if the type requires it.  This makes sure
4144    that both before and after the layout of each argument, the Next
4145    Stacked Argument Address (NSAA) will have a minimum alignment of
4146    8 bytes.  */
4147
4148 static unsigned int
4149 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
4150 {
4151   bool abi_break;
4152   unsigned int alignment = aarch64_function_arg_alignment (mode, type,
4153                                                            &abi_break);
4154   if (abi_break & warn_psabi)
4155     inform (input_location, "parameter passing for argument of type "
4156             "%qT changed in GCC 9.1", type);
4157
4158   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
4159 }
4160
4161 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
4162
4163 static fixed_size_mode
4164 aarch64_get_reg_raw_mode (int regno)
4165 {
4166   if (TARGET_SVE && FP_REGNUM_P (regno))
4167     /* Don't use the SVE part of the register for __builtin_apply and
4168        __builtin_return.  The SVE registers aren't used by the normal PCS,
4169        so using them there would be a waste of time.  The PCS extensions
4170        for SVE types are fundamentally incompatible with the
4171        __builtin_return/__builtin_apply interface.  */
4172     return as_a <fixed_size_mode> (V16QImode);
4173   return default_get_reg_raw_mode (regno);
4174 }
4175
4176 /* Implement TARGET_FUNCTION_ARG_PADDING.
4177
4178    Small aggregate types are placed in the lowest memory address.
4179
4180    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
4181
4182 static pad_direction
4183 aarch64_function_arg_padding (machine_mode mode, const_tree type)
4184 {
4185   /* On little-endian targets, the least significant byte of every stack
4186      argument is passed at the lowest byte address of the stack slot.  */
4187   if (!BYTES_BIG_ENDIAN)
4188     return PAD_UPWARD;
4189
4190   /* Otherwise, integral, floating-point and pointer types are padded downward:
4191      the least significant byte of a stack argument is passed at the highest
4192      byte address of the stack slot.  */
4193   if (type
4194       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
4195          || POINTER_TYPE_P (type))
4196       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
4197     return PAD_DOWNWARD;
4198
4199   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
4200   return PAD_UPWARD;
4201 }
4202
4203 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4204
4205    It specifies padding for the last (may also be the only)
4206    element of a block move between registers and memory.  If
4207    assuming the block is in the memory, padding upward means that
4208    the last element is padded after its highest significant byte,
4209    while in downward padding, the last element is padded at the
4210    its least significant byte side.
4211
4212    Small aggregates and small complex types are always padded
4213    upwards.
4214
4215    We don't need to worry about homogeneous floating-point or
4216    short-vector aggregates; their move is not affected by the
4217    padding direction determined here.  Regardless of endianness,
4218    each element of such an aggregate is put in the least
4219    significant bits of a fp/simd register.
4220
4221    Return !BYTES_BIG_ENDIAN if the least significant byte of the
4222    register has useful data, and return the opposite if the most
4223    significant byte does.  */
4224
4225 bool
4226 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
4227                      bool first ATTRIBUTE_UNUSED)
4228 {
4229
4230   /* Small composite types are always padded upward.  */
4231   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
4232     {
4233       HOST_WIDE_INT size;
4234       if (type)
4235         size = int_size_in_bytes (type);
4236       else
4237         /* No frontends can create types with variable-sized modes, so we
4238            shouldn't be asked to pass or return them.  */
4239         size = GET_MODE_SIZE (mode).to_constant ();
4240       if (size < 2 * UNITS_PER_WORD)
4241         return true;
4242     }
4243
4244   /* Otherwise, use the default padding.  */
4245   return !BYTES_BIG_ENDIAN;
4246 }
4247
4248 static scalar_int_mode
4249 aarch64_libgcc_cmp_return_mode (void)
4250 {
4251   return SImode;
4252 }
4253
4254 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
4255
4256 /* We use the 12-bit shifted immediate arithmetic instructions so values
4257    must be multiple of (1 << 12), i.e. 4096.  */
4258 #define ARITH_FACTOR 4096
4259
4260 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
4261 #error Cannot use simple address calculation for stack probing
4262 #endif
4263
4264 /* The pair of scratch registers used for stack probing.  */
4265 #define PROBE_STACK_FIRST_REG  R9_REGNUM
4266 #define PROBE_STACK_SECOND_REG R10_REGNUM
4267
4268 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
4269    inclusive.  These are offsets from the current stack pointer.  */
4270
4271 static void
4272 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
4273 {
4274   HOST_WIDE_INT size;
4275   if (!poly_size.is_constant (&size))
4276     {
4277       sorry ("stack probes for SVE frames");
4278       return;
4279     }
4280
4281   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
4282
4283   /* See the same assertion on PROBE_INTERVAL above.  */
4284   gcc_assert ((first % ARITH_FACTOR) == 0);
4285
4286   /* See if we have a constant small number of probes to generate.  If so,
4287      that's the easy case.  */
4288   if (size <= PROBE_INTERVAL)
4289     {
4290       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
4291
4292       emit_set_insn (reg1,
4293                      plus_constant (Pmode,
4294                                     stack_pointer_rtx, -(first + base)));
4295       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
4296     }
4297
4298   /* The run-time loop is made up of 8 insns in the generic case while the
4299      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
4300   else if (size <= 4 * PROBE_INTERVAL)
4301     {
4302       HOST_WIDE_INT i, rem;
4303
4304       emit_set_insn (reg1,
4305                      plus_constant (Pmode,
4306                                     stack_pointer_rtx,
4307                                     -(first + PROBE_INTERVAL)));
4308       emit_stack_probe (reg1);
4309
4310       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
4311          it exceeds SIZE.  If only two probes are needed, this will not
4312          generate any code.  Then probe at FIRST + SIZE.  */
4313       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
4314         {
4315           emit_set_insn (reg1,
4316                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
4317           emit_stack_probe (reg1);
4318         }
4319
4320       rem = size - (i - PROBE_INTERVAL);
4321       if (rem > 256)
4322         {
4323           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4324
4325           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
4326           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
4327         }
4328       else
4329         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
4330     }
4331
4332   /* Otherwise, do the same as above, but in a loop.  Note that we must be
4333      extra careful with variables wrapping around because we might be at
4334      the very top (or the very bottom) of the address space and we have
4335      to be able to handle this case properly; in particular, we use an
4336      equality test for the loop condition.  */
4337   else
4338     {
4339       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
4340
4341       /* Step 1: round SIZE to the previous multiple of the interval.  */
4342
4343       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
4344
4345
4346       /* Step 2: compute initial and final value of the loop counter.  */
4347
4348       /* TEST_ADDR = SP + FIRST.  */
4349       emit_set_insn (reg1,
4350                      plus_constant (Pmode, stack_pointer_rtx, -first));
4351
4352       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
4353       HOST_WIDE_INT adjustment = - (first + rounded_size);
4354       if (! aarch64_uimm12_shift (adjustment))
4355         {
4356           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
4357                                           true, Pmode);
4358           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
4359         }
4360       else
4361         emit_set_insn (reg2,
4362                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
4363
4364       /* Step 3: the loop
4365
4366          do
4367            {
4368              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
4369              probe at TEST_ADDR
4370            }
4371          while (TEST_ADDR != LAST_ADDR)
4372
4373          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
4374          until it is equal to ROUNDED_SIZE.  */
4375
4376       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
4377
4378
4379       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
4380          that SIZE is equal to ROUNDED_SIZE.  */
4381
4382       if (size != rounded_size)
4383         {
4384           HOST_WIDE_INT rem = size - rounded_size;
4385
4386           if (rem > 256)
4387             {
4388               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4389
4390               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
4391               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
4392             }
4393           else
4394             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
4395         }
4396     }
4397
4398   /* Make sure nothing is scheduled before we are done.  */
4399   emit_insn (gen_blockage ());
4400 }
4401
4402 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
4403    absolute addresses.  */
4404
4405 const char *
4406 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
4407 {
4408   static int labelno = 0;
4409   char loop_lab[32];
4410   rtx xops[2];
4411
4412   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
4413
4414   /* Loop.  */
4415   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
4416
4417   HOST_WIDE_INT stack_clash_probe_interval
4418     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
4419
4420   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
4421   xops[0] = reg1;
4422   HOST_WIDE_INT interval;
4423   if (flag_stack_clash_protection)
4424     interval = stack_clash_probe_interval;
4425   else
4426     interval = PROBE_INTERVAL;
4427
4428   gcc_assert (aarch64_uimm12_shift (interval));
4429   xops[1] = GEN_INT (interval);
4430
4431   output_asm_insn ("sub\t%0, %0, %1", xops);
4432
4433   /* If doing stack clash protection then we probe up by the ABI specified
4434      amount.  We do this because we're dropping full pages at a time in the
4435      loop.  But if we're doing non-stack clash probing, probe at SP 0.  */
4436   if (flag_stack_clash_protection)
4437     xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
4438   else
4439     xops[1] = CONST0_RTX (GET_MODE (xops[1]));
4440
4441   /* Probe at TEST_ADDR.  If we're inside the loop it is always safe to probe
4442      by this amount for each iteration.  */
4443   output_asm_insn ("str\txzr, [%0, %1]", xops);
4444
4445   /* Test if TEST_ADDR == LAST_ADDR.  */
4446   xops[1] = reg2;
4447   output_asm_insn ("cmp\t%0, %1", xops);
4448
4449   /* Branch.  */
4450   fputs ("\tb.ne\t", asm_out_file);
4451   assemble_name_raw (asm_out_file, loop_lab);
4452   fputc ('\n', asm_out_file);
4453
4454   return "";
4455 }
4456
4457 /* Emit the probe loop for doing stack clash probes and stack adjustments for
4458    SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
4459    of GUARD_SIZE.  When a probe is emitted it is done at most
4460    MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
4461    at most MIN_PROBE_THRESHOLD.  By the end of this function
4462    BASE = BASE - ADJUSTMENT.  */
4463
4464 const char *
4465 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
4466                                       rtx min_probe_threshold, rtx guard_size)
4467 {
4468   /* This function is not allowed to use any instruction generation function
4469      like gen_ and friends.  If you do you'll likely ICE during CFG validation,
4470      so instead emit the code you want using output_asm_insn.  */
4471   gcc_assert (flag_stack_clash_protection);
4472   gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
4473   gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
4474
4475   /* The minimum required allocation before the residual requires probing.  */
4476   HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
4477
4478   /* Clamp the value down to the nearest value that can be used with a cmp.  */
4479   residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
4480   rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
4481
4482   gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
4483   gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
4484
4485   static int labelno = 0;
4486   char loop_start_lab[32];
4487   char loop_end_lab[32];
4488   rtx xops[2];
4489
4490   ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
4491   ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
4492
4493   /* Emit loop start label.  */
4494   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
4495
4496   /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
4497   xops[0] = adjustment;
4498   xops[1] = probe_offset_value_rtx;
4499   output_asm_insn ("cmp\t%0, %1", xops);
4500
4501   /* Branch to end if not enough adjustment to probe.  */
4502   fputs ("\tb.lt\t", asm_out_file);
4503   assemble_name_raw (asm_out_file, loop_end_lab);
4504   fputc ('\n', asm_out_file);
4505
4506   /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
4507   xops[0] = base;
4508   xops[1] = probe_offset_value_rtx;
4509   output_asm_insn ("sub\t%0, %0, %1", xops);
4510
4511   /* Probe at BASE.  */
4512   xops[1] = const0_rtx;
4513   output_asm_insn ("str\txzr, [%0, %1]", xops);
4514
4515   /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
4516   xops[0] = adjustment;
4517   xops[1] = probe_offset_value_rtx;
4518   output_asm_insn ("sub\t%0, %0, %1", xops);
4519
4520   /* Branch to start if still more bytes to allocate.  */
4521   fputs ("\tb\t", asm_out_file);
4522   assemble_name_raw (asm_out_file, loop_start_lab);
4523   fputc ('\n', asm_out_file);
4524
4525   /* No probe leave.  */
4526   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
4527
4528   /* BASE = BASE - ADJUSTMENT.  */
4529   xops[0] = base;
4530   xops[1] = adjustment;
4531   output_asm_insn ("sub\t%0, %0, %1", xops);
4532   return "";
4533 }
4534
4535 /* Determine whether a frame chain needs to be generated.  */
4536 static bool
4537 aarch64_needs_frame_chain (void)
4538 {
4539   /* Force a frame chain for EH returns so the return address is at FP+8.  */
4540   if (frame_pointer_needed || crtl->calls_eh_return)
4541     return true;
4542
4543   /* A leaf function cannot have calls or write LR.  */
4544   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
4545
4546   /* Don't use a frame chain in leaf functions if leaf frame pointers
4547      are disabled.  */
4548   if (flag_omit_leaf_frame_pointer && is_leaf)
4549     return false;
4550
4551   return aarch64_use_frame_pointer;
4552 }
4553
4554 /* Mark the registers that need to be saved by the callee and calculate
4555    the size of the callee-saved registers area and frame record (both FP
4556    and LR may be omitted).  */
4557 static void
4558 aarch64_layout_frame (void)
4559 {
4560   HOST_WIDE_INT offset = 0;
4561   int regno, last_fp_reg = INVALID_REGNUM;
4562   bool simd_function = aarch64_simd_decl_p (cfun->decl);
4563
4564   cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
4565
4566   /* Adjust the outgoing arguments size if required.  Keep it in sync with what
4567      the mid-end is doing.  */
4568   crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
4569
4570 #define SLOT_NOT_REQUIRED (-2)
4571 #define SLOT_REQUIRED     (-1)
4572
4573   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
4574   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
4575
4576   /* If this is a non-leaf simd function with calls we assume that
4577      at least one of those calls is to a non-simd function and thus
4578      we must save V8 to V23 in the prologue.  */
4579
4580   if (simd_function && !crtl->is_leaf)
4581     {
4582       for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4583         if (FP_SIMD_SAVED_REGNUM_P (regno))
4584           df_set_regs_ever_live (regno, true);
4585     }
4586
4587   /* First mark all the registers that really need to be saved...  */
4588   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4589     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4590
4591   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4592     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4593
4594   /* ... that includes the eh data registers (if needed)...  */
4595   if (crtl->calls_eh_return)
4596     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
4597       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
4598         = SLOT_REQUIRED;
4599
4600   /* ... and any callee saved register that dataflow says is live.  */
4601   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4602     if (df_regs_ever_live_p (regno)
4603         && (regno == R30_REGNUM
4604             || !call_used_regs[regno]))
4605       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4606
4607   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4608     if (df_regs_ever_live_p (regno)
4609         && (!call_used_regs[regno]
4610             || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
4611       {
4612         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4613         last_fp_reg = regno;
4614       }
4615
4616   if (cfun->machine->frame.emit_frame_chain)
4617     {
4618       /* FP and LR are placed in the linkage record.  */
4619       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
4620       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
4621       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
4622       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
4623       offset = 2 * UNITS_PER_WORD;
4624     }
4625
4626   /* With stack-clash, LR must be saved in non-leaf functions.  */
4627   gcc_assert (crtl->is_leaf
4628               || (cfun->machine->frame.reg_offset[R30_REGNUM]
4629                   != SLOT_NOT_REQUIRED));
4630
4631   /* Now assign stack slots for them.  */
4632   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4633     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4634       {
4635         cfun->machine->frame.reg_offset[regno] = offset;
4636         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4637           cfun->machine->frame.wb_candidate1 = regno;
4638         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4639           cfun->machine->frame.wb_candidate2 = regno;
4640         offset += UNITS_PER_WORD;
4641       }
4642
4643   HOST_WIDE_INT max_int_offset = offset;
4644   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4645   bool has_align_gap = offset != max_int_offset;
4646
4647   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4648     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4649       {
4650         /* If there is an alignment gap between integer and fp callee-saves,
4651            allocate the last fp register to it if possible.  */
4652         if (regno == last_fp_reg
4653             && has_align_gap
4654             && !simd_function
4655             && (offset & 8) == 0)
4656           {
4657             cfun->machine->frame.reg_offset[regno] = max_int_offset;
4658             break;
4659           }
4660
4661         cfun->machine->frame.reg_offset[regno] = offset;
4662         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4663           cfun->machine->frame.wb_candidate1 = regno;
4664         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4665                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4666           cfun->machine->frame.wb_candidate2 = regno;
4667         offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
4668       }
4669
4670   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4671
4672   cfun->machine->frame.saved_regs_size = offset;
4673
4674   HOST_WIDE_INT varargs_and_saved_regs_size
4675     = offset + cfun->machine->frame.saved_varargs_size;
4676
4677   cfun->machine->frame.hard_fp_offset
4678     = aligned_upper_bound (varargs_and_saved_regs_size
4679                            + get_frame_size (),
4680                            STACK_BOUNDARY / BITS_PER_UNIT);
4681
4682   /* Both these values are already aligned.  */
4683   gcc_assert (multiple_p (crtl->outgoing_args_size,
4684                           STACK_BOUNDARY / BITS_PER_UNIT));
4685   cfun->machine->frame.frame_size
4686     = (cfun->machine->frame.hard_fp_offset
4687        + crtl->outgoing_args_size);
4688
4689   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4690
4691   cfun->machine->frame.initial_adjust = 0;
4692   cfun->machine->frame.final_adjust = 0;
4693   cfun->machine->frame.callee_adjust = 0;
4694   cfun->machine->frame.callee_offset = 0;
4695
4696   HOST_WIDE_INT max_push_offset = 0;
4697   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4698     max_push_offset = 512;
4699   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4700     max_push_offset = 256;
4701
4702   HOST_WIDE_INT const_size, const_fp_offset;
4703   if (cfun->machine->frame.frame_size.is_constant (&const_size)
4704       && const_size < max_push_offset
4705       && known_eq (crtl->outgoing_args_size, 0))
4706     {
4707       /* Simple, small frame with no outgoing arguments:
4708          stp reg1, reg2, [sp, -frame_size]!
4709          stp reg3, reg4, [sp, 16]  */
4710       cfun->machine->frame.callee_adjust = const_size;
4711     }
4712   else if (known_lt (crtl->outgoing_args_size
4713                      + cfun->machine->frame.saved_regs_size, 512)
4714            && !(cfun->calls_alloca
4715                 && known_lt (cfun->machine->frame.hard_fp_offset,
4716                              max_push_offset)))
4717     {
4718       /* Frame with small outgoing arguments:
4719          sub sp, sp, frame_size
4720          stp reg1, reg2, [sp, outgoing_args_size]
4721          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
4722       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4723       cfun->machine->frame.callee_offset
4724         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4725     }
4726   else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4727            && const_fp_offset < max_push_offset)
4728     {
4729       /* Frame with large outgoing arguments but a small local area:
4730          stp reg1, reg2, [sp, -hard_fp_offset]!
4731          stp reg3, reg4, [sp, 16]
4732          sub sp, sp, outgoing_args_size  */
4733       cfun->machine->frame.callee_adjust = const_fp_offset;
4734       cfun->machine->frame.final_adjust
4735         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4736     }
4737   else
4738     {
4739       /* Frame with large local area and outgoing arguments using frame pointer:
4740          sub sp, sp, hard_fp_offset
4741          stp x29, x30, [sp, 0]
4742          add x29, sp, 0
4743          stp reg3, reg4, [sp, 16]
4744          sub sp, sp, outgoing_args_size  */
4745       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4746       cfun->machine->frame.final_adjust
4747         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4748     }
4749
4750   cfun->machine->frame.laid_out = true;
4751 }
4752
4753 /* Return true if the register REGNO is saved on entry to
4754    the current function.  */
4755
4756 static bool
4757 aarch64_register_saved_on_entry (int regno)
4758 {
4759   return cfun->machine->frame.reg_offset[regno] >= 0;
4760 }
4761
4762 /* Return the next register up from REGNO up to LIMIT for the callee
4763    to save.  */
4764
4765 static unsigned
4766 aarch64_next_callee_save (unsigned regno, unsigned limit)
4767 {
4768   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4769     regno ++;
4770   return regno;
4771 }
4772
4773 /* Push the register number REGNO of mode MODE to the stack with write-back
4774    adjusting the stack by ADJUSTMENT.  */
4775
4776 static void
4777 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4778                            HOST_WIDE_INT adjustment)
4779  {
4780   rtx base_rtx = stack_pointer_rtx;
4781   rtx insn, reg, mem;
4782
4783   reg = gen_rtx_REG (mode, regno);
4784   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4785                             plus_constant (Pmode, base_rtx, -adjustment));
4786   mem = gen_frame_mem (mode, mem);
4787
4788   insn = emit_move_insn (mem, reg);
4789   RTX_FRAME_RELATED_P (insn) = 1;
4790 }
4791
4792 /* Generate and return an instruction to store the pair of registers
4793    REG and REG2 of mode MODE to location BASE with write-back adjusting
4794    the stack location BASE by ADJUSTMENT.  */
4795
4796 static rtx
4797 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4798                           HOST_WIDE_INT adjustment)
4799 {
4800   switch (mode)
4801     {
4802     case E_DImode:
4803       return gen_storewb_pairdi_di (base, base, reg, reg2,
4804                                     GEN_INT (-adjustment),
4805                                     GEN_INT (UNITS_PER_WORD - adjustment));
4806     case E_DFmode:
4807       return gen_storewb_pairdf_di (base, base, reg, reg2,
4808                                     GEN_INT (-adjustment),
4809                                     GEN_INT (UNITS_PER_WORD - adjustment));
4810     case E_TFmode:
4811       return gen_storewb_pairtf_di (base, base, reg, reg2,
4812                                     GEN_INT (-adjustment),
4813                                     GEN_INT (UNITS_PER_VREG - adjustment));
4814     default:
4815       gcc_unreachable ();
4816     }
4817 }
4818
4819 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4820    stack pointer by ADJUSTMENT.  */
4821
4822 static void
4823 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4824 {
4825   rtx_insn *insn;
4826   machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
4827
4828   if (regno2 == INVALID_REGNUM)
4829     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4830
4831   rtx reg1 = gen_rtx_REG (mode, regno1);
4832   rtx reg2 = gen_rtx_REG (mode, regno2);
4833
4834   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4835                                               reg2, adjustment));
4836   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4837   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4838   RTX_FRAME_RELATED_P (insn) = 1;
4839 }
4840
4841 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4842    adjusting it by ADJUSTMENT afterwards.  */
4843
4844 static rtx
4845 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4846                          HOST_WIDE_INT adjustment)
4847 {
4848   switch (mode)
4849     {
4850     case E_DImode:
4851       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4852                                    GEN_INT (UNITS_PER_WORD));
4853     case E_DFmode:
4854       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4855                                    GEN_INT (UNITS_PER_WORD));
4856     case E_TFmode:
4857       return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
4858                                    GEN_INT (UNITS_PER_VREG));
4859     default:
4860       gcc_unreachable ();
4861     }
4862 }
4863
4864 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4865    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4866    into CFI_OPS.  */
4867
4868 static void
4869 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4870                   rtx *cfi_ops)
4871 {
4872   machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
4873   rtx reg1 = gen_rtx_REG (mode, regno1);
4874
4875   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4876
4877   if (regno2 == INVALID_REGNUM)
4878     {
4879       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4880       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4881       emit_move_insn (reg1, gen_frame_mem (mode, mem));
4882     }
4883   else
4884     {
4885       rtx reg2 = gen_rtx_REG (mode, regno2);
4886       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4887       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4888                                           reg2, adjustment));
4889     }
4890 }
4891
4892 /* Generate and return a store pair instruction of mode MODE to store
4893    register REG1 to MEM1 and register REG2 to MEM2.  */
4894
4895 static rtx
4896 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4897                         rtx reg2)
4898 {
4899   switch (mode)
4900     {
4901     case E_DImode:
4902       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
4903
4904     case E_DFmode:
4905       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
4906
4907     case E_TFmode:
4908       return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
4909
4910     default:
4911       gcc_unreachable ();
4912     }
4913 }
4914
4915 /* Generate and regurn a load pair isntruction of mode MODE to load register
4916    REG1 from MEM1 and register REG2 from MEM2.  */
4917
4918 static rtx
4919 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4920                        rtx mem2)
4921 {
4922   switch (mode)
4923     {
4924     case E_DImode:
4925       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
4926
4927     case E_DFmode:
4928       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
4929
4930     case E_TFmode:
4931       return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
4932
4933     default:
4934       gcc_unreachable ();
4935     }
4936 }
4937
4938 /* Return TRUE if return address signing should be enabled for the current
4939    function, otherwise return FALSE.  */
4940
4941 bool
4942 aarch64_return_address_signing_enabled (void)
4943 {
4944   /* This function should only be called after frame laid out.   */
4945   gcc_assert (cfun->machine->frame.laid_out);
4946
4947   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4948      if its LR is pushed onto stack.  */
4949   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4950           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4951               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4952 }
4953
4954 /* Return TRUE if Branch Target Identification Mechanism is enabled.  */
4955 bool
4956 aarch64_bti_enabled (void)
4957 {
4958   return (aarch64_enable_bti == 1);
4959 }
4960
4961 /* Emit code to save the callee-saved registers from register number START
4962    to LIMIT to the stack at the location starting at offset START_OFFSET,
4963    skipping any write-back candidates if SKIP_WB is true.  */
4964
4965 static void
4966 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4967                            unsigned start, unsigned limit, bool skip_wb)
4968 {
4969   rtx_insn *insn;
4970   unsigned regno;
4971   unsigned regno2;
4972
4973   for (regno = aarch64_next_callee_save (start, limit);
4974        regno <= limit;
4975        regno = aarch64_next_callee_save (regno + 1, limit))
4976     {
4977       rtx reg, mem;
4978       poly_int64 offset;
4979       int offset_diff;
4980
4981       if (skip_wb
4982           && (regno == cfun->machine->frame.wb_candidate1
4983               || regno == cfun->machine->frame.wb_candidate2))
4984         continue;
4985
4986       if (cfun->machine->reg_is_wrapped_separately[regno])
4987        continue;
4988
4989       reg = gen_rtx_REG (mode, regno);
4990       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4991       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4992                                                 offset));
4993
4994       regno2 = aarch64_next_callee_save (regno + 1, limit);
4995       offset_diff = cfun->machine->frame.reg_offset[regno2]
4996                     - cfun->machine->frame.reg_offset[regno];
4997
4998       if (regno2 <= limit
4999           && !cfun->machine->reg_is_wrapped_separately[regno2]
5000           && known_eq (GET_MODE_SIZE (mode), offset_diff))
5001         {
5002           rtx reg2 = gen_rtx_REG (mode, regno2);
5003           rtx mem2;
5004
5005           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5006           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5007                                                      offset));
5008           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
5009                                                     reg2));
5010
5011           /* The first part of a frame-related parallel insn is
5012              always assumed to be relevant to the frame
5013              calculations; subsequent parts, are only
5014              frame-related if explicitly marked.  */
5015           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5016           regno = regno2;
5017         }
5018       else
5019         insn = emit_move_insn (mem, reg);
5020
5021       RTX_FRAME_RELATED_P (insn) = 1;
5022     }
5023 }
5024
5025 /* Emit code to restore the callee registers of mode MODE from register
5026    number START up to and including LIMIT.  Restore from the stack offset
5027    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
5028    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
5029
5030 static void
5031 aarch64_restore_callee_saves (machine_mode mode,
5032                               poly_int64 start_offset, unsigned start,
5033                               unsigned limit, bool skip_wb, rtx *cfi_ops)
5034 {
5035   rtx base_rtx = stack_pointer_rtx;
5036   unsigned regno;
5037   unsigned regno2;
5038   poly_int64 offset;
5039
5040   for (regno = aarch64_next_callee_save (start, limit);
5041        regno <= limit;
5042        regno = aarch64_next_callee_save (regno + 1, limit))
5043     {
5044       if (cfun->machine->reg_is_wrapped_separately[regno])
5045        continue;
5046
5047       rtx reg, mem;
5048       int offset_diff;
5049
5050       if (skip_wb
5051           && (regno == cfun->machine->frame.wb_candidate1
5052               || regno == cfun->machine->frame.wb_candidate2))
5053         continue;
5054
5055       reg = gen_rtx_REG (mode, regno);
5056       offset = start_offset + cfun->machine->frame.reg_offset[regno];
5057       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5058
5059       regno2 = aarch64_next_callee_save (regno + 1, limit);
5060       offset_diff = cfun->machine->frame.reg_offset[regno2]
5061                     - cfun->machine->frame.reg_offset[regno];
5062
5063       if (regno2 <= limit
5064           && !cfun->machine->reg_is_wrapped_separately[regno2]
5065           && known_eq (GET_MODE_SIZE (mode), offset_diff))
5066         {
5067           rtx reg2 = gen_rtx_REG (mode, regno2);
5068           rtx mem2;
5069
5070           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5071           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5072           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5073
5074           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5075           regno = regno2;
5076         }
5077       else
5078         emit_move_insn (reg, mem);
5079       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
5080     }
5081 }
5082
5083 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
5084    of MODE.  */
5085
5086 static inline bool
5087 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5088 {
5089   HOST_WIDE_INT multiple;
5090   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5091           && IN_RANGE (multiple, -8, 7));
5092 }
5093
5094 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
5095    of MODE.  */
5096
5097 static inline bool
5098 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5099 {
5100   HOST_WIDE_INT multiple;
5101   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5102           && IN_RANGE (multiple, 0, 63));
5103 }
5104
5105 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
5106    of MODE.  */
5107
5108 bool
5109 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5110 {
5111   HOST_WIDE_INT multiple;
5112   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5113           && IN_RANGE (multiple, -64, 63));
5114 }
5115
5116 /* Return true if OFFSET is a signed 9-bit value.  */
5117
5118 bool
5119 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
5120                                        poly_int64 offset)
5121 {
5122   HOST_WIDE_INT const_offset;
5123   return (offset.is_constant (&const_offset)
5124           && IN_RANGE (const_offset, -256, 255));
5125 }
5126
5127 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
5128    of MODE.  */
5129
5130 static inline bool
5131 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5132 {
5133   HOST_WIDE_INT multiple;
5134   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5135           && IN_RANGE (multiple, -256, 255));
5136 }
5137
5138 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
5139    of MODE.  */
5140
5141 static inline bool
5142 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5143 {
5144   HOST_WIDE_INT multiple;
5145   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5146           && IN_RANGE (multiple, 0, 4095));
5147 }
5148
5149 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
5150
5151 static sbitmap
5152 aarch64_get_separate_components (void)
5153 {
5154   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5155   bitmap_clear (components);
5156
5157   /* The registers we need saved to the frame.  */
5158   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5159     if (aarch64_register_saved_on_entry (regno))
5160       {
5161         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5162         if (!frame_pointer_needed)
5163           offset += cfun->machine->frame.frame_size
5164                     - cfun->machine->frame.hard_fp_offset;
5165         /* Check that we can access the stack slot of the register with one
5166            direct load with no adjustments needed.  */
5167         if (offset_12bit_unsigned_scaled_p (DImode, offset))
5168           bitmap_set_bit (components, regno);
5169       }
5170
5171   /* Don't mess with the hard frame pointer.  */
5172   if (frame_pointer_needed)
5173     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
5174
5175   unsigned reg1 = cfun->machine->frame.wb_candidate1;
5176   unsigned reg2 = cfun->machine->frame.wb_candidate2;
5177   /* If registers have been chosen to be stored/restored with
5178      writeback don't interfere with them to avoid having to output explicit
5179      stack adjustment instructions.  */
5180   if (reg2 != INVALID_REGNUM)
5181     bitmap_clear_bit (components, reg2);
5182   if (reg1 != INVALID_REGNUM)
5183     bitmap_clear_bit (components, reg1);
5184
5185   bitmap_clear_bit (components, LR_REGNUM);
5186   bitmap_clear_bit (components, SP_REGNUM);
5187
5188   return components;
5189 }
5190
5191 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
5192
5193 static sbitmap
5194 aarch64_components_for_bb (basic_block bb)
5195 {
5196   bitmap in = DF_LIVE_IN (bb);
5197   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
5198   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
5199   bool simd_function = aarch64_simd_decl_p (cfun->decl);
5200
5201   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5202   bitmap_clear (components);
5203
5204   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
5205   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5206     if ((!call_used_regs[regno]
5207         || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
5208        && (bitmap_bit_p (in, regno)
5209            || bitmap_bit_p (gen, regno)
5210            || bitmap_bit_p (kill, regno)))
5211       {
5212         unsigned regno2, offset, offset2;
5213         bitmap_set_bit (components, regno);
5214
5215         /* If there is a callee-save at an adjacent offset, add it too
5216            to increase the use of LDP/STP.  */
5217         offset = cfun->machine->frame.reg_offset[regno];
5218         regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
5219
5220         if (regno2 <= LAST_SAVED_REGNUM)
5221           {
5222             offset2 = cfun->machine->frame.reg_offset[regno2];
5223             if ((offset & ~8) == (offset2 & ~8))
5224               bitmap_set_bit (components, regno2);
5225           }
5226       }
5227
5228   return components;
5229 }
5230
5231 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5232    Nothing to do for aarch64.  */
5233
5234 static void
5235 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
5236 {
5237 }
5238
5239 /* Return the next set bit in BMP from START onwards.  Return the total number
5240    of bits in BMP if no set bit is found at or after START.  */
5241
5242 static unsigned int
5243 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
5244 {
5245   unsigned int nbits = SBITMAP_SIZE (bmp);
5246   if (start == nbits)
5247     return start;
5248
5249   gcc_assert (start < nbits);
5250   for (unsigned int i = start; i < nbits; i++)
5251     if (bitmap_bit_p (bmp, i))
5252       return i;
5253
5254   return nbits;
5255 }
5256
5257 /* Do the work for aarch64_emit_prologue_components and
5258    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
5259    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
5260    for these components or the epilogue sequence.  That is, it determines
5261    whether we should emit stores or loads and what kind of CFA notes to attach
5262    to the insns.  Otherwise the logic for the two sequences is very
5263    similar.  */
5264
5265 static void
5266 aarch64_process_components (sbitmap components, bool prologue_p)
5267 {
5268   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
5269                              ? HARD_FRAME_POINTER_REGNUM
5270                              : STACK_POINTER_REGNUM);
5271
5272   unsigned last_regno = SBITMAP_SIZE (components);
5273   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
5274   rtx_insn *insn = NULL;
5275
5276   while (regno != last_regno)
5277     {
5278       /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
5279          so DFmode for the vector registers is enough.  For simd functions
5280          we want to save the low 128 bits.  */
5281       machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
5282
5283       rtx reg = gen_rtx_REG (mode, regno);
5284       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5285       if (!frame_pointer_needed)
5286         offset += cfun->machine->frame.frame_size
5287                   - cfun->machine->frame.hard_fp_offset;
5288       rtx addr = plus_constant (Pmode, ptr_reg, offset);
5289       rtx mem = gen_frame_mem (mode, addr);
5290
5291       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
5292       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
5293       /* No more registers to handle after REGNO.
5294          Emit a single save/restore and exit.  */
5295       if (regno2 == last_regno)
5296         {
5297           insn = emit_insn (set);
5298           RTX_FRAME_RELATED_P (insn) = 1;
5299           if (prologue_p)
5300             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5301           else
5302             add_reg_note (insn, REG_CFA_RESTORE, reg);
5303           break;
5304         }
5305
5306       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
5307       /* The next register is not of the same class or its offset is not
5308          mergeable with the current one into a pair.  */
5309       if (!satisfies_constraint_Ump (mem)
5310           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
5311           || (aarch64_simd_decl_p (cfun->decl) && FP_REGNUM_P (regno))
5312           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
5313                        GET_MODE_SIZE (mode)))
5314         {
5315           insn = emit_insn (set);
5316           RTX_FRAME_RELATED_P (insn) = 1;
5317           if (prologue_p)
5318             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5319           else
5320             add_reg_note (insn, REG_CFA_RESTORE, reg);
5321
5322           regno = regno2;
5323           continue;
5324         }
5325
5326       /* REGNO2 can be saved/restored in a pair with REGNO.  */
5327       rtx reg2 = gen_rtx_REG (mode, regno2);
5328       if (!frame_pointer_needed)
5329         offset2 += cfun->machine->frame.frame_size
5330                   - cfun->machine->frame.hard_fp_offset;
5331       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
5332       rtx mem2 = gen_frame_mem (mode, addr2);
5333       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
5334                              : gen_rtx_SET (reg2, mem2);
5335
5336       if (prologue_p)
5337         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
5338       else
5339         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5340
5341       RTX_FRAME_RELATED_P (insn) = 1;
5342       if (prologue_p)
5343         {
5344           add_reg_note (insn, REG_CFA_OFFSET, set);
5345           add_reg_note (insn, REG_CFA_OFFSET, set2);
5346         }
5347       else
5348         {
5349           add_reg_note (insn, REG_CFA_RESTORE, reg);
5350           add_reg_note (insn, REG_CFA_RESTORE, reg2);
5351         }
5352
5353       regno = aarch64_get_next_set_bit (components, regno2 + 1);
5354     }
5355 }
5356
5357 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
5358
5359 static void
5360 aarch64_emit_prologue_components (sbitmap components)
5361 {
5362   aarch64_process_components (components, true);
5363 }
5364
5365 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
5366
5367 static void
5368 aarch64_emit_epilogue_components (sbitmap components)
5369 {
5370   aarch64_process_components (components, false);
5371 }
5372
5373 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
5374
5375 static void
5376 aarch64_set_handled_components (sbitmap components)
5377 {
5378   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5379     if (bitmap_bit_p (components, regno))
5380       cfun->machine->reg_is_wrapped_separately[regno] = true;
5381 }
5382
5383 /* On AArch64 we have an ABI defined safe buffer.  This constant is used to
5384    determining the probe offset for alloca.  */
5385
5386 static HOST_WIDE_INT
5387 aarch64_stack_clash_protection_alloca_probe_range (void)
5388 {
5389   return STACK_CLASH_CALLER_GUARD;
5390 }
5391
5392
5393 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
5394    registers.  If POLY_SIZE is not large enough to require a probe this function
5395    will only adjust the stack.  When allocating the stack space
5396    FRAME_RELATED_P is then used to indicate if the allocation is frame related.
5397    FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
5398    arguments.  If we are then we ensure that any allocation larger than the ABI
5399    defined buffer needs a probe so that the invariant of having a 1KB buffer is
5400    maintained.
5401
5402    We emit barriers after each stack adjustment to prevent optimizations from
5403    breaking the invariant that we never drop the stack more than a page.  This
5404    invariant is needed to make it easier to correctly handle asynchronous
5405    events, e.g. if we were to allow the stack to be dropped by more than a page
5406    and then have multiple probes up and we take a signal somewhere in between
5407    then the signal handler doesn't know the state of the stack and can make no
5408    assumptions about which pages have been probed.  */
5409
5410 static void
5411 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
5412                                         poly_int64 poly_size,
5413                                         bool frame_related_p,
5414                                         bool final_adjustment_p)
5415 {
5416   HOST_WIDE_INT guard_size
5417     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5418   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
5419   /* When doing the final adjustment for the outgoing argument size we can't
5420      assume that LR was saved at position 0.  So subtract it's offset from the
5421      ABI safe buffer so that we don't accidentally allow an adjustment that
5422      would result in an allocation larger than the ABI buffer without
5423      probing.  */
5424   HOST_WIDE_INT min_probe_threshold
5425     = final_adjustment_p
5426       ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
5427       : guard_size - guard_used_by_caller;
5428
5429   poly_int64 frame_size = cfun->machine->frame.frame_size;
5430
5431   /* We should always have a positive probe threshold.  */
5432   gcc_assert (min_probe_threshold > 0);
5433
5434   if (flag_stack_clash_protection && !final_adjustment_p)
5435     {
5436       poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5437       poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5438
5439       if (known_eq (frame_size, 0))
5440         {
5441           dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
5442         }
5443       else if (known_lt (initial_adjust, guard_size - guard_used_by_caller)
5444                && known_lt (final_adjust, guard_used_by_caller))
5445         {
5446           dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
5447         }
5448     }
5449
5450   /* If SIZE is not large enough to require probing, just adjust the stack and
5451      exit.  */
5452   if (known_lt (poly_size, min_probe_threshold)
5453       || !flag_stack_clash_protection)
5454     {
5455       aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
5456       return;
5457     }
5458
5459   HOST_WIDE_INT size;
5460   /* Handle the SVE non-constant case first.  */
5461   if (!poly_size.is_constant (&size))
5462     {
5463      if (dump_file)
5464       {
5465         fprintf (dump_file, "Stack clash SVE prologue: ");
5466         print_dec (poly_size, dump_file);
5467         fprintf (dump_file, " bytes, dynamic probing will be required.\n");
5468       }
5469
5470       /* First calculate the amount of bytes we're actually spilling.  */
5471       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
5472                           poly_size, temp1, temp2, false, true);
5473
5474       rtx_insn *insn = get_last_insn ();
5475
5476       if (frame_related_p)
5477         {
5478           /* This is done to provide unwinding information for the stack
5479              adjustments we're about to do, however to prevent the optimizers
5480              from removing the R11 move and leaving the CFA note (which would be
5481              very wrong) we tie the old and new stack pointer together.
5482              The tie will expand to nothing but the optimizers will not touch
5483              the instruction.  */
5484           rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
5485           emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
5486           emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
5487
5488           /* We want the CFA independent of the stack pointer for the
5489              duration of the loop.  */
5490           add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
5491           RTX_FRAME_RELATED_P (insn) = 1;
5492         }
5493
5494       rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
5495       rtx guard_const = gen_int_mode (guard_size, Pmode);
5496
5497       insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
5498                                                    stack_pointer_rtx, temp1,
5499                                                    probe_const, guard_const));
5500
5501       /* Now reset the CFA register if needed.  */
5502       if (frame_related_p)
5503         {
5504           add_reg_note (insn, REG_CFA_DEF_CFA,
5505                         gen_rtx_PLUS (Pmode, stack_pointer_rtx,
5506                                       gen_int_mode (poly_size, Pmode)));
5507           RTX_FRAME_RELATED_P (insn) = 1;
5508         }
5509
5510       return;
5511     }
5512
5513   if (dump_file)
5514     fprintf (dump_file,
5515              "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
5516              " bytes, probing will be required.\n", size);
5517
5518   /* Round size to the nearest multiple of guard_size, and calculate the
5519      residual as the difference between the original size and the rounded
5520      size.  */
5521   HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
5522   HOST_WIDE_INT residual = size - rounded_size;
5523
5524   /* We can handle a small number of allocations/probes inline.  Otherwise
5525      punt to a loop.  */
5526   if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
5527     {
5528       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
5529         {
5530           aarch64_sub_sp (NULL, temp2, guard_size, true);
5531           emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
5532                                            guard_used_by_caller));
5533           emit_insn (gen_blockage ());
5534         }
5535       dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
5536     }
5537   else
5538     {
5539       /* Compute the ending address.  */
5540       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
5541                           temp1, NULL, false, true);
5542       rtx_insn *insn = get_last_insn ();
5543
5544       /* For the initial allocation, we don't have a frame pointer
5545          set up, so we always need CFI notes.  If we're doing the
5546          final allocation, then we may have a frame pointer, in which
5547          case it is the CFA, otherwise we need CFI notes.
5548
5549          We can determine which allocation we are doing by looking at
5550          the value of FRAME_RELATED_P since the final allocations are not
5551          frame related.  */
5552       if (frame_related_p)
5553         {
5554           /* We want the CFA independent of the stack pointer for the
5555              duration of the loop.  */
5556           add_reg_note (insn, REG_CFA_DEF_CFA,
5557                         plus_constant (Pmode, temp1, rounded_size));
5558           RTX_FRAME_RELATED_P (insn) = 1;
5559         }
5560
5561       /* This allocates and probes the stack.  Note that this re-uses some of
5562          the existing Ada stack protection code.  However we are guaranteed not
5563          to enter the non loop or residual branches of that code.
5564
5565          The non-loop part won't be entered because if our allocation amount
5566          doesn't require a loop, the case above would handle it.
5567
5568          The residual amount won't be entered because TEMP1 is a mutliple of
5569          the allocation size.  The residual will always be 0.  As such, the only
5570          part we are actually using from that code is the loop setup.  The
5571          actual probing is done in aarch64_output_probe_stack_range.  */
5572       insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
5573                                                stack_pointer_rtx, temp1));
5574
5575       /* Now reset the CFA register if needed.  */
5576       if (frame_related_p)
5577         {
5578           add_reg_note (insn, REG_CFA_DEF_CFA,
5579                         plus_constant (Pmode, stack_pointer_rtx, rounded_size));
5580           RTX_FRAME_RELATED_P (insn) = 1;
5581         }
5582
5583       emit_insn (gen_blockage ());
5584       dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
5585     }
5586
5587   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
5588      be probed.  This maintains the requirement that each page is probed at
5589      least once.  For initial probing we probe only if the allocation is
5590      more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
5591      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
5592      GUARD_SIZE.  This works that for any allocation that is large enough to
5593      trigger a probe here, we'll have at least one, and if they're not large
5594      enough for this code to emit anything for them, The page would have been
5595      probed by the saving of FP/LR either by this function or any callees.  If
5596      we don't have any callees then we won't have more stack adjustments and so
5597      are still safe.  */
5598   if (residual)
5599     {
5600       HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
5601       /* If we're doing final adjustments, and we've done any full page
5602          allocations then any residual needs to be probed.  */
5603       if (final_adjustment_p && rounded_size != 0)
5604         min_probe_threshold = 0;
5605       /* If doing a small final adjustment, we always probe at offset 0.
5606          This is done to avoid issues when LR is not at position 0 or when
5607          the final adjustment is smaller than the probing offset.  */
5608       else if (final_adjustment_p && rounded_size == 0)
5609         residual_probe_offset = 0;
5610
5611       aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
5612       if (residual >= min_probe_threshold)
5613         {
5614           if (dump_file)
5615             fprintf (dump_file,
5616                      "Stack clash AArch64 prologue residuals: "
5617                      HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
5618                      "\n", residual);
5619
5620             emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
5621                                              residual_probe_offset));
5622           emit_insn (gen_blockage ());
5623         }
5624     }
5625 }
5626
5627 /* Return 1 if the register is used by the epilogue.  We need to say the
5628    return register is used, but only after epilogue generation is complete.
5629    Note that in the case of sibcalls, the values "used by the epilogue" are
5630    considered live at the start of the called function.
5631
5632    For SIMD functions we need to return 1 for FP registers that are saved and
5633    restored by a function but are not zero in call_used_regs.  If we do not do
5634    this optimizations may remove the restore of the register.  */
5635
5636 int
5637 aarch64_epilogue_uses (int regno)
5638 {
5639   if (epilogue_completed)
5640     {
5641       if (regno == LR_REGNUM)
5642         return 1;
5643       if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno))
5644         return 1;
5645     }
5646   return 0;
5647 }
5648
5649 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
5650    is saved at BASE + OFFSET.  */
5651
5652 static void
5653 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
5654                             rtx base, poly_int64 offset)
5655 {
5656   rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
5657   add_reg_note (insn, REG_CFA_EXPRESSION,
5658                 gen_rtx_SET (mem, regno_reg_rtx[reg]));
5659 }
5660
5661 /* AArch64 stack frames generated by this compiler look like:
5662
5663         +-------------------------------+
5664         |                               |
5665         |  incoming stack arguments     |
5666         |                               |
5667         +-------------------------------+
5668         |                               | <-- incoming stack pointer (aligned)
5669         |  callee-allocated save area   |
5670         |  for register varargs         |
5671         |                               |
5672         +-------------------------------+
5673         |  local variables              | <-- frame_pointer_rtx
5674         |                               |
5675         +-------------------------------+
5676         |  padding                      | \
5677         +-------------------------------+  |
5678         |  callee-saved registers       |  | frame.saved_regs_size
5679         +-------------------------------+  |
5680         |  LR'                          |  |
5681         +-------------------------------+  |
5682         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
5683         +-------------------------------+
5684         |  dynamic allocation           |
5685         +-------------------------------+
5686         |  padding                      |
5687         +-------------------------------+
5688         |  outgoing stack arguments     | <-- arg_pointer
5689         |                               |
5690         +-------------------------------+
5691         |                               | <-- stack_pointer_rtx (aligned)
5692
5693    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
5694    but leave frame_pointer_rtx and hard_frame_pointer_rtx
5695    unchanged.
5696
5697    By default for stack-clash we assume the guard is at least 64KB, but this
5698    value is configurable to either 4KB or 64KB.  We also force the guard size to
5699    be the same as the probing interval and both values are kept in sync.
5700
5701    With those assumptions the callee can allocate up to 63KB (or 3KB depending
5702    on the guard size) of stack space without probing.
5703
5704    When probing is needed, we emit a probe at the start of the prologue
5705    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
5706
5707    We have to track how much space has been allocated and the only stores
5708    to the stack we track as implicit probes are the FP/LR stores.
5709
5710    For outgoing arguments we probe if the size is larger than 1KB, such that
5711    the ABI specified buffer is maintained for the next callee.
5712
5713    The following registers are reserved during frame layout and should not be
5714    used for any other purpose:
5715
5716    - r11: Used by stack clash protection when SVE is enabled.
5717    - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
5718    - r14 and r15: Used for speculation tracking.
5719    - r16(IP0), r17(IP1): Used by indirect tailcalls.
5720    - r30(LR), r29(FP): Used by standard frame layout.
5721
5722    These registers must be avoided in frame layout related code unless the
5723    explicit intention is to interact with one of the features listed above.  */
5724
5725 /* Generate the prologue instructions for entry into a function.
5726    Establish the stack frame by decreasing the stack pointer with a
5727    properly calculated size and, if necessary, create a frame record
5728    filled with the values of LR and previous frame pointer.  The
5729    current FP is also set up if it is in use.  */
5730
5731 void
5732 aarch64_expand_prologue (void)
5733 {
5734   poly_int64 frame_size = cfun->machine->frame.frame_size;
5735   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5736   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
5737   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5738   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
5739   unsigned reg1 = cfun->machine->frame.wb_candidate1;
5740   unsigned reg2 = cfun->machine->frame.wb_candidate2;
5741   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
5742   rtx_insn *insn;
5743
5744   /* Sign return address for functions.  */
5745   if (aarch64_return_address_signing_enabled ())
5746     {
5747       switch (aarch64_ra_sign_key)
5748         {
5749           case AARCH64_KEY_A:
5750             insn = emit_insn (gen_paciasp ());
5751             break;
5752           case AARCH64_KEY_B:
5753             insn = emit_insn (gen_pacibsp ());
5754             break;
5755           default:
5756             gcc_unreachable ();
5757         }
5758       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5759       RTX_FRAME_RELATED_P (insn) = 1;
5760     }
5761
5762   if (flag_stack_usage_info)
5763     current_function_static_stack_size = constant_lower_bound (frame_size);
5764
5765   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
5766     {
5767       if (crtl->is_leaf && !cfun->calls_alloca)
5768         {
5769           if (maybe_gt (frame_size, PROBE_INTERVAL)
5770               && maybe_gt (frame_size, get_stack_check_protect ()))
5771             aarch64_emit_probe_stack_range (get_stack_check_protect (),
5772                                             (frame_size
5773                                              - get_stack_check_protect ()));
5774         }
5775       else if (maybe_gt (frame_size, 0))
5776         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
5777     }
5778
5779   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
5780   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
5781
5782   /* In theory we should never have both an initial adjustment
5783      and a callee save adjustment.  Verify that is the case since the
5784      code below does not handle it for -fstack-clash-protection.  */
5785   gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
5786
5787   /* Will only probe if the initial adjustment is larger than the guard
5788      less the amount of the guard reserved for use by the caller's
5789      outgoing args.  */
5790   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
5791                                           true, false);
5792
5793   if (callee_adjust != 0)
5794     aarch64_push_regs (reg1, reg2, callee_adjust);
5795
5796   if (emit_frame_chain)
5797     {
5798       poly_int64 reg_offset = callee_adjust;
5799       if (callee_adjust == 0)
5800         {
5801           reg1 = R29_REGNUM;
5802           reg2 = R30_REGNUM;
5803           reg_offset = callee_offset;
5804           aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
5805         }
5806       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
5807                           stack_pointer_rtx, callee_offset,
5808                           tmp1_rtx, tmp0_rtx, frame_pointer_needed);
5809       if (frame_pointer_needed && !frame_size.is_constant ())
5810         {
5811           /* Variable-sized frames need to describe the save slot
5812              address using DW_CFA_expression rather than DW_CFA_offset.
5813              This means that, without taking further action, the
5814              locations of the registers that we've already saved would
5815              remain based on the stack pointer even after we redefine
5816              the CFA based on the frame pointer.  We therefore need new
5817              DW_CFA_expressions to re-express the save slots with addresses
5818              based on the frame pointer.  */
5819           rtx_insn *insn = get_last_insn ();
5820           gcc_assert (RTX_FRAME_RELATED_P (insn));
5821
5822           /* Add an explicit CFA definition if this was previously
5823              implicit.  */
5824           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
5825             {
5826               rtx src = plus_constant (Pmode, stack_pointer_rtx,
5827                                        callee_offset);
5828               add_reg_note (insn, REG_CFA_ADJUST_CFA,
5829                             gen_rtx_SET (hard_frame_pointer_rtx, src));
5830             }
5831
5832           /* Change the save slot expressions for the registers that
5833              we've already saved.  */
5834           reg_offset -= callee_offset;
5835           aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
5836                                       reg_offset + UNITS_PER_WORD);
5837           aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
5838                                       reg_offset);
5839         }
5840       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
5841     }
5842
5843   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5844                              callee_adjust != 0 || emit_frame_chain);
5845   if (aarch64_simd_decl_p (cfun->decl))
5846     aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5847                                callee_adjust != 0 || emit_frame_chain);
5848   else
5849     aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5850                                callee_adjust != 0 || emit_frame_chain);
5851
5852   /* We may need to probe the final adjustment if it is larger than the guard
5853      that is assumed by the called.  */
5854   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
5855                                           !frame_pointer_needed, true);
5856 }
5857
5858 /* Return TRUE if we can use a simple_return insn.
5859
5860    This function checks whether the callee saved stack is empty, which
5861    means no restore actions are need. The pro_and_epilogue will use
5862    this to check whether shrink-wrapping opt is feasible.  */
5863
5864 bool
5865 aarch64_use_return_insn_p (void)
5866 {
5867   if (!reload_completed)
5868     return false;
5869
5870   if (crtl->profile)
5871     return false;
5872
5873   return known_eq (cfun->machine->frame.frame_size, 0);
5874 }
5875
5876 /* Return false for non-leaf SIMD functions in order to avoid
5877    shrink-wrapping them.  Doing this will lose the necessary
5878    save/restore of FP registers.  */
5879
5880 bool
5881 aarch64_use_simple_return_insn_p (void)
5882 {
5883   if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf)
5884     return false;
5885
5886   return true;
5887 }
5888
5889 /* Generate the epilogue instructions for returning from a function.
5890    This is almost exactly the reverse of the prolog sequence, except
5891    that we need to insert barriers to avoid scheduling loads that read
5892    from a deallocated stack, and we optimize the unwind records by
5893    emitting them all together if possible.  */
5894 void
5895 aarch64_expand_epilogue (bool for_sibcall)
5896 {
5897   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5898   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
5899   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5900   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
5901   unsigned reg1 = cfun->machine->frame.wb_candidate1;
5902   unsigned reg2 = cfun->machine->frame.wb_candidate2;
5903   rtx cfi_ops = NULL;
5904   rtx_insn *insn;
5905   /* A stack clash protection prologue may not have left EP0_REGNUM or
5906      EP1_REGNUM in a usable state.  The same is true for allocations
5907      with an SVE component, since we then need both temporary registers
5908      for each allocation.  For stack clash we are in a usable state if
5909      the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
5910   HOST_WIDE_INT guard_size
5911     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5912   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
5913
5914   /* We can re-use the registers when the allocation amount is smaller than
5915      guard_size - guard_used_by_caller because we won't be doing any probes
5916      then.  In such situations the register should remain live with the correct
5917      value.  */
5918   bool can_inherit_p = (initial_adjust.is_constant ()
5919                         && final_adjust.is_constant ())
5920                         && (!flag_stack_clash_protection
5921                             || known_lt (initial_adjust,
5922                                          guard_size - guard_used_by_caller));
5923
5924   /* We need to add memory barrier to prevent read from deallocated stack.  */
5925   bool need_barrier_p
5926     = maybe_ne (get_frame_size ()
5927                 + cfun->machine->frame.saved_varargs_size, 0);
5928
5929   /* Emit a barrier to prevent loads from a deallocated stack.  */
5930   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
5931       || cfun->calls_alloca
5932       || crtl->calls_eh_return)
5933     {
5934       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5935       need_barrier_p = false;
5936     }
5937
5938   /* Restore the stack pointer from the frame pointer if it may not
5939      be the same as the stack pointer.  */
5940   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
5941   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
5942   if (frame_pointer_needed
5943       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
5944     /* If writeback is used when restoring callee-saves, the CFA
5945        is restored on the instruction doing the writeback.  */
5946     aarch64_add_offset (Pmode, stack_pointer_rtx,
5947                         hard_frame_pointer_rtx, -callee_offset,
5948                         tmp1_rtx, tmp0_rtx, callee_adjust == 0);
5949   else
5950      /* The case where we need to re-use the register here is very rare, so
5951         avoid the complicated condition and just always emit a move if the
5952         immediate doesn't fit.  */
5953      aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
5954
5955   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5956                                 callee_adjust != 0, &cfi_ops);
5957   if (aarch64_simd_decl_p (cfun->decl))
5958     aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5959                                   callee_adjust != 0, &cfi_ops);
5960   else
5961     aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5962                                   callee_adjust != 0, &cfi_ops);
5963
5964   if (need_barrier_p)
5965     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5966
5967   if (callee_adjust != 0)
5968     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
5969
5970   if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
5971     {
5972       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
5973       insn = get_last_insn ();
5974       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
5975       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
5976       RTX_FRAME_RELATED_P (insn) = 1;
5977       cfi_ops = NULL;
5978     }
5979
5980   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
5981      add restriction on emit_move optimization to leaf functions.  */
5982   aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
5983                   (!can_inherit_p || !crtl->is_leaf
5984                    || df_regs_ever_live_p (EP0_REGNUM)));
5985
5986   if (cfi_ops)
5987     {
5988       /* Emit delayed restores and reset the CFA to be SP.  */
5989       insn = get_last_insn ();
5990       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
5991       REG_NOTES (insn) = cfi_ops;
5992       RTX_FRAME_RELATED_P (insn) = 1;
5993     }
5994
5995   /* We prefer to emit the combined return/authenticate instruction RETAA,
5996      however there are three cases in which we must instead emit an explicit
5997      authentication instruction.
5998
5999         1) Sibcalls don't return in a normal way, so if we're about to call one
6000            we must authenticate.
6001
6002         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
6003            generating code for !TARGET_ARMV8_3 we can't use it and must
6004            explicitly authenticate.
6005
6006         3) On an eh_return path we make extra stack adjustments to update the
6007            canonical frame address to be the exception handler's CFA.  We want
6008            to authenticate using the CFA of the function which calls eh_return.
6009     */
6010   if (aarch64_return_address_signing_enabled ()
6011       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
6012     {
6013       switch (aarch64_ra_sign_key)
6014         {
6015           case AARCH64_KEY_A:
6016             insn = emit_insn (gen_autiasp ());
6017             break;
6018           case AARCH64_KEY_B:
6019             insn = emit_insn (gen_autibsp ());
6020             break;
6021           default:
6022             gcc_unreachable ();
6023         }
6024       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6025       RTX_FRAME_RELATED_P (insn) = 1;
6026     }
6027
6028   /* Stack adjustment for exception handler.  */
6029   if (crtl->calls_eh_return && !for_sibcall)
6030     {
6031       /* We need to unwind the stack by the offset computed by
6032          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
6033          to be SP; letting the CFA move during this adjustment
6034          is just as correct as retaining the CFA from the body
6035          of the function.  Therefore, do nothing special.  */
6036       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
6037     }
6038
6039   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
6040   if (!for_sibcall)
6041     emit_jump_insn (ret_rtx);
6042 }
6043
6044 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
6045    normally or return to a previous frame after unwinding.
6046
6047    An EH return uses a single shared return sequence.  The epilogue is
6048    exactly like a normal epilogue except that it has an extra input
6049    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
6050    that must be applied after the frame has been destroyed.  An extra label
6051    is inserted before the epilogue which initializes this register to zero,
6052    and this is the entry point for a normal return.
6053
6054    An actual EH return updates the return address, initializes the stack
6055    adjustment and jumps directly into the epilogue (bypassing the zeroing
6056    of the adjustment).  Since the return address is typically saved on the
6057    stack when a function makes a call, the saved LR must be updated outside
6058    the epilogue.
6059
6060    This poses problems as the store is generated well before the epilogue,
6061    so the offset of LR is not known yet.  Also optimizations will remove the
6062    store as it appears dead, even after the epilogue is generated (as the
6063    base or offset for loading LR is different in many cases).
6064
6065    To avoid these problems this implementation forces the frame pointer
6066    in eh_return functions so that the location of LR is fixed and known early.
6067    It also marks the store volatile, so no optimization is permitted to
6068    remove the store.  */
6069 rtx
6070 aarch64_eh_return_handler_rtx (void)
6071 {
6072   rtx tmp = gen_frame_mem (Pmode,
6073     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
6074
6075   /* Mark the store volatile, so no optimization is permitted to remove it.  */
6076   MEM_VOLATILE_P (tmp) = true;
6077   return tmp;
6078 }
6079
6080 /* Output code to add DELTA to the first argument, and then jump
6081    to FUNCTION.  Used for C++ multiple inheritance.  */
6082 static void
6083 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
6084                          HOST_WIDE_INT delta,
6085                          HOST_WIDE_INT vcall_offset,
6086                          tree function)
6087 {
6088   /* The this pointer is always in x0.  Note that this differs from
6089      Arm where the this pointer maybe bumped to r1 if r0 is required
6090      to return a pointer to an aggregate.  On AArch64 a result value
6091      pointer will be in x8.  */
6092   int this_regno = R0_REGNUM;
6093   rtx this_rtx, temp0, temp1, addr, funexp;
6094   rtx_insn *insn;
6095   const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
6096
6097   if (aarch64_bti_enabled ())
6098     emit_insn (gen_bti_c());
6099
6100   reload_completed = 1;
6101   emit_note (NOTE_INSN_PROLOGUE_END);
6102
6103   this_rtx = gen_rtx_REG (Pmode, this_regno);
6104   temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
6105   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
6106
6107   if (vcall_offset == 0)
6108     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
6109   else
6110     {
6111       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
6112
6113       addr = this_rtx;
6114       if (delta != 0)
6115         {
6116           if (delta >= -256 && delta < 256)
6117             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
6118                                        plus_constant (Pmode, this_rtx, delta));
6119           else
6120             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
6121                                 temp1, temp0, false);
6122         }
6123
6124       if (Pmode == ptr_mode)
6125         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
6126       else
6127         aarch64_emit_move (temp0,
6128                            gen_rtx_ZERO_EXTEND (Pmode,
6129                                                 gen_rtx_MEM (ptr_mode, addr)));
6130
6131       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
6132           addr = plus_constant (Pmode, temp0, vcall_offset);
6133       else
6134         {
6135           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
6136                                           Pmode);
6137           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
6138         }
6139
6140       if (Pmode == ptr_mode)
6141         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
6142       else
6143         aarch64_emit_move (temp1,
6144                            gen_rtx_SIGN_EXTEND (Pmode,
6145                                                 gen_rtx_MEM (ptr_mode, addr)));
6146
6147       emit_insn (gen_add2_insn (this_rtx, temp1));
6148     }
6149
6150   /* Generate a tail call to the target function.  */
6151   if (!TREE_USED (function))
6152     {
6153       assemble_external (function);
6154       TREE_USED (function) = 1;
6155     }
6156   funexp = XEXP (DECL_RTL (function), 0);
6157   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
6158   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
6159   SIBLING_CALL_P (insn) = 1;
6160
6161   insn = get_insns ();
6162   shorten_branches (insn);
6163
6164   assemble_start_function (thunk, fnname);
6165   final_start_function (insn, file, 1);
6166   final (insn, file, 1);
6167   final_end_function ();
6168   assemble_end_function (thunk, fnname);
6169
6170   /* Stop pretending to be a post-reload pass.  */
6171   reload_completed = 0;
6172 }
6173
6174 static bool
6175 aarch64_tls_referenced_p (rtx x)
6176 {
6177   if (!TARGET_HAVE_TLS)
6178     return false;
6179   subrtx_iterator::array_type array;
6180   FOR_EACH_SUBRTX (iter, array, x, ALL)
6181     {
6182       const_rtx x = *iter;
6183       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
6184         return true;
6185       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
6186          TLS offsets, not real symbol references.  */
6187       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
6188         iter.skip_subrtxes ();
6189     }
6190   return false;
6191 }
6192
6193
6194 /* Return true if val can be encoded as a 12-bit unsigned immediate with
6195    a left shift of 0 or 12 bits.  */
6196 bool
6197 aarch64_uimm12_shift (HOST_WIDE_INT val)
6198 {
6199   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
6200           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
6201           );
6202 }
6203
6204 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
6205    that can be created with a left shift of 0 or 12.  */
6206 static HOST_WIDE_INT
6207 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
6208 {
6209   /* Check to see if the value fits in 24 bits, as that is the maximum we can
6210      handle correctly.  */
6211   gcc_assert ((val & 0xffffff) == val);
6212
6213   if (((val & 0xfff) << 0) == val)
6214     return val;
6215
6216   return val & (0xfff << 12);
6217 }
6218
6219 /* Return true if val is an immediate that can be loaded into a
6220    register by a MOVZ instruction.  */
6221 static bool
6222 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
6223 {
6224   if (GET_MODE_SIZE (mode) > 4)
6225     {
6226       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
6227           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
6228         return 1;
6229     }
6230   else
6231     {
6232       /* Ignore sign extension.  */
6233       val &= (HOST_WIDE_INT) 0xffffffff;
6234     }
6235   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
6236           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
6237 }
6238
6239 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
6240    64-bit (DImode) integer.  */
6241
6242 static unsigned HOST_WIDE_INT
6243 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
6244 {
6245   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
6246   while (size < 64)
6247     {
6248       val &= (HOST_WIDE_INT_1U << size) - 1;
6249       val |= val << size;
6250       size *= 2;
6251     }
6252   return val;
6253 }
6254
6255 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
6256
6257 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
6258   {
6259     0x0000000100000001ull,
6260     0x0001000100010001ull,
6261     0x0101010101010101ull,
6262     0x1111111111111111ull,
6263     0x5555555555555555ull,
6264   };
6265
6266
6267 /* Return true if val is a valid bitmask immediate.  */
6268
6269 bool
6270 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
6271 {
6272   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
6273   int bits;
6274
6275   /* Check for a single sequence of one bits and return quickly if so.
6276      The special cases of all ones and all zeroes returns false.  */
6277   val = aarch64_replicate_bitmask_imm (val_in, mode);
6278   tmp = val + (val & -val);
6279
6280   if (tmp == (tmp & -tmp))
6281     return (val + 1) > 1;
6282
6283   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
6284   if (mode == SImode)
6285     val = (val << 32) | (val & 0xffffffff);
6286
6287   /* Invert if the immediate doesn't start with a zero bit - this means we
6288      only need to search for sequences of one bits.  */
6289   if (val & 1)
6290     val = ~val;
6291
6292   /* Find the first set bit and set tmp to val with the first sequence of one
6293      bits removed.  Return success if there is a single sequence of ones.  */
6294   first_one = val & -val;
6295   tmp = val & (val + first_one);
6296
6297   if (tmp == 0)
6298     return true;
6299
6300   /* Find the next set bit and compute the difference in bit position.  */
6301   next_one = tmp & -tmp;
6302   bits = clz_hwi (first_one) - clz_hwi (next_one);
6303   mask = val ^ tmp;
6304
6305   /* Check the bit position difference is a power of 2, and that the first
6306      sequence of one bits fits within 'bits' bits.  */
6307   if ((mask >> bits) != 0 || bits != (bits & -bits))
6308     return false;
6309
6310   /* Check the sequence of one bits is repeated 64/bits times.  */
6311   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
6312 }
6313
6314 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
6315    Assumed precondition: VAL_IN Is not zero.  */
6316
6317 unsigned HOST_WIDE_INT
6318 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
6319 {
6320   int lowest_bit_set = ctz_hwi (val_in);
6321   int highest_bit_set = floor_log2 (val_in);
6322   gcc_assert (val_in != 0);
6323
6324   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
6325           (HOST_WIDE_INT_1U << lowest_bit_set));
6326 }
6327
6328 /* Create constant where bits outside of lowest bit set to highest bit set
6329    are set to 1.  */
6330
6331 unsigned HOST_WIDE_INT
6332 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
6333 {
6334   return val_in | ~aarch64_and_split_imm1 (val_in);
6335 }
6336
6337 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
6338
6339 bool
6340 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
6341 {
6342   scalar_int_mode int_mode;
6343   if (!is_a <scalar_int_mode> (mode, &int_mode))
6344     return false;
6345
6346   if (aarch64_bitmask_imm (val_in, int_mode))
6347     return false;
6348
6349   if (aarch64_move_imm (val_in, int_mode))
6350     return false;
6351
6352   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
6353
6354   return aarch64_bitmask_imm (imm2, int_mode);
6355 }
6356
6357 /* Return true if val is an immediate that can be loaded into a
6358    register in a single instruction.  */
6359 bool
6360 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
6361 {
6362   scalar_int_mode int_mode;
6363   if (!is_a <scalar_int_mode> (mode, &int_mode))
6364     return false;
6365
6366   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
6367     return 1;
6368   return aarch64_bitmask_imm (val, int_mode);
6369 }
6370
6371 static bool
6372 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
6373 {
6374   rtx base, offset;
6375
6376   if (GET_CODE (x) == HIGH)
6377     return true;
6378
6379   /* There's no way to calculate VL-based values using relocations.  */
6380   subrtx_iterator::array_type array;
6381   FOR_EACH_SUBRTX (iter, array, x, ALL)
6382     if (GET_CODE (*iter) == CONST_POLY_INT)
6383       return true;
6384
6385   split_const (x, &base, &offset);
6386   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
6387     {
6388       if (aarch64_classify_symbol (base, INTVAL (offset))
6389           != SYMBOL_FORCE_TO_MEM)
6390         return true;
6391       else
6392         /* Avoid generating a 64-bit relocation in ILP32; leave
6393            to aarch64_expand_mov_immediate to handle it properly.  */
6394         return mode != ptr_mode;
6395     }
6396
6397   return aarch64_tls_referenced_p (x);
6398 }
6399
6400 /* Implement TARGET_CASE_VALUES_THRESHOLD.
6401    The expansion for a table switch is quite expensive due to the number
6402    of instructions, the table lookup and hard to predict indirect jump.
6403    When optimizing for speed, and -O3 enabled, use the per-core tuning if
6404    set, otherwise use tables for > 16 cases as a tradeoff between size and
6405    performance.  When optimizing for size, use the default setting.  */
6406
6407 static unsigned int
6408 aarch64_case_values_threshold (void)
6409 {
6410   /* Use the specified limit for the number of cases before using jump
6411      tables at higher optimization levels.  */
6412   if (optimize > 2
6413       && selected_cpu->tune->max_case_values != 0)
6414     return selected_cpu->tune->max_case_values;
6415   else
6416     return optimize_size ? default_case_values_threshold () : 17;
6417 }
6418
6419 /* Return true if register REGNO is a valid index register.
6420    STRICT_P is true if REG_OK_STRICT is in effect.  */
6421
6422 bool
6423 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
6424 {
6425   if (!HARD_REGISTER_NUM_P (regno))
6426     {
6427       if (!strict_p)
6428         return true;
6429
6430       if (!reg_renumber)
6431         return false;
6432
6433       regno = reg_renumber[regno];
6434     }
6435   return GP_REGNUM_P (regno);
6436 }
6437
6438 /* Return true if register REGNO is a valid base register for mode MODE.
6439    STRICT_P is true if REG_OK_STRICT is in effect.  */
6440
6441 bool
6442 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
6443 {
6444   if (!HARD_REGISTER_NUM_P (regno))
6445     {
6446       if (!strict_p)
6447         return true;
6448
6449       if (!reg_renumber)
6450         return false;
6451
6452       regno = reg_renumber[regno];
6453     }
6454
6455   /* The fake registers will be eliminated to either the stack or
6456      hard frame pointer, both of which are usually valid base registers.
6457      Reload deals with the cases where the eliminated form isn't valid.  */
6458   return (GP_REGNUM_P (regno)
6459           || regno == SP_REGNUM
6460           || regno == FRAME_POINTER_REGNUM
6461           || regno == ARG_POINTER_REGNUM);
6462 }
6463
6464 /* Return true if X is a valid base register for mode MODE.
6465    STRICT_P is true if REG_OK_STRICT is in effect.  */
6466
6467 static bool
6468 aarch64_base_register_rtx_p (rtx x, bool strict_p)
6469 {
6470   if (!strict_p
6471       && GET_CODE (x) == SUBREG
6472       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
6473     x = SUBREG_REG (x);
6474
6475   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
6476 }
6477
6478 /* Return true if address offset is a valid index.  If it is, fill in INFO
6479    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
6480
6481 static bool
6482 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
6483                         machine_mode mode, bool strict_p)
6484 {
6485   enum aarch64_address_type type;
6486   rtx index;
6487   int shift;
6488
6489   /* (reg:P) */
6490   if ((REG_P (x) || GET_CODE (x) == SUBREG)
6491       && GET_MODE (x) == Pmode)
6492     {
6493       type = ADDRESS_REG_REG;
6494       index = x;
6495       shift = 0;
6496     }
6497   /* (sign_extend:DI (reg:SI)) */
6498   else if ((GET_CODE (x) == SIGN_EXTEND
6499             || GET_CODE (x) == ZERO_EXTEND)
6500            && GET_MODE (x) == DImode
6501            && GET_MODE (XEXP (x, 0)) == SImode)
6502     {
6503       type = (GET_CODE (x) == SIGN_EXTEND)
6504         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6505       index = XEXP (x, 0);
6506       shift = 0;
6507     }
6508   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
6509   else if (GET_CODE (x) == MULT
6510            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6511                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6512            && GET_MODE (XEXP (x, 0)) == DImode
6513            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6514            && CONST_INT_P (XEXP (x, 1)))
6515     {
6516       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6517         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6518       index = XEXP (XEXP (x, 0), 0);
6519       shift = exact_log2 (INTVAL (XEXP (x, 1)));
6520     }
6521   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
6522   else if (GET_CODE (x) == ASHIFT
6523            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6524                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6525            && GET_MODE (XEXP (x, 0)) == DImode
6526            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6527            && CONST_INT_P (XEXP (x, 1)))
6528     {
6529       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6530         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6531       index = XEXP (XEXP (x, 0), 0);
6532       shift = INTVAL (XEXP (x, 1));
6533     }
6534   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
6535   else if ((GET_CODE (x) == SIGN_EXTRACT
6536             || GET_CODE (x) == ZERO_EXTRACT)
6537            && GET_MODE (x) == DImode
6538            && GET_CODE (XEXP (x, 0)) == MULT
6539            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6540            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
6541     {
6542       type = (GET_CODE (x) == SIGN_EXTRACT)
6543         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6544       index = XEXP (XEXP (x, 0), 0);
6545       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
6546       if (INTVAL (XEXP (x, 1)) != 32 + shift
6547           || INTVAL (XEXP (x, 2)) != 0)
6548         shift = -1;
6549     }
6550   /* (and:DI (mult:DI (reg:DI) (const_int scale))
6551      (const_int 0xffffffff<<shift)) */
6552   else if (GET_CODE (x) == AND
6553            && GET_MODE (x) == DImode
6554            && GET_CODE (XEXP (x, 0)) == MULT
6555            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6556            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6557            && CONST_INT_P (XEXP (x, 1)))
6558     {
6559       type = ADDRESS_REG_UXTW;
6560       index = XEXP (XEXP (x, 0), 0);
6561       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
6562       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
6563         shift = -1;
6564     }
6565   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
6566   else if ((GET_CODE (x) == SIGN_EXTRACT
6567             || GET_CODE (x) == ZERO_EXTRACT)
6568            && GET_MODE (x) == DImode
6569            && GET_CODE (XEXP (x, 0)) == ASHIFT
6570            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6571            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
6572     {
6573       type = (GET_CODE (x) == SIGN_EXTRACT)
6574         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6575       index = XEXP (XEXP (x, 0), 0);
6576       shift = INTVAL (XEXP (XEXP (x, 0), 1));
6577       if (INTVAL (XEXP (x, 1)) != 32 + shift
6578           || INTVAL (XEXP (x, 2)) != 0)
6579         shift = -1;
6580     }
6581   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
6582      (const_int 0xffffffff<<shift)) */
6583   else if (GET_CODE (x) == AND
6584            && GET_MODE (x) == DImode
6585            && GET_CODE (XEXP (x, 0)) == ASHIFT
6586            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6587            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6588            && CONST_INT_P (XEXP (x, 1)))
6589     {
6590       type = ADDRESS_REG_UXTW;
6591       index = XEXP (XEXP (x, 0), 0);
6592       shift = INTVAL (XEXP (XEXP (x, 0), 1));
6593       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
6594         shift = -1;
6595     }
6596   /* (mult:P (reg:P) (const_int scale)) */
6597   else if (GET_CODE (x) == MULT
6598            && GET_MODE (x) == Pmode
6599            && GET_MODE (XEXP (x, 0)) == Pmode
6600            && CONST_INT_P (XEXP (x, 1)))
6601     {
6602       type = ADDRESS_REG_REG;
6603       index = XEXP (x, 0);
6604       shift = exact_log2 (INTVAL (XEXP (x, 1)));
6605     }
6606   /* (ashift:P (reg:P) (const_int shift)) */
6607   else if (GET_CODE (x) == ASHIFT
6608            && GET_MODE (x) == Pmode
6609            && GET_MODE (XEXP (x, 0)) == Pmode
6610            && CONST_INT_P (XEXP (x, 1)))
6611     {
6612       type = ADDRESS_REG_REG;
6613       index = XEXP (x, 0);
6614       shift = INTVAL (XEXP (x, 1));
6615     }
6616   else
6617     return false;
6618
6619   if (!strict_p
6620       && GET_CODE (index) == SUBREG
6621       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
6622     index = SUBREG_REG (index);
6623
6624   if (aarch64_sve_data_mode_p (mode))
6625     {
6626       if (type != ADDRESS_REG_REG
6627           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
6628         return false;
6629     }
6630   else
6631     {
6632       if (shift != 0
6633           && !(IN_RANGE (shift, 1, 3)
6634                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
6635         return false;
6636     }
6637
6638   if (REG_P (index)
6639       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
6640     {
6641       info->type = type;
6642       info->offset = index;
6643       info->shift = shift;
6644       return true;
6645     }
6646
6647   return false;
6648 }
6649
6650 /* Return true if MODE is one of the modes for which we
6651    support LDP/STP operations.  */
6652
6653 static bool
6654 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
6655 {
6656   return mode == SImode || mode == DImode
6657          || mode == SFmode || mode == DFmode
6658          || (aarch64_vector_mode_supported_p (mode)
6659              && (known_eq (GET_MODE_SIZE (mode), 8)
6660                  || (known_eq (GET_MODE_SIZE (mode), 16)
6661                     && (aarch64_tune_params.extra_tuning_flags
6662                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
6663 }
6664
6665 /* Return true if REGNO is a virtual pointer register, or an eliminable
6666    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
6667    include stack_pointer or hard_frame_pointer.  */
6668 static bool
6669 virt_or_elim_regno_p (unsigned regno)
6670 {
6671   return ((regno >= FIRST_VIRTUAL_REGISTER
6672            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
6673           || regno == FRAME_POINTER_REGNUM
6674           || regno == ARG_POINTER_REGNUM);
6675 }
6676
6677 /* Return true if X is a valid address of type TYPE for machine mode MODE.
6678    If it is, fill in INFO appropriately.  STRICT_P is true if
6679    REG_OK_STRICT is in effect.  */
6680
6681 bool
6682 aarch64_classify_address (struct aarch64_address_info *info,
6683                           rtx x, machine_mode mode, bool strict_p,
6684                           aarch64_addr_query_type type)
6685 {
6686   enum rtx_code code = GET_CODE (x);
6687   rtx op0, op1;
6688   poly_int64 offset;
6689
6690   HOST_WIDE_INT const_size;
6691
6692   /* On BE, we use load/store pair for all large int mode load/stores.
6693      TI/TFmode may also use a load/store pair.  */
6694   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6695   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
6696   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
6697                             || type == ADDR_QUERY_LDP_STP_N
6698                             || mode == TImode
6699                             || mode == TFmode
6700                             || (BYTES_BIG_ENDIAN && advsimd_struct_p));
6701
6702   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
6703      corresponds to the actual size of the memory being loaded/stored and the
6704      mode of the corresponding addressing mode is half of that.  */
6705   if (type == ADDR_QUERY_LDP_STP_N
6706       && known_eq (GET_MODE_SIZE (mode), 16))
6707     mode = DFmode;
6708
6709   bool allow_reg_index_p = (!load_store_pair_p
6710                             && (known_lt (GET_MODE_SIZE (mode), 16)
6711                                 || vec_flags == VEC_ADVSIMD
6712                                 || vec_flags & VEC_SVE_DATA));
6713
6714   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
6715      [Rn, #offset, MUL VL].  */
6716   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
6717       && (code != REG && code != PLUS))
6718     return false;
6719
6720   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
6721      REG addressing.  */
6722   if (advsimd_struct_p
6723       && !BYTES_BIG_ENDIAN
6724       && (code != POST_INC && code != REG))
6725     return false;
6726
6727   gcc_checking_assert (GET_MODE (x) == VOIDmode
6728                        || SCALAR_INT_MODE_P (GET_MODE (x)));
6729
6730   switch (code)
6731     {
6732     case REG:
6733     case SUBREG:
6734       info->type = ADDRESS_REG_IMM;
6735       info->base = x;
6736       info->offset = const0_rtx;
6737       info->const_offset = 0;
6738       return aarch64_base_register_rtx_p (x, strict_p);
6739
6740     case PLUS:
6741       op0 = XEXP (x, 0);
6742       op1 = XEXP (x, 1);
6743
6744       if (! strict_p
6745           && REG_P (op0)
6746           && virt_or_elim_regno_p (REGNO (op0))
6747           && poly_int_rtx_p (op1, &offset))
6748         {
6749           info->type = ADDRESS_REG_IMM;
6750           info->base = op0;
6751           info->offset = op1;
6752           info->const_offset = offset;
6753
6754           return true;
6755         }
6756
6757       if (maybe_ne (GET_MODE_SIZE (mode), 0)
6758           && aarch64_base_register_rtx_p (op0, strict_p)
6759           && poly_int_rtx_p (op1, &offset))
6760         {
6761           info->type = ADDRESS_REG_IMM;
6762           info->base = op0;
6763           info->offset = op1;
6764           info->const_offset = offset;
6765
6766           /* TImode and TFmode values are allowed in both pairs of X
6767              registers and individual Q registers.  The available
6768              address modes are:
6769              X,X: 7-bit signed scaled offset
6770              Q:   9-bit signed offset
6771              We conservatively require an offset representable in either mode.
6772              When performing the check for pairs of X registers i.e.  LDP/STP
6773              pass down DImode since that is the natural size of the LDP/STP
6774              instruction memory accesses.  */
6775           if (mode == TImode || mode == TFmode)
6776             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
6777                     && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
6778                         || offset_12bit_unsigned_scaled_p (mode, offset)));
6779
6780           /* A 7bit offset check because OImode will emit a ldp/stp
6781              instruction (only big endian will get here).
6782              For ldp/stp instructions, the offset is scaled for the size of a
6783              single element of the pair.  */
6784           if (mode == OImode)
6785             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
6786
6787           /* Three 9/12 bit offsets checks because CImode will emit three
6788              ldr/str instructions (only big endian will get here).  */
6789           if (mode == CImode)
6790             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
6791                     && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
6792                                                                offset + 32)
6793                         || offset_12bit_unsigned_scaled_p (V16QImode,
6794                                                            offset + 32)));
6795
6796           /* Two 7bit offsets checks because XImode will emit two ldp/stp
6797              instructions (only big endian will get here).  */
6798           if (mode == XImode)
6799             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
6800                     && aarch64_offset_7bit_signed_scaled_p (TImode,
6801                                                             offset + 32));
6802
6803           /* Make "m" use the LD1 offset range for SVE data modes, so
6804              that pre-RTL optimizers like ivopts will work to that
6805              instead of the wider LDR/STR range.  */
6806           if (vec_flags == VEC_SVE_DATA)
6807             return (type == ADDR_QUERY_M
6808                     ? offset_4bit_signed_scaled_p (mode, offset)
6809                     : offset_9bit_signed_scaled_p (mode, offset));
6810
6811           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
6812             {
6813               poly_int64 end_offset = (offset
6814                                        + GET_MODE_SIZE (mode)
6815                                        - BYTES_PER_SVE_VECTOR);
6816               return (type == ADDR_QUERY_M
6817                       ? offset_4bit_signed_scaled_p (mode, offset)
6818                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
6819                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
6820                                                          end_offset)));
6821             }
6822
6823           if (vec_flags == VEC_SVE_PRED)
6824             return offset_9bit_signed_scaled_p (mode, offset);
6825
6826           if (load_store_pair_p)
6827             return ((known_eq (GET_MODE_SIZE (mode), 4)
6828                      || known_eq (GET_MODE_SIZE (mode), 8)
6829                      || known_eq (GET_MODE_SIZE (mode), 16))
6830                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
6831           else
6832             return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
6833                     || offset_12bit_unsigned_scaled_p (mode, offset));
6834         }
6835
6836       if (allow_reg_index_p)
6837         {
6838           /* Look for base + (scaled/extended) index register.  */
6839           if (aarch64_base_register_rtx_p (op0, strict_p)
6840               && aarch64_classify_index (info, op1, mode, strict_p))
6841             {
6842               info->base = op0;
6843               return true;
6844             }
6845           if (aarch64_base_register_rtx_p (op1, strict_p)
6846               && aarch64_classify_index (info, op0, mode, strict_p))
6847             {
6848               info->base = op1;
6849               return true;
6850             }
6851         }
6852
6853       return false;
6854
6855     case POST_INC:
6856     case POST_DEC:
6857     case PRE_INC:
6858     case PRE_DEC:
6859       info->type = ADDRESS_REG_WB;
6860       info->base = XEXP (x, 0);
6861       info->offset = NULL_RTX;
6862       return aarch64_base_register_rtx_p (info->base, strict_p);
6863
6864     case POST_MODIFY:
6865     case PRE_MODIFY:
6866       info->type = ADDRESS_REG_WB;
6867       info->base = XEXP (x, 0);
6868       if (GET_CODE (XEXP (x, 1)) == PLUS
6869           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
6870           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
6871           && aarch64_base_register_rtx_p (info->base, strict_p))
6872         {
6873           info->offset = XEXP (XEXP (x, 1), 1);
6874           info->const_offset = offset;
6875
6876           /* TImode and TFmode values are allowed in both pairs of X
6877              registers and individual Q registers.  The available
6878              address modes are:
6879              X,X: 7-bit signed scaled offset
6880              Q:   9-bit signed offset
6881              We conservatively require an offset representable in either mode.
6882            */
6883           if (mode == TImode || mode == TFmode)
6884             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
6885                     && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
6886
6887           if (load_store_pair_p)
6888             return ((known_eq (GET_MODE_SIZE (mode), 4)
6889                      || known_eq (GET_MODE_SIZE (mode), 8)
6890                      || known_eq (GET_MODE_SIZE (mode), 16))
6891                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
6892           else
6893             return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
6894         }
6895       return false;
6896
6897     case CONST:
6898     case SYMBOL_REF:
6899     case LABEL_REF:
6900       /* load literal: pc-relative constant pool entry.  Only supported
6901          for SI mode or larger.  */
6902       info->type = ADDRESS_SYMBOLIC;
6903
6904       if (!load_store_pair_p
6905           && GET_MODE_SIZE (mode).is_constant (&const_size)
6906           && const_size >= 4)
6907         {
6908           rtx sym, addend;
6909
6910           split_const (x, &sym, &addend);
6911           return ((GET_CODE (sym) == LABEL_REF
6912                    || (GET_CODE (sym) == SYMBOL_REF
6913                        && CONSTANT_POOL_ADDRESS_P (sym)
6914                        && aarch64_pcrelative_literal_loads)));
6915         }
6916       return false;
6917
6918     case LO_SUM:
6919       info->type = ADDRESS_LO_SUM;
6920       info->base = XEXP (x, 0);
6921       info->offset = XEXP (x, 1);
6922       if (allow_reg_index_p
6923           && aarch64_base_register_rtx_p (info->base, strict_p))
6924         {
6925           rtx sym, offs;
6926           split_const (info->offset, &sym, &offs);
6927           if (GET_CODE (sym) == SYMBOL_REF
6928               && (aarch64_classify_symbol (sym, INTVAL (offs))
6929                   == SYMBOL_SMALL_ABSOLUTE))
6930             {
6931               /* The symbol and offset must be aligned to the access size.  */
6932               unsigned int align;
6933
6934               if (CONSTANT_POOL_ADDRESS_P (sym))
6935                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
6936               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
6937                 {
6938                   tree exp = SYMBOL_REF_DECL (sym);
6939                   align = TYPE_ALIGN (TREE_TYPE (exp));
6940                   align = aarch64_constant_alignment (exp, align);
6941                 }
6942               else if (SYMBOL_REF_DECL (sym))
6943                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
6944               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
6945                        && SYMBOL_REF_BLOCK (sym) != NULL)
6946                 align = SYMBOL_REF_BLOCK (sym)->alignment;
6947               else
6948                 align = BITS_PER_UNIT;
6949
6950               poly_int64 ref_size = GET_MODE_SIZE (mode);
6951               if (known_eq (ref_size, 0))
6952                 ref_size = GET_MODE_SIZE (DImode);
6953
6954               return (multiple_p (INTVAL (offs), ref_size)
6955                       && multiple_p (align / BITS_PER_UNIT, ref_size));
6956             }
6957         }
6958       return false;
6959
6960     default:
6961       return false;
6962     }
6963 }
6964
6965 /* Return true if the address X is valid for a PRFM instruction.
6966    STRICT_P is true if we should do strict checking with
6967    aarch64_classify_address.  */
6968
6969 bool
6970 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
6971 {
6972   struct aarch64_address_info addr;
6973
6974   /* PRFM accepts the same addresses as DImode...  */
6975   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
6976   if (!res)
6977     return false;
6978
6979   /* ... except writeback forms.  */
6980   return addr.type != ADDRESS_REG_WB;
6981 }
6982
6983 bool
6984 aarch64_symbolic_address_p (rtx x)
6985 {
6986   rtx offset;
6987
6988   split_const (x, &x, &offset);
6989   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
6990 }
6991
6992 /* Classify the base of symbolic expression X.  */
6993
6994 enum aarch64_symbol_type
6995 aarch64_classify_symbolic_expression (rtx x)
6996 {
6997   rtx offset;
6998
6999   split_const (x, &x, &offset);
7000   return aarch64_classify_symbol (x, INTVAL (offset));
7001 }
7002
7003
7004 /* Return TRUE if X is a legitimate address for accessing memory in
7005    mode MODE.  */
7006 static bool
7007 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
7008 {
7009   struct aarch64_address_info addr;
7010
7011   return aarch64_classify_address (&addr, x, mode, strict_p);
7012 }
7013
7014 /* Return TRUE if X is a legitimate address of type TYPE for accessing
7015    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
7016 bool
7017 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
7018                               aarch64_addr_query_type type)
7019 {
7020   struct aarch64_address_info addr;
7021
7022   return aarch64_classify_address (&addr, x, mode, strict_p, type);
7023 }
7024
7025 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
7026
7027 static bool
7028 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
7029                                          poly_int64 orig_offset,
7030                                          machine_mode mode)
7031 {
7032   HOST_WIDE_INT size;
7033   if (GET_MODE_SIZE (mode).is_constant (&size))
7034     {
7035       HOST_WIDE_INT const_offset, second_offset;
7036
7037       /* A general SVE offset is A * VQ + B.  Remove the A component from
7038          coefficient 0 in order to get the constant B.  */
7039       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
7040
7041       /* Split an out-of-range address displacement into a base and
7042          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
7043          range otherwise to increase opportunities for sharing the base
7044          address of different sizes.  Unaligned accesses use the signed
7045          9-bit range, TImode/TFmode use the intersection of signed
7046          scaled 7-bit and signed 9-bit offset.  */
7047       if (mode == TImode || mode == TFmode)
7048         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
7049       else if ((const_offset & (size - 1)) != 0)
7050         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
7051       else
7052         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
7053
7054       if (second_offset == 0 || known_eq (orig_offset, second_offset))
7055         return false;
7056
7057       /* Split the offset into second_offset and the rest.  */
7058       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7059       *offset2 = gen_int_mode (second_offset, Pmode);
7060       return true;
7061     }
7062   else
7063     {
7064       /* Get the mode we should use as the basis of the range.  For structure
7065          modes this is the mode of one vector.  */
7066       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7067       machine_mode step_mode
7068         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
7069
7070       /* Get the "mul vl" multiplier we'd like to use.  */
7071       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
7072       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
7073       if (vec_flags & VEC_SVE_DATA)
7074         /* LDR supports a 9-bit range, but the move patterns for
7075            structure modes require all vectors to be in range of the
7076            same base.  The simplest way of accomodating that while still
7077            promoting reuse of anchor points between different modes is
7078            to use an 8-bit range unconditionally.  */
7079         vnum = ((vnum + 128) & 255) - 128;
7080       else
7081         /* Predicates are only handled singly, so we might as well use
7082            the full range.  */
7083         vnum = ((vnum + 256) & 511) - 256;
7084       if (vnum == 0)
7085         return false;
7086
7087       /* Convert the "mul vl" multiplier into a byte offset.  */
7088       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
7089       if (known_eq (second_offset, orig_offset))
7090         return false;
7091
7092       /* Split the offset into second_offset and the rest.  */
7093       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7094       *offset2 = gen_int_mode (second_offset, Pmode);
7095       return true;
7096     }
7097 }
7098
7099 /* Return the binary representation of floating point constant VALUE in INTVAL.
7100    If the value cannot be converted, return false without setting INTVAL.
7101    The conversion is done in the given MODE.  */
7102 bool
7103 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
7104 {
7105
7106   /* We make a general exception for 0.  */
7107   if (aarch64_float_const_zero_rtx_p (value))
7108     {
7109       *intval = 0;
7110       return true;
7111     }
7112
7113   scalar_float_mode mode;
7114   if (GET_CODE (value) != CONST_DOUBLE
7115       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
7116       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
7117       /* Only support up to DF mode.  */
7118       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
7119     return false;
7120
7121   unsigned HOST_WIDE_INT ival = 0;
7122
7123   long res[2];
7124   real_to_target (res,
7125                   CONST_DOUBLE_REAL_VALUE (value),
7126                   REAL_MODE_FORMAT (mode));
7127
7128   if (mode == DFmode)
7129     {
7130       int order = BYTES_BIG_ENDIAN ? 1 : 0;
7131       ival = zext_hwi (res[order], 32);
7132       ival |= (zext_hwi (res[1 - order], 32) << 32);
7133     }
7134   else
7135       ival = zext_hwi (res[0], 32);
7136
7137   *intval = ival;
7138   return true;
7139 }
7140
7141 /* Return TRUE if rtx X is an immediate constant that can be moved using a
7142    single MOV(+MOVK) followed by an FMOV.  */
7143 bool
7144 aarch64_float_const_rtx_p (rtx x)
7145 {
7146   machine_mode mode = GET_MODE (x);
7147   if (mode == VOIDmode)
7148     return false;
7149
7150   /* Determine whether it's cheaper to write float constants as
7151      mov/movk pairs over ldr/adrp pairs.  */
7152   unsigned HOST_WIDE_INT ival;
7153
7154   if (GET_CODE (x) == CONST_DOUBLE
7155       && SCALAR_FLOAT_MODE_P (mode)
7156       && aarch64_reinterpret_float_as_int (x, &ival))
7157     {
7158       scalar_int_mode imode = (mode == HFmode
7159                                ? SImode
7160                                : int_mode_for_mode (mode).require ());
7161       int num_instr = aarch64_internal_mov_immediate
7162                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7163       return num_instr < 3;
7164     }
7165
7166   return false;
7167 }
7168
7169 /* Return TRUE if rtx X is immediate constant 0.0 */
7170 bool
7171 aarch64_float_const_zero_rtx_p (rtx x)
7172 {
7173   if (GET_MODE (x) == VOIDmode)
7174     return false;
7175
7176   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
7177     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
7178   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
7179 }
7180
7181 /* Return TRUE if rtx X is immediate constant that fits in a single
7182    MOVI immediate operation.  */
7183 bool
7184 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
7185 {
7186   if (!TARGET_SIMD)
7187      return false;
7188
7189   machine_mode vmode;
7190   scalar_int_mode imode;
7191   unsigned HOST_WIDE_INT ival;
7192
7193   if (GET_CODE (x) == CONST_DOUBLE
7194       && SCALAR_FLOAT_MODE_P (mode))
7195     {
7196       if (!aarch64_reinterpret_float_as_int (x, &ival))
7197         return false;
7198
7199       /* We make a general exception for 0.  */
7200       if (aarch64_float_const_zero_rtx_p (x))
7201         return true;
7202
7203       imode = int_mode_for_mode (mode).require ();
7204     }
7205   else if (GET_CODE (x) == CONST_INT
7206            && is_a <scalar_int_mode> (mode, &imode))
7207     ival = INTVAL (x);
7208   else
7209     return false;
7210
7211    /* use a 64 bit mode for everything except for DI/DF mode, where we use
7212      a 128 bit vector mode.  */
7213   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
7214
7215   vmode = aarch64_simd_container_mode (imode, width);
7216   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
7217
7218   return aarch64_simd_valid_immediate (v_op, NULL);
7219 }
7220
7221
7222 /* Return the fixed registers used for condition codes.  */
7223
7224 static bool
7225 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
7226 {
7227   *p1 = CC_REGNUM;
7228   *p2 = INVALID_REGNUM;
7229   return true;
7230 }
7231
7232 /* This function is used by the call expanders of the machine description.
7233    RESULT is the register in which the result is returned.  It's NULL for
7234    "call" and "sibcall".
7235    MEM is the location of the function call.
7236    SIBCALL indicates whether this function call is normal call or sibling call.
7237    It will generate different pattern accordingly.  */
7238
7239 void
7240 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
7241 {
7242   rtx call, callee, tmp;
7243   rtvec vec;
7244   machine_mode mode;
7245
7246   gcc_assert (MEM_P (mem));
7247   callee = XEXP (mem, 0);
7248   mode = GET_MODE (callee);
7249   gcc_assert (mode == Pmode);
7250
7251   /* Decide if we should generate indirect calls by loading the
7252      address of the callee into a register before performing
7253      the branch-and-link.  */
7254   if (SYMBOL_REF_P (callee)
7255       ? (aarch64_is_long_call_p (callee)
7256          || aarch64_is_noplt_call_p (callee))
7257       : !REG_P (callee))
7258     XEXP (mem, 0) = force_reg (mode, callee);
7259
7260   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
7261
7262   if (result != NULL_RTX)
7263     call = gen_rtx_SET (result, call);
7264
7265   if (sibcall)
7266     tmp = ret_rtx;
7267   else
7268     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
7269
7270   vec = gen_rtvec (2, call, tmp);
7271   call = gen_rtx_PARALLEL (VOIDmode, vec);
7272
7273   aarch64_emit_call_insn (call);
7274 }
7275
7276 /* Emit call insn with PAT and do aarch64-specific handling.  */
7277
7278 void
7279 aarch64_emit_call_insn (rtx pat)
7280 {
7281   rtx insn = emit_call_insn (pat);
7282
7283   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
7284   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
7285   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
7286 }
7287
7288 machine_mode
7289 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
7290 {
7291   machine_mode mode_x = GET_MODE (x);
7292   rtx_code code_x = GET_CODE (x);
7293
7294   /* All floating point compares return CCFP if it is an equality
7295      comparison, and CCFPE otherwise.  */
7296   if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
7297     {
7298       switch (code)
7299         {
7300         case EQ:
7301         case NE:
7302         case UNORDERED:
7303         case ORDERED:
7304         case UNLT:
7305         case UNLE:
7306         case UNGT:
7307         case UNGE:
7308         case UNEQ:
7309           return CCFPmode;
7310
7311         case LT:
7312         case LE:
7313         case GT:
7314         case GE:
7315         case LTGT:
7316           return CCFPEmode;
7317
7318         default:
7319           gcc_unreachable ();
7320         }
7321     }
7322
7323   /* Equality comparisons of short modes against zero can be performed
7324      using the TST instruction with the appropriate bitmask.  */
7325   if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
7326       && (code == EQ || code == NE)
7327       && (mode_x == HImode || mode_x == QImode))
7328     return CC_NZmode;
7329
7330   /* Similarly, comparisons of zero_extends from shorter modes can
7331      be performed using an ANDS with an immediate mask.  */
7332   if (y == const0_rtx && code_x == ZERO_EXTEND
7333       && (mode_x == SImode || mode_x == DImode)
7334       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
7335       && (code == EQ || code == NE))
7336     return CC_NZmode;
7337
7338   if ((mode_x == SImode || mode_x == DImode)
7339       && y == const0_rtx
7340       && (code == EQ || code == NE || code == LT || code == GE)
7341       && (code_x == PLUS || code_x == MINUS || code_x == AND
7342           || code_x == NEG
7343           || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
7344               && CONST_INT_P (XEXP (x, 2)))))
7345     return CC_NZmode;
7346
7347   /* A compare with a shifted operand.  Because of canonicalization,
7348      the comparison will have to be swapped when we emit the assembly
7349      code.  */
7350   if ((mode_x == SImode || mode_x == DImode)
7351       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
7352       && (code_x == ASHIFT || code_x == ASHIFTRT
7353           || code_x == LSHIFTRT
7354           || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
7355     return CC_SWPmode;
7356
7357   /* Similarly for a negated operand, but we can only do this for
7358      equalities.  */
7359   if ((mode_x == SImode || mode_x == DImode)
7360       && (REG_P (y) || GET_CODE (y) == SUBREG)
7361       && (code == EQ || code == NE)
7362       && code_x == NEG)
7363     return CC_Zmode;
7364
7365   /* A test for unsigned overflow from an addition.  */
7366   if ((mode_x == DImode || mode_x == TImode)
7367       && (code == LTU || code == GEU)
7368       && code_x == PLUS
7369       && rtx_equal_p (XEXP (x, 0), y))
7370     return CC_Cmode;
7371
7372   /* A test for unsigned overflow from an add with carry.  */
7373   if ((mode_x == DImode || mode_x == TImode)
7374       && (code == LTU || code == GEU)
7375       && code_x == PLUS
7376       && CONST_SCALAR_INT_P (y)
7377       && (rtx_mode_t (y, mode_x)
7378           == (wi::shwi (1, mode_x)
7379               << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
7380     return CC_ADCmode;
7381
7382   /* A test for signed overflow.  */
7383   if ((mode_x == DImode || mode_x == TImode)
7384       && code == NE
7385       && code_x == PLUS
7386       && GET_CODE (y) == SIGN_EXTEND)
7387     return CC_Vmode;
7388
7389   /* For everything else, return CCmode.  */
7390   return CCmode;
7391 }
7392
7393 static int
7394 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
7395
7396 int
7397 aarch64_get_condition_code (rtx x)
7398 {
7399   machine_mode mode = GET_MODE (XEXP (x, 0));
7400   enum rtx_code comp_code = GET_CODE (x);
7401
7402   if (GET_MODE_CLASS (mode) != MODE_CC)
7403     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
7404   return aarch64_get_condition_code_1 (mode, comp_code);
7405 }
7406
7407 static int
7408 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
7409 {
7410   switch (mode)
7411     {
7412     case E_CCFPmode:
7413     case E_CCFPEmode:
7414       switch (comp_code)
7415         {
7416         case GE: return AARCH64_GE;
7417         case GT: return AARCH64_GT;
7418         case LE: return AARCH64_LS;
7419         case LT: return AARCH64_MI;
7420         case NE: return AARCH64_NE;
7421         case EQ: return AARCH64_EQ;
7422         case ORDERED: return AARCH64_VC;
7423         case UNORDERED: return AARCH64_VS;
7424         case UNLT: return AARCH64_LT;
7425         case UNLE: return AARCH64_LE;
7426         case UNGT: return AARCH64_HI;
7427         case UNGE: return AARCH64_PL;
7428         default: return -1;
7429         }
7430       break;
7431
7432     case E_CCmode:
7433       switch (comp_code)
7434         {
7435         case NE: return AARCH64_NE;
7436         case EQ: return AARCH64_EQ;
7437         case GE: return AARCH64_GE;
7438         case GT: return AARCH64_GT;
7439         case LE: return AARCH64_LE;
7440         case LT: return AARCH64_LT;
7441         case GEU: return AARCH64_CS;
7442         case GTU: return AARCH64_HI;
7443         case LEU: return AARCH64_LS;
7444         case LTU: return AARCH64_CC;
7445         default: return -1;
7446         }
7447       break;
7448
7449     case E_CC_SWPmode:
7450       switch (comp_code)
7451         {
7452         case NE: return AARCH64_NE;
7453         case EQ: return AARCH64_EQ;
7454         case GE: return AARCH64_LE;
7455         case GT: return AARCH64_LT;
7456         case LE: return AARCH64_GE;
7457         case LT: return AARCH64_GT;
7458         case GEU: return AARCH64_LS;
7459         case GTU: return AARCH64_CC;
7460         case LEU: return AARCH64_CS;
7461         case LTU: return AARCH64_HI;
7462         default: return -1;
7463         }
7464       break;
7465
7466     case E_CC_NZCmode:
7467       switch (comp_code)
7468         {
7469         case NE: return AARCH64_NE; /* = any */
7470         case EQ: return AARCH64_EQ; /* = none */
7471         case GE: return AARCH64_PL; /* = nfrst */
7472         case LT: return AARCH64_MI; /* = first */
7473         case GEU: return AARCH64_CS; /* = nlast */
7474         case GTU: return AARCH64_HI; /* = pmore */
7475         case LEU: return AARCH64_LS; /* = plast */
7476         case LTU: return AARCH64_CC; /* = last */
7477         default: return -1;
7478         }
7479       break;
7480
7481     case E_CC_NZmode:
7482       switch (comp_code)
7483         {
7484         case NE: return AARCH64_NE;
7485         case EQ: return AARCH64_EQ;
7486         case GE: return AARCH64_PL;
7487         case LT: return AARCH64_MI;
7488         default: return -1;
7489         }
7490       break;
7491
7492     case E_CC_Zmode:
7493       switch (comp_code)
7494         {
7495         case NE: return AARCH64_NE;
7496         case EQ: return AARCH64_EQ;
7497         default: return -1;
7498         }
7499       break;
7500
7501     case E_CC_Cmode:
7502       switch (comp_code)
7503         {
7504         case LTU: return AARCH64_CS;
7505         case GEU: return AARCH64_CC;
7506         default: return -1;
7507         }
7508       break;
7509
7510     case E_CC_ADCmode:
7511       switch (comp_code)
7512         {
7513         case GEU: return AARCH64_CS;
7514         case LTU: return AARCH64_CC;
7515         default: return -1;
7516         }
7517       break;
7518
7519     case E_CC_Vmode:
7520       switch (comp_code)
7521         {
7522         case NE: return AARCH64_VS;
7523         case EQ: return AARCH64_VC;
7524         default: return -1;
7525         }
7526       break;
7527
7528     default:
7529       return -1;
7530     }
7531
7532   return -1;
7533 }
7534
7535 bool
7536 aarch64_const_vec_all_same_in_range_p (rtx x,
7537                                        HOST_WIDE_INT minval,
7538                                        HOST_WIDE_INT maxval)
7539 {
7540   rtx elt;
7541   return (const_vec_duplicate_p (x, &elt)
7542           && CONST_INT_P (elt)
7543           && IN_RANGE (INTVAL (elt), minval, maxval));
7544 }
7545
7546 bool
7547 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
7548 {
7549   return aarch64_const_vec_all_same_in_range_p (x, val, val);
7550 }
7551
7552 /* Return true if VEC is a constant in which every element is in the range
7553    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
7554
7555 static bool
7556 aarch64_const_vec_all_in_range_p (rtx vec,
7557                                   HOST_WIDE_INT minval,
7558                                   HOST_WIDE_INT maxval)
7559 {
7560   if (GET_CODE (vec) != CONST_VECTOR
7561       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
7562     return false;
7563
7564   int nunits;
7565   if (!CONST_VECTOR_STEPPED_P (vec))
7566     nunits = const_vector_encoded_nelts (vec);
7567   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
7568     return false;
7569
7570   for (int i = 0; i < nunits; i++)
7571     {
7572       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
7573       if (!CONST_INT_P (vec_elem)
7574           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
7575         return false;
7576     }
7577   return true;
7578 }
7579
7580 /* N Z C V.  */
7581 #define AARCH64_CC_V 1
7582 #define AARCH64_CC_C (1 << 1)
7583 #define AARCH64_CC_Z (1 << 2)
7584 #define AARCH64_CC_N (1 << 3)
7585
7586 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
7587 static const int aarch64_nzcv_codes[] =
7588 {
7589   0,            /* EQ, Z == 1.  */
7590   AARCH64_CC_Z, /* NE, Z == 0.  */
7591   0,            /* CS, C == 1.  */
7592   AARCH64_CC_C, /* CC, C == 0.  */
7593   0,            /* MI, N == 1.  */
7594   AARCH64_CC_N, /* PL, N == 0.  */
7595   0,            /* VS, V == 1.  */
7596   AARCH64_CC_V, /* VC, V == 0.  */
7597   0,            /* HI, C ==1 && Z == 0.  */
7598   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
7599   AARCH64_CC_V, /* GE, N == V.  */
7600   0,            /* LT, N != V.  */
7601   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
7602   0,            /* LE, !(Z == 0 && N == V).  */
7603   0,            /* AL, Any.  */
7604   0             /* NV, Any.  */
7605 };
7606
7607 /* Print floating-point vector immediate operand X to F, negating it
7608    first if NEGATE is true.  Return true on success, false if it isn't
7609    a constant we can handle.  */
7610
7611 static bool
7612 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
7613 {
7614   rtx elt;
7615
7616   if (!const_vec_duplicate_p (x, &elt))
7617     return false;
7618
7619   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
7620   if (negate)
7621     r = real_value_negate (&r);
7622
7623   /* We only handle the SVE single-bit immediates here.  */
7624   if (real_equal (&r, &dconst0))
7625     asm_fprintf (f, "0.0");
7626   else if (real_equal (&r, &dconst1))
7627     asm_fprintf (f, "1.0");
7628   else if (real_equal (&r, &dconsthalf))
7629     asm_fprintf (f, "0.5");
7630   else
7631     return false;
7632
7633   return true;
7634 }
7635
7636 /* Return the equivalent letter for size.  */
7637 static char
7638 sizetochar (int size)
7639 {
7640   switch (size)
7641     {
7642     case 64: return 'd';
7643     case 32: return 's';
7644     case 16: return 'h';
7645     case 8 : return 'b';
7646     default: gcc_unreachable ();
7647     }
7648 }
7649
7650 /* Print operand X to file F in a target specific manner according to CODE.
7651    The acceptable formatting commands given by CODE are:
7652      'c':               An integer or symbol address without a preceding #
7653                         sign.
7654      'C':               Take the duplicated element in a vector constant
7655                         and print it in hex.
7656      'D':               Take the duplicated element in a vector constant
7657                         and print it as an unsigned integer, in decimal.
7658      'e':               Print the sign/zero-extend size as a character 8->b,
7659                         16->h, 32->w.
7660      'p':               Prints N such that 2^N == X (X must be power of 2 and
7661                         const int).
7662      'P':               Print the number of non-zero bits in X (a const_int).
7663      'H':               Print the higher numbered register of a pair (TImode)
7664                         of regs.
7665      'm':               Print a condition (eq, ne, etc).
7666      'M':               Same as 'm', but invert condition.
7667      'N':               Take the duplicated element in a vector constant
7668                         and print the negative of it in decimal.
7669      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
7670      'S/T/U/V':         Print a FP/SIMD register name for a register list.
7671                         The register printed is the FP/SIMD register name
7672                         of X + 0/1/2/3 for S/T/U/V.
7673      'R':               Print a scalar FP/SIMD register name + 1.
7674      'X':               Print bottom 16 bits of integer constant in hex.
7675      'w/x':             Print a general register name or the zero register
7676                         (32-bit or 64-bit).
7677      '0':               Print a normal operand, if it's a general register,
7678                         then we assume DImode.
7679      'k':               Print NZCV for conditional compare instructions.
7680      'A':               Output address constant representing the first
7681                         argument of X, specifying a relocation offset
7682                         if appropriate.
7683      'L':               Output constant address specified by X
7684                         with a relocation offset if appropriate.
7685      'G':               Prints address of X, specifying a PC relative
7686                         relocation mode if appropriate.
7687      'y':               Output address of LDP or STP - this is used for
7688                         some LDP/STPs which don't use a PARALLEL in their
7689                         pattern (so the mode needs to be adjusted).
7690      'z':               Output address of a typical LDP or STP.  */
7691
7692 static void
7693 aarch64_print_operand (FILE *f, rtx x, int code)
7694 {
7695   rtx elt;
7696   switch (code)
7697     {
7698     case 'c':
7699       switch (GET_CODE (x))
7700         {
7701         case CONST_INT:
7702           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
7703           break;
7704
7705         case SYMBOL_REF:
7706           output_addr_const (f, x);
7707           break;
7708
7709         case CONST:
7710           if (GET_CODE (XEXP (x, 0)) == PLUS
7711               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
7712             {
7713               output_addr_const (f, x);
7714               break;
7715             }
7716           /* Fall through.  */
7717
7718         default:
7719           output_operand_lossage ("unsupported operand for code '%c'", code);
7720         }
7721       break;
7722
7723     case 'e':
7724       {
7725         int n;
7726
7727         if (!CONST_INT_P (x)
7728             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
7729           {
7730             output_operand_lossage ("invalid operand for '%%%c'", code);
7731             return;
7732           }
7733
7734         switch (n)
7735           {
7736           case 3:
7737             fputc ('b', f);
7738             break;
7739           case 4:
7740             fputc ('h', f);
7741             break;
7742           case 5:
7743             fputc ('w', f);
7744             break;
7745           default:
7746             output_operand_lossage ("invalid operand for '%%%c'", code);
7747             return;
7748           }
7749       }
7750       break;
7751
7752     case 'p':
7753       {
7754         int n;
7755
7756         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
7757           {
7758             output_operand_lossage ("invalid operand for '%%%c'", code);
7759             return;
7760           }
7761
7762         asm_fprintf (f, "%d", n);
7763       }
7764       break;
7765
7766     case 'P':
7767       if (!CONST_INT_P (x))
7768         {
7769           output_operand_lossage ("invalid operand for '%%%c'", code);
7770           return;
7771         }
7772
7773       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
7774       break;
7775
7776     case 'H':
7777       if (x == const0_rtx)
7778         {
7779           asm_fprintf (f, "xzr");
7780           break;
7781         }
7782
7783       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
7784         {
7785           output_operand_lossage ("invalid operand for '%%%c'", code);
7786           return;
7787         }
7788
7789       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
7790       break;
7791
7792     case 'M':
7793     case 'm':
7794       {
7795         int cond_code;
7796         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
7797         if (x == const_true_rtx)
7798           {
7799             if (code == 'M')
7800               fputs ("nv", f);
7801             return;
7802           }
7803
7804         if (!COMPARISON_P (x))
7805           {
7806             output_operand_lossage ("invalid operand for '%%%c'", code);
7807             return;
7808           }
7809
7810         cond_code = aarch64_get_condition_code (x);
7811         gcc_assert (cond_code >= 0);
7812         if (code == 'M')
7813           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
7814         if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
7815           fputs (aarch64_sve_condition_codes[cond_code], f);
7816         else
7817           fputs (aarch64_condition_codes[cond_code], f);
7818       }
7819       break;
7820
7821     case 'N':
7822       if (!const_vec_duplicate_p (x, &elt))
7823         {
7824           output_operand_lossage ("invalid vector constant");
7825           return;
7826         }
7827
7828       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
7829         asm_fprintf (f, "%wd", -INTVAL (elt));
7830       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
7831                && aarch64_print_vector_float_operand (f, x, true))
7832         ;
7833       else
7834         {
7835           output_operand_lossage ("invalid vector constant");
7836           return;
7837         }
7838       break;
7839
7840     case 'b':
7841     case 'h':
7842     case 's':
7843     case 'd':
7844     case 'q':
7845       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7846         {
7847           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7848           return;
7849         }
7850       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
7851       break;
7852
7853     case 'S':
7854     case 'T':
7855     case 'U':
7856     case 'V':
7857       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7858         {
7859           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7860           return;
7861         }
7862       asm_fprintf (f, "%c%d",
7863                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
7864                    REGNO (x) - V0_REGNUM + (code - 'S'));
7865       break;
7866
7867     case 'R':
7868       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7869         {
7870           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7871           return;
7872         }
7873       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
7874       break;
7875
7876     case 'X':
7877       if (!CONST_INT_P (x))
7878         {
7879           output_operand_lossage ("invalid operand for '%%%c'", code);
7880           return;
7881         }
7882       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
7883       break;
7884
7885     case 'C':
7886       {
7887         /* Print a replicated constant in hex.  */
7888         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
7889           {
7890             output_operand_lossage ("invalid operand for '%%%c'", code);
7891             return;
7892           }
7893         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
7894         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
7895       }
7896       break;
7897
7898     case 'D':
7899       {
7900         /* Print a replicated constant in decimal, treating it as
7901            unsigned.  */
7902         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
7903           {
7904             output_operand_lossage ("invalid operand for '%%%c'", code);
7905             return;
7906           }
7907         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
7908         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
7909       }
7910       break;
7911
7912     case 'w':
7913     case 'x':
7914       if (x == const0_rtx
7915           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
7916         {
7917           asm_fprintf (f, "%czr", code);
7918           break;
7919         }
7920
7921       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
7922         {
7923           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
7924           break;
7925         }
7926
7927       if (REG_P (x) && REGNO (x) == SP_REGNUM)
7928         {
7929           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
7930           break;
7931         }
7932
7933       /* Fall through */
7934
7935     case 0:
7936       if (x == NULL)
7937         {
7938           output_operand_lossage ("missing operand");
7939           return;
7940         }
7941
7942       switch (GET_CODE (x))
7943         {
7944         case REG:
7945           if (aarch64_sve_data_mode_p (GET_MODE (x)))
7946             {
7947               if (REG_NREGS (x) == 1)
7948                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
7949               else
7950                 {
7951                   char suffix
7952                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
7953                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
7954                                REGNO (x) - V0_REGNUM, suffix,
7955                                END_REGNO (x) - V0_REGNUM - 1, suffix);
7956                 }
7957             }
7958           else
7959             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
7960           break;
7961
7962         case MEM:
7963           output_address (GET_MODE (x), XEXP (x, 0));
7964           break;
7965
7966         case LABEL_REF:
7967         case SYMBOL_REF:
7968           output_addr_const (asm_out_file, x);
7969           break;
7970
7971         case CONST_INT:
7972           asm_fprintf (f, "%wd", INTVAL (x));
7973           break;
7974
7975         case CONST:
7976           if (!VECTOR_MODE_P (GET_MODE (x)))
7977             {
7978               output_addr_const (asm_out_file, x);
7979               break;
7980             }
7981           /* fall through */
7982
7983         case CONST_VECTOR:
7984           if (!const_vec_duplicate_p (x, &elt))
7985             {
7986               output_operand_lossage ("invalid vector constant");
7987               return;
7988             }
7989
7990           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
7991             asm_fprintf (f, "%wd", INTVAL (elt));
7992           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
7993                    && aarch64_print_vector_float_operand (f, x, false))
7994             ;
7995           else
7996             {
7997               output_operand_lossage ("invalid vector constant");
7998               return;
7999             }
8000           break;
8001
8002         case CONST_DOUBLE:
8003           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
8004              be getting CONST_DOUBLEs holding integers.  */
8005           gcc_assert (GET_MODE (x) != VOIDmode);
8006           if (aarch64_float_const_zero_rtx_p (x))
8007             {
8008               fputc ('0', f);
8009               break;
8010             }
8011           else if (aarch64_float_const_representable_p (x))
8012             {
8013 #define buf_size 20
8014               char float_buf[buf_size] = {'\0'};
8015               real_to_decimal_for_mode (float_buf,
8016                                         CONST_DOUBLE_REAL_VALUE (x),
8017                                         buf_size, buf_size,
8018                                         1, GET_MODE (x));
8019               asm_fprintf (asm_out_file, "%s", float_buf);
8020               break;
8021 #undef buf_size
8022             }
8023           output_operand_lossage ("invalid constant");
8024           return;
8025         default:
8026           output_operand_lossage ("invalid operand");
8027           return;
8028         }
8029       break;
8030
8031     case 'A':
8032       if (GET_CODE (x) == HIGH)
8033         x = XEXP (x, 0);
8034
8035       switch (aarch64_classify_symbolic_expression (x))
8036         {
8037         case SYMBOL_SMALL_GOT_4G:
8038           asm_fprintf (asm_out_file, ":got:");
8039           break;
8040
8041         case SYMBOL_SMALL_TLSGD:
8042           asm_fprintf (asm_out_file, ":tlsgd:");
8043           break;
8044
8045         case SYMBOL_SMALL_TLSDESC:
8046           asm_fprintf (asm_out_file, ":tlsdesc:");
8047           break;
8048
8049         case SYMBOL_SMALL_TLSIE:
8050           asm_fprintf (asm_out_file, ":gottprel:");
8051           break;
8052
8053         case SYMBOL_TLSLE24:
8054           asm_fprintf (asm_out_file, ":tprel:");
8055           break;
8056
8057         case SYMBOL_TINY_GOT:
8058           gcc_unreachable ();
8059           break;
8060
8061         default:
8062           break;
8063         }
8064       output_addr_const (asm_out_file, x);
8065       break;
8066
8067     case 'L':
8068       switch (aarch64_classify_symbolic_expression (x))
8069         {
8070         case SYMBOL_SMALL_GOT_4G:
8071           asm_fprintf (asm_out_file, ":lo12:");
8072           break;
8073
8074         case SYMBOL_SMALL_TLSGD:
8075           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
8076           break;
8077
8078         case SYMBOL_SMALL_TLSDESC:
8079           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
8080           break;
8081
8082         case SYMBOL_SMALL_TLSIE:
8083           asm_fprintf (asm_out_file, ":gottprel_lo12:");
8084           break;
8085
8086         case SYMBOL_TLSLE12:
8087           asm_fprintf (asm_out_file, ":tprel_lo12:");
8088           break;
8089
8090         case SYMBOL_TLSLE24:
8091           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
8092           break;
8093
8094         case SYMBOL_TINY_GOT:
8095           asm_fprintf (asm_out_file, ":got:");
8096           break;
8097
8098         case SYMBOL_TINY_TLSIE:
8099           asm_fprintf (asm_out_file, ":gottprel:");
8100           break;
8101
8102         default:
8103           break;
8104         }
8105       output_addr_const (asm_out_file, x);
8106       break;
8107
8108     case 'G':
8109       switch (aarch64_classify_symbolic_expression (x))
8110         {
8111         case SYMBOL_TLSLE24:
8112           asm_fprintf (asm_out_file, ":tprel_hi12:");
8113           break;
8114         default:
8115           break;
8116         }
8117       output_addr_const (asm_out_file, x);
8118       break;
8119
8120     case 'k':
8121       {
8122         HOST_WIDE_INT cond_code;
8123
8124         if (!CONST_INT_P (x))
8125           {
8126             output_operand_lossage ("invalid operand for '%%%c'", code);
8127             return;
8128           }
8129
8130         cond_code = INTVAL (x);
8131         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
8132         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
8133       }
8134       break;
8135
8136     case 'y':
8137     case 'z':
8138       {
8139         machine_mode mode = GET_MODE (x);
8140
8141         if (GET_CODE (x) != MEM
8142             || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
8143           {
8144             output_operand_lossage ("invalid operand for '%%%c'", code);
8145             return;
8146           }
8147
8148         if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
8149                                             code == 'y'
8150                                             ? ADDR_QUERY_LDP_STP_N
8151                                             : ADDR_QUERY_LDP_STP))
8152           output_operand_lossage ("invalid operand prefix '%%%c'", code);
8153       }
8154       break;
8155
8156     default:
8157       output_operand_lossage ("invalid operand prefix '%%%c'", code);
8158       return;
8159     }
8160 }
8161
8162 /* Print address 'x' of a memory access with mode 'mode'.
8163    'op' is the context required by aarch64_classify_address.  It can either be
8164    MEM for a normal memory access or PARALLEL for LDP/STP.  */
8165 static bool
8166 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
8167                                 aarch64_addr_query_type type)
8168 {
8169   struct aarch64_address_info addr;
8170   unsigned int size;
8171
8172   /* Check all addresses are Pmode - including ILP32.  */
8173   if (GET_MODE (x) != Pmode
8174       && (!CONST_INT_P (x)
8175           || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
8176     {
8177       output_operand_lossage ("invalid address mode");
8178       return false;
8179     }
8180
8181   if (aarch64_classify_address (&addr, x, mode, true, type))
8182     switch (addr.type)
8183       {
8184       case ADDRESS_REG_IMM:
8185         if (known_eq (addr.const_offset, 0))
8186           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
8187         else if (aarch64_sve_data_mode_p (mode))
8188           {
8189             HOST_WIDE_INT vnum
8190               = exact_div (addr.const_offset,
8191                            BYTES_PER_SVE_VECTOR).to_constant ();
8192             asm_fprintf (f, "[%s, #%wd, mul vl]",
8193                          reg_names[REGNO (addr.base)], vnum);
8194           }
8195         else if (aarch64_sve_pred_mode_p (mode))
8196           {
8197             HOST_WIDE_INT vnum
8198               = exact_div (addr.const_offset,
8199                            BYTES_PER_SVE_PRED).to_constant ();
8200             asm_fprintf (f, "[%s, #%wd, mul vl]",
8201                          reg_names[REGNO (addr.base)], vnum);
8202           }
8203         else
8204           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
8205                        INTVAL (addr.offset));
8206         return true;
8207
8208       case ADDRESS_REG_REG:
8209         if (addr.shift == 0)
8210           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
8211                        reg_names [REGNO (addr.offset)]);
8212         else
8213           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
8214                        reg_names [REGNO (addr.offset)], addr.shift);
8215         return true;
8216
8217       case ADDRESS_REG_UXTW:
8218         if (addr.shift == 0)
8219           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
8220                        REGNO (addr.offset) - R0_REGNUM);
8221         else
8222           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
8223                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
8224         return true;
8225
8226       case ADDRESS_REG_SXTW:
8227         if (addr.shift == 0)
8228           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
8229                        REGNO (addr.offset) - R0_REGNUM);
8230         else
8231           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
8232                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
8233         return true;
8234
8235       case ADDRESS_REG_WB:
8236         /* Writeback is only supported for fixed-width modes.  */
8237         size = GET_MODE_SIZE (mode).to_constant ();
8238         switch (GET_CODE (x))
8239           {
8240           case PRE_INC:
8241             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
8242             return true;
8243           case POST_INC:
8244             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
8245             return true;
8246           case PRE_DEC:
8247             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
8248             return true;
8249           case POST_DEC:
8250             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
8251             return true;
8252           case PRE_MODIFY:
8253             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
8254                          INTVAL (addr.offset));
8255             return true;
8256           case POST_MODIFY:
8257             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
8258                          INTVAL (addr.offset));
8259             return true;
8260           default:
8261             break;
8262           }
8263         break;
8264
8265       case ADDRESS_LO_SUM:
8266         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
8267         output_addr_const (f, addr.offset);
8268         asm_fprintf (f, "]");
8269         return true;
8270
8271       case ADDRESS_SYMBOLIC:
8272         output_addr_const (f, x);
8273         return true;
8274       }
8275
8276   return false;
8277 }
8278
8279 /* Print address 'x' of a memory access with mode 'mode'.  */
8280 static void
8281 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
8282 {
8283   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
8284     output_addr_const (f, x);
8285 }
8286
8287 bool
8288 aarch64_label_mentioned_p (rtx x)
8289 {
8290   const char *fmt;
8291   int i;
8292
8293   if (GET_CODE (x) == LABEL_REF)
8294     return true;
8295
8296   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
8297      referencing instruction, but they are constant offsets, not
8298      symbols.  */
8299   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
8300     return false;
8301
8302   fmt = GET_RTX_FORMAT (GET_CODE (x));
8303   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
8304     {
8305       if (fmt[i] == 'E')
8306         {
8307           int j;
8308
8309           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
8310             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
8311               return 1;
8312         }
8313       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
8314         return 1;
8315     }
8316
8317   return 0;
8318 }
8319
8320 /* Implement REGNO_REG_CLASS.  */
8321
8322 enum reg_class
8323 aarch64_regno_regclass (unsigned regno)
8324 {
8325   if (GP_REGNUM_P (regno))
8326     return GENERAL_REGS;
8327
8328   if (regno == SP_REGNUM)
8329     return STACK_REG;
8330
8331   if (regno == FRAME_POINTER_REGNUM
8332       || regno == ARG_POINTER_REGNUM)
8333     return POINTER_REGS;
8334
8335   if (FP_REGNUM_P (regno))
8336     return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
8337             : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
8338
8339   if (PR_REGNUM_P (regno))
8340     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
8341
8342   return NO_REGS;
8343 }
8344
8345 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
8346    If OFFSET is out of range, return an offset of an anchor point
8347    that is in range.  Return 0 otherwise.  */
8348
8349 static HOST_WIDE_INT
8350 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
8351                        machine_mode mode)
8352 {
8353   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
8354   if (size > 16)
8355     return (offset + 0x400) & ~0x7f0;
8356
8357   /* For offsets that aren't a multiple of the access size, the limit is
8358      -256...255.  */
8359   if (offset & (size - 1))
8360     {
8361       /* BLKmode typically uses LDP of X-registers.  */
8362       if (mode == BLKmode)
8363         return (offset + 512) & ~0x3ff;
8364       return (offset + 0x100) & ~0x1ff;
8365     }
8366
8367   /* Small negative offsets are supported.  */
8368   if (IN_RANGE (offset, -256, 0))
8369     return 0;
8370
8371   if (mode == TImode || mode == TFmode)
8372     return (offset + 0x100) & ~0x1ff;
8373
8374   /* Use 12-bit offset by access size.  */
8375   return offset & (~0xfff * size);
8376 }
8377
8378 static rtx
8379 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
8380 {
8381   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
8382      where mask is selected by alignment and size of the offset.
8383      We try to pick as large a range for the offset as possible to
8384      maximize the chance of a CSE.  However, for aligned addresses
8385      we limit the range to 4k so that structures with different sized
8386      elements are likely to use the same base.  We need to be careful
8387      not to split a CONST for some forms of address expression, otherwise
8388      it will generate sub-optimal code.  */
8389
8390   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
8391     {
8392       rtx base = XEXP (x, 0);
8393       rtx offset_rtx = XEXP (x, 1);
8394       HOST_WIDE_INT offset = INTVAL (offset_rtx);
8395
8396       if (GET_CODE (base) == PLUS)
8397         {
8398           rtx op0 = XEXP (base, 0);
8399           rtx op1 = XEXP (base, 1);
8400
8401           /* Force any scaling into a temp for CSE.  */
8402           op0 = force_reg (Pmode, op0);
8403           op1 = force_reg (Pmode, op1);
8404
8405           /* Let the pointer register be in op0.  */
8406           if (REG_POINTER (op1))
8407             std::swap (op0, op1);
8408
8409           /* If the pointer is virtual or frame related, then we know that
8410              virtual register instantiation or register elimination is going
8411              to apply a second constant.  We want the two constants folded
8412              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
8413           if (virt_or_elim_regno_p (REGNO (op0)))
8414             {
8415               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
8416                                    NULL_RTX, true, OPTAB_DIRECT);
8417               return gen_rtx_PLUS (Pmode, base, op1);
8418             }
8419
8420           /* Otherwise, in order to encourage CSE (and thence loop strength
8421              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
8422           base = expand_binop (Pmode, add_optab, op0, op1,
8423                                NULL_RTX, true, OPTAB_DIRECT);
8424           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
8425         }
8426
8427       HOST_WIDE_INT size;
8428       if (GET_MODE_SIZE (mode).is_constant (&size))
8429         {
8430           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
8431                                                              mode);
8432           if (base_offset != 0)
8433             {
8434               base = plus_constant (Pmode, base, base_offset);
8435               base = force_operand (base, NULL_RTX);
8436               return plus_constant (Pmode, base, offset - base_offset);
8437             }
8438         }
8439     }
8440
8441   return x;
8442 }
8443
8444 static reg_class_t
8445 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
8446                           reg_class_t rclass,
8447                           machine_mode mode,
8448                           secondary_reload_info *sri)
8449 {
8450   /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
8451      directly by the *aarch64_sve_mov<mode>_be move pattern.  See the
8452      comment at the head of aarch64-sve.md for more details about the
8453      big-endian handling.  */
8454   if (BYTES_BIG_ENDIAN
8455       && reg_class_subset_p (rclass, FP_REGS)
8456       && !((REG_P (x) && HARD_REGISTER_P (x))
8457            || aarch64_simd_valid_immediate (x, NULL))
8458       && aarch64_sve_data_mode_p (mode))
8459     {
8460       sri->icode = CODE_FOR_aarch64_sve_reload_be;
8461       return NO_REGS;
8462     }
8463
8464   /* If we have to disable direct literal pool loads and stores because the
8465      function is too big, then we need a scratch register.  */
8466   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
8467       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
8468           || targetm.vector_mode_supported_p (GET_MODE (x)))
8469       && !aarch64_pcrelative_literal_loads)
8470     {
8471       sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
8472       return NO_REGS;
8473     }
8474
8475   /* Without the TARGET_SIMD instructions we cannot move a Q register
8476      to a Q register directly.  We need a scratch.  */
8477   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
8478       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
8479       && reg_class_subset_p (rclass, FP_REGS))
8480     {
8481       sri->icode = code_for_aarch64_reload_mov (mode);
8482       return NO_REGS;
8483     }
8484
8485   /* A TFmode or TImode memory access should be handled via an FP_REGS
8486      because AArch64 has richer addressing modes for LDR/STR instructions
8487      than LDP/STP instructions.  */
8488   if (TARGET_FLOAT && rclass == GENERAL_REGS
8489       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
8490     return FP_REGS;
8491
8492   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
8493       return GENERAL_REGS;
8494
8495   return NO_REGS;
8496 }
8497
8498 static bool
8499 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
8500 {
8501   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
8502
8503   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
8504      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
8505   if (frame_pointer_needed)
8506     return to == HARD_FRAME_POINTER_REGNUM;
8507   return true;
8508 }
8509
8510 poly_int64
8511 aarch64_initial_elimination_offset (unsigned from, unsigned to)
8512 {
8513   if (to == HARD_FRAME_POINTER_REGNUM)
8514     {
8515       if (from == ARG_POINTER_REGNUM)
8516         return cfun->machine->frame.hard_fp_offset;
8517
8518       if (from == FRAME_POINTER_REGNUM)
8519         return cfun->machine->frame.hard_fp_offset
8520                - cfun->machine->frame.locals_offset;
8521     }
8522
8523   if (to == STACK_POINTER_REGNUM)
8524     {
8525       if (from == FRAME_POINTER_REGNUM)
8526           return cfun->machine->frame.frame_size
8527                  - cfun->machine->frame.locals_offset;
8528     }
8529
8530   return cfun->machine->frame.frame_size;
8531 }
8532
8533 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
8534    previous frame.  */
8535
8536 rtx
8537 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
8538 {
8539   if (count != 0)
8540     return const0_rtx;
8541   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
8542 }
8543
8544
8545 static void
8546 aarch64_asm_trampoline_template (FILE *f)
8547 {
8548   int offset1 = 16;
8549   int offset2 = 20;
8550
8551   if (aarch64_bti_enabled ())
8552     {
8553       asm_fprintf (f, "\thint\t34 // bti c\n");
8554       offset1 -= 4;
8555       offset2 -= 4;
8556     }
8557
8558   if (TARGET_ILP32)
8559     {
8560       asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
8561       asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
8562                    offset1);
8563     }
8564   else
8565     {
8566       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
8567       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
8568                    offset2);
8569     }
8570   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
8571
8572   /* The trampoline needs an extra padding instruction.  In case if BTI is
8573      enabled the padding instruction is replaced by the BTI instruction at
8574      the beginning.  */
8575   if (!aarch64_bti_enabled ())
8576     assemble_aligned_integer (4, const0_rtx);
8577
8578   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
8579   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
8580 }
8581
8582 static void
8583 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
8584 {
8585   rtx fnaddr, mem, a_tramp;
8586   const int tramp_code_sz = 16;
8587
8588   /* Don't need to copy the trailing D-words, we fill those in below.  */
8589   emit_block_move (m_tramp, assemble_trampoline_template (),
8590                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
8591   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
8592   fnaddr = XEXP (DECL_RTL (fndecl), 0);
8593   if (GET_MODE (fnaddr) != ptr_mode)
8594     fnaddr = convert_memory_address (ptr_mode, fnaddr);
8595   emit_move_insn (mem, fnaddr);
8596
8597   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
8598   emit_move_insn (mem, chain_value);
8599
8600   /* XXX We should really define a "clear_cache" pattern and use
8601      gen_clear_cache().  */
8602   a_tramp = XEXP (m_tramp, 0);
8603   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
8604                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
8605                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
8606                      ptr_mode);
8607 }
8608
8609 static unsigned char
8610 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
8611 {
8612   /* ??? Logically we should only need to provide a value when
8613      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
8614      can hold MODE, but at the moment we need to handle all modes.
8615      Just ignore any runtime parts for registers that can't store them.  */
8616   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
8617   unsigned int nregs;
8618   switch (regclass)
8619     {
8620     case TAILCALL_ADDR_REGS:
8621     case POINTER_REGS:
8622     case GENERAL_REGS:
8623     case ALL_REGS:
8624     case POINTER_AND_FP_REGS:
8625     case FP_REGS:
8626     case FP_LO_REGS:
8627     case FP_LO8_REGS:
8628       if (aarch64_sve_data_mode_p (mode)
8629           && constant_multiple_p (GET_MODE_SIZE (mode),
8630                                   BYTES_PER_SVE_VECTOR, &nregs))
8631         return nregs;
8632       return (aarch64_vector_data_mode_p (mode)
8633               ? CEIL (lowest_size, UNITS_PER_VREG)
8634               : CEIL (lowest_size, UNITS_PER_WORD));
8635     case STACK_REG:
8636     case PR_REGS:
8637     case PR_LO_REGS:
8638     case PR_HI_REGS:
8639       return 1;
8640
8641     case NO_REGS:
8642       return 0;
8643
8644     default:
8645       break;
8646     }
8647   gcc_unreachable ();
8648 }
8649
8650 static reg_class_t
8651 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
8652 {
8653   if (regclass == POINTER_REGS)
8654     return GENERAL_REGS;
8655
8656   if (regclass == STACK_REG)
8657     {
8658       if (REG_P(x)
8659           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
8660           return regclass;
8661
8662       return NO_REGS;
8663     }
8664
8665   /* Register eliminiation can result in a request for
8666      SP+constant->FP_REGS.  We cannot support such operations which
8667      use SP as source and an FP_REG as destination, so reject out
8668      right now.  */
8669   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
8670     {
8671       rtx lhs = XEXP (x, 0);
8672
8673       /* Look through a possible SUBREG introduced by ILP32.  */
8674       if (GET_CODE (lhs) == SUBREG)
8675         lhs = SUBREG_REG (lhs);
8676
8677       gcc_assert (REG_P (lhs));
8678       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
8679                                       POINTER_REGS));
8680       return NO_REGS;
8681     }
8682
8683   return regclass;
8684 }
8685
8686 void
8687 aarch64_asm_output_labelref (FILE* f, const char *name)
8688 {
8689   asm_fprintf (f, "%U%s", name);
8690 }
8691
8692 static void
8693 aarch64_elf_asm_constructor (rtx symbol, int priority)
8694 {
8695   if (priority == DEFAULT_INIT_PRIORITY)
8696     default_ctor_section_asm_out_constructor (symbol, priority);
8697   else
8698     {
8699       section *s;
8700       /* While priority is known to be in range [0, 65535], so 18 bytes
8701          would be enough, the compiler might not know that.  To avoid
8702          -Wformat-truncation false positive, use a larger size.  */
8703       char buf[23];
8704       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
8705       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
8706       switch_to_section (s);
8707       assemble_align (POINTER_SIZE);
8708       assemble_aligned_integer (POINTER_BYTES, symbol);
8709     }
8710 }
8711
8712 static void
8713 aarch64_elf_asm_destructor (rtx symbol, int priority)
8714 {
8715   if (priority == DEFAULT_INIT_PRIORITY)
8716     default_dtor_section_asm_out_destructor (symbol, priority);
8717   else
8718     {
8719       section *s;
8720       /* While priority is known to be in range [0, 65535], so 18 bytes
8721          would be enough, the compiler might not know that.  To avoid
8722          -Wformat-truncation false positive, use a larger size.  */
8723       char buf[23];
8724       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
8725       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
8726       switch_to_section (s);
8727       assemble_align (POINTER_SIZE);
8728       assemble_aligned_integer (POINTER_BYTES, symbol);
8729     }
8730 }
8731
8732 const char*
8733 aarch64_output_casesi (rtx *operands)
8734 {
8735   char buf[100];
8736   char label[100];
8737   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
8738   int index;
8739   static const char *const patterns[4][2] =
8740   {
8741     {
8742       "ldrb\t%w3, [%0,%w1,uxtw]",
8743       "add\t%3, %4, %w3, sxtb #2"
8744     },
8745     {
8746       "ldrh\t%w3, [%0,%w1,uxtw #1]",
8747       "add\t%3, %4, %w3, sxth #2"
8748     },
8749     {
8750       "ldr\t%w3, [%0,%w1,uxtw #2]",
8751       "add\t%3, %4, %w3, sxtw #2"
8752     },
8753     /* We assume that DImode is only generated when not optimizing and
8754        that we don't really need 64-bit address offsets.  That would
8755        imply an object file with 8GB of code in a single function!  */
8756     {
8757       "ldr\t%w3, [%0,%w1,uxtw #2]",
8758       "add\t%3, %4, %w3, sxtw #2"
8759     }
8760   };
8761
8762   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
8763
8764   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
8765   index = exact_log2 (GET_MODE_SIZE (mode));
8766
8767   gcc_assert (index >= 0 && index <= 3);
8768
8769   /* Need to implement table size reduction, by chaning the code below.  */
8770   output_asm_insn (patterns[index][0], operands);
8771   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
8772   snprintf (buf, sizeof (buf),
8773             "adr\t%%4, %s", targetm.strip_name_encoding (label));
8774   output_asm_insn (buf, operands);
8775   output_asm_insn (patterns[index][1], operands);
8776   output_asm_insn ("br\t%3", operands);
8777   assemble_label (asm_out_file, label);
8778   return "";
8779 }
8780
8781
8782 /* Return size in bits of an arithmetic operand which is shifted/scaled and
8783    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
8784    operator.  */
8785
8786 int
8787 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
8788 {
8789   if (shift >= 0 && shift <= 3)
8790     {
8791       int size;
8792       for (size = 8; size <= 32; size *= 2)
8793         {
8794           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
8795           if (mask == bits << shift)
8796             return size;
8797         }
8798     }
8799   return 0;
8800 }
8801
8802 /* Constant pools are per function only when PC relative
8803    literal loads are true or we are in the large memory
8804    model.  */
8805
8806 static inline bool
8807 aarch64_can_use_per_function_literal_pools_p (void)
8808 {
8809   return (aarch64_pcrelative_literal_loads
8810           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
8811 }
8812
8813 static bool
8814 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
8815 {
8816   /* We can't use blocks for constants when we're using a per-function
8817      constant pool.  */
8818   return !aarch64_can_use_per_function_literal_pools_p ();
8819 }
8820
8821 /* Select appropriate section for constants depending
8822    on where we place literal pools.  */
8823
8824 static section *
8825 aarch64_select_rtx_section (machine_mode mode,
8826                             rtx x,
8827                             unsigned HOST_WIDE_INT align)
8828 {
8829   if (aarch64_can_use_per_function_literal_pools_p ())
8830     return function_section (current_function_decl);
8831
8832   return default_elf_select_rtx_section (mode, x, align);
8833 }
8834
8835 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
8836 void
8837 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
8838                                   HOST_WIDE_INT offset)
8839 {
8840   /* When using per-function literal pools, we must ensure that any code
8841      section is aligned to the minimal instruction length, lest we get
8842      errors from the assembler re "unaligned instructions".  */
8843   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
8844     ASM_OUTPUT_ALIGN (f, 2);
8845 }
8846
8847 /* Costs.  */
8848
8849 /* Helper function for rtx cost calculation.  Strip a shift expression
8850    from X.  Returns the inner operand if successful, or the original
8851    expression on failure.  */
8852 static rtx
8853 aarch64_strip_shift (rtx x)
8854 {
8855   rtx op = x;
8856
8857   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
8858      we can convert both to ROR during final output.  */
8859   if ((GET_CODE (op) == ASHIFT
8860        || GET_CODE (op) == ASHIFTRT
8861        || GET_CODE (op) == LSHIFTRT
8862        || GET_CODE (op) == ROTATERT
8863        || GET_CODE (op) == ROTATE)
8864       && CONST_INT_P (XEXP (op, 1)))
8865     return XEXP (op, 0);
8866
8867   if (GET_CODE (op) == MULT
8868       && CONST_INT_P (XEXP (op, 1))
8869       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
8870     return XEXP (op, 0);
8871
8872   return x;
8873 }
8874
8875 /* Helper function for rtx cost calculation.  Strip an extend
8876    expression from X.  Returns the inner operand if successful, or the
8877    original expression on failure.  We deal with a number of possible
8878    canonicalization variations here. If STRIP_SHIFT is true, then
8879    we can strip off a shift also.  */
8880 static rtx
8881 aarch64_strip_extend (rtx x, bool strip_shift)
8882 {
8883   scalar_int_mode mode;
8884   rtx op = x;
8885
8886   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
8887     return op;
8888
8889   /* Zero and sign extraction of a widened value.  */
8890   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
8891       && XEXP (op, 2) == const0_rtx
8892       && GET_CODE (XEXP (op, 0)) == MULT
8893       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
8894                                          XEXP (op, 1)))
8895     return XEXP (XEXP (op, 0), 0);
8896
8897   /* It can also be represented (for zero-extend) as an AND with an
8898      immediate.  */
8899   if (GET_CODE (op) == AND
8900       && GET_CODE (XEXP (op, 0)) == MULT
8901       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
8902       && CONST_INT_P (XEXP (op, 1))
8903       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
8904                            INTVAL (XEXP (op, 1))) != 0)
8905     return XEXP (XEXP (op, 0), 0);
8906
8907   /* Now handle extended register, as this may also have an optional
8908      left shift by 1..4.  */
8909   if (strip_shift
8910       && GET_CODE (op) == ASHIFT
8911       && CONST_INT_P (XEXP (op, 1))
8912       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
8913     op = XEXP (op, 0);
8914
8915   if (GET_CODE (op) == ZERO_EXTEND
8916       || GET_CODE (op) == SIGN_EXTEND)
8917     op = XEXP (op, 0);
8918
8919   if (op != x)
8920     return op;
8921
8922   return x;
8923 }
8924
8925 /* Return true iff CODE is a shift supported in combination
8926    with arithmetic instructions.  */
8927
8928 static bool
8929 aarch64_shift_p (enum rtx_code code)
8930 {
8931   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
8932 }
8933
8934
8935 /* Return true iff X is a cheap shift without a sign extend. */
8936
8937 static bool
8938 aarch64_cheap_mult_shift_p (rtx x)
8939 {
8940   rtx op0, op1;
8941
8942   op0 = XEXP (x, 0);
8943   op1 = XEXP (x, 1);
8944
8945   if (!(aarch64_tune_params.extra_tuning_flags
8946                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
8947     return false;
8948
8949   if (GET_CODE (op0) == SIGN_EXTEND)
8950     return false;
8951
8952   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
8953       && UINTVAL (op1) <= 4)
8954     return true;
8955
8956   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
8957     return false;
8958
8959   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
8960
8961   if (l2 > 0 && l2 <= 4)
8962     return true;
8963
8964   return false;
8965 }
8966
8967 /* Helper function for rtx cost calculation.  Calculate the cost of
8968    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
8969    Return the calculated cost of the expression, recursing manually in to
8970    operands where needed.  */
8971
8972 static int
8973 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
8974 {
8975   rtx op0, op1;
8976   const struct cpu_cost_table *extra_cost
8977     = aarch64_tune_params.insn_extra_cost;
8978   int cost = 0;
8979   bool compound_p = (outer == PLUS || outer == MINUS);
8980   machine_mode mode = GET_MODE (x);
8981
8982   gcc_checking_assert (code == MULT);
8983
8984   op0 = XEXP (x, 0);
8985   op1 = XEXP (x, 1);
8986
8987   if (VECTOR_MODE_P (mode))
8988     mode = GET_MODE_INNER (mode);
8989
8990   /* Integer multiply/fma.  */
8991   if (GET_MODE_CLASS (mode) == MODE_INT)
8992     {
8993       /* The multiply will be canonicalized as a shift, cost it as such.  */
8994       if (aarch64_shift_p (GET_CODE (x))
8995           || (CONST_INT_P (op1)
8996               && exact_log2 (INTVAL (op1)) > 0))
8997         {
8998           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
8999                            || GET_CODE (op0) == SIGN_EXTEND;
9000           if (speed)
9001             {
9002               if (compound_p)
9003                 {
9004                   /* If the shift is considered cheap,
9005                      then don't add any cost. */
9006                   if (aarch64_cheap_mult_shift_p (x))
9007                     ;
9008                   else if (REG_P (op1))
9009                     /* ARITH + shift-by-register.  */
9010                     cost += extra_cost->alu.arith_shift_reg;
9011                   else if (is_extend)
9012                     /* ARITH + extended register.  We don't have a cost field
9013                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
9014                     cost += extra_cost->alu.extend_arith;
9015                   else
9016                     /* ARITH + shift-by-immediate.  */
9017                     cost += extra_cost->alu.arith_shift;
9018                 }
9019               else
9020                 /* LSL (immediate).  */
9021                 cost += extra_cost->alu.shift;
9022
9023             }
9024           /* Strip extends as we will have costed them in the case above.  */
9025           if (is_extend)
9026             op0 = aarch64_strip_extend (op0, true);
9027
9028           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
9029
9030           return cost;
9031         }
9032
9033       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
9034          compound and let the below cases handle it.  After all, MNEG is a
9035          special-case alias of MSUB.  */
9036       if (GET_CODE (op0) == NEG)
9037         {
9038           op0 = XEXP (op0, 0);
9039           compound_p = true;
9040         }
9041
9042       /* Integer multiplies or FMAs have zero/sign extending variants.  */
9043       if ((GET_CODE (op0) == ZERO_EXTEND
9044            && GET_CODE (op1) == ZERO_EXTEND)
9045           || (GET_CODE (op0) == SIGN_EXTEND
9046               && GET_CODE (op1) == SIGN_EXTEND))
9047         {
9048           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
9049           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
9050
9051           if (speed)
9052             {
9053               if (compound_p)
9054                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
9055                 cost += extra_cost->mult[0].extend_add;
9056               else
9057                 /* MUL/SMULL/UMULL.  */
9058                 cost += extra_cost->mult[0].extend;
9059             }
9060
9061           return cost;
9062         }
9063
9064       /* This is either an integer multiply or a MADD.  In both cases
9065          we want to recurse and cost the operands.  */
9066       cost += rtx_cost (op0, mode, MULT, 0, speed);
9067       cost += rtx_cost (op1, mode, MULT, 1, speed);
9068
9069       if (speed)
9070         {
9071           if (compound_p)
9072             /* MADD/MSUB.  */
9073             cost += extra_cost->mult[mode == DImode].add;
9074           else
9075             /* MUL.  */
9076             cost += extra_cost->mult[mode == DImode].simple;
9077         }
9078
9079       return cost;
9080     }
9081   else
9082     {
9083       if (speed)
9084         {
9085           /* Floating-point FMA/FMUL can also support negations of the
9086              operands, unless the rounding mode is upward or downward in
9087              which case FNMUL is different than FMUL with operand negation.  */
9088           bool neg0 = GET_CODE (op0) == NEG;
9089           bool neg1 = GET_CODE (op1) == NEG;
9090           if (compound_p || !flag_rounding_math || (neg0 && neg1))
9091             {
9092               if (neg0)
9093                 op0 = XEXP (op0, 0);
9094               if (neg1)
9095                 op1 = XEXP (op1, 0);
9096             }
9097
9098           if (compound_p)
9099             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
9100             cost += extra_cost->fp[mode == DFmode].fma;
9101           else
9102             /* FMUL/FNMUL.  */
9103             cost += extra_cost->fp[mode == DFmode].mult;
9104         }
9105
9106       cost += rtx_cost (op0, mode, MULT, 0, speed);
9107       cost += rtx_cost (op1, mode, MULT, 1, speed);
9108       return cost;
9109     }
9110 }
9111
9112 static int
9113 aarch64_address_cost (rtx x,
9114                       machine_mode mode,
9115                       addr_space_t as ATTRIBUTE_UNUSED,
9116                       bool speed)
9117 {
9118   enum rtx_code c = GET_CODE (x);
9119   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
9120   struct aarch64_address_info info;
9121   int cost = 0;
9122   info.shift = 0;
9123
9124   if (!aarch64_classify_address (&info, x, mode, false))
9125     {
9126       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
9127         {
9128           /* This is a CONST or SYMBOL ref which will be split
9129              in a different way depending on the code model in use.
9130              Cost it through the generic infrastructure.  */
9131           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
9132           /* Divide through by the cost of one instruction to
9133              bring it to the same units as the address costs.  */
9134           cost_symbol_ref /= COSTS_N_INSNS (1);
9135           /* The cost is then the cost of preparing the address,
9136              followed by an immediate (possibly 0) offset.  */
9137           return cost_symbol_ref + addr_cost->imm_offset;
9138         }
9139       else
9140         {
9141           /* This is most likely a jump table from a case
9142              statement.  */
9143           return addr_cost->register_offset;
9144         }
9145     }
9146
9147   switch (info.type)
9148     {
9149       case ADDRESS_LO_SUM:
9150       case ADDRESS_SYMBOLIC:
9151       case ADDRESS_REG_IMM:
9152         cost += addr_cost->imm_offset;
9153         break;
9154
9155       case ADDRESS_REG_WB:
9156         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
9157           cost += addr_cost->pre_modify;
9158         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
9159           cost += addr_cost->post_modify;
9160         else
9161           gcc_unreachable ();
9162
9163         break;
9164
9165       case ADDRESS_REG_REG:
9166         cost += addr_cost->register_offset;
9167         break;
9168
9169       case ADDRESS_REG_SXTW:
9170         cost += addr_cost->register_sextend;
9171         break;
9172
9173       case ADDRESS_REG_UXTW:
9174         cost += addr_cost->register_zextend;
9175         break;
9176
9177       default:
9178         gcc_unreachable ();
9179     }
9180
9181
9182   if (info.shift > 0)
9183     {
9184       /* For the sake of calculating the cost of the shifted register
9185          component, we can treat same sized modes in the same way.  */
9186       if (known_eq (GET_MODE_BITSIZE (mode), 16))
9187         cost += addr_cost->addr_scale_costs.hi;
9188       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
9189         cost += addr_cost->addr_scale_costs.si;
9190       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
9191         cost += addr_cost->addr_scale_costs.di;
9192       else
9193         /* We can't tell, or this is a 128-bit vector.  */
9194         cost += addr_cost->addr_scale_costs.ti;
9195     }
9196
9197   return cost;
9198 }
9199
9200 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
9201    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
9202    to be taken.  */
9203
9204 int
9205 aarch64_branch_cost (bool speed_p, bool predictable_p)
9206 {
9207   /* When optimizing for speed, use the cost of unpredictable branches.  */
9208   const struct cpu_branch_cost *branch_costs =
9209     aarch64_tune_params.branch_costs;
9210
9211   if (!speed_p || predictable_p)
9212     return branch_costs->predictable;
9213   else
9214     return branch_costs->unpredictable;
9215 }
9216
9217 /* Return true if the RTX X in mode MODE is a zero or sign extract
9218    usable in an ADD or SUB (extended register) instruction.  */
9219 static bool
9220 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
9221 {
9222   /* Catch add with a sign extract.
9223      This is add_<optab><mode>_multp2.  */
9224   if (GET_CODE (x) == SIGN_EXTRACT
9225       || GET_CODE (x) == ZERO_EXTRACT)
9226     {
9227       rtx op0 = XEXP (x, 0);
9228       rtx op1 = XEXP (x, 1);
9229       rtx op2 = XEXP (x, 2);
9230
9231       if (GET_CODE (op0) == MULT
9232           && CONST_INT_P (op1)
9233           && op2 == const0_rtx
9234           && CONST_INT_P (XEXP (op0, 1))
9235           && aarch64_is_extend_from_extract (mode,
9236                                              XEXP (op0, 1),
9237                                              op1))
9238         {
9239           return true;
9240         }
9241     }
9242   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
9243      No shift.  */
9244   else if (GET_CODE (x) == SIGN_EXTEND
9245            || GET_CODE (x) == ZERO_EXTEND)
9246     return REG_P (XEXP (x, 0));
9247
9248   return false;
9249 }
9250
9251 static bool
9252 aarch64_frint_unspec_p (unsigned int u)
9253 {
9254   switch (u)
9255     {
9256       case UNSPEC_FRINTZ:
9257       case UNSPEC_FRINTP:
9258       case UNSPEC_FRINTM:
9259       case UNSPEC_FRINTA:
9260       case UNSPEC_FRINTN:
9261       case UNSPEC_FRINTX:
9262       case UNSPEC_FRINTI:
9263         return true;
9264
9265       default:
9266         return false;
9267     }
9268 }
9269
9270 /* Return true iff X is an rtx that will match an extr instruction
9271    i.e. as described in the *extr<mode>5_insn family of patterns.
9272    OP0 and OP1 will be set to the operands of the shifts involved
9273    on success and will be NULL_RTX otherwise.  */
9274
9275 static bool
9276 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
9277 {
9278   rtx op0, op1;
9279   scalar_int_mode mode;
9280   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
9281     return false;
9282
9283   *res_op0 = NULL_RTX;
9284   *res_op1 = NULL_RTX;
9285
9286   if (GET_CODE (x) != IOR)
9287     return false;
9288
9289   op0 = XEXP (x, 0);
9290   op1 = XEXP (x, 1);
9291
9292   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
9293       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
9294     {
9295      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
9296       if (GET_CODE (op1) == ASHIFT)
9297         std::swap (op0, op1);
9298
9299       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
9300         return false;
9301
9302       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
9303       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
9304
9305       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
9306           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
9307         {
9308           *res_op0 = XEXP (op0, 0);
9309           *res_op1 = XEXP (op1, 0);
9310           return true;
9311         }
9312     }
9313
9314   return false;
9315 }
9316
9317 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
9318    storing it in *COST.  Result is true if the total cost of the operation
9319    has now been calculated.  */
9320 static bool
9321 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
9322 {
9323   rtx inner;
9324   rtx comparator;
9325   enum rtx_code cmpcode;
9326
9327   if (COMPARISON_P (op0))
9328     {
9329       inner = XEXP (op0, 0);
9330       comparator = XEXP (op0, 1);
9331       cmpcode = GET_CODE (op0);
9332     }
9333   else
9334     {
9335       inner = op0;
9336       comparator = const0_rtx;
9337       cmpcode = NE;
9338     }
9339
9340   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
9341     {
9342       /* Conditional branch.  */
9343       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9344         return true;
9345       else
9346         {
9347           if (cmpcode == NE || cmpcode == EQ)
9348             {
9349               if (comparator == const0_rtx)
9350                 {
9351                   /* TBZ/TBNZ/CBZ/CBNZ.  */
9352                   if (GET_CODE (inner) == ZERO_EXTRACT)
9353                     /* TBZ/TBNZ.  */
9354                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
9355                                        ZERO_EXTRACT, 0, speed);
9356                   else
9357                     /* CBZ/CBNZ.  */
9358                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
9359
9360                 return true;
9361               }
9362             }
9363           else if (cmpcode == LT || cmpcode == GE)
9364             {
9365               /* TBZ/TBNZ.  */
9366               if (comparator == const0_rtx)
9367                 return true;
9368             }
9369         }
9370     }
9371   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9372     {
9373       /* CCMP.  */
9374       if (GET_CODE (op1) == COMPARE)
9375         {
9376           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
9377           if (XEXP (op1, 1) == const0_rtx)
9378             *cost += 1;
9379           if (speed)
9380             {
9381               machine_mode mode = GET_MODE (XEXP (op1, 0));
9382               const struct cpu_cost_table *extra_cost
9383                 = aarch64_tune_params.insn_extra_cost;
9384
9385               if (GET_MODE_CLASS (mode) == MODE_INT)
9386                 *cost += extra_cost->alu.arith;
9387               else
9388                 *cost += extra_cost->fp[mode == DFmode].compare;
9389             }
9390           return true;
9391         }
9392
9393       /* It's a conditional operation based on the status flags,
9394          so it must be some flavor of CSEL.  */
9395
9396       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
9397       if (GET_CODE (op1) == NEG
9398           || GET_CODE (op1) == NOT
9399           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
9400         op1 = XEXP (op1, 0);
9401       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
9402         {
9403           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
9404           op1 = XEXP (op1, 0);
9405           op2 = XEXP (op2, 0);
9406         }
9407
9408       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
9409       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
9410       return true;
9411     }
9412
9413   /* We don't know what this is, cost all operands.  */
9414   return false;
9415 }
9416
9417 /* Check whether X is a bitfield operation of the form shift + extend that
9418    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
9419    operand to which the bitfield operation is applied.  Otherwise return
9420    NULL_RTX.  */
9421
9422 static rtx
9423 aarch64_extend_bitfield_pattern_p (rtx x)
9424 {
9425   rtx_code outer_code = GET_CODE (x);
9426   machine_mode outer_mode = GET_MODE (x);
9427
9428   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
9429       && outer_mode != SImode && outer_mode != DImode)
9430     return NULL_RTX;
9431
9432   rtx inner = XEXP (x, 0);
9433   rtx_code inner_code = GET_CODE (inner);
9434   machine_mode inner_mode = GET_MODE (inner);
9435   rtx op = NULL_RTX;
9436
9437   switch (inner_code)
9438     {
9439       case ASHIFT:
9440         if (CONST_INT_P (XEXP (inner, 1))
9441             && (inner_mode == QImode || inner_mode == HImode))
9442           op = XEXP (inner, 0);
9443         break;
9444       case LSHIFTRT:
9445         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
9446             && (inner_mode == QImode || inner_mode == HImode))
9447           op = XEXP (inner, 0);
9448         break;
9449       case ASHIFTRT:
9450         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
9451             && (inner_mode == QImode || inner_mode == HImode))
9452           op = XEXP (inner, 0);
9453         break;
9454       default:
9455         break;
9456     }
9457
9458   return op;
9459 }
9460
9461 /* Return true if the mask and a shift amount from an RTX of the form
9462    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
9463    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
9464
9465 bool
9466 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
9467                                     rtx shft_amnt)
9468 {
9469   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
9470          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
9471          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
9472          && (INTVAL (mask)
9473              & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
9474 }
9475
9476 /* Return true if the masks and a shift amount from an RTX of the form
9477    ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
9478    a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
9479
9480 bool
9481 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
9482                                    unsigned HOST_WIDE_INT mask1,
9483                                    unsigned HOST_WIDE_INT shft_amnt,
9484                                    unsigned HOST_WIDE_INT mask2)
9485 {
9486   unsigned HOST_WIDE_INT t;
9487
9488   /* Verify that there is no overlap in what bits are set in the two masks.  */
9489   if (mask1 != ~mask2)
9490     return false;
9491
9492   /* Verify that mask2 is not all zeros or ones.  */
9493   if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
9494     return false;
9495
9496   /* The shift amount should always be less than the mode size.  */
9497   gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
9498
9499   /* Verify that the mask being shifted is contiguous and would be in the
9500      least significant bits after shifting by shft_amnt.  */
9501   t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
9502   return (t == (t & -t));
9503 }
9504
9505 /* Calculate the cost of calculating X, storing it in *COST.  Result
9506    is true if the total cost of the operation has now been calculated.  */
9507 static bool
9508 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
9509                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
9510 {
9511   rtx op0, op1, op2;
9512   const struct cpu_cost_table *extra_cost
9513     = aarch64_tune_params.insn_extra_cost;
9514   int code = GET_CODE (x);
9515   scalar_int_mode int_mode;
9516
9517   /* By default, assume that everything has equivalent cost to the
9518      cheapest instruction.  Any additional costs are applied as a delta
9519      above this default.  */
9520   *cost = COSTS_N_INSNS (1);
9521
9522   switch (code)
9523     {
9524     case SET:
9525       /* The cost depends entirely on the operands to SET.  */
9526       *cost = 0;
9527       op0 = SET_DEST (x);
9528       op1 = SET_SRC (x);
9529
9530       switch (GET_CODE (op0))
9531         {
9532         case MEM:
9533           if (speed)
9534             {
9535               rtx address = XEXP (op0, 0);
9536               if (VECTOR_MODE_P (mode))
9537                 *cost += extra_cost->ldst.storev;
9538               else if (GET_MODE_CLASS (mode) == MODE_INT)
9539                 *cost += extra_cost->ldst.store;
9540               else if (mode == SFmode)
9541                 *cost += extra_cost->ldst.storef;
9542               else if (mode == DFmode)
9543                 *cost += extra_cost->ldst.stored;
9544
9545               *cost +=
9546                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9547                                                      0, speed));
9548             }
9549
9550           *cost += rtx_cost (op1, mode, SET, 1, speed);
9551           return true;
9552
9553         case SUBREG:
9554           if (! REG_P (SUBREG_REG (op0)))
9555             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
9556
9557           /* Fall through.  */
9558         case REG:
9559           /* The cost is one per vector-register copied.  */
9560           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
9561             {
9562               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
9563               *cost = COSTS_N_INSNS (nregs);
9564             }
9565           /* const0_rtx is in general free, but we will use an
9566              instruction to set a register to 0.  */
9567           else if (REG_P (op1) || op1 == const0_rtx)
9568             {
9569               /* The cost is 1 per register copied.  */
9570               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
9571               *cost = COSTS_N_INSNS (nregs);
9572             }
9573           else
9574             /* Cost is just the cost of the RHS of the set.  */
9575             *cost += rtx_cost (op1, mode, SET, 1, speed);
9576           return true;
9577
9578         case ZERO_EXTRACT:
9579         case SIGN_EXTRACT:
9580           /* Bit-field insertion.  Strip any redundant widening of
9581              the RHS to meet the width of the target.  */
9582           if (GET_CODE (op1) == SUBREG)
9583             op1 = SUBREG_REG (op1);
9584           if ((GET_CODE (op1) == ZERO_EXTEND
9585                || GET_CODE (op1) == SIGN_EXTEND)
9586               && CONST_INT_P (XEXP (op0, 1))
9587               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
9588               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
9589             op1 = XEXP (op1, 0);
9590
9591           if (CONST_INT_P (op1))
9592             {
9593               /* MOV immediate is assumed to always be cheap.  */
9594               *cost = COSTS_N_INSNS (1);
9595             }
9596           else
9597             {
9598               /* BFM.  */
9599               if (speed)
9600                 *cost += extra_cost->alu.bfi;
9601               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
9602             }
9603
9604           return true;
9605
9606         default:
9607           /* We can't make sense of this, assume default cost.  */
9608           *cost = COSTS_N_INSNS (1);
9609           return false;
9610         }
9611       return false;
9612
9613     case CONST_INT:
9614       /* If an instruction can incorporate a constant within the
9615          instruction, the instruction's expression avoids calling
9616          rtx_cost() on the constant.  If rtx_cost() is called on a
9617          constant, then it is usually because the constant must be
9618          moved into a register by one or more instructions.
9619
9620          The exception is constant 0, which can be expressed
9621          as XZR/WZR and is therefore free.  The exception to this is
9622          if we have (set (reg) (const0_rtx)) in which case we must cost
9623          the move.  However, we can catch that when we cost the SET, so
9624          we don't need to consider that here.  */
9625       if (x == const0_rtx)
9626         *cost = 0;
9627       else
9628         {
9629           /* To an approximation, building any other constant is
9630              proportionally expensive to the number of instructions
9631              required to build that constant.  This is true whether we
9632              are compiling for SPEED or otherwise.  */
9633           if (!is_a <scalar_int_mode> (mode, &int_mode))
9634             int_mode = word_mode;
9635           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
9636                                  (NULL_RTX, x, false, int_mode));
9637         }
9638       return true;
9639
9640     case CONST_DOUBLE:
9641
9642       /* First determine number of instructions to do the move
9643           as an integer constant.  */
9644       if (!aarch64_float_const_representable_p (x)
9645            && !aarch64_can_const_movi_rtx_p (x, mode)
9646            && aarch64_float_const_rtx_p (x))
9647         {
9648           unsigned HOST_WIDE_INT ival;
9649           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
9650           gcc_assert (succeed);
9651
9652           scalar_int_mode imode = (mode == HFmode
9653                                    ? SImode
9654                                    : int_mode_for_mode (mode).require ());
9655           int ncost = aarch64_internal_mov_immediate
9656                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
9657           *cost += COSTS_N_INSNS (ncost);
9658           return true;
9659         }
9660
9661       if (speed)
9662         {
9663           /* mov[df,sf]_aarch64.  */
9664           if (aarch64_float_const_representable_p (x))
9665             /* FMOV (scalar immediate).  */
9666             *cost += extra_cost->fp[mode == DFmode].fpconst;
9667           else if (!aarch64_float_const_zero_rtx_p (x))
9668             {
9669               /* This will be a load from memory.  */
9670               if (mode == DFmode)
9671                 *cost += extra_cost->ldst.loadd;
9672               else
9673                 *cost += extra_cost->ldst.loadf;
9674             }
9675           else
9676             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
9677                or MOV v0.s[0], wzr - neither of which are modeled by the
9678                cost tables.  Just use the default cost.  */
9679             {
9680             }
9681         }
9682
9683       return true;
9684
9685     case MEM:
9686       if (speed)
9687         {
9688           /* For loads we want the base cost of a load, plus an
9689              approximation for the additional cost of the addressing
9690              mode.  */
9691           rtx address = XEXP (x, 0);
9692           if (VECTOR_MODE_P (mode))
9693             *cost += extra_cost->ldst.loadv;
9694           else if (GET_MODE_CLASS (mode) == MODE_INT)
9695             *cost += extra_cost->ldst.load;
9696           else if (mode == SFmode)
9697             *cost += extra_cost->ldst.loadf;
9698           else if (mode == DFmode)
9699             *cost += extra_cost->ldst.loadd;
9700
9701           *cost +=
9702                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9703                                                      0, speed));
9704         }
9705
9706       return true;
9707
9708     case NEG:
9709       op0 = XEXP (x, 0);
9710
9711       if (VECTOR_MODE_P (mode))
9712         {
9713           if (speed)
9714             {
9715               /* FNEG.  */
9716               *cost += extra_cost->vect.alu;
9717             }
9718           return false;
9719         }
9720
9721       if (GET_MODE_CLASS (mode) == MODE_INT)
9722         {
9723           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
9724               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
9725             {
9726               /* CSETM.  */
9727               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
9728               return true;
9729             }
9730
9731           /* Cost this as SUB wzr, X.  */
9732           op0 = CONST0_RTX (mode);
9733           op1 = XEXP (x, 0);
9734           goto cost_minus;
9735         }
9736
9737       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9738         {
9739           /* Support (neg(fma...)) as a single instruction only if
9740              sign of zeros is unimportant.  This matches the decision
9741              making in aarch64.md.  */
9742           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
9743             {
9744               /* FNMADD.  */
9745               *cost = rtx_cost (op0, mode, NEG, 0, speed);
9746               return true;
9747             }
9748           if (GET_CODE (op0) == MULT)
9749             {
9750               /* FNMUL.  */
9751               *cost = rtx_cost (op0, mode, NEG, 0, speed);
9752               return true;
9753             }
9754           if (speed)
9755             /* FNEG.  */
9756             *cost += extra_cost->fp[mode == DFmode].neg;
9757           return false;
9758         }
9759
9760       return false;
9761
9762     case CLRSB:
9763     case CLZ:
9764       if (speed)
9765         {
9766           if (VECTOR_MODE_P (mode))
9767             *cost += extra_cost->vect.alu;
9768           else
9769             *cost += extra_cost->alu.clz;
9770         }
9771
9772       return false;
9773
9774     case COMPARE:
9775       op0 = XEXP (x, 0);
9776       op1 = XEXP (x, 1);
9777
9778       if (op1 == const0_rtx
9779           && GET_CODE (op0) == AND)
9780         {
9781           x = op0;
9782           mode = GET_MODE (op0);
9783           goto cost_logic;
9784         }
9785
9786       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
9787         {
9788           /* TODO: A write to the CC flags possibly costs extra, this
9789              needs encoding in the cost tables.  */
9790
9791           mode = GET_MODE (op0);
9792           /* ANDS.  */
9793           if (GET_CODE (op0) == AND)
9794             {
9795               x = op0;
9796               goto cost_logic;
9797             }
9798
9799           if (GET_CODE (op0) == PLUS)
9800             {
9801               /* ADDS (and CMN alias).  */
9802               x = op0;
9803               goto cost_plus;
9804             }
9805
9806           if (GET_CODE (op0) == MINUS)
9807             {
9808               /* SUBS.  */
9809               x = op0;
9810               goto cost_minus;
9811             }
9812
9813           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
9814               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
9815               && CONST_INT_P (XEXP (op0, 2)))
9816             {
9817               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
9818                  Handle it here directly rather than going to cost_logic
9819                  since we know the immediate generated for the TST is valid
9820                  so we can avoid creating an intermediate rtx for it only
9821                  for costing purposes.  */
9822               if (speed)
9823                 *cost += extra_cost->alu.logical;
9824
9825               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
9826                                  ZERO_EXTRACT, 0, speed);
9827               return true;
9828             }
9829
9830           if (GET_CODE (op1) == NEG)
9831             {
9832               /* CMN.  */
9833               if (speed)
9834                 *cost += extra_cost->alu.arith;
9835
9836               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
9837               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
9838               return true;
9839             }
9840
9841           /* CMP.
9842
9843              Compare can freely swap the order of operands, and
9844              canonicalization puts the more complex operation first.
9845              But the integer MINUS logic expects the shift/extend
9846              operation in op1.  */
9847           if (! (REG_P (op0)
9848                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
9849           {
9850             op0 = XEXP (x, 1);
9851             op1 = XEXP (x, 0);
9852           }
9853           goto cost_minus;
9854         }
9855
9856       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
9857         {
9858           /* FCMP.  */
9859           if (speed)
9860             *cost += extra_cost->fp[mode == DFmode].compare;
9861
9862           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
9863             {
9864               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
9865               /* FCMP supports constant 0.0 for no extra cost. */
9866               return true;
9867             }
9868           return false;
9869         }
9870
9871       if (VECTOR_MODE_P (mode))
9872         {
9873           /* Vector compare.  */
9874           if (speed)
9875             *cost += extra_cost->vect.alu;
9876
9877           if (aarch64_float_const_zero_rtx_p (op1))
9878             {
9879               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
9880                  cost.  */
9881               return true;
9882             }
9883           return false;
9884         }
9885       return false;
9886
9887     case MINUS:
9888       {
9889         op0 = XEXP (x, 0);
9890         op1 = XEXP (x, 1);
9891
9892 cost_minus:
9893         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
9894
9895         /* Detect valid immediates.  */
9896         if ((GET_MODE_CLASS (mode) == MODE_INT
9897              || (GET_MODE_CLASS (mode) == MODE_CC
9898                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
9899             && CONST_INT_P (op1)
9900             && aarch64_uimm12_shift (INTVAL (op1)))
9901           {
9902             if (speed)
9903               /* SUB(S) (immediate).  */
9904               *cost += extra_cost->alu.arith;
9905             return true;
9906           }
9907
9908         /* Look for SUB (extended register).  */
9909         if (is_a <scalar_int_mode> (mode, &int_mode)
9910             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
9911           {
9912             if (speed)
9913               *cost += extra_cost->alu.extend_arith;
9914
9915             op1 = aarch64_strip_extend (op1, true);
9916             *cost += rtx_cost (op1, VOIDmode,
9917                                (enum rtx_code) GET_CODE (op1), 0, speed);
9918             return true;
9919           }
9920
9921         rtx new_op1 = aarch64_strip_extend (op1, false);
9922
9923         /* Cost this as an FMA-alike operation.  */
9924         if ((GET_CODE (new_op1) == MULT
9925              || aarch64_shift_p (GET_CODE (new_op1)))
9926             && code != COMPARE)
9927           {
9928             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
9929                                             (enum rtx_code) code,
9930                                             speed);
9931             return true;
9932           }
9933
9934         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
9935
9936         if (speed)
9937           {
9938             if (VECTOR_MODE_P (mode))
9939               {
9940                 /* Vector SUB.  */
9941                 *cost += extra_cost->vect.alu;
9942               }
9943             else if (GET_MODE_CLASS (mode) == MODE_INT)
9944               {
9945                 /* SUB(S).  */
9946                 *cost += extra_cost->alu.arith;
9947               }
9948             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9949               {
9950                 /* FSUB.  */
9951                 *cost += extra_cost->fp[mode == DFmode].addsub;
9952               }
9953           }
9954         return true;
9955       }
9956
9957     case PLUS:
9958       {
9959         rtx new_op0;
9960
9961         op0 = XEXP (x, 0);
9962         op1 = XEXP (x, 1);
9963
9964 cost_plus:
9965         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
9966             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
9967           {
9968             /* CSINC.  */
9969             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
9970             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
9971             return true;
9972           }
9973
9974         if (GET_MODE_CLASS (mode) == MODE_INT
9975             && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
9976                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
9977           {
9978             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
9979
9980             if (speed)
9981               /* ADD (immediate).  */
9982               *cost += extra_cost->alu.arith;
9983             return true;
9984           }
9985
9986         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
9987
9988         /* Look for ADD (extended register).  */
9989         if (is_a <scalar_int_mode> (mode, &int_mode)
9990             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
9991           {
9992             if (speed)
9993               *cost += extra_cost->alu.extend_arith;
9994
9995             op0 = aarch64_strip_extend (op0, true);
9996             *cost += rtx_cost (op0, VOIDmode,
9997                                (enum rtx_code) GET_CODE (op0), 0, speed);
9998             return true;
9999           }
10000
10001         /* Strip any extend, leave shifts behind as we will
10002            cost them through mult_cost.  */
10003         new_op0 = aarch64_strip_extend (op0, false);
10004
10005         if (GET_CODE (new_op0) == MULT
10006             || aarch64_shift_p (GET_CODE (new_op0)))
10007           {
10008             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
10009                                             speed);
10010             return true;
10011           }
10012
10013         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
10014
10015         if (speed)
10016           {
10017             if (VECTOR_MODE_P (mode))
10018               {
10019                 /* Vector ADD.  */
10020                 *cost += extra_cost->vect.alu;
10021               }
10022             else if (GET_MODE_CLASS (mode) == MODE_INT)
10023               {
10024                 /* ADD.  */
10025                 *cost += extra_cost->alu.arith;
10026               }
10027             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10028               {
10029                 /* FADD.  */
10030                 *cost += extra_cost->fp[mode == DFmode].addsub;
10031               }
10032           }
10033         return true;
10034       }
10035
10036     case BSWAP:
10037       *cost = COSTS_N_INSNS (1);
10038
10039       if (speed)
10040         {
10041           if (VECTOR_MODE_P (mode))
10042             *cost += extra_cost->vect.alu;
10043           else
10044             *cost += extra_cost->alu.rev;
10045         }
10046       return false;
10047
10048     case IOR:
10049       if (aarch_rev16_p (x))
10050         {
10051           *cost = COSTS_N_INSNS (1);
10052
10053           if (speed)
10054             {
10055               if (VECTOR_MODE_P (mode))
10056                 *cost += extra_cost->vect.alu;
10057               else
10058                 *cost += extra_cost->alu.rev;
10059             }
10060           return true;
10061         }
10062
10063       if (aarch64_extr_rtx_p (x, &op0, &op1))
10064         {
10065           *cost += rtx_cost (op0, mode, IOR, 0, speed);
10066           *cost += rtx_cost (op1, mode, IOR, 1, speed);
10067           if (speed)
10068             *cost += extra_cost->alu.shift;
10069
10070           return true;
10071         }
10072     /* Fall through.  */
10073     case XOR:
10074     case AND:
10075     cost_logic:
10076       op0 = XEXP (x, 0);
10077       op1 = XEXP (x, 1);
10078
10079       if (VECTOR_MODE_P (mode))
10080         {
10081           if (speed)
10082             *cost += extra_cost->vect.alu;
10083           return true;
10084         }
10085
10086       if (code == AND
10087           && GET_CODE (op0) == MULT
10088           && CONST_INT_P (XEXP (op0, 1))
10089           && CONST_INT_P (op1)
10090           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
10091                                INTVAL (op1)) != 0)
10092         {
10093           /* This is a UBFM/SBFM.  */
10094           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
10095           if (speed)
10096             *cost += extra_cost->alu.bfx;
10097           return true;
10098         }
10099
10100       if (is_int_mode (mode, &int_mode))
10101         {
10102           if (CONST_INT_P (op1))
10103             {
10104               /* We have a mask + shift version of a UBFIZ
10105                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
10106               if (GET_CODE (op0) == ASHIFT
10107                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
10108                                                          XEXP (op0, 1)))
10109                 {
10110                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
10111                                      (enum rtx_code) code, 0, speed);
10112                   if (speed)
10113                     *cost += extra_cost->alu.bfx;
10114
10115                   return true;
10116                 }
10117               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
10118                 {
10119                 /* We possibly get the immediate for free, this is not
10120                    modelled.  */
10121                   *cost += rtx_cost (op0, int_mode,
10122                                      (enum rtx_code) code, 0, speed);
10123                   if (speed)
10124                     *cost += extra_cost->alu.logical;
10125
10126                   return true;
10127                 }
10128             }
10129           else
10130             {
10131               rtx new_op0 = op0;
10132
10133               /* Handle ORN, EON, or BIC.  */
10134               if (GET_CODE (op0) == NOT)
10135                 op0 = XEXP (op0, 0);
10136
10137               new_op0 = aarch64_strip_shift (op0);
10138
10139               /* If we had a shift on op0 then this is a logical-shift-
10140                  by-register/immediate operation.  Otherwise, this is just
10141                  a logical operation.  */
10142               if (speed)
10143                 {
10144                   if (new_op0 != op0)
10145                     {
10146                       /* Shift by immediate.  */
10147                       if (CONST_INT_P (XEXP (op0, 1)))
10148                         *cost += extra_cost->alu.log_shift;
10149                       else
10150                         *cost += extra_cost->alu.log_shift_reg;
10151                     }
10152                   else
10153                     *cost += extra_cost->alu.logical;
10154                 }
10155
10156               /* In both cases we want to cost both operands.  */
10157               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
10158                                  0, speed);
10159               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
10160                                  1, speed);
10161
10162               return true;
10163             }
10164         }
10165       return false;
10166
10167     case NOT:
10168       x = XEXP (x, 0);
10169       op0 = aarch64_strip_shift (x);
10170
10171       if (VECTOR_MODE_P (mode))
10172         {
10173           /* Vector NOT.  */
10174           *cost += extra_cost->vect.alu;
10175           return false;
10176         }
10177
10178       /* MVN-shifted-reg.  */
10179       if (op0 != x)
10180         {
10181           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10182
10183           if (speed)
10184             *cost += extra_cost->alu.log_shift;
10185
10186           return true;
10187         }
10188       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
10189          Handle the second form here taking care that 'a' in the above can
10190          be a shift.  */
10191       else if (GET_CODE (op0) == XOR)
10192         {
10193           rtx newop0 = XEXP (op0, 0);
10194           rtx newop1 = XEXP (op0, 1);
10195           rtx op0_stripped = aarch64_strip_shift (newop0);
10196
10197           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
10198           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
10199
10200           if (speed)
10201             {
10202               if (op0_stripped != newop0)
10203                 *cost += extra_cost->alu.log_shift;
10204               else
10205                 *cost += extra_cost->alu.logical;
10206             }
10207
10208           return true;
10209         }
10210       /* MVN.  */
10211       if (speed)
10212         *cost += extra_cost->alu.logical;
10213
10214       return false;
10215
10216     case ZERO_EXTEND:
10217
10218       op0 = XEXP (x, 0);
10219       /* If a value is written in SI mode, then zero extended to DI
10220          mode, the operation will in general be free as a write to
10221          a 'w' register implicitly zeroes the upper bits of an 'x'
10222          register.  However, if this is
10223
10224            (set (reg) (zero_extend (reg)))
10225
10226          we must cost the explicit register move.  */
10227       if (mode == DImode
10228           && GET_MODE (op0) == SImode
10229           && outer == SET)
10230         {
10231           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
10232
10233         /* If OP_COST is non-zero, then the cost of the zero extend
10234            is effectively the cost of the inner operation.  Otherwise
10235            we have a MOV instruction and we take the cost from the MOV
10236            itself.  This is true independently of whether we are
10237            optimizing for space or time.  */
10238           if (op_cost)
10239             *cost = op_cost;
10240
10241           return true;
10242         }
10243       else if (MEM_P (op0))
10244         {
10245           /* All loads can zero extend to any size for free.  */
10246           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
10247           return true;
10248         }
10249
10250       op0 = aarch64_extend_bitfield_pattern_p (x);
10251       if (op0)
10252         {
10253           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
10254           if (speed)
10255             *cost += extra_cost->alu.bfx;
10256           return true;
10257         }
10258
10259       if (speed)
10260         {
10261           if (VECTOR_MODE_P (mode))
10262             {
10263               /* UMOV.  */
10264               *cost += extra_cost->vect.alu;
10265             }
10266           else
10267             {
10268               /* We generate an AND instead of UXTB/UXTH.  */
10269               *cost += extra_cost->alu.logical;
10270             }
10271         }
10272       return false;
10273
10274     case SIGN_EXTEND:
10275       if (MEM_P (XEXP (x, 0)))
10276         {
10277           /* LDRSH.  */
10278           if (speed)
10279             {
10280               rtx address = XEXP (XEXP (x, 0), 0);
10281               *cost += extra_cost->ldst.load_sign_extend;
10282
10283               *cost +=
10284                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10285                                                      0, speed));
10286             }
10287           return true;
10288         }
10289
10290       op0 = aarch64_extend_bitfield_pattern_p (x);
10291       if (op0)
10292         {
10293           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
10294           if (speed)
10295             *cost += extra_cost->alu.bfx;
10296           return true;
10297         }
10298
10299       if (speed)
10300         {
10301           if (VECTOR_MODE_P (mode))
10302             *cost += extra_cost->vect.alu;
10303           else
10304             *cost += extra_cost->alu.extend;
10305         }
10306       return false;
10307
10308     case ASHIFT:
10309       op0 = XEXP (x, 0);
10310       op1 = XEXP (x, 1);
10311
10312       if (CONST_INT_P (op1))
10313         {
10314           if (speed)
10315             {
10316               if (VECTOR_MODE_P (mode))
10317                 {
10318                   /* Vector shift (immediate).  */
10319                   *cost += extra_cost->vect.alu;
10320                 }
10321               else
10322                 {
10323                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
10324                      aliases.  */
10325                   *cost += extra_cost->alu.shift;
10326                 }
10327             }
10328
10329           /* We can incorporate zero/sign extend for free.  */
10330           if (GET_CODE (op0) == ZERO_EXTEND
10331               || GET_CODE (op0) == SIGN_EXTEND)
10332             op0 = XEXP (op0, 0);
10333
10334           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
10335           return true;
10336         }
10337       else
10338         {
10339           if (VECTOR_MODE_P (mode))
10340             {
10341               if (speed)
10342                 /* Vector shift (register).  */
10343                 *cost += extra_cost->vect.alu;
10344             }
10345           else
10346             {
10347               if (speed)
10348                 /* LSLV.  */
10349                 *cost += extra_cost->alu.shift_reg;
10350
10351               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10352                   && CONST_INT_P (XEXP (op1, 1))
10353                   && known_eq (INTVAL (XEXP (op1, 1)),
10354                                GET_MODE_BITSIZE (mode) - 1))
10355                 {
10356                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10357                   /* We already demanded XEXP (op1, 0) to be REG_P, so
10358                      don't recurse into it.  */
10359                   return true;
10360                 }
10361             }
10362           return false;  /* All arguments need to be in registers.  */
10363         }
10364
10365     case ROTATE:
10366     case ROTATERT:
10367     case LSHIFTRT:
10368     case ASHIFTRT:
10369       op0 = XEXP (x, 0);
10370       op1 = XEXP (x, 1);
10371
10372       if (CONST_INT_P (op1))
10373         {
10374           /* ASR (immediate) and friends.  */
10375           if (speed)
10376             {
10377               if (VECTOR_MODE_P (mode))
10378                 *cost += extra_cost->vect.alu;
10379               else
10380                 *cost += extra_cost->alu.shift;
10381             }
10382
10383           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10384           return true;
10385         }
10386       else
10387         {
10388           if (VECTOR_MODE_P (mode))
10389             {
10390               if (speed)
10391                 /* Vector shift (register).  */
10392                 *cost += extra_cost->vect.alu;
10393             }
10394           else
10395             {
10396               if (speed)
10397                 /* ASR (register) and friends.  */
10398                 *cost += extra_cost->alu.shift_reg;
10399
10400               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10401                   && CONST_INT_P (XEXP (op1, 1))
10402                   && known_eq (INTVAL (XEXP (op1, 1)),
10403                                GET_MODE_BITSIZE (mode) - 1))
10404                 {
10405                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10406                   /* We already demanded XEXP (op1, 0) to be REG_P, so
10407                      don't recurse into it.  */
10408                   return true;
10409                 }
10410             }
10411           return false;  /* All arguments need to be in registers.  */
10412         }
10413
10414     case SYMBOL_REF:
10415
10416       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
10417           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
10418         {
10419           /* LDR.  */
10420           if (speed)
10421             *cost += extra_cost->ldst.load;
10422         }
10423       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
10424                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
10425         {
10426           /* ADRP, followed by ADD.  */
10427           *cost += COSTS_N_INSNS (1);
10428           if (speed)
10429             *cost += 2 * extra_cost->alu.arith;
10430         }
10431       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
10432                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10433         {
10434           /* ADR.  */
10435           if (speed)
10436             *cost += extra_cost->alu.arith;
10437         }
10438
10439       if (flag_pic)
10440         {
10441           /* One extra load instruction, after accessing the GOT.  */
10442           *cost += COSTS_N_INSNS (1);
10443           if (speed)
10444             *cost += extra_cost->ldst.load;
10445         }
10446       return true;
10447
10448     case HIGH:
10449     case LO_SUM:
10450       /* ADRP/ADD (immediate).  */
10451       if (speed)
10452         *cost += extra_cost->alu.arith;
10453       return true;
10454
10455     case ZERO_EXTRACT:
10456     case SIGN_EXTRACT:
10457       /* UBFX/SBFX.  */
10458       if (speed)
10459         {
10460           if (VECTOR_MODE_P (mode))
10461             *cost += extra_cost->vect.alu;
10462           else
10463             *cost += extra_cost->alu.bfx;
10464         }
10465
10466       /* We can trust that the immediates used will be correct (there
10467          are no by-register forms), so we need only cost op0.  */
10468       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
10469       return true;
10470
10471     case MULT:
10472       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
10473       /* aarch64_rtx_mult_cost always handles recursion to its
10474          operands.  */
10475       return true;
10476
10477     case MOD:
10478     /* We can expand signed mod by power of 2 using a NEGS, two parallel
10479        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
10480        an unconditional negate.  This case should only ever be reached through
10481        the set_smod_pow2_cheap check in expmed.c.  */
10482       if (CONST_INT_P (XEXP (x, 1))
10483           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
10484           && (mode == SImode || mode == DImode))
10485         {
10486           /* We expand to 4 instructions.  Reset the baseline.  */
10487           *cost = COSTS_N_INSNS (4);
10488
10489           if (speed)
10490             *cost += 2 * extra_cost->alu.logical
10491                      + 2 * extra_cost->alu.arith;
10492
10493           return true;
10494         }
10495
10496     /* Fall-through.  */
10497     case UMOD:
10498       if (speed)
10499         {
10500           /* Slighly prefer UMOD over SMOD.  */
10501           if (VECTOR_MODE_P (mode))
10502             *cost += extra_cost->vect.alu;
10503           else if (GET_MODE_CLASS (mode) == MODE_INT)
10504             *cost += (extra_cost->mult[mode == DImode].add
10505                       + extra_cost->mult[mode == DImode].idiv
10506                       + (code == MOD ? 1 : 0));
10507         }
10508       return false;  /* All arguments need to be in registers.  */
10509
10510     case DIV:
10511     case UDIV:
10512     case SQRT:
10513       if (speed)
10514         {
10515           if (VECTOR_MODE_P (mode))
10516             *cost += extra_cost->vect.alu;
10517           else if (GET_MODE_CLASS (mode) == MODE_INT)
10518             /* There is no integer SQRT, so only DIV and UDIV can get
10519                here.  */
10520             *cost += (extra_cost->mult[mode == DImode].idiv
10521                      /* Slighly prefer UDIV over SDIV.  */
10522                      + (code == DIV ? 1 : 0));
10523           else
10524             *cost += extra_cost->fp[mode == DFmode].div;
10525         }
10526       return false;  /* All arguments need to be in registers.  */
10527
10528     case IF_THEN_ELSE:
10529       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
10530                                          XEXP (x, 2), cost, speed);
10531
10532     case EQ:
10533     case NE:
10534     case GT:
10535     case GTU:
10536     case LT:
10537     case LTU:
10538     case GE:
10539     case GEU:
10540     case LE:
10541     case LEU:
10542
10543       return false; /* All arguments must be in registers.  */
10544
10545     case FMA:
10546       op0 = XEXP (x, 0);
10547       op1 = XEXP (x, 1);
10548       op2 = XEXP (x, 2);
10549
10550       if (speed)
10551         {
10552           if (VECTOR_MODE_P (mode))
10553             *cost += extra_cost->vect.alu;
10554           else
10555             *cost += extra_cost->fp[mode == DFmode].fma;
10556         }
10557
10558       /* FMSUB, FNMADD, and FNMSUB are free.  */
10559       if (GET_CODE (op0) == NEG)
10560         op0 = XEXP (op0, 0);
10561
10562       if (GET_CODE (op2) == NEG)
10563         op2 = XEXP (op2, 0);
10564
10565       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
10566          and the by-element operand as operand 0.  */
10567       if (GET_CODE (op1) == NEG)
10568         op1 = XEXP (op1, 0);
10569
10570       /* Catch vector-by-element operations.  The by-element operand can
10571          either be (vec_duplicate (vec_select (x))) or just
10572          (vec_select (x)), depending on whether we are multiplying by
10573          a vector or a scalar.
10574
10575          Canonicalization is not very good in these cases, FMA4 will put the
10576          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
10577       if (GET_CODE (op0) == VEC_DUPLICATE)
10578         op0 = XEXP (op0, 0);
10579       else if (GET_CODE (op1) == VEC_DUPLICATE)
10580         op1 = XEXP (op1, 0);
10581
10582       if (GET_CODE (op0) == VEC_SELECT)
10583         op0 = XEXP (op0, 0);
10584       else if (GET_CODE (op1) == VEC_SELECT)
10585         op1 = XEXP (op1, 0);
10586
10587       /* If the remaining parameters are not registers,
10588          get the cost to put them into registers.  */
10589       *cost += rtx_cost (op0, mode, FMA, 0, speed);
10590       *cost += rtx_cost (op1, mode, FMA, 1, speed);
10591       *cost += rtx_cost (op2, mode, FMA, 2, speed);
10592       return true;
10593
10594     case FLOAT:
10595     case UNSIGNED_FLOAT:
10596       if (speed)
10597         *cost += extra_cost->fp[mode == DFmode].fromint;
10598       return false;
10599
10600     case FLOAT_EXTEND:
10601       if (speed)
10602         {
10603           if (VECTOR_MODE_P (mode))
10604             {
10605               /*Vector truncate.  */
10606               *cost += extra_cost->vect.alu;
10607             }
10608           else
10609             *cost += extra_cost->fp[mode == DFmode].widen;
10610         }
10611       return false;
10612
10613     case FLOAT_TRUNCATE:
10614       if (speed)
10615         {
10616           if (VECTOR_MODE_P (mode))
10617             {
10618               /*Vector conversion.  */
10619               *cost += extra_cost->vect.alu;
10620             }
10621           else
10622             *cost += extra_cost->fp[mode == DFmode].narrow;
10623         }
10624       return false;
10625
10626     case FIX:
10627     case UNSIGNED_FIX:
10628       x = XEXP (x, 0);
10629       /* Strip the rounding part.  They will all be implemented
10630          by the fcvt* family of instructions anyway.  */
10631       if (GET_CODE (x) == UNSPEC)
10632         {
10633           unsigned int uns_code = XINT (x, 1);
10634
10635           if (uns_code == UNSPEC_FRINTA
10636               || uns_code == UNSPEC_FRINTM
10637               || uns_code == UNSPEC_FRINTN
10638               || uns_code == UNSPEC_FRINTP
10639               || uns_code == UNSPEC_FRINTZ)
10640             x = XVECEXP (x, 0, 0);
10641         }
10642
10643       if (speed)
10644         {
10645           if (VECTOR_MODE_P (mode))
10646             *cost += extra_cost->vect.alu;
10647           else
10648             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
10649         }
10650
10651       /* We can combine fmul by a power of 2 followed by a fcvt into a single
10652          fixed-point fcvt.  */
10653       if (GET_CODE (x) == MULT
10654           && ((VECTOR_MODE_P (mode)
10655                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
10656               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
10657         {
10658           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
10659                              0, speed);
10660           return true;
10661         }
10662
10663       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
10664       return true;
10665
10666     case ABS:
10667       if (VECTOR_MODE_P (mode))
10668         {
10669           /* ABS (vector).  */
10670           if (speed)
10671             *cost += extra_cost->vect.alu;
10672         }
10673       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10674         {
10675           op0 = XEXP (x, 0);
10676
10677           /* FABD, which is analogous to FADD.  */
10678           if (GET_CODE (op0) == MINUS)
10679             {
10680               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
10681               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
10682               if (speed)
10683                 *cost += extra_cost->fp[mode == DFmode].addsub;
10684
10685               return true;
10686             }
10687           /* Simple FABS is analogous to FNEG.  */
10688           if (speed)
10689             *cost += extra_cost->fp[mode == DFmode].neg;
10690         }
10691       else
10692         {
10693           /* Integer ABS will either be split to
10694              two arithmetic instructions, or will be an ABS
10695              (scalar), which we don't model.  */
10696           *cost = COSTS_N_INSNS (2);
10697           if (speed)
10698             *cost += 2 * extra_cost->alu.arith;
10699         }
10700       return false;
10701
10702     case SMAX:
10703     case SMIN:
10704       if (speed)
10705         {
10706           if (VECTOR_MODE_P (mode))
10707             *cost += extra_cost->vect.alu;
10708           else
10709             {
10710               /* FMAXNM/FMINNM/FMAX/FMIN.
10711                  TODO: This may not be accurate for all implementations, but
10712                  we do not model this in the cost tables.  */
10713               *cost += extra_cost->fp[mode == DFmode].addsub;
10714             }
10715         }
10716       return false;
10717
10718     case UNSPEC:
10719       /* The floating point round to integer frint* instructions.  */
10720       if (aarch64_frint_unspec_p (XINT (x, 1)))
10721         {
10722           if (speed)
10723             *cost += extra_cost->fp[mode == DFmode].roundint;
10724
10725           return false;
10726         }
10727
10728       if (XINT (x, 1) == UNSPEC_RBIT)
10729         {
10730           if (speed)
10731             *cost += extra_cost->alu.rev;
10732
10733           return false;
10734         }
10735       break;
10736
10737     case TRUNCATE:
10738
10739       /* Decompose <su>muldi3_highpart.  */
10740       if (/* (truncate:DI  */
10741           mode == DImode
10742           /*   (lshiftrt:TI  */
10743           && GET_MODE (XEXP (x, 0)) == TImode
10744           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
10745           /*      (mult:TI  */
10746           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
10747           /*        (ANY_EXTEND:TI (reg:DI))
10748                     (ANY_EXTEND:TI (reg:DI)))  */
10749           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
10750                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
10751               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
10752                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
10753           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
10754           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
10755           /*     (const_int 64)  */
10756           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10757           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
10758         {
10759           /* UMULH/SMULH.  */
10760           if (speed)
10761             *cost += extra_cost->mult[mode == DImode].extend;
10762           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
10763                              mode, MULT, 0, speed);
10764           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
10765                              mode, MULT, 1, speed);
10766           return true;
10767         }
10768
10769       /* Fall through.  */
10770     default:
10771       break;
10772     }
10773
10774   if (dump_file
10775       && flag_aarch64_verbose_cost)
10776     fprintf (dump_file,
10777       "\nFailed to cost RTX.  Assuming default cost.\n");
10778
10779   return true;
10780 }
10781
10782 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
10783    calculated for X.  This cost is stored in *COST.  Returns true
10784    if the total cost of X was calculated.  */
10785 static bool
10786 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
10787                    int param, int *cost, bool speed)
10788 {
10789   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
10790
10791   if (dump_file
10792       && flag_aarch64_verbose_cost)
10793     {
10794       print_rtl_single (dump_file, x);
10795       fprintf (dump_file, "\n%s cost: %d (%s)\n",
10796                speed ? "Hot" : "Cold",
10797                *cost, result ? "final" : "partial");
10798     }
10799
10800   return result;
10801 }
10802
10803 static int
10804 aarch64_register_move_cost (machine_mode mode,
10805                             reg_class_t from_i, reg_class_t to_i)
10806 {
10807   enum reg_class from = (enum reg_class) from_i;
10808   enum reg_class to = (enum reg_class) to_i;
10809   const struct cpu_regmove_cost *regmove_cost
10810     = aarch64_tune_params.regmove_cost;
10811
10812   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
10813   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
10814     to = GENERAL_REGS;
10815
10816   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
10817     from = GENERAL_REGS;
10818
10819   /* Moving between GPR and stack cost is the same as GP2GP.  */
10820   if ((from == GENERAL_REGS && to == STACK_REG)
10821       || (to == GENERAL_REGS && from == STACK_REG))
10822     return regmove_cost->GP2GP;
10823
10824   /* To/From the stack register, we move via the gprs.  */
10825   if (to == STACK_REG || from == STACK_REG)
10826     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
10827             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
10828
10829   if (known_eq (GET_MODE_SIZE (mode), 16))
10830     {
10831       /* 128-bit operations on general registers require 2 instructions.  */
10832       if (from == GENERAL_REGS && to == GENERAL_REGS)
10833         return regmove_cost->GP2GP * 2;
10834       else if (from == GENERAL_REGS)
10835         return regmove_cost->GP2FP * 2;
10836       else if (to == GENERAL_REGS)
10837         return regmove_cost->FP2GP * 2;
10838
10839       /* When AdvSIMD instructions are disabled it is not possible to move
10840          a 128-bit value directly between Q registers.  This is handled in
10841          secondary reload.  A general register is used as a scratch to move
10842          the upper DI value and the lower DI value is moved directly,
10843          hence the cost is the sum of three moves. */
10844       if (! TARGET_SIMD)
10845         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
10846
10847       return regmove_cost->FP2FP;
10848     }
10849
10850   if (from == GENERAL_REGS && to == GENERAL_REGS)
10851     return regmove_cost->GP2GP;
10852   else if (from == GENERAL_REGS)
10853     return regmove_cost->GP2FP;
10854   else if (to == GENERAL_REGS)
10855     return regmove_cost->FP2GP;
10856
10857   return regmove_cost->FP2FP;
10858 }
10859
10860 static int
10861 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
10862                           reg_class_t rclass ATTRIBUTE_UNUSED,
10863                           bool in ATTRIBUTE_UNUSED)
10864 {
10865   return aarch64_tune_params.memmov_cost;
10866 }
10867
10868 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
10869    to optimize 1.0/sqrt.  */
10870
10871 static bool
10872 use_rsqrt_p (machine_mode mode)
10873 {
10874   return (!flag_trapping_math
10875           && flag_unsafe_math_optimizations
10876           && ((aarch64_tune_params.approx_modes->recip_sqrt
10877                & AARCH64_APPROX_MODE (mode))
10878               || flag_mrecip_low_precision_sqrt));
10879 }
10880
10881 /* Function to decide when to use the approximate reciprocal square root
10882    builtin.  */
10883
10884 static tree
10885 aarch64_builtin_reciprocal (tree fndecl)
10886 {
10887   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
10888
10889   if (!use_rsqrt_p (mode))
10890     return NULL_TREE;
10891   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
10892 }
10893
10894 /* Emit instruction sequence to compute either the approximate square root
10895    or its approximate reciprocal, depending on the flag RECP, and return
10896    whether the sequence was emitted or not.  */
10897
10898 bool
10899 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
10900 {
10901   machine_mode mode = GET_MODE (dst);
10902
10903   if (GET_MODE_INNER (mode) == HFmode)
10904     {
10905       gcc_assert (!recp);
10906       return false;
10907     }
10908
10909   if (!recp)
10910     {
10911       if (!(flag_mlow_precision_sqrt
10912             || (aarch64_tune_params.approx_modes->sqrt
10913                 & AARCH64_APPROX_MODE (mode))))
10914         return false;
10915
10916       if (flag_finite_math_only
10917           || flag_trapping_math
10918           || !flag_unsafe_math_optimizations
10919           || optimize_function_for_size_p (cfun))
10920         return false;
10921     }
10922   else
10923     /* Caller assumes we cannot fail.  */
10924     gcc_assert (use_rsqrt_p (mode));
10925
10926   machine_mode mmsk = mode_for_int_vector (mode).require ();
10927   rtx xmsk = gen_reg_rtx (mmsk);
10928   if (!recp)
10929     /* When calculating the approximate square root, compare the
10930        argument with 0.0 and create a mask.  */
10931     emit_insn (gen_rtx_SET (xmsk,
10932                             gen_rtx_NEG (mmsk,
10933                                          gen_rtx_EQ (mmsk, src,
10934                                                      CONST0_RTX (mode)))));
10935
10936   /* Estimate the approximate reciprocal square root.  */
10937   rtx xdst = gen_reg_rtx (mode);
10938   emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
10939
10940   /* Iterate over the series twice for SF and thrice for DF.  */
10941   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10942
10943   /* Optionally iterate over the series once less for faster performance
10944      while sacrificing the accuracy.  */
10945   if ((recp && flag_mrecip_low_precision_sqrt)
10946       || (!recp && flag_mlow_precision_sqrt))
10947     iterations--;
10948
10949   /* Iterate over the series to calculate the approximate reciprocal square
10950      root.  */
10951   rtx x1 = gen_reg_rtx (mode);
10952   while (iterations--)
10953     {
10954       rtx x2 = gen_reg_rtx (mode);
10955       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
10956
10957       emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
10958
10959       if (iterations > 0)
10960         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
10961     }
10962
10963   if (!recp)
10964     {
10965       /* Qualify the approximate reciprocal square root when the argument is
10966          0.0 by squashing the intermediary result to 0.0.  */
10967       rtx xtmp = gen_reg_rtx (mmsk);
10968       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
10969                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
10970       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
10971
10972       /* Calculate the approximate square root.  */
10973       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
10974     }
10975
10976   /* Finalize the approximation.  */
10977   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
10978
10979   return true;
10980 }
10981
10982 /* Emit the instruction sequence to compute the approximation for the division
10983    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
10984
10985 bool
10986 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
10987 {
10988   machine_mode mode = GET_MODE (quo);
10989
10990   if (GET_MODE_INNER (mode) == HFmode)
10991     return false;
10992
10993   bool use_approx_division_p = (flag_mlow_precision_div
10994                                 || (aarch64_tune_params.approx_modes->division
10995                                     & AARCH64_APPROX_MODE (mode)));
10996
10997   if (!flag_finite_math_only
10998       || flag_trapping_math
10999       || !flag_unsafe_math_optimizations
11000       || optimize_function_for_size_p (cfun)
11001       || !use_approx_division_p)
11002     return false;
11003
11004   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
11005     return false;
11006
11007   /* Estimate the approximate reciprocal.  */
11008   rtx xrcp = gen_reg_rtx (mode);
11009   emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
11010
11011   /* Iterate over the series twice for SF and thrice for DF.  */
11012   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11013
11014   /* Optionally iterate over the series once less for faster performance,
11015      while sacrificing the accuracy.  */
11016   if (flag_mlow_precision_div)
11017     iterations--;
11018
11019   /* Iterate over the series to calculate the approximate reciprocal.  */
11020   rtx xtmp = gen_reg_rtx (mode);
11021   while (iterations--)
11022     {
11023       emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
11024
11025       if (iterations > 0)
11026         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
11027     }
11028
11029   if (num != CONST1_RTX (mode))
11030     {
11031       /* As the approximate reciprocal of DEN is already calculated, only
11032          calculate the approximate division when NUM is not 1.0.  */
11033       rtx xnum = force_reg (mode, num);
11034       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
11035     }
11036
11037   /* Finalize the approximation.  */
11038   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
11039   return true;
11040 }
11041
11042 /* Return the number of instructions that can be issued per cycle.  */
11043 static int
11044 aarch64_sched_issue_rate (void)
11045 {
11046   return aarch64_tune_params.issue_rate;
11047 }
11048
11049 static int
11050 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
11051 {
11052   int issue_rate = aarch64_sched_issue_rate ();
11053
11054   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
11055 }
11056
11057
11058 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
11059    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
11060    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
11061
11062 static int
11063 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
11064                                                     int ready_index)
11065 {
11066   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
11067 }
11068
11069
11070 /* Vectorizer cost model target hooks.  */
11071
11072 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
11073 static int
11074 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
11075                                     tree vectype,
11076                                     int misalign ATTRIBUTE_UNUSED)
11077 {
11078   unsigned elements;
11079   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
11080   bool fp = false;
11081
11082   if (vectype != NULL)
11083     fp = FLOAT_TYPE_P (vectype);
11084
11085   switch (type_of_cost)
11086     {
11087       case scalar_stmt:
11088         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
11089
11090       case scalar_load:
11091         return costs->scalar_load_cost;
11092
11093       case scalar_store:
11094         return costs->scalar_store_cost;
11095
11096       case vector_stmt:
11097         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11098
11099       case vector_load:
11100         return costs->vec_align_load_cost;
11101
11102       case vector_store:
11103         return costs->vec_store_cost;
11104
11105       case vec_to_scalar:
11106         return costs->vec_to_scalar_cost;
11107
11108       case scalar_to_vec:
11109         return costs->scalar_to_vec_cost;
11110
11111       case unaligned_load:
11112       case vector_gather_load:
11113         return costs->vec_unalign_load_cost;
11114
11115       case unaligned_store:
11116       case vector_scatter_store:
11117         return costs->vec_unalign_store_cost;
11118
11119       case cond_branch_taken:
11120         return costs->cond_taken_branch_cost;
11121
11122       case cond_branch_not_taken:
11123         return costs->cond_not_taken_branch_cost;
11124
11125       case vec_perm:
11126         return costs->vec_permute_cost;
11127
11128       case vec_promote_demote:
11129         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11130
11131       case vec_construct:
11132         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
11133         return elements / 2 + 1;
11134
11135       default:
11136         gcc_unreachable ();
11137     }
11138 }
11139
11140 /* Implement targetm.vectorize.add_stmt_cost.  */
11141 static unsigned
11142 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
11143                        struct _stmt_vec_info *stmt_info, int misalign,
11144                        enum vect_cost_model_location where)
11145 {
11146   unsigned *cost = (unsigned *) data;
11147   unsigned retval = 0;
11148
11149   if (flag_vect_cost_model)
11150     {
11151       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
11152       int stmt_cost =
11153             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
11154
11155       /* Statements in an inner loop relative to the loop being
11156          vectorized are weighted more heavily.  The value here is
11157          arbitrary and could potentially be improved with analysis.  */
11158       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
11159         count *= 50; /*  FIXME  */
11160
11161       retval = (unsigned) (count * stmt_cost);
11162       cost[where] += retval;
11163     }
11164
11165   return retval;
11166 }
11167
11168 static void initialize_aarch64_code_model (struct gcc_options *);
11169
11170 /* Parse the TO_PARSE string and put the architecture struct that it
11171    selects into RES and the architectural features into ISA_FLAGS.
11172    Return an aarch64_parse_opt_result describing the parse result.
11173    If there is an error parsing, RES and ISA_FLAGS are left unchanged.
11174    When the TO_PARSE string contains an invalid extension,
11175    a copy of the string is created and stored to INVALID_EXTENSION.  */
11176
11177 static enum aarch64_parse_opt_result
11178 aarch64_parse_arch (const char *to_parse, const struct processor **res,
11179                     uint64_t *isa_flags, std::string *invalid_extension)
11180 {
11181   const char *ext;
11182   const struct processor *arch;
11183   size_t len;
11184
11185   ext = strchr (to_parse, '+');
11186
11187   if (ext != NULL)
11188     len = ext - to_parse;
11189   else
11190     len = strlen (to_parse);
11191
11192   if (len == 0)
11193     return AARCH64_PARSE_MISSING_ARG;
11194
11195
11196   /* Loop through the list of supported ARCHes to find a match.  */
11197   for (arch = all_architectures; arch->name != NULL; arch++)
11198     {
11199       if (strlen (arch->name) == len
11200           && strncmp (arch->name, to_parse, len) == 0)
11201         {
11202           uint64_t isa_temp = arch->flags;
11203
11204           if (ext != NULL)
11205             {
11206               /* TO_PARSE string contains at least one extension.  */
11207               enum aarch64_parse_opt_result ext_res
11208                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11209
11210               if (ext_res != AARCH64_PARSE_OK)
11211                 return ext_res;
11212             }
11213           /* Extension parsing was successful.  Confirm the result
11214              arch and ISA flags.  */
11215           *res = arch;
11216           *isa_flags = isa_temp;
11217           return AARCH64_PARSE_OK;
11218         }
11219     }
11220
11221   /* ARCH name not found in list.  */
11222   return AARCH64_PARSE_INVALID_ARG;
11223 }
11224
11225 /* Parse the TO_PARSE string and put the result tuning in RES and the
11226    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
11227    describing the parse result.  If there is an error parsing, RES and
11228    ISA_FLAGS are left unchanged.
11229    When the TO_PARSE string contains an invalid extension,
11230    a copy of the string is created and stored to INVALID_EXTENSION.  */
11231
11232 static enum aarch64_parse_opt_result
11233 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
11234                    uint64_t *isa_flags, std::string *invalid_extension)
11235 {
11236   const char *ext;
11237   const struct processor *cpu;
11238   size_t len;
11239
11240   ext = strchr (to_parse, '+');
11241
11242   if (ext != NULL)
11243     len = ext - to_parse;
11244   else
11245     len = strlen (to_parse);
11246
11247   if (len == 0)
11248     return AARCH64_PARSE_MISSING_ARG;
11249
11250
11251   /* Loop through the list of supported CPUs to find a match.  */
11252   for (cpu = all_cores; cpu->name != NULL; cpu++)
11253     {
11254       if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
11255         {
11256           uint64_t isa_temp = cpu->flags;
11257
11258
11259           if (ext != NULL)
11260             {
11261               /* TO_PARSE string contains at least one extension.  */
11262               enum aarch64_parse_opt_result ext_res
11263                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11264
11265               if (ext_res != AARCH64_PARSE_OK)
11266                 return ext_res;
11267             }
11268           /* Extension parsing was successfull.  Confirm the result
11269              cpu and ISA flags.  */
11270           *res = cpu;
11271           *isa_flags = isa_temp;
11272           return AARCH64_PARSE_OK;
11273         }
11274     }
11275
11276   /* CPU name not found in list.  */
11277   return AARCH64_PARSE_INVALID_ARG;
11278 }
11279
11280 /* Parse the TO_PARSE string and put the cpu it selects into RES.
11281    Return an aarch64_parse_opt_result describing the parse result.
11282    If the parsing fails the RES does not change.  */
11283
11284 static enum aarch64_parse_opt_result
11285 aarch64_parse_tune (const char *to_parse, const struct processor **res)
11286 {
11287   const struct processor *cpu;
11288
11289   /* Loop through the list of supported CPUs to find a match.  */
11290   for (cpu = all_cores; cpu->name != NULL; cpu++)
11291     {
11292       if (strcmp (cpu->name, to_parse) == 0)
11293         {
11294           *res = cpu;
11295           return AARCH64_PARSE_OK;
11296         }
11297     }
11298
11299   /* CPU name not found in list.  */
11300   return AARCH64_PARSE_INVALID_ARG;
11301 }
11302
11303 /* Parse TOKEN, which has length LENGTH to see if it is an option
11304    described in FLAG.  If it is, return the index bit for that fusion type.
11305    If not, error (printing OPTION_NAME) and return zero.  */
11306
11307 static unsigned int
11308 aarch64_parse_one_option_token (const char *token,
11309                                 size_t length,
11310                                 const struct aarch64_flag_desc *flag,
11311                                 const char *option_name)
11312 {
11313   for (; flag->name != NULL; flag++)
11314     {
11315       if (length == strlen (flag->name)
11316           && !strncmp (flag->name, token, length))
11317         return flag->flag;
11318     }
11319
11320   error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
11321   return 0;
11322 }
11323
11324 /* Parse OPTION which is a comma-separated list of flags to enable.
11325    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
11326    default state we inherit from the CPU tuning structures.  OPTION_NAME
11327    gives the top-level option we are parsing in the -moverride string,
11328    for use in error messages.  */
11329
11330 static unsigned int
11331 aarch64_parse_boolean_options (const char *option,
11332                                const struct aarch64_flag_desc *flags,
11333                                unsigned int initial_state,
11334                                const char *option_name)
11335 {
11336   const char separator = '.';
11337   const char* specs = option;
11338   const char* ntoken = option;
11339   unsigned int found_flags = initial_state;
11340
11341   while ((ntoken = strchr (specs, separator)))
11342     {
11343       size_t token_length = ntoken - specs;
11344       unsigned token_ops = aarch64_parse_one_option_token (specs,
11345                                                            token_length,
11346                                                            flags,
11347                                                            option_name);
11348       /* If we find "none" (or, for simplicity's sake, an error) anywhere
11349          in the token stream, reset the supported operations.  So:
11350
11351            adrp+add.cmp+branch.none.adrp+add
11352
11353            would have the result of turning on only adrp+add fusion.  */
11354       if (!token_ops)
11355         found_flags = 0;
11356
11357       found_flags |= token_ops;
11358       specs = ++ntoken;
11359     }
11360
11361   /* We ended with a comma, print something.  */
11362   if (!(*specs))
11363     {
11364       error ("%s string ill-formed\n", option_name);
11365       return 0;
11366     }
11367
11368   /* We still have one more token to parse.  */
11369   size_t token_length = strlen (specs);
11370   unsigned token_ops = aarch64_parse_one_option_token (specs,
11371                                                        token_length,
11372                                                        flags,
11373                                                        option_name);
11374    if (!token_ops)
11375      found_flags = 0;
11376
11377   found_flags |= token_ops;
11378   return found_flags;
11379 }
11380
11381 /* Support for overriding instruction fusion.  */
11382
11383 static void
11384 aarch64_parse_fuse_string (const char *fuse_string,
11385                             struct tune_params *tune)
11386 {
11387   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
11388                                                      aarch64_fusible_pairs,
11389                                                      tune->fusible_ops,
11390                                                      "fuse=");
11391 }
11392
11393 /* Support for overriding other tuning flags.  */
11394
11395 static void
11396 aarch64_parse_tune_string (const char *tune_string,
11397                             struct tune_params *tune)
11398 {
11399   tune->extra_tuning_flags
11400     = aarch64_parse_boolean_options (tune_string,
11401                                      aarch64_tuning_flags,
11402                                      tune->extra_tuning_flags,
11403                                      "tune=");
11404 }
11405
11406 /* Parse the sve_width tuning moverride string in TUNE_STRING.
11407    Accept the valid SVE vector widths allowed by
11408    aarch64_sve_vector_bits_enum and use it to override sve_width
11409    in TUNE.  */
11410
11411 static void
11412 aarch64_parse_sve_width_string (const char *tune_string,
11413                                 struct tune_params *tune)
11414 {
11415   int width = -1;
11416
11417   int n = sscanf (tune_string, "%d", &width);
11418   if (n == EOF)
11419     {
11420       error ("invalid format for sve_width");
11421       return;
11422     }
11423   switch (width)
11424     {
11425     case SVE_128:
11426     case SVE_256:
11427     case SVE_512:
11428     case SVE_1024:
11429     case SVE_2048:
11430       break;
11431     default:
11432       error ("invalid sve_width value: %d", width);
11433     }
11434   tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
11435 }
11436
11437 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
11438    we understand.  If it is, extract the option string and handoff to
11439    the appropriate function.  */
11440
11441 void
11442 aarch64_parse_one_override_token (const char* token,
11443                                   size_t length,
11444                                   struct tune_params *tune)
11445 {
11446   const struct aarch64_tuning_override_function *fn
11447     = aarch64_tuning_override_functions;
11448
11449   const char *option_part = strchr (token, '=');
11450   if (!option_part)
11451     {
11452       error ("tuning string missing in option (%s)", token);
11453       return;
11454     }
11455
11456   /* Get the length of the option name.  */
11457   length = option_part - token;
11458   /* Skip the '=' to get to the option string.  */
11459   option_part++;
11460
11461   for (; fn->name != NULL; fn++)
11462     {
11463       if (!strncmp (fn->name, token, length))
11464         {
11465           fn->parse_override (option_part, tune);
11466           return;
11467         }
11468     }
11469
11470   error ("unknown tuning option (%s)",token);
11471   return;
11472 }
11473
11474 /* A checking mechanism for the implementation of the tls size.  */
11475
11476 static void
11477 initialize_aarch64_tls_size (struct gcc_options *opts)
11478 {
11479   if (aarch64_tls_size == 0)
11480     aarch64_tls_size = 24;
11481
11482   switch (opts->x_aarch64_cmodel_var)
11483     {
11484     case AARCH64_CMODEL_TINY:
11485       /* Both the default and maximum TLS size allowed under tiny is 1M which
11486          needs two instructions to address, so we clamp the size to 24.  */
11487       if (aarch64_tls_size > 24)
11488         aarch64_tls_size = 24;
11489       break;
11490     case AARCH64_CMODEL_SMALL:
11491       /* The maximum TLS size allowed under small is 4G.  */
11492       if (aarch64_tls_size > 32)
11493         aarch64_tls_size = 32;
11494       break;
11495     case AARCH64_CMODEL_LARGE:
11496       /* The maximum TLS size allowed under large is 16E.
11497          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
11498       if (aarch64_tls_size > 48)
11499         aarch64_tls_size = 48;
11500       break;
11501     default:
11502       gcc_unreachable ();
11503     }
11504
11505   return;
11506 }
11507
11508 /* Parse STRING looking for options in the format:
11509      string     :: option:string
11510      option     :: name=substring
11511      name       :: {a-z}
11512      substring  :: defined by option.  */
11513
11514 static void
11515 aarch64_parse_override_string (const char* input_string,
11516                                struct tune_params* tune)
11517 {
11518   const char separator = ':';
11519   size_t string_length = strlen (input_string) + 1;
11520   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
11521   char *string = string_root;
11522   strncpy (string, input_string, string_length);
11523   string[string_length - 1] = '\0';
11524
11525   char* ntoken = string;
11526
11527   while ((ntoken = strchr (string, separator)))
11528     {
11529       size_t token_length = ntoken - string;
11530       /* Make this substring look like a string.  */
11531       *ntoken = '\0';
11532       aarch64_parse_one_override_token (string, token_length, tune);
11533       string = ++ntoken;
11534     }
11535
11536   /* One last option to parse.  */
11537   aarch64_parse_one_override_token (string, strlen (string), tune);
11538   free (string_root);
11539 }
11540
11541
11542 static void
11543 aarch64_override_options_after_change_1 (struct gcc_options *opts)
11544 {
11545   if (accepted_branch_protection_string)
11546     {
11547       opts->x_aarch64_branch_protection_string
11548         = xstrdup (accepted_branch_protection_string);
11549     }
11550
11551   /* PR 70044: We have to be careful about being called multiple times for the
11552      same function.  This means all changes should be repeatable.  */
11553
11554   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
11555      Disable the frame pointer flag so the mid-end will not use a frame
11556      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
11557      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
11558      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
11559   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
11560   if (opts->x_flag_omit_frame_pointer == 0)
11561     opts->x_flag_omit_frame_pointer = 2;
11562
11563   /* If not optimizing for size, set the default
11564      alignment to what the target wants.  */
11565   if (!opts->x_optimize_size)
11566     {
11567       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
11568         opts->x_str_align_loops = aarch64_tune_params.loop_align;
11569       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
11570         opts->x_str_align_jumps = aarch64_tune_params.jump_align;
11571       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
11572         opts->x_str_align_functions = aarch64_tune_params.function_align;
11573     }
11574
11575   /* We default to no pc-relative literal loads.  */
11576
11577   aarch64_pcrelative_literal_loads = false;
11578
11579   /* If -mpc-relative-literal-loads is set on the command line, this
11580      implies that the user asked for PC relative literal loads.  */
11581   if (opts->x_pcrelative_literal_loads == 1)
11582     aarch64_pcrelative_literal_loads = true;
11583
11584   /* In the tiny memory model it makes no sense to disallow PC relative
11585      literal pool loads.  */
11586   if (aarch64_cmodel == AARCH64_CMODEL_TINY
11587       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
11588     aarch64_pcrelative_literal_loads = true;
11589
11590   /* When enabling the lower precision Newton series for the square root, also
11591      enable it for the reciprocal square root, since the latter is an
11592      intermediary step for the former.  */
11593   if (flag_mlow_precision_sqrt)
11594     flag_mrecip_low_precision_sqrt = true;
11595 }
11596
11597 /* 'Unpack' up the internal tuning structs and update the options
11598     in OPTS.  The caller must have set up selected_tune and selected_arch
11599     as all the other target-specific codegen decisions are
11600     derived from them.  */
11601
11602 void
11603 aarch64_override_options_internal (struct gcc_options *opts)
11604 {
11605   aarch64_tune_flags = selected_tune->flags;
11606   aarch64_tune = selected_tune->sched_core;
11607   /* Make a copy of the tuning parameters attached to the core, which
11608      we may later overwrite.  */
11609   aarch64_tune_params = *(selected_tune->tune);
11610   aarch64_architecture_version = selected_arch->architecture_version;
11611
11612   if (opts->x_aarch64_override_tune_string)
11613     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
11614                                   &aarch64_tune_params);
11615
11616   /* This target defaults to strict volatile bitfields.  */
11617   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
11618     opts->x_flag_strict_volatile_bitfields = 1;
11619
11620   if (aarch64_stack_protector_guard == SSP_GLOBAL
11621       && opts->x_aarch64_stack_protector_guard_offset_str)
11622     {
11623       error ("incompatible options %<-mstack-protector-guard=global%> and "
11624              "%<-mstack-protector-guard-offset=%s%>",
11625              aarch64_stack_protector_guard_offset_str);
11626     }
11627
11628   if (aarch64_stack_protector_guard == SSP_SYSREG
11629       && !(opts->x_aarch64_stack_protector_guard_offset_str
11630            && opts->x_aarch64_stack_protector_guard_reg_str))
11631     {
11632       error ("both %<-mstack-protector-guard-offset%> and "
11633              "%<-mstack-protector-guard-reg%> must be used "
11634              "with %<-mstack-protector-guard=sysreg%>");
11635     }
11636
11637   if (opts->x_aarch64_stack_protector_guard_reg_str)
11638     {
11639       if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
11640           error ("specify a system register with a small string length.");
11641     }
11642
11643   if (opts->x_aarch64_stack_protector_guard_offset_str)
11644     {
11645       char *end;
11646       const char *str = aarch64_stack_protector_guard_offset_str;
11647       errno = 0;
11648       long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
11649       if (!*str || *end || errno)
11650         error ("%qs is not a valid offset in %qs", str,
11651                "-mstack-protector-guard-offset=");
11652       aarch64_stack_protector_guard_offset = offs;
11653     }
11654
11655   initialize_aarch64_code_model (opts);
11656   initialize_aarch64_tls_size (opts);
11657
11658   int queue_depth = 0;
11659   switch (aarch64_tune_params.autoprefetcher_model)
11660     {
11661       case tune_params::AUTOPREFETCHER_OFF:
11662         queue_depth = -1;
11663         break;
11664       case tune_params::AUTOPREFETCHER_WEAK:
11665         queue_depth = 0;
11666         break;
11667       case tune_params::AUTOPREFETCHER_STRONG:
11668         queue_depth = max_insn_queue_index + 1;
11669         break;
11670       default:
11671         gcc_unreachable ();
11672     }
11673
11674   /* We don't mind passing in global_options_set here as we don't use
11675      the *options_set structs anyway.  */
11676   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
11677                          queue_depth,
11678                          opts->x_param_values,
11679                          global_options_set.x_param_values);
11680
11681   /* Set up parameters to be used in prefetching algorithm.  Do not
11682      override the defaults unless we are tuning for a core we have
11683      researched values for.  */
11684   if (aarch64_tune_params.prefetch->num_slots > 0)
11685     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
11686                            aarch64_tune_params.prefetch->num_slots,
11687                            opts->x_param_values,
11688                            global_options_set.x_param_values);
11689   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
11690     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
11691                            aarch64_tune_params.prefetch->l1_cache_size,
11692                            opts->x_param_values,
11693                            global_options_set.x_param_values);
11694   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
11695     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
11696                            aarch64_tune_params.prefetch->l1_cache_line_size,
11697                            opts->x_param_values,
11698                            global_options_set.x_param_values);
11699   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
11700     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
11701                            aarch64_tune_params.prefetch->l2_cache_size,
11702                            opts->x_param_values,
11703                            global_options_set.x_param_values);
11704   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
11705     maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
11706                            0,
11707                            opts->x_param_values,
11708                            global_options_set.x_param_values);
11709   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
11710     maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
11711                            aarch64_tune_params.prefetch->minimum_stride,
11712                            opts->x_param_values,
11713                            global_options_set.x_param_values);
11714
11715   /* Use the alternative scheduling-pressure algorithm by default.  */
11716   maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
11717                          opts->x_param_values,
11718                          global_options_set.x_param_values);
11719
11720   /* If the user hasn't changed it via configure then set the default to 64 KB
11721      for the backend.  */
11722   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
11723                          DEFAULT_STK_CLASH_GUARD_SIZE == 0
11724                            ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE,
11725                          opts->x_param_values,
11726                          global_options_set.x_param_values);
11727
11728   /* Validate the guard size.  */
11729   int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
11730
11731   /* Enforce that interval is the same size as size so the mid-end does the
11732      right thing.  */
11733   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
11734                          guard_size,
11735                          opts->x_param_values,
11736                          global_options_set.x_param_values);
11737
11738   /* The maybe_set calls won't update the value if the user has explicitly set
11739      one.  Which means we need to validate that probing interval and guard size
11740      are equal.  */
11741   int probe_interval
11742     = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
11743   if (guard_size != probe_interval)
11744     error ("stack clash guard size %<%d%> must be equal to probing interval "
11745            "%<%d%>", guard_size, probe_interval);
11746
11747   /* Enable sw prefetching at specified optimization level for
11748      CPUS that have prefetch.  Lower optimization level threshold by 1
11749      when profiling is enabled.  */
11750   if (opts->x_flag_prefetch_loop_arrays < 0
11751       && !opts->x_optimize_size
11752       && aarch64_tune_params.prefetch->default_opt_level >= 0
11753       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
11754     opts->x_flag_prefetch_loop_arrays = 1;
11755
11756   if (opts->x_aarch64_arch_string == NULL)
11757     opts->x_aarch64_arch_string = selected_arch->name;
11758   if (opts->x_aarch64_cpu_string == NULL)
11759     opts->x_aarch64_cpu_string = selected_cpu->name;
11760   if (opts->x_aarch64_tune_string == NULL)
11761     opts->x_aarch64_tune_string = selected_tune->name;
11762
11763   aarch64_override_options_after_change_1 (opts);
11764 }
11765
11766 /* Print a hint with a suggestion for a core or architecture name that
11767    most closely resembles what the user passed in STR.  ARCH is true if
11768    the user is asking for an architecture name.  ARCH is false if the user
11769    is asking for a core name.  */
11770
11771 static void
11772 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
11773 {
11774   auto_vec<const char *> candidates;
11775   const struct processor *entry = arch ? all_architectures : all_cores;
11776   for (; entry->name != NULL; entry++)
11777     candidates.safe_push (entry->name);
11778
11779 #ifdef HAVE_LOCAL_CPU_DETECT
11780   /* Add also "native" as possible value.  */
11781   if (arch)
11782     candidates.safe_push ("native");
11783 #endif
11784
11785   char *s;
11786   const char *hint = candidates_list_and_hint (str, s, candidates);
11787   if (hint)
11788     inform (input_location, "valid arguments are: %s;"
11789                              " did you mean %qs?", s, hint);
11790   else
11791     inform (input_location, "valid arguments are: %s", s);
11792
11793   XDELETEVEC (s);
11794 }
11795
11796 /* Print a hint with a suggestion for a core name that most closely resembles
11797    what the user passed in STR.  */
11798
11799 inline static void
11800 aarch64_print_hint_for_core (const char *str)
11801 {
11802   aarch64_print_hint_for_core_or_arch (str, false);
11803 }
11804
11805 /* Print a hint with a suggestion for an architecture name that most closely
11806    resembles what the user passed in STR.  */
11807
11808 inline static void
11809 aarch64_print_hint_for_arch (const char *str)
11810 {
11811   aarch64_print_hint_for_core_or_arch (str, true);
11812 }
11813
11814
11815 /* Print a hint with a suggestion for an extension name
11816    that most closely resembles what the user passed in STR.  */
11817
11818 void
11819 aarch64_print_hint_for_extensions (const std::string &str)
11820 {
11821   auto_vec<const char *> candidates;
11822   aarch64_get_all_extension_candidates (&candidates);
11823   char *s;
11824   const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
11825   if (hint)
11826     inform (input_location, "valid arguments are: %s;"
11827                              " did you mean %qs?", s, hint);
11828   else
11829     inform (input_location, "valid arguments are: %s;", s);
11830
11831   XDELETEVEC (s);
11832 }
11833
11834 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
11835    specified in STR and throw errors if appropriate.  Put the results if
11836    they are valid in RES and ISA_FLAGS.  Return whether the option is
11837    valid.  */
11838
11839 static bool
11840 aarch64_validate_mcpu (const char *str, const struct processor **res,
11841                        uint64_t *isa_flags)
11842 {
11843   std::string invalid_extension;
11844   enum aarch64_parse_opt_result parse_res
11845     = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
11846
11847   if (parse_res == AARCH64_PARSE_OK)
11848     return true;
11849
11850   switch (parse_res)
11851     {
11852       case AARCH64_PARSE_MISSING_ARG:
11853         error ("missing cpu name in %<-mcpu=%s%>", str);
11854         break;
11855       case AARCH64_PARSE_INVALID_ARG:
11856         error ("unknown value %qs for %<-mcpu%>", str);
11857         aarch64_print_hint_for_core (str);
11858         break;
11859       case AARCH64_PARSE_INVALID_FEATURE:
11860         error ("invalid feature modifier %qs in %<-mcpu=%s%>",
11861                invalid_extension.c_str (), str);
11862         aarch64_print_hint_for_extensions (invalid_extension);
11863         break;
11864       default:
11865         gcc_unreachable ();
11866     }
11867
11868   return false;
11869 }
11870
11871 /* Parses CONST_STR for branch protection features specified in
11872    aarch64_branch_protect_types, and set any global variables required.  Returns
11873    the parsing result and assigns LAST_STR to the last processed token from
11874    CONST_STR so that it can be used for error reporting.  */
11875
11876 static enum
11877 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
11878                                                           char** last_str)
11879 {
11880   char *str_root = xstrdup (const_str);
11881   char* token_save = NULL;
11882   char *str = strtok_r (str_root, "+", &token_save);
11883   enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
11884   if (!str)
11885     res = AARCH64_PARSE_MISSING_ARG;
11886   else
11887     {
11888       char *next_str = strtok_r (NULL, "+", &token_save);
11889       /* Reset the branch protection features to their defaults.  */
11890       aarch64_handle_no_branch_protection (NULL, NULL);
11891
11892       while (str && res == AARCH64_PARSE_OK)
11893         {
11894           const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
11895           bool found = false;
11896           /* Search for this type.  */
11897           while (type && type->name && !found && res == AARCH64_PARSE_OK)
11898             {
11899               if (strcmp (str, type->name) == 0)
11900                 {
11901                   found = true;
11902                   res = type->handler (str, next_str);
11903                   str = next_str;
11904                   next_str = strtok_r (NULL, "+", &token_save);
11905                 }
11906               else
11907                 type++;
11908             }
11909           if (found && res == AARCH64_PARSE_OK)
11910             {
11911               bool found_subtype = true;
11912               /* Loop through each token until we find one that isn't a
11913                  subtype.  */
11914               while (found_subtype)
11915                 {
11916                   found_subtype = false;
11917                   const aarch64_branch_protect_type *subtype = type->subtypes;
11918                   /* Search for the subtype.  */
11919                   while (str && subtype && subtype->name && !found_subtype
11920                           && res == AARCH64_PARSE_OK)
11921                     {
11922                       if (strcmp (str, subtype->name) == 0)
11923                         {
11924                           found_subtype = true;
11925                           res = subtype->handler (str, next_str);
11926                           str = next_str;
11927                           next_str = strtok_r (NULL, "+", &token_save);
11928                         }
11929                       else
11930                         subtype++;
11931                     }
11932                 }
11933             }
11934           else if (!found)
11935             res = AARCH64_PARSE_INVALID_ARG;
11936         }
11937     }
11938   /* Copy the last processed token into the argument to pass it back.
11939     Used by option and attribute validation to print the offending token.  */
11940   if (last_str)
11941     {
11942       if (str) strcpy (*last_str, str);
11943       else *last_str = NULL;
11944     }
11945   if (res == AARCH64_PARSE_OK)
11946     {
11947       /* If needed, alloc the accepted string then copy in const_str.
11948         Used by override_option_after_change_1.  */
11949       if (!accepted_branch_protection_string)
11950         accepted_branch_protection_string = (char *) xmalloc (
11951                                                       BRANCH_PROTECT_STR_MAX
11952                                                         + 1);
11953       strncpy (accepted_branch_protection_string, const_str,
11954                 BRANCH_PROTECT_STR_MAX + 1);
11955       /* Forcibly null-terminate.  */
11956       accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
11957     }
11958   return res;
11959 }
11960
11961 static bool
11962 aarch64_validate_mbranch_protection (const char *const_str)
11963 {
11964   char *str = (char *) xmalloc (strlen (const_str));
11965   enum aarch64_parse_opt_result res =
11966     aarch64_parse_branch_protection (const_str, &str);
11967   if (res == AARCH64_PARSE_INVALID_ARG)
11968     error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
11969   else if (res == AARCH64_PARSE_MISSING_ARG)
11970     error ("missing argument for %<-mbranch-protection=%>");
11971   free (str);
11972   return res == AARCH64_PARSE_OK;
11973 }
11974
11975 /* Validate a command-line -march option.  Parse the arch and extensions
11976    (if any) specified in STR and throw errors if appropriate.  Put the
11977    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
11978    option is valid.  */
11979
11980 static bool
11981 aarch64_validate_march (const char *str, const struct processor **res,
11982                          uint64_t *isa_flags)
11983 {
11984   std::string invalid_extension;
11985   enum aarch64_parse_opt_result parse_res
11986     = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
11987
11988   if (parse_res == AARCH64_PARSE_OK)
11989     return true;
11990
11991   switch (parse_res)
11992     {
11993       case AARCH64_PARSE_MISSING_ARG:
11994         error ("missing arch name in %<-march=%s%>", str);
11995         break;
11996       case AARCH64_PARSE_INVALID_ARG:
11997         error ("unknown value %qs for %<-march%>", str);
11998         aarch64_print_hint_for_arch (str);
11999         break;
12000       case AARCH64_PARSE_INVALID_FEATURE:
12001         error ("invalid feature modifier %qs in %<-march=%s%>",
12002                invalid_extension.c_str (), str);
12003         aarch64_print_hint_for_extensions (invalid_extension);
12004         break;
12005       default:
12006         gcc_unreachable ();
12007     }
12008
12009   return false;
12010 }
12011
12012 /* Validate a command-line -mtune option.  Parse the cpu
12013    specified in STR and throw errors if appropriate.  Put the
12014    result, if it is valid, in RES.  Return whether the option is
12015    valid.  */
12016
12017 static bool
12018 aarch64_validate_mtune (const char *str, const struct processor **res)
12019 {
12020   enum aarch64_parse_opt_result parse_res
12021     = aarch64_parse_tune (str, res);
12022
12023   if (parse_res == AARCH64_PARSE_OK)
12024     return true;
12025
12026   switch (parse_res)
12027     {
12028       case AARCH64_PARSE_MISSING_ARG:
12029         error ("missing cpu name in %<-mtune=%s%>", str);
12030         break;
12031       case AARCH64_PARSE_INVALID_ARG:
12032         error ("unknown value %qs for %<-mtune%>", str);
12033         aarch64_print_hint_for_core (str);
12034         break;
12035       default:
12036         gcc_unreachable ();
12037     }
12038   return false;
12039 }
12040
12041 /* Return the CPU corresponding to the enum CPU.
12042    If it doesn't specify a cpu, return the default.  */
12043
12044 static const struct processor *
12045 aarch64_get_tune_cpu (enum aarch64_processor cpu)
12046 {
12047   if (cpu != aarch64_none)
12048     return &all_cores[cpu];
12049
12050   /* The & 0x3f is to extract the bottom 6 bits that encode the
12051      default cpu as selected by the --with-cpu GCC configure option
12052      in config.gcc.
12053      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
12054      flags mechanism should be reworked to make it more sane.  */
12055   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12056 }
12057
12058 /* Return the architecture corresponding to the enum ARCH.
12059    If it doesn't specify a valid architecture, return the default.  */
12060
12061 static const struct processor *
12062 aarch64_get_arch (enum aarch64_arch arch)
12063 {
12064   if (arch != aarch64_no_arch)
12065     return &all_architectures[arch];
12066
12067   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12068
12069   return &all_architectures[cpu->arch];
12070 }
12071
12072 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
12073
12074 static poly_uint16
12075 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
12076 {
12077   /* For now generate vector-length agnostic code for -msve-vector-bits=128.
12078      This ensures we can clearly distinguish SVE and Advanced SIMD modes when
12079      deciding which .md file patterns to use and when deciding whether
12080      something is a legitimate address or constant.  */
12081   if (value == SVE_SCALABLE || value == SVE_128)
12082     return poly_uint16 (2, 2);
12083   else
12084     return (int) value / 64;
12085 }
12086
12087 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
12088    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
12089    tuning structs.  In particular it must set selected_tune and
12090    aarch64_isa_flags that define the available ISA features and tuning
12091    decisions.  It must also set selected_arch as this will be used to
12092    output the .arch asm tags for each function.  */
12093
12094 static void
12095 aarch64_override_options (void)
12096 {
12097   uint64_t cpu_isa = 0;
12098   uint64_t arch_isa = 0;
12099   aarch64_isa_flags = 0;
12100
12101   bool valid_cpu = true;
12102   bool valid_tune = true;
12103   bool valid_arch = true;
12104
12105   selected_cpu = NULL;
12106   selected_arch = NULL;
12107   selected_tune = NULL;
12108
12109   if (aarch64_branch_protection_string)
12110     aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
12111
12112   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
12113      If either of -march or -mtune is given, they override their
12114      respective component of -mcpu.  */
12115   if (aarch64_cpu_string)
12116     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
12117                                         &cpu_isa);
12118
12119   if (aarch64_arch_string)
12120     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
12121                                           &arch_isa);
12122
12123   if (aarch64_tune_string)
12124     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
12125
12126 #ifdef SUBTARGET_OVERRIDE_OPTIONS
12127   SUBTARGET_OVERRIDE_OPTIONS;
12128 #endif
12129
12130   /* If the user did not specify a processor, choose the default
12131      one for them.  This will be the CPU set during configuration using
12132      --with-cpu, otherwise it is "generic".  */
12133   if (!selected_cpu)
12134     {
12135       if (selected_arch)
12136         {
12137           selected_cpu = &all_cores[selected_arch->ident];
12138           aarch64_isa_flags = arch_isa;
12139           explicit_arch = selected_arch->arch;
12140         }
12141       else
12142         {
12143           /* Get default configure-time CPU.  */
12144           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
12145           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
12146         }
12147
12148       if (selected_tune)
12149         explicit_tune_core = selected_tune->ident;
12150     }
12151   /* If both -mcpu and -march are specified check that they are architecturally
12152      compatible, warn if they're not and prefer the -march ISA flags.  */
12153   else if (selected_arch)
12154     {
12155       if (selected_arch->arch != selected_cpu->arch)
12156         {
12157           warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
12158                        all_architectures[selected_cpu->arch].name,
12159                        selected_arch->name);
12160         }
12161       aarch64_isa_flags = arch_isa;
12162       explicit_arch = selected_arch->arch;
12163       explicit_tune_core = selected_tune ? selected_tune->ident
12164                                           : selected_cpu->ident;
12165     }
12166   else
12167     {
12168       /* -mcpu but no -march.  */
12169       aarch64_isa_flags = cpu_isa;
12170       explicit_tune_core = selected_tune ? selected_tune->ident
12171                                           : selected_cpu->ident;
12172       gcc_assert (selected_cpu);
12173       selected_arch = &all_architectures[selected_cpu->arch];
12174       explicit_arch = selected_arch->arch;
12175     }
12176
12177   /* Set the arch as well as we will need it when outputing
12178      the .arch directive in assembly.  */
12179   if (!selected_arch)
12180     {
12181       gcc_assert (selected_cpu);
12182       selected_arch = &all_architectures[selected_cpu->arch];
12183     }
12184
12185   if (!selected_tune)
12186     selected_tune = selected_cpu;
12187
12188   if (aarch64_enable_bti == 2)
12189     {
12190 #ifdef TARGET_ENABLE_BTI
12191       aarch64_enable_bti = 1;
12192 #else
12193       aarch64_enable_bti = 0;
12194 #endif
12195     }
12196
12197   /* Return address signing is currently not supported for ILP32 targets.  For
12198      LP64 targets use the configured option in the absence of a command-line
12199      option for -mbranch-protection.  */
12200   if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
12201     {
12202 #ifdef TARGET_ENABLE_PAC_RET
12203       aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
12204 #else
12205       aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
12206 #endif
12207     }
12208
12209 #ifndef HAVE_AS_MABI_OPTION
12210   /* The compiler may have been configured with 2.23.* binutils, which does
12211      not have support for ILP32.  */
12212   if (TARGET_ILP32)
12213     error ("assembler does not support %<-mabi=ilp32%>");
12214 #endif
12215
12216   /* Convert -msve-vector-bits to a VG count.  */
12217   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
12218
12219   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
12220     sorry ("return address signing is only supported for %<-mabi=lp64%>");
12221
12222   /* Make sure we properly set up the explicit options.  */
12223   if ((aarch64_cpu_string && valid_cpu)
12224        || (aarch64_tune_string && valid_tune))
12225     gcc_assert (explicit_tune_core != aarch64_none);
12226
12227   if ((aarch64_cpu_string && valid_cpu)
12228        || (aarch64_arch_string && valid_arch))
12229     gcc_assert (explicit_arch != aarch64_no_arch);
12230
12231   /* The pass to insert speculation tracking runs before
12232      shrink-wrapping and the latter does not know how to update the
12233      tracking status.  So disable it in this case.  */
12234   if (aarch64_track_speculation)
12235     flag_shrink_wrap = 0;
12236
12237   aarch64_override_options_internal (&global_options);
12238
12239   /* Save these options as the default ones in case we push and pop them later
12240      while processing functions with potential target attributes.  */
12241   target_option_default_node = target_option_current_node
12242       = build_target_option_node (&global_options);
12243 }
12244
12245 /* Implement targetm.override_options_after_change.  */
12246
12247 static void
12248 aarch64_override_options_after_change (void)
12249 {
12250   aarch64_override_options_after_change_1 (&global_options);
12251 }
12252
12253 static struct machine_function *
12254 aarch64_init_machine_status (void)
12255 {
12256   struct machine_function *machine;
12257   machine = ggc_cleared_alloc<machine_function> ();
12258   return machine;
12259 }
12260
12261 void
12262 aarch64_init_expanders (void)
12263 {
12264   init_machine_status = aarch64_init_machine_status;
12265 }
12266
12267 /* A checking mechanism for the implementation of the various code models.  */
12268 static void
12269 initialize_aarch64_code_model (struct gcc_options *opts)
12270 {
12271    if (opts->x_flag_pic)
12272      {
12273        switch (opts->x_aarch64_cmodel_var)
12274          {
12275          case AARCH64_CMODEL_TINY:
12276            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
12277            break;
12278          case AARCH64_CMODEL_SMALL:
12279 #ifdef HAVE_AS_SMALL_PIC_RELOCS
12280            aarch64_cmodel = (flag_pic == 2
12281                              ? AARCH64_CMODEL_SMALL_PIC
12282                              : AARCH64_CMODEL_SMALL_SPIC);
12283 #else
12284            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
12285 #endif
12286            break;
12287          case AARCH64_CMODEL_LARGE:
12288            sorry ("code model %qs with %<-f%s%>", "large",
12289                   opts->x_flag_pic > 1 ? "PIC" : "pic");
12290            break;
12291          default:
12292            gcc_unreachable ();
12293          }
12294      }
12295    else
12296      aarch64_cmodel = opts->x_aarch64_cmodel_var;
12297 }
12298
12299 /* Implement TARGET_OPTION_SAVE.  */
12300
12301 static void
12302 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
12303 {
12304   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
12305   ptr->x_aarch64_branch_protection_string
12306     = opts->x_aarch64_branch_protection_string;
12307 }
12308
12309 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
12310    using the information saved in PTR.  */
12311
12312 static void
12313 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
12314 {
12315   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
12316   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
12317   opts->x_explicit_arch = ptr->x_explicit_arch;
12318   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
12319   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
12320   opts->x_aarch64_branch_protection_string
12321     = ptr->x_aarch64_branch_protection_string;
12322   if (opts->x_aarch64_branch_protection_string)
12323     {
12324       aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
12325                                         NULL);
12326     }
12327
12328   aarch64_override_options_internal (opts);
12329 }
12330
12331 /* Implement TARGET_OPTION_PRINT.  */
12332
12333 static void
12334 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
12335 {
12336   const struct processor *cpu
12337     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
12338   uint64_t isa_flags = ptr->x_aarch64_isa_flags;
12339   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
12340   std::string extension
12341     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
12342
12343   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
12344   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
12345            arch->name, extension.c_str ());
12346 }
12347
12348 static GTY(()) tree aarch64_previous_fndecl;
12349
12350 void
12351 aarch64_reset_previous_fndecl (void)
12352 {
12353   aarch64_previous_fndecl = NULL;
12354 }
12355
12356 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
12357    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
12358    make sure optab availability predicates are recomputed when necessary.  */
12359
12360 void
12361 aarch64_save_restore_target_globals (tree new_tree)
12362 {
12363   if (TREE_TARGET_GLOBALS (new_tree))
12364     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
12365   else if (new_tree == target_option_default_node)
12366     restore_target_globals (&default_target_globals);
12367   else
12368     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
12369 }
12370
12371 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
12372    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
12373    of the function, if such exists.  This function may be called multiple
12374    times on a single function so use aarch64_previous_fndecl to avoid
12375    setting up identical state.  */
12376
12377 static void
12378 aarch64_set_current_function (tree fndecl)
12379 {
12380   if (!fndecl || fndecl == aarch64_previous_fndecl)
12381     return;
12382
12383   tree old_tree = (aarch64_previous_fndecl
12384                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
12385                    : NULL_TREE);
12386
12387   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12388
12389   /* If current function has no attributes but the previous one did,
12390      use the default node.  */
12391   if (!new_tree && old_tree)
12392     new_tree = target_option_default_node;
12393
12394   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
12395      the default have been handled by aarch64_save_restore_target_globals from
12396      aarch64_pragma_target_parse.  */
12397   if (old_tree == new_tree)
12398     return;
12399
12400   aarch64_previous_fndecl = fndecl;
12401
12402   /* First set the target options.  */
12403   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
12404
12405   aarch64_save_restore_target_globals (new_tree);
12406 }
12407
12408 /* Enum describing the various ways we can handle attributes.
12409    In many cases we can reuse the generic option handling machinery.  */
12410
12411 enum aarch64_attr_opt_type
12412 {
12413   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
12414   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
12415   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
12416   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
12417 };
12418
12419 /* All the information needed to handle a target attribute.
12420    NAME is the name of the attribute.
12421    ATTR_TYPE specifies the type of behavior of the attribute as described
12422    in the definition of enum aarch64_attr_opt_type.
12423    ALLOW_NEG is true if the attribute supports a "no-" form.
12424    HANDLER is the function that takes the attribute string as an argument
12425    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
12426    OPT_NUM is the enum specifying the option that the attribute modifies.
12427    This is needed for attributes that mirror the behavior of a command-line
12428    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
12429    aarch64_attr_enum.  */
12430
12431 struct aarch64_attribute_info
12432 {
12433   const char *name;
12434   enum aarch64_attr_opt_type attr_type;
12435   bool allow_neg;
12436   bool (*handler) (const char *);
12437   enum opt_code opt_num;
12438 };
12439
12440 /* Handle the ARCH_STR argument to the arch= target attribute.  */
12441
12442 static bool
12443 aarch64_handle_attr_arch (const char *str)
12444 {
12445   const struct processor *tmp_arch = NULL;
12446   std::string invalid_extension;
12447   enum aarch64_parse_opt_result parse_res
12448     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
12449
12450   if (parse_res == AARCH64_PARSE_OK)
12451     {
12452       gcc_assert (tmp_arch);
12453       selected_arch = tmp_arch;
12454       explicit_arch = selected_arch->arch;
12455       return true;
12456     }
12457
12458   switch (parse_res)
12459     {
12460       case AARCH64_PARSE_MISSING_ARG:
12461         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
12462         break;
12463       case AARCH64_PARSE_INVALID_ARG:
12464         error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
12465         aarch64_print_hint_for_arch (str);
12466         break;
12467       case AARCH64_PARSE_INVALID_FEATURE:
12468         error ("invalid feature modifier %s of value (\"%s\") in "
12469                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12470         aarch64_print_hint_for_extensions (invalid_extension);
12471         break;
12472       default:
12473         gcc_unreachable ();
12474     }
12475
12476   return false;
12477 }
12478
12479 /* Handle the argument CPU_STR to the cpu= target attribute.  */
12480
12481 static bool
12482 aarch64_handle_attr_cpu (const char *str)
12483 {
12484   const struct processor *tmp_cpu = NULL;
12485   std::string invalid_extension;
12486   enum aarch64_parse_opt_result parse_res
12487     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
12488
12489   if (parse_res == AARCH64_PARSE_OK)
12490     {
12491       gcc_assert (tmp_cpu);
12492       selected_tune = tmp_cpu;
12493       explicit_tune_core = selected_tune->ident;
12494
12495       selected_arch = &all_architectures[tmp_cpu->arch];
12496       explicit_arch = selected_arch->arch;
12497       return true;
12498     }
12499
12500   switch (parse_res)
12501     {
12502       case AARCH64_PARSE_MISSING_ARG:
12503         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
12504         break;
12505       case AARCH64_PARSE_INVALID_ARG:
12506         error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
12507         aarch64_print_hint_for_core (str);
12508         break;
12509       case AARCH64_PARSE_INVALID_FEATURE:
12510         error ("invalid feature modifier %s of value (\"%s\") in "
12511                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12512         aarch64_print_hint_for_extensions (invalid_extension);
12513         break;
12514       default:
12515         gcc_unreachable ();
12516     }
12517
12518   return false;
12519 }
12520
12521 /* Handle the argument STR to the branch-protection= attribute.  */
12522
12523  static bool
12524  aarch64_handle_attr_branch_protection (const char* str)
12525  {
12526   char *err_str = (char *) xmalloc (strlen (str));
12527   enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
12528                                                                       &err_str);
12529   bool success = false;
12530   switch (res)
12531     {
12532      case AARCH64_PARSE_MISSING_ARG:
12533        error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
12534               " attribute");
12535        break;
12536      case AARCH64_PARSE_INVALID_ARG:
12537        error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
12538               "=\")%> pragma or attribute", err_str);
12539        break;
12540      case AARCH64_PARSE_OK:
12541        success = true;
12542       /* Fall through.  */
12543      case AARCH64_PARSE_INVALID_FEATURE:
12544        break;
12545      default:
12546        gcc_unreachable ();
12547     }
12548   free (err_str);
12549   return success;
12550  }
12551
12552 /* Handle the argument STR to the tune= target attribute.  */
12553
12554 static bool
12555 aarch64_handle_attr_tune (const char *str)
12556 {
12557   const struct processor *tmp_tune = NULL;
12558   enum aarch64_parse_opt_result parse_res
12559     = aarch64_parse_tune (str, &tmp_tune);
12560
12561   if (parse_res == AARCH64_PARSE_OK)
12562     {
12563       gcc_assert (tmp_tune);
12564       selected_tune = tmp_tune;
12565       explicit_tune_core = selected_tune->ident;
12566       return true;
12567     }
12568
12569   switch (parse_res)
12570     {
12571       case AARCH64_PARSE_INVALID_ARG:
12572         error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
12573         aarch64_print_hint_for_core (str);
12574         break;
12575       default:
12576         gcc_unreachable ();
12577     }
12578
12579   return false;
12580 }
12581
12582 /* Parse an architecture extensions target attribute string specified in STR.
12583    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
12584    if successful.  Update aarch64_isa_flags to reflect the ISA features
12585    modified.  */
12586
12587 static bool
12588 aarch64_handle_attr_isa_flags (char *str)
12589 {
12590   enum aarch64_parse_opt_result parse_res;
12591   uint64_t isa_flags = aarch64_isa_flags;
12592
12593   /* We allow "+nothing" in the beginning to clear out all architectural
12594      features if the user wants to handpick specific features.  */
12595   if (strncmp ("+nothing", str, 8) == 0)
12596     {
12597       isa_flags = 0;
12598       str += 8;
12599     }
12600
12601   std::string invalid_extension;
12602   parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
12603
12604   if (parse_res == AARCH64_PARSE_OK)
12605     {
12606       aarch64_isa_flags = isa_flags;
12607       return true;
12608     }
12609
12610   switch (parse_res)
12611     {
12612       case AARCH64_PARSE_MISSING_ARG:
12613         error ("missing value in %<target()%> pragma or attribute");
12614         break;
12615
12616       case AARCH64_PARSE_INVALID_FEATURE:
12617         error ("invalid feature modifier %s of value (\"%s\") in "
12618                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12619         break;
12620
12621       default:
12622         gcc_unreachable ();
12623     }
12624
12625  return false;
12626 }
12627
12628 /* The target attributes that we support.  On top of these we also support just
12629    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
12630    handled explicitly in aarch64_process_one_target_attr.  */
12631
12632 static const struct aarch64_attribute_info aarch64_attributes[] =
12633 {
12634   { "general-regs-only", aarch64_attr_mask, false, NULL,
12635      OPT_mgeneral_regs_only },
12636   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
12637      OPT_mfix_cortex_a53_835769 },
12638   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
12639      OPT_mfix_cortex_a53_843419 },
12640   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
12641   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
12642   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
12643      OPT_momit_leaf_frame_pointer },
12644   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
12645   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
12646      OPT_march_ },
12647   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
12648   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
12649      OPT_mtune_ },
12650   { "branch-protection", aarch64_attr_custom, false,
12651      aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
12652   { "sign-return-address", aarch64_attr_enum, false, NULL,
12653      OPT_msign_return_address_ },
12654   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
12655 };
12656
12657 /* Parse ARG_STR which contains the definition of one target attribute.
12658    Show appropriate errors if any or return true if the attribute is valid.  */
12659
12660 static bool
12661 aarch64_process_one_target_attr (char *arg_str)
12662 {
12663   bool invert = false;
12664
12665   size_t len = strlen (arg_str);
12666
12667   if (len == 0)
12668     {
12669       error ("malformed %<target()%> pragma or attribute");
12670       return false;
12671     }
12672
12673   char *str_to_check = (char *) alloca (len + 1);
12674   strcpy (str_to_check, arg_str);
12675
12676   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
12677      It is easier to detect and handle it explicitly here rather than going
12678      through the machinery for the rest of the target attributes in this
12679      function.  */
12680   if (*str_to_check == '+')
12681     return aarch64_handle_attr_isa_flags (str_to_check);
12682
12683   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
12684     {
12685       invert = true;
12686       str_to_check += 3;
12687     }
12688   char *arg = strchr (str_to_check, '=');
12689
12690   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
12691      and point ARG to "foo".  */
12692   if (arg)
12693     {
12694       *arg = '\0';
12695       arg++;
12696     }
12697   const struct aarch64_attribute_info *p_attr;
12698   bool found = false;
12699   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
12700     {
12701       /* If the names don't match up, or the user has given an argument
12702          to an attribute that doesn't accept one, or didn't give an argument
12703          to an attribute that expects one, fail to match.  */
12704       if (strcmp (str_to_check, p_attr->name) != 0)
12705         continue;
12706
12707       found = true;
12708       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
12709                               || p_attr->attr_type == aarch64_attr_enum;
12710
12711       if (attr_need_arg_p ^ (arg != NULL))
12712         {
12713           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
12714           return false;
12715         }
12716
12717       /* If the name matches but the attribute does not allow "no-" versions
12718          then we can't match.  */
12719       if (invert && !p_attr->allow_neg)
12720         {
12721           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
12722           return false;
12723         }
12724
12725       switch (p_attr->attr_type)
12726         {
12727         /* Has a custom handler registered.
12728            For example, cpu=, arch=, tune=.  */
12729           case aarch64_attr_custom:
12730             gcc_assert (p_attr->handler);
12731             if (!p_attr->handler (arg))
12732               return false;
12733             break;
12734
12735           /* Either set or unset a boolean option.  */
12736           case aarch64_attr_bool:
12737             {
12738               struct cl_decoded_option decoded;
12739
12740               generate_option (p_attr->opt_num, NULL, !invert,
12741                                CL_TARGET, &decoded);
12742               aarch64_handle_option (&global_options, &global_options_set,
12743                                       &decoded, input_location);
12744               break;
12745             }
12746           /* Set or unset a bit in the target_flags.  aarch64_handle_option
12747              should know what mask to apply given the option number.  */
12748           case aarch64_attr_mask:
12749             {
12750               struct cl_decoded_option decoded;
12751               /* We only need to specify the option number.
12752                  aarch64_handle_option will know which mask to apply.  */
12753               decoded.opt_index = p_attr->opt_num;
12754               decoded.value = !invert;
12755               aarch64_handle_option (&global_options, &global_options_set,
12756                                       &decoded, input_location);
12757               break;
12758             }
12759           /* Use the option setting machinery to set an option to an enum.  */
12760           case aarch64_attr_enum:
12761             {
12762               gcc_assert (arg);
12763               bool valid;
12764               int value;
12765               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
12766                                               &value, CL_TARGET);
12767               if (valid)
12768                 {
12769                   set_option (&global_options, NULL, p_attr->opt_num, value,
12770                               NULL, DK_UNSPECIFIED, input_location,
12771                               global_dc);
12772                 }
12773               else
12774                 {
12775                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
12776                 }
12777               break;
12778             }
12779           default:
12780             gcc_unreachable ();
12781         }
12782     }
12783
12784   /* If we reached here we either have found an attribute and validated
12785      it or didn't match any.  If we matched an attribute but its arguments
12786      were malformed we will have returned false already.  */
12787   return found;
12788 }
12789
12790 /* Count how many times the character C appears in
12791    NULL-terminated string STR.  */
12792
12793 static unsigned int
12794 num_occurences_in_str (char c, char *str)
12795 {
12796   unsigned int res = 0;
12797   while (*str != '\0')
12798     {
12799       if (*str == c)
12800         res++;
12801
12802       str++;
12803     }
12804
12805   return res;
12806 }
12807
12808 /* Parse the tree in ARGS that contains the target attribute information
12809    and update the global target options space.  */
12810
12811 bool
12812 aarch64_process_target_attr (tree args)
12813 {
12814   if (TREE_CODE (args) == TREE_LIST)
12815     {
12816       do
12817         {
12818           tree head = TREE_VALUE (args);
12819           if (head)
12820             {
12821               if (!aarch64_process_target_attr (head))
12822                 return false;
12823             }
12824           args = TREE_CHAIN (args);
12825         } while (args);
12826
12827       return true;
12828     }
12829
12830   if (TREE_CODE (args) != STRING_CST)
12831     {
12832       error ("attribute %<target%> argument not a string");
12833       return false;
12834     }
12835
12836   size_t len = strlen (TREE_STRING_POINTER (args));
12837   char *str_to_check = (char *) alloca (len + 1);
12838   strcpy (str_to_check, TREE_STRING_POINTER (args));
12839
12840   if (len == 0)
12841     {
12842       error ("malformed %<target()%> pragma or attribute");
12843       return false;
12844     }
12845
12846   /* Used to catch empty spaces between commas i.e.
12847      attribute ((target ("attr1,,attr2"))).  */
12848   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
12849
12850   /* Handle multiple target attributes separated by ','.  */
12851   char *token = strtok_r (str_to_check, ",", &str_to_check);
12852
12853   unsigned int num_attrs = 0;
12854   while (token)
12855     {
12856       num_attrs++;
12857       if (!aarch64_process_one_target_attr (token))
12858         {
12859           error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
12860           return false;
12861         }
12862
12863       token = strtok_r (NULL, ",", &str_to_check);
12864     }
12865
12866   if (num_attrs != num_commas + 1)
12867     {
12868       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
12869       return false;
12870     }
12871
12872   return true;
12873 }
12874
12875 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
12876    process attribute ((target ("..."))).  */
12877
12878 static bool
12879 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
12880 {
12881   struct cl_target_option cur_target;
12882   bool ret;
12883   tree old_optimize;
12884   tree new_target, new_optimize;
12885   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12886
12887   /* If what we're processing is the current pragma string then the
12888      target option node is already stored in target_option_current_node
12889      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
12890      having to re-parse the string.  This is especially useful to keep
12891      arm_neon.h compile times down since that header contains a lot
12892      of intrinsics enclosed in pragmas.  */
12893   if (!existing_target && args == current_target_pragma)
12894     {
12895       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
12896       return true;
12897     }
12898   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
12899
12900   old_optimize = build_optimization_node (&global_options);
12901   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
12902
12903   /* If the function changed the optimization levels as well as setting
12904      target options, start with the optimizations specified.  */
12905   if (func_optimize && func_optimize != old_optimize)
12906     cl_optimization_restore (&global_options,
12907                              TREE_OPTIMIZATION (func_optimize));
12908
12909   /* Save the current target options to restore at the end.  */
12910   cl_target_option_save (&cur_target, &global_options);
12911
12912   /* If fndecl already has some target attributes applied to it, unpack
12913      them so that we add this attribute on top of them, rather than
12914      overwriting them.  */
12915   if (existing_target)
12916     {
12917       struct cl_target_option *existing_options
12918         = TREE_TARGET_OPTION (existing_target);
12919
12920       if (existing_options)
12921         cl_target_option_restore (&global_options, existing_options);
12922     }
12923   else
12924     cl_target_option_restore (&global_options,
12925                         TREE_TARGET_OPTION (target_option_current_node));
12926
12927   ret = aarch64_process_target_attr (args);
12928
12929   /* Set up any additional state.  */
12930   if (ret)
12931     {
12932       aarch64_override_options_internal (&global_options);
12933       /* Initialize SIMD builtins if we haven't already.
12934          Set current_target_pragma to NULL for the duration so that
12935          the builtin initialization code doesn't try to tag the functions
12936          being built with the attributes specified by any current pragma, thus
12937          going into an infinite recursion.  */
12938       if (TARGET_SIMD)
12939         {
12940           tree saved_current_target_pragma = current_target_pragma;
12941           current_target_pragma = NULL;
12942           aarch64_init_simd_builtins ();
12943           current_target_pragma = saved_current_target_pragma;
12944         }
12945       new_target = build_target_option_node (&global_options);
12946     }
12947   else
12948     new_target = NULL;
12949
12950   new_optimize = build_optimization_node (&global_options);
12951
12952   if (fndecl && ret)
12953     {
12954       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
12955
12956       if (old_optimize != new_optimize)
12957         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
12958     }
12959
12960   cl_target_option_restore (&global_options, &cur_target);
12961
12962   if (old_optimize != new_optimize)
12963     cl_optimization_restore (&global_options,
12964                              TREE_OPTIMIZATION (old_optimize));
12965   return ret;
12966 }
12967
12968 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
12969    tri-bool options (yes, no, don't care) and the default value is
12970    DEF, determine whether to reject inlining.  */
12971
12972 static bool
12973 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
12974                                      int dont_care, int def)
12975 {
12976   /* If the callee doesn't care, always allow inlining.  */
12977   if (callee == dont_care)
12978     return true;
12979
12980   /* If the caller doesn't care, always allow inlining.  */
12981   if (caller == dont_care)
12982     return true;
12983
12984   /* Otherwise, allow inlining if either the callee and caller values
12985      agree, or if the callee is using the default value.  */
12986   return (callee == caller || callee == def);
12987 }
12988
12989 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
12990    to inline CALLEE into CALLER based on target-specific info.
12991    Make sure that the caller and callee have compatible architectural
12992    features.  Then go through the other possible target attributes
12993    and see if they can block inlining.  Try not to reject always_inline
12994    callees unless they are incompatible architecturally.  */
12995
12996 static bool
12997 aarch64_can_inline_p (tree caller, tree callee)
12998 {
12999   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
13000   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
13001
13002   struct cl_target_option *caller_opts
13003         = TREE_TARGET_OPTION (caller_tree ? caller_tree
13004                                            : target_option_default_node);
13005
13006   struct cl_target_option *callee_opts
13007         = TREE_TARGET_OPTION (callee_tree ? callee_tree
13008                                            : target_option_default_node);
13009
13010   /* Callee's ISA flags should be a subset of the caller's.  */
13011   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
13012        != callee_opts->x_aarch64_isa_flags)
13013     return false;
13014
13015   /* Allow non-strict aligned functions inlining into strict
13016      aligned ones.  */
13017   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
13018        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
13019       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
13020            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
13021     return false;
13022
13023   bool always_inline = lookup_attribute ("always_inline",
13024                                           DECL_ATTRIBUTES (callee));
13025
13026   /* If the architectural features match up and the callee is always_inline
13027      then the other attributes don't matter.  */
13028   if (always_inline)
13029     return true;
13030
13031   if (caller_opts->x_aarch64_cmodel_var
13032       != callee_opts->x_aarch64_cmodel_var)
13033     return false;
13034
13035   if (caller_opts->x_aarch64_tls_dialect
13036       != callee_opts->x_aarch64_tls_dialect)
13037     return false;
13038
13039   /* Honour explicit requests to workaround errata.  */
13040   if (!aarch64_tribools_ok_for_inlining_p (
13041           caller_opts->x_aarch64_fix_a53_err835769,
13042           callee_opts->x_aarch64_fix_a53_err835769,
13043           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
13044     return false;
13045
13046   if (!aarch64_tribools_ok_for_inlining_p (
13047           caller_opts->x_aarch64_fix_a53_err843419,
13048           callee_opts->x_aarch64_fix_a53_err843419,
13049           2, TARGET_FIX_ERR_A53_843419))
13050     return false;
13051
13052   /* If the user explicitly specified -momit-leaf-frame-pointer for the
13053      caller and calle and they don't match up, reject inlining.  */
13054   if (!aarch64_tribools_ok_for_inlining_p (
13055           caller_opts->x_flag_omit_leaf_frame_pointer,
13056           callee_opts->x_flag_omit_leaf_frame_pointer,
13057           2, 1))
13058     return false;
13059
13060   /* If the callee has specific tuning overrides, respect them.  */
13061   if (callee_opts->x_aarch64_override_tune_string != NULL
13062       && caller_opts->x_aarch64_override_tune_string == NULL)
13063     return false;
13064
13065   /* If the user specified tuning override strings for the
13066      caller and callee and they don't match up, reject inlining.
13067      We just do a string compare here, we don't analyze the meaning
13068      of the string, as it would be too costly for little gain.  */
13069   if (callee_opts->x_aarch64_override_tune_string
13070       && caller_opts->x_aarch64_override_tune_string
13071       && (strcmp (callee_opts->x_aarch64_override_tune_string,
13072                   caller_opts->x_aarch64_override_tune_string) != 0))
13073     return false;
13074
13075   return true;
13076 }
13077
13078 /* Return true if SYMBOL_REF X binds locally.  */
13079
13080 static bool
13081 aarch64_symbol_binds_local_p (const_rtx x)
13082 {
13083   return (SYMBOL_REF_DECL (x)
13084           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
13085           : SYMBOL_REF_LOCAL_P (x));
13086 }
13087
13088 /* Return true if SYMBOL_REF X is thread local */
13089 static bool
13090 aarch64_tls_symbol_p (rtx x)
13091 {
13092   if (! TARGET_HAVE_TLS)
13093     return false;
13094
13095   if (GET_CODE (x) != SYMBOL_REF)
13096     return false;
13097
13098   return SYMBOL_REF_TLS_MODEL (x) != 0;
13099 }
13100
13101 /* Classify a TLS symbol into one of the TLS kinds.  */
13102 enum aarch64_symbol_type
13103 aarch64_classify_tls_symbol (rtx x)
13104 {
13105   enum tls_model tls_kind = tls_symbolic_operand_type (x);
13106
13107   switch (tls_kind)
13108     {
13109     case TLS_MODEL_GLOBAL_DYNAMIC:
13110     case TLS_MODEL_LOCAL_DYNAMIC:
13111       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
13112
13113     case TLS_MODEL_INITIAL_EXEC:
13114       switch (aarch64_cmodel)
13115         {
13116         case AARCH64_CMODEL_TINY:
13117         case AARCH64_CMODEL_TINY_PIC:
13118           return SYMBOL_TINY_TLSIE;
13119         default:
13120           return SYMBOL_SMALL_TLSIE;
13121         }
13122
13123     case TLS_MODEL_LOCAL_EXEC:
13124       if (aarch64_tls_size == 12)
13125         return SYMBOL_TLSLE12;
13126       else if (aarch64_tls_size == 24)
13127         return SYMBOL_TLSLE24;
13128       else if (aarch64_tls_size == 32)
13129         return SYMBOL_TLSLE32;
13130       else if (aarch64_tls_size == 48)
13131         return SYMBOL_TLSLE48;
13132       else
13133         gcc_unreachable ();
13134
13135     case TLS_MODEL_EMULATED:
13136     case TLS_MODEL_NONE:
13137       return SYMBOL_FORCE_TO_MEM;
13138
13139     default:
13140       gcc_unreachable ();
13141     }
13142 }
13143
13144 /* Return the correct method for accessing X + OFFSET, where X is either
13145    a SYMBOL_REF or LABEL_REF.  */
13146
13147 enum aarch64_symbol_type
13148 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
13149 {
13150   if (GET_CODE (x) == LABEL_REF)
13151     {
13152       switch (aarch64_cmodel)
13153         {
13154         case AARCH64_CMODEL_LARGE:
13155           return SYMBOL_FORCE_TO_MEM;
13156
13157         case AARCH64_CMODEL_TINY_PIC:
13158         case AARCH64_CMODEL_TINY:
13159           return SYMBOL_TINY_ABSOLUTE;
13160
13161         case AARCH64_CMODEL_SMALL_SPIC:
13162         case AARCH64_CMODEL_SMALL_PIC:
13163         case AARCH64_CMODEL_SMALL:
13164           return SYMBOL_SMALL_ABSOLUTE;
13165
13166         default:
13167           gcc_unreachable ();
13168         }
13169     }
13170
13171   if (GET_CODE (x) == SYMBOL_REF)
13172     {
13173       if (aarch64_tls_symbol_p (x))
13174         return aarch64_classify_tls_symbol (x);
13175
13176       switch (aarch64_cmodel)
13177         {
13178         case AARCH64_CMODEL_TINY:
13179           /* When we retrieve symbol + offset address, we have to make sure
13180              the offset does not cause overflow of the final address.  But
13181              we have no way of knowing the address of symbol at compile time
13182              so we can't accurately say if the distance between the PC and
13183              symbol + offset is outside the addressible range of +/-1M in the
13184              TINY code model.  So we rely on images not being greater than
13185              1M and cap the offset at 1M and anything beyond 1M will have to
13186              be loaded using an alternative mechanism.  Furthermore if the
13187              symbol is a weak reference to something that isn't known to
13188              resolve to a symbol in this module, then force to memory.  */
13189           if ((SYMBOL_REF_WEAK (x)
13190                && !aarch64_symbol_binds_local_p (x))
13191               || !IN_RANGE (offset, -1048575, 1048575))
13192             return SYMBOL_FORCE_TO_MEM;
13193           return SYMBOL_TINY_ABSOLUTE;
13194
13195         case AARCH64_CMODEL_SMALL:
13196           /* Same reasoning as the tiny code model, but the offset cap here is
13197              4G.  */
13198           if ((SYMBOL_REF_WEAK (x)
13199                && !aarch64_symbol_binds_local_p (x))
13200               || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
13201                             HOST_WIDE_INT_C (4294967264)))
13202             return SYMBOL_FORCE_TO_MEM;
13203           return SYMBOL_SMALL_ABSOLUTE;
13204
13205         case AARCH64_CMODEL_TINY_PIC:
13206           if (!aarch64_symbol_binds_local_p (x))
13207             return SYMBOL_TINY_GOT;
13208           return SYMBOL_TINY_ABSOLUTE;
13209
13210         case AARCH64_CMODEL_SMALL_SPIC:
13211         case AARCH64_CMODEL_SMALL_PIC:
13212           if (!aarch64_symbol_binds_local_p (x))
13213             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
13214                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
13215           return SYMBOL_SMALL_ABSOLUTE;
13216
13217         case AARCH64_CMODEL_LARGE:
13218           /* This is alright even in PIC code as the constant
13219              pool reference is always PC relative and within
13220              the same translation unit.  */
13221           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
13222             return SYMBOL_SMALL_ABSOLUTE;
13223           else
13224             return SYMBOL_FORCE_TO_MEM;
13225
13226         default:
13227           gcc_unreachable ();
13228         }
13229     }
13230
13231   /* By default push everything into the constant pool.  */
13232   return SYMBOL_FORCE_TO_MEM;
13233 }
13234
13235 bool
13236 aarch64_constant_address_p (rtx x)
13237 {
13238   return (CONSTANT_P (x) && memory_address_p (DImode, x));
13239 }
13240
13241 bool
13242 aarch64_legitimate_pic_operand_p (rtx x)
13243 {
13244   if (GET_CODE (x) == SYMBOL_REF
13245       || (GET_CODE (x) == CONST
13246           && GET_CODE (XEXP (x, 0)) == PLUS
13247           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
13248      return false;
13249
13250   return true;
13251 }
13252
13253 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
13254    that should be rematerialized rather than spilled.  */
13255
13256 static bool
13257 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
13258 {
13259   /* Support CSE and rematerialization of common constants.  */
13260   if (CONST_INT_P (x)
13261       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
13262       || GET_CODE (x) == CONST_VECTOR)
13263     return true;
13264
13265   /* Do not allow vector struct mode constants for Advanced SIMD.
13266      We could support 0 and -1 easily, but they need support in
13267      aarch64-simd.md.  */
13268   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13269   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13270     return false;
13271
13272   /* Only accept variable-length vector constants if they can be
13273      handled directly.
13274
13275      ??? It would be possible to handle rematerialization of other
13276      constants via secondary reloads.  */
13277   if (vec_flags & VEC_ANY_SVE)
13278     return aarch64_simd_valid_immediate (x, NULL);
13279
13280   if (GET_CODE (x) == HIGH)
13281     x = XEXP (x, 0);
13282
13283   /* Accept polynomial constants that can be calculated by using the
13284      destination of a move as the sole temporary.  Constants that
13285      require a second temporary cannot be rematerialized (they can't be
13286      forced to memory and also aren't legitimate constants).  */
13287   poly_int64 offset;
13288   if (poly_int_rtx_p (x, &offset))
13289     return aarch64_offset_temporaries (false, offset) <= 1;
13290
13291   /* If an offset is being added to something else, we need to allow the
13292      base to be moved into the destination register, meaning that there
13293      are no free temporaries for the offset.  */
13294   x = strip_offset (x, &offset);
13295   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
13296     return false;
13297
13298   /* Do not allow const (plus (anchor_symbol, const_int)).  */
13299   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
13300     return false;
13301
13302   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
13303      so spilling them is better than rematerialization.  */
13304   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
13305     return true;
13306
13307   /* Label references are always constant.  */
13308   if (GET_CODE (x) == LABEL_REF)
13309     return true;
13310
13311   return false;
13312 }
13313
13314 rtx
13315 aarch64_load_tp (rtx target)
13316 {
13317   if (!target
13318       || GET_MODE (target) != Pmode
13319       || !register_operand (target, Pmode))
13320     target = gen_reg_rtx (Pmode);
13321
13322   /* Can return in any reg.  */
13323   emit_insn (gen_aarch64_load_tp_hard (target));
13324   return target;
13325 }
13326
13327 /* On AAPCS systems, this is the "struct __va_list".  */
13328 static GTY(()) tree va_list_type;
13329
13330 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
13331    Return the type to use as __builtin_va_list.
13332
13333    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
13334
13335    struct __va_list
13336    {
13337      void *__stack;
13338      void *__gr_top;
13339      void *__vr_top;
13340      int   __gr_offs;
13341      int   __vr_offs;
13342    };  */
13343
13344 static tree
13345 aarch64_build_builtin_va_list (void)
13346 {
13347   tree va_list_name;
13348   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13349
13350   /* Create the type.  */
13351   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
13352   /* Give it the required name.  */
13353   va_list_name = build_decl (BUILTINS_LOCATION,
13354                              TYPE_DECL,
13355                              get_identifier ("__va_list"),
13356                              va_list_type);
13357   DECL_ARTIFICIAL (va_list_name) = 1;
13358   TYPE_NAME (va_list_type) = va_list_name;
13359   TYPE_STUB_DECL (va_list_type) = va_list_name;
13360
13361   /* Create the fields.  */
13362   f_stack = build_decl (BUILTINS_LOCATION,
13363                         FIELD_DECL, get_identifier ("__stack"),
13364                         ptr_type_node);
13365   f_grtop = build_decl (BUILTINS_LOCATION,
13366                         FIELD_DECL, get_identifier ("__gr_top"),
13367                         ptr_type_node);
13368   f_vrtop = build_decl (BUILTINS_LOCATION,
13369                         FIELD_DECL, get_identifier ("__vr_top"),
13370                         ptr_type_node);
13371   f_groff = build_decl (BUILTINS_LOCATION,
13372                         FIELD_DECL, get_identifier ("__gr_offs"),
13373                         integer_type_node);
13374   f_vroff = build_decl (BUILTINS_LOCATION,
13375                         FIELD_DECL, get_identifier ("__vr_offs"),
13376                         integer_type_node);
13377
13378   /* Tell tree-stdarg pass about our internal offset fields.
13379      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
13380      purpose to identify whether the code is updating va_list internal
13381      offset fields through irregular way.  */
13382   va_list_gpr_counter_field = f_groff;
13383   va_list_fpr_counter_field = f_vroff;
13384
13385   DECL_ARTIFICIAL (f_stack) = 1;
13386   DECL_ARTIFICIAL (f_grtop) = 1;
13387   DECL_ARTIFICIAL (f_vrtop) = 1;
13388   DECL_ARTIFICIAL (f_groff) = 1;
13389   DECL_ARTIFICIAL (f_vroff) = 1;
13390
13391   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
13392   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
13393   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
13394   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
13395   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
13396
13397   TYPE_FIELDS (va_list_type) = f_stack;
13398   DECL_CHAIN (f_stack) = f_grtop;
13399   DECL_CHAIN (f_grtop) = f_vrtop;
13400   DECL_CHAIN (f_vrtop) = f_groff;
13401   DECL_CHAIN (f_groff) = f_vroff;
13402
13403   /* Compute its layout.  */
13404   layout_type (va_list_type);
13405
13406   return va_list_type;
13407 }
13408
13409 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
13410 static void
13411 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
13412 {
13413   const CUMULATIVE_ARGS *cum;
13414   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13415   tree stack, grtop, vrtop, groff, vroff;
13416   tree t;
13417   int gr_save_area_size = cfun->va_list_gpr_size;
13418   int vr_save_area_size = cfun->va_list_fpr_size;
13419   int vr_offset;
13420
13421   cum = &crtl->args.info;
13422   if (cfun->va_list_gpr_size)
13423     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
13424                              cfun->va_list_gpr_size);
13425   if (cfun->va_list_fpr_size)
13426     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
13427                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
13428
13429   if (!TARGET_FLOAT)
13430     {
13431       gcc_assert (cum->aapcs_nvrn == 0);
13432       vr_save_area_size = 0;
13433     }
13434
13435   f_stack = TYPE_FIELDS (va_list_type_node);
13436   f_grtop = DECL_CHAIN (f_stack);
13437   f_vrtop = DECL_CHAIN (f_grtop);
13438   f_groff = DECL_CHAIN (f_vrtop);
13439   f_vroff = DECL_CHAIN (f_groff);
13440
13441   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
13442                   NULL_TREE);
13443   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
13444                   NULL_TREE);
13445   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
13446                   NULL_TREE);
13447   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
13448                   NULL_TREE);
13449   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
13450                   NULL_TREE);
13451
13452   /* Emit code to initialize STACK, which points to the next varargs stack
13453      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
13454      by named arguments.  STACK is 8-byte aligned.  */
13455   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
13456   if (cum->aapcs_stack_size > 0)
13457     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
13458   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
13459   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13460
13461   /* Emit code to initialize GRTOP, the top of the GR save area.
13462      virtual_incoming_args_rtx should have been 16 byte aligned.  */
13463   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
13464   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
13465   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13466
13467   /* Emit code to initialize VRTOP, the top of the VR save area.
13468      This address is gr_save_area_bytes below GRTOP, rounded
13469      down to the next 16-byte boundary.  */
13470   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
13471   vr_offset = ROUND_UP (gr_save_area_size,
13472                         STACK_BOUNDARY / BITS_PER_UNIT);
13473
13474   if (vr_offset)
13475     t = fold_build_pointer_plus_hwi (t, -vr_offset);
13476   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
13477   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13478
13479   /* Emit code to initialize GROFF, the offset from GRTOP of the
13480      next GPR argument.  */
13481   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
13482               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
13483   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13484
13485   /* Likewise emit code to initialize VROFF, the offset from FTOP
13486      of the next VR argument.  */
13487   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
13488               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
13489   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13490 }
13491
13492 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
13493
13494 static tree
13495 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
13496                               gimple_seq *post_p ATTRIBUTE_UNUSED)
13497 {
13498   tree addr;
13499   bool indirect_p;
13500   bool is_ha;           /* is HFA or HVA.  */
13501   bool dw_align;        /* double-word align.  */
13502   machine_mode ag_mode = VOIDmode;
13503   int nregs;
13504   machine_mode mode;
13505
13506   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13507   tree stack, f_top, f_off, off, arg, roundup, on_stack;
13508   HOST_WIDE_INT size, rsize, adjust, align;
13509   tree t, u, cond1, cond2;
13510
13511   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
13512   if (indirect_p)
13513     type = build_pointer_type (type);
13514
13515   mode = TYPE_MODE (type);
13516
13517   f_stack = TYPE_FIELDS (va_list_type_node);
13518   f_grtop = DECL_CHAIN (f_stack);
13519   f_vrtop = DECL_CHAIN (f_grtop);
13520   f_groff = DECL_CHAIN (f_vrtop);
13521   f_vroff = DECL_CHAIN (f_groff);
13522
13523   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
13524                   f_stack, NULL_TREE);
13525   size = int_size_in_bytes (type);
13526
13527   bool abi_break;
13528   align
13529     = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
13530
13531   dw_align = false;
13532   adjust = 0;
13533   if (aarch64_vfp_is_call_or_return_candidate (mode,
13534                                                type,
13535                                                &ag_mode,
13536                                                &nregs,
13537                                                &is_ha))
13538     {
13539       /* No frontends can create types with variable-sized modes, so we
13540          shouldn't be asked to pass or return them.  */
13541       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
13542
13543       /* TYPE passed in fp/simd registers.  */
13544       if (!TARGET_FLOAT)
13545         aarch64_err_no_fpadvsimd (mode);
13546
13547       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
13548                       unshare_expr (valist), f_vrtop, NULL_TREE);
13549       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
13550                       unshare_expr (valist), f_vroff, NULL_TREE);
13551
13552       rsize = nregs * UNITS_PER_VREG;
13553
13554       if (is_ha)
13555         {
13556           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
13557             adjust = UNITS_PER_VREG - ag_size;
13558         }
13559       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13560                && size < UNITS_PER_VREG)
13561         {
13562           adjust = UNITS_PER_VREG - size;
13563         }
13564     }
13565   else
13566     {
13567       /* TYPE passed in general registers.  */
13568       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
13569                       unshare_expr (valist), f_grtop, NULL_TREE);
13570       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
13571                       unshare_expr (valist), f_groff, NULL_TREE);
13572       rsize = ROUND_UP (size, UNITS_PER_WORD);
13573       nregs = rsize / UNITS_PER_WORD;
13574
13575       if (align > 8)
13576         {
13577           if (abi_break && warn_psabi)
13578             inform (input_location, "parameter passing for argument of type "
13579                     "%qT changed in GCC 9.1", type);
13580           dw_align = true;
13581         }
13582
13583       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13584           && size < UNITS_PER_WORD)
13585         {
13586           adjust = UNITS_PER_WORD  - size;
13587         }
13588     }
13589
13590   /* Get a local temporary for the field value.  */
13591   off = get_initialized_tmp_var (f_off, pre_p, NULL);
13592
13593   /* Emit code to branch if off >= 0.  */
13594   t = build2 (GE_EXPR, boolean_type_node, off,
13595               build_int_cst (TREE_TYPE (off), 0));
13596   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
13597
13598   if (dw_align)
13599     {
13600       /* Emit: offs = (offs + 15) & -16.  */
13601       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
13602                   build_int_cst (TREE_TYPE (off), 15));
13603       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
13604                   build_int_cst (TREE_TYPE (off), -16));
13605       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
13606     }
13607   else
13608     roundup = NULL;
13609
13610   /* Update ap.__[g|v]r_offs  */
13611   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
13612               build_int_cst (TREE_TYPE (off), rsize));
13613   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
13614
13615   /* String up.  */
13616   if (roundup)
13617     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
13618
13619   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
13620   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
13621               build_int_cst (TREE_TYPE (f_off), 0));
13622   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
13623
13624   /* String up: make sure the assignment happens before the use.  */
13625   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
13626   COND_EXPR_ELSE (cond1) = t;
13627
13628   /* Prepare the trees handling the argument that is passed on the stack;
13629      the top level node will store in ON_STACK.  */
13630   arg = get_initialized_tmp_var (stack, pre_p, NULL);
13631   if (align > 8)
13632     {
13633       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
13634       t = fold_build_pointer_plus_hwi (arg, 15);
13635       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
13636                   build_int_cst (TREE_TYPE (t), -16));
13637       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
13638     }
13639   else
13640     roundup = NULL;
13641   /* Advance ap.__stack  */
13642   t = fold_build_pointer_plus_hwi (arg, size + 7);
13643   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
13644               build_int_cst (TREE_TYPE (t), -8));
13645   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
13646   /* String up roundup and advance.  */
13647   if (roundup)
13648     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
13649   /* String up with arg */
13650   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
13651   /* Big-endianness related address adjustment.  */
13652   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13653       && size < UNITS_PER_WORD)
13654   {
13655     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
13656                 size_int (UNITS_PER_WORD - size));
13657     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
13658   }
13659
13660   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
13661   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
13662
13663   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
13664   t = off;
13665   if (adjust)
13666     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
13667                 build_int_cst (TREE_TYPE (off), adjust));
13668
13669   t = fold_convert (sizetype, t);
13670   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
13671
13672   if (is_ha)
13673     {
13674       /* type ha; // treat as "struct {ftype field[n];}"
13675          ... [computing offs]
13676          for (i = 0; i <nregs; ++i, offs += 16)
13677            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
13678          return ha;  */
13679       int i;
13680       tree tmp_ha, field_t, field_ptr_t;
13681
13682       /* Declare a local variable.  */
13683       tmp_ha = create_tmp_var_raw (type, "ha");
13684       gimple_add_tmp_var (tmp_ha);
13685
13686       /* Establish the base type.  */
13687       switch (ag_mode)
13688         {
13689         case E_SFmode:
13690           field_t = float_type_node;
13691           field_ptr_t = float_ptr_type_node;
13692           break;
13693         case E_DFmode:
13694           field_t = double_type_node;
13695           field_ptr_t = double_ptr_type_node;
13696           break;
13697         case E_TFmode:
13698           field_t = long_double_type_node;
13699           field_ptr_t = long_double_ptr_type_node;
13700           break;
13701         case E_HFmode:
13702           field_t = aarch64_fp16_type_node;
13703           field_ptr_t = aarch64_fp16_ptr_type_node;
13704           break;
13705         case E_V2SImode:
13706         case E_V4SImode:
13707             {
13708               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
13709               field_t = build_vector_type_for_mode (innertype, ag_mode);
13710               field_ptr_t = build_pointer_type (field_t);
13711             }
13712           break;
13713         default:
13714           gcc_assert (0);
13715         }
13716
13717       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
13718       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
13719       addr = t;
13720       t = fold_convert (field_ptr_t, addr);
13721       t = build2 (MODIFY_EXPR, field_t,
13722                   build1 (INDIRECT_REF, field_t, tmp_ha),
13723                   build1 (INDIRECT_REF, field_t, t));
13724
13725       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
13726       for (i = 1; i < nregs; ++i)
13727         {
13728           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
13729           u = fold_convert (field_ptr_t, addr);
13730           u = build2 (MODIFY_EXPR, field_t,
13731                       build2 (MEM_REF, field_t, tmp_ha,
13732                               build_int_cst (field_ptr_t,
13733                                              (i *
13734                                               int_size_in_bytes (field_t)))),
13735                       build1 (INDIRECT_REF, field_t, u));
13736           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
13737         }
13738
13739       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
13740       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
13741     }
13742
13743   COND_EXPR_ELSE (cond2) = t;
13744   addr = fold_convert (build_pointer_type (type), cond1);
13745   addr = build_va_arg_indirect_ref (addr);
13746
13747   if (indirect_p)
13748     addr = build_va_arg_indirect_ref (addr);
13749
13750   return addr;
13751 }
13752
13753 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
13754
13755 static void
13756 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
13757                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
13758                                 int no_rtl)
13759 {
13760   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
13761   CUMULATIVE_ARGS local_cum;
13762   int gr_saved = cfun->va_list_gpr_size;
13763   int vr_saved = cfun->va_list_fpr_size;
13764
13765   /* The caller has advanced CUM up to, but not beyond, the last named
13766      argument.  Advance a local copy of CUM past the last "real" named
13767      argument, to find out how many registers are left over.  */
13768   local_cum = *cum;
13769   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
13770
13771   /* Found out how many registers we need to save.
13772      Honor tree-stdvar analysis results.  */
13773   if (cfun->va_list_gpr_size)
13774     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
13775                     cfun->va_list_gpr_size / UNITS_PER_WORD);
13776   if (cfun->va_list_fpr_size)
13777     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
13778                     cfun->va_list_fpr_size / UNITS_PER_VREG);
13779
13780   if (!TARGET_FLOAT)
13781     {
13782       gcc_assert (local_cum.aapcs_nvrn == 0);
13783       vr_saved = 0;
13784     }
13785
13786   if (!no_rtl)
13787     {
13788       if (gr_saved > 0)
13789         {
13790           rtx ptr, mem;
13791
13792           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
13793           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
13794                                - gr_saved * UNITS_PER_WORD);
13795           mem = gen_frame_mem (BLKmode, ptr);
13796           set_mem_alias_set (mem, get_varargs_alias_set ());
13797
13798           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
13799                                mem, gr_saved);
13800         }
13801       if (vr_saved > 0)
13802         {
13803           /* We can't use move_block_from_reg, because it will use
13804              the wrong mode, storing D regs only.  */
13805           machine_mode mode = TImode;
13806           int off, i, vr_start;
13807
13808           /* Set OFF to the offset from virtual_incoming_args_rtx of
13809              the first vector register.  The VR save area lies below
13810              the GR one, and is aligned to 16 bytes.  */
13811           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
13812                            STACK_BOUNDARY / BITS_PER_UNIT);
13813           off -= vr_saved * UNITS_PER_VREG;
13814
13815           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
13816           for (i = 0; i < vr_saved; ++i)
13817             {
13818               rtx ptr, mem;
13819
13820               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
13821               mem = gen_frame_mem (mode, ptr);
13822               set_mem_alias_set (mem, get_varargs_alias_set ());
13823               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
13824               off += UNITS_PER_VREG;
13825             }
13826         }
13827     }
13828
13829   /* We don't save the size into *PRETEND_SIZE because we want to avoid
13830      any complication of having crtl->args.pretend_args_size changed.  */
13831   cfun->machine->frame.saved_varargs_size
13832     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
13833                  STACK_BOUNDARY / BITS_PER_UNIT)
13834        + vr_saved * UNITS_PER_VREG);
13835 }
13836
13837 static void
13838 aarch64_conditional_register_usage (void)
13839 {
13840   int i;
13841   if (!TARGET_FLOAT)
13842     {
13843       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
13844         {
13845           fixed_regs[i] = 1;
13846           call_used_regs[i] = 1;
13847         }
13848     }
13849   if (!TARGET_SVE)
13850     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
13851       {
13852         fixed_regs[i] = 1;
13853         call_used_regs[i] = 1;
13854       }
13855
13856   /* When tracking speculation, we need a couple of call-clobbered registers
13857      to track the speculation state.  It would be nice to just use
13858      IP0 and IP1, but currently there are numerous places that just
13859      assume these registers are free for other uses (eg pointer
13860      authentication).  */
13861   if (aarch64_track_speculation)
13862     {
13863       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
13864       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
13865       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
13866       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
13867     }
13868 }
13869
13870 /* Walk down the type tree of TYPE counting consecutive base elements.
13871    If *MODEP is VOIDmode, then set it to the first valid floating point
13872    type.  If a non-floating point type is found, or if a floating point
13873    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
13874    otherwise return the count in the sub-tree.  */
13875 static int
13876 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
13877 {
13878   machine_mode mode;
13879   HOST_WIDE_INT size;
13880
13881   switch (TREE_CODE (type))
13882     {
13883     case REAL_TYPE:
13884       mode = TYPE_MODE (type);
13885       if (mode != DFmode && mode != SFmode
13886           && mode != TFmode && mode != HFmode)
13887         return -1;
13888
13889       if (*modep == VOIDmode)
13890         *modep = mode;
13891
13892       if (*modep == mode)
13893         return 1;
13894
13895       break;
13896
13897     case COMPLEX_TYPE:
13898       mode = TYPE_MODE (TREE_TYPE (type));
13899       if (mode != DFmode && mode != SFmode
13900           && mode != TFmode && mode != HFmode)
13901         return -1;
13902
13903       if (*modep == VOIDmode)
13904         *modep = mode;
13905
13906       if (*modep == mode)
13907         return 2;
13908
13909       break;
13910
13911     case VECTOR_TYPE:
13912       /* Use V2SImode and V4SImode as representatives of all 64-bit
13913          and 128-bit vector types.  */
13914       size = int_size_in_bytes (type);
13915       switch (size)
13916         {
13917         case 8:
13918           mode = V2SImode;
13919           break;
13920         case 16:
13921           mode = V4SImode;
13922           break;
13923         default:
13924           return -1;
13925         }
13926
13927       if (*modep == VOIDmode)
13928         *modep = mode;
13929
13930       /* Vector modes are considered to be opaque: two vectors are
13931          equivalent for the purposes of being homogeneous aggregates
13932          if they are the same size.  */
13933       if (*modep == mode)
13934         return 1;
13935
13936       break;
13937
13938     case ARRAY_TYPE:
13939       {
13940         int count;
13941         tree index = TYPE_DOMAIN (type);
13942
13943         /* Can't handle incomplete types nor sizes that are not
13944            fixed.  */
13945         if (!COMPLETE_TYPE_P (type)
13946             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13947           return -1;
13948
13949         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
13950         if (count == -1
13951             || !index
13952             || !TYPE_MAX_VALUE (index)
13953             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
13954             || !TYPE_MIN_VALUE (index)
13955             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
13956             || count < 0)
13957           return -1;
13958
13959         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
13960                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
13961
13962         /* There must be no padding.  */
13963         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13964                       count * GET_MODE_BITSIZE (*modep)))
13965           return -1;
13966
13967         return count;
13968       }
13969
13970     case RECORD_TYPE:
13971       {
13972         int count = 0;
13973         int sub_count;
13974         tree field;
13975
13976         /* Can't handle incomplete types nor sizes that are not
13977            fixed.  */
13978         if (!COMPLETE_TYPE_P (type)
13979             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13980           return -1;
13981
13982         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
13983           {
13984             if (TREE_CODE (field) != FIELD_DECL)
13985               continue;
13986
13987             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
13988             if (sub_count < 0)
13989               return -1;
13990             count += sub_count;
13991           }
13992
13993         /* There must be no padding.  */
13994         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13995                       count * GET_MODE_BITSIZE (*modep)))
13996           return -1;
13997
13998         return count;
13999       }
14000
14001     case UNION_TYPE:
14002     case QUAL_UNION_TYPE:
14003       {
14004         /* These aren't very interesting except in a degenerate case.  */
14005         int count = 0;
14006         int sub_count;
14007         tree field;
14008
14009         /* Can't handle incomplete types nor sizes that are not
14010            fixed.  */
14011         if (!COMPLETE_TYPE_P (type)
14012             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14013           return -1;
14014
14015         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14016           {
14017             if (TREE_CODE (field) != FIELD_DECL)
14018               continue;
14019
14020             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14021             if (sub_count < 0)
14022               return -1;
14023             count = count > sub_count ? count : sub_count;
14024           }
14025
14026         /* There must be no padding.  */
14027         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14028                       count * GET_MODE_BITSIZE (*modep)))
14029           return -1;
14030
14031         return count;
14032       }
14033
14034     default:
14035       break;
14036     }
14037
14038   return -1;
14039 }
14040
14041 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
14042    type as described in AAPCS64 \S 4.1.2.
14043
14044    See the comment above aarch64_composite_type_p for the notes on MODE.  */
14045
14046 static bool
14047 aarch64_short_vector_p (const_tree type,
14048                         machine_mode mode)
14049 {
14050   poly_int64 size = -1;
14051
14052   if (type && TREE_CODE (type) == VECTOR_TYPE)
14053     size = int_size_in_bytes (type);
14054   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
14055             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
14056     size = GET_MODE_SIZE (mode);
14057
14058   return known_eq (size, 8) || known_eq (size, 16);
14059 }
14060
14061 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
14062    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
14063    array types.  The C99 floating-point complex types are also considered
14064    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
14065    types, which are GCC extensions and out of the scope of AAPCS64, are
14066    treated as composite types here as well.
14067
14068    Note that MODE itself is not sufficient in determining whether a type
14069    is such a composite type or not.  This is because
14070    stor-layout.c:compute_record_mode may have already changed the MODE
14071    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
14072    structure with only one field may have its MODE set to the mode of the
14073    field.  Also an integer mode whose size matches the size of the
14074    RECORD_TYPE type may be used to substitute the original mode
14075    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
14076    solely relied on.  */
14077
14078 static bool
14079 aarch64_composite_type_p (const_tree type,
14080                           machine_mode mode)
14081 {
14082   if (aarch64_short_vector_p (type, mode))
14083     return false;
14084
14085   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
14086     return true;
14087
14088   if (mode == BLKmode
14089       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
14090       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
14091     return true;
14092
14093   return false;
14094 }
14095
14096 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
14097    shall be passed or returned in simd/fp register(s) (providing these
14098    parameter passing registers are available).
14099
14100    Upon successful return, *COUNT returns the number of needed registers,
14101    *BASE_MODE returns the mode of the individual register and when IS_HAF
14102    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
14103    floating-point aggregate or a homogeneous short-vector aggregate.  */
14104
14105 static bool
14106 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
14107                                          const_tree type,
14108                                          machine_mode *base_mode,
14109                                          int *count,
14110                                          bool *is_ha)
14111 {
14112   machine_mode new_mode = VOIDmode;
14113   bool composite_p = aarch64_composite_type_p (type, mode);
14114
14115   if (is_ha != NULL) *is_ha = false;
14116
14117   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
14118       || aarch64_short_vector_p (type, mode))
14119     {
14120       *count = 1;
14121       new_mode = mode;
14122     }
14123   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
14124     {
14125       if (is_ha != NULL) *is_ha = true;
14126       *count = 2;
14127       new_mode = GET_MODE_INNER (mode);
14128     }
14129   else if (type && composite_p)
14130     {
14131       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
14132
14133       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
14134         {
14135           if (is_ha != NULL) *is_ha = true;
14136           *count = ag_count;
14137         }
14138       else
14139         return false;
14140     }
14141   else
14142     return false;
14143
14144   *base_mode = new_mode;
14145   return true;
14146 }
14147
14148 /* Implement TARGET_STRUCT_VALUE_RTX.  */
14149
14150 static rtx
14151 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
14152                           int incoming ATTRIBUTE_UNUSED)
14153 {
14154   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
14155 }
14156
14157 /* Implements target hook vector_mode_supported_p.  */
14158 static bool
14159 aarch64_vector_mode_supported_p (machine_mode mode)
14160 {
14161   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14162   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
14163 }
14164
14165 /* Return appropriate SIMD container
14166    for MODE within a vector of WIDTH bits.  */
14167 static machine_mode
14168 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
14169 {
14170   if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
14171     switch (mode)
14172       {
14173       case E_DFmode:
14174         return VNx2DFmode;
14175       case E_SFmode:
14176         return VNx4SFmode;
14177       case E_HFmode:
14178         return VNx8HFmode;
14179       case E_DImode:
14180         return VNx2DImode;
14181       case E_SImode:
14182         return VNx4SImode;
14183       case E_HImode:
14184         return VNx8HImode;
14185       case E_QImode:
14186         return VNx16QImode;
14187       default:
14188         return word_mode;
14189       }
14190
14191   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
14192   if (TARGET_SIMD)
14193     {
14194       if (known_eq (width, 128))
14195         switch (mode)
14196           {
14197           case E_DFmode:
14198             return V2DFmode;
14199           case E_SFmode:
14200             return V4SFmode;
14201           case E_HFmode:
14202             return V8HFmode;
14203           case E_SImode:
14204             return V4SImode;
14205           case E_HImode:
14206             return V8HImode;
14207           case E_QImode:
14208             return V16QImode;
14209           case E_DImode:
14210             return V2DImode;
14211           default:
14212             break;
14213           }
14214       else
14215         switch (mode)
14216           {
14217           case E_SFmode:
14218             return V2SFmode;
14219           case E_HFmode:
14220             return V4HFmode;
14221           case E_SImode:
14222             return V2SImode;
14223           case E_HImode:
14224             return V4HImode;
14225           case E_QImode:
14226             return V8QImode;
14227           default:
14228             break;
14229           }
14230     }
14231   return word_mode;
14232 }
14233
14234 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
14235 static machine_mode
14236 aarch64_preferred_simd_mode (scalar_mode mode)
14237 {
14238   poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
14239   return aarch64_simd_container_mode (mode, bits);
14240 }
14241
14242 /* Return a list of possible vector sizes for the vectorizer
14243    to iterate over.  */
14244 static void
14245 aarch64_autovectorize_vector_sizes (vector_sizes *sizes, bool)
14246 {
14247   if (TARGET_SVE)
14248     sizes->safe_push (BYTES_PER_SVE_VECTOR);
14249   sizes->safe_push (16);
14250   sizes->safe_push (8);
14251 }
14252
14253 /* Implement TARGET_MANGLE_TYPE.  */
14254
14255 static const char *
14256 aarch64_mangle_type (const_tree type)
14257 {
14258   /* The AArch64 ABI documents say that "__va_list" has to be
14259      mangled as if it is in the "std" namespace.  */
14260   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
14261     return "St9__va_list";
14262
14263   /* Half-precision float.  */
14264   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
14265     return "Dh";
14266
14267   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
14268      builtin types.  */
14269   if (TYPE_NAME (type) != NULL)
14270     return aarch64_mangle_builtin_type (type);
14271
14272   /* Use the default mangling.  */
14273   return NULL;
14274 }
14275
14276 /* Find the first rtx_insn before insn that will generate an assembly
14277    instruction.  */
14278
14279 static rtx_insn *
14280 aarch64_prev_real_insn (rtx_insn *insn)
14281 {
14282   if (!insn)
14283     return NULL;
14284
14285   do
14286     {
14287       insn = prev_real_insn (insn);
14288     }
14289   while (insn && recog_memoized (insn) < 0);
14290
14291   return insn;
14292 }
14293
14294 static bool
14295 is_madd_op (enum attr_type t1)
14296 {
14297   unsigned int i;
14298   /* A number of these may be AArch32 only.  */
14299   enum attr_type mlatypes[] = {
14300     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
14301     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
14302     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
14303   };
14304
14305   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
14306     {
14307       if (t1 == mlatypes[i])
14308         return true;
14309     }
14310
14311   return false;
14312 }
14313
14314 /* Check if there is a register dependency between a load and the insn
14315    for which we hold recog_data.  */
14316
14317 static bool
14318 dep_between_memop_and_curr (rtx memop)
14319 {
14320   rtx load_reg;
14321   int opno;
14322
14323   gcc_assert (GET_CODE (memop) == SET);
14324
14325   if (!REG_P (SET_DEST (memop)))
14326     return false;
14327
14328   load_reg = SET_DEST (memop);
14329   for (opno = 1; opno < recog_data.n_operands; opno++)
14330     {
14331       rtx operand = recog_data.operand[opno];
14332       if (REG_P (operand)
14333           && reg_overlap_mentioned_p (load_reg, operand))
14334         return true;
14335
14336     }
14337   return false;
14338 }
14339
14340
14341 /* When working around the Cortex-A53 erratum 835769,
14342    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
14343    instruction and has a preceding memory instruction such that a NOP
14344    should be inserted between them.  */
14345
14346 bool
14347 aarch64_madd_needs_nop (rtx_insn* insn)
14348 {
14349   enum attr_type attr_type;
14350   rtx_insn *prev;
14351   rtx body;
14352
14353   if (!TARGET_FIX_ERR_A53_835769)
14354     return false;
14355
14356   if (!INSN_P (insn) || recog_memoized (insn) < 0)
14357     return false;
14358
14359   attr_type = get_attr_type (insn);
14360   if (!is_madd_op (attr_type))
14361     return false;
14362
14363   prev = aarch64_prev_real_insn (insn);
14364   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
14365      Restore recog state to INSN to avoid state corruption.  */
14366   extract_constrain_insn_cached (insn);
14367
14368   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
14369     return false;
14370
14371   body = single_set (prev);
14372
14373   /* If the previous insn is a memory op and there is no dependency between
14374      it and the DImode madd, emit a NOP between them.  If body is NULL then we
14375      have a complex memory operation, probably a load/store pair.
14376      Be conservative for now and emit a NOP.  */
14377   if (GET_MODE (recog_data.operand[0]) == DImode
14378       && (!body || !dep_between_memop_and_curr (body)))
14379     return true;
14380
14381   return false;
14382
14383 }
14384
14385
14386 /* Implement FINAL_PRESCAN_INSN.  */
14387
14388 void
14389 aarch64_final_prescan_insn (rtx_insn *insn)
14390 {
14391   if (aarch64_madd_needs_nop (insn))
14392     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
14393 }
14394
14395
14396 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
14397    instruction.  */
14398
14399 bool
14400 aarch64_sve_index_immediate_p (rtx base_or_step)
14401 {
14402   return (CONST_INT_P (base_or_step)
14403           && IN_RANGE (INTVAL (base_or_step), -16, 15));
14404 }
14405
14406 /* Return true if X is a valid immediate for the SVE ADD and SUB
14407    instructions.  Negate X first if NEGATE_P is true.  */
14408
14409 bool
14410 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
14411 {
14412   rtx elt;
14413
14414   if (!const_vec_duplicate_p (x, &elt)
14415       || !CONST_INT_P (elt))
14416     return false;
14417
14418   HOST_WIDE_INT val = INTVAL (elt);
14419   if (negate_p)
14420     val = -val;
14421   val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
14422
14423   if (val & 0xff)
14424     return IN_RANGE (val, 0, 0xff);
14425   return IN_RANGE (val, 0, 0xff00);
14426 }
14427
14428 /* Return true if X is a valid immediate operand for an SVE logical
14429    instruction such as AND.  */
14430
14431 bool
14432 aarch64_sve_bitmask_immediate_p (rtx x)
14433 {
14434   rtx elt;
14435
14436   return (const_vec_duplicate_p (x, &elt)
14437           && CONST_INT_P (elt)
14438           && aarch64_bitmask_imm (INTVAL (elt),
14439                                   GET_MODE_INNER (GET_MODE (x))));
14440 }
14441
14442 /* Return true if X is a valid immediate for the SVE DUP and CPY
14443    instructions.  */
14444
14445 bool
14446 aarch64_sve_dup_immediate_p (rtx x)
14447 {
14448   rtx elt;
14449
14450   if (!const_vec_duplicate_p (x, &elt)
14451       || !CONST_INT_P (elt))
14452     return false;
14453
14454   HOST_WIDE_INT val = INTVAL (elt);
14455   if (val & 0xff)
14456     return IN_RANGE (val, -0x80, 0x7f);
14457   return IN_RANGE (val, -0x8000, 0x7f00);
14458 }
14459
14460 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
14461    SIGNED_P says whether the operand is signed rather than unsigned.  */
14462
14463 bool
14464 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
14465 {
14466   rtx elt;
14467
14468   return (const_vec_duplicate_p (x, &elt)
14469           && CONST_INT_P (elt)
14470           && (signed_p
14471               ? IN_RANGE (INTVAL (elt), -16, 15)
14472               : IN_RANGE (INTVAL (elt), 0, 127)));
14473 }
14474
14475 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
14476    instruction.  Negate X first if NEGATE_P is true.  */
14477
14478 bool
14479 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
14480 {
14481   rtx elt;
14482   REAL_VALUE_TYPE r;
14483
14484   if (!const_vec_duplicate_p (x, &elt)
14485       || GET_CODE (elt) != CONST_DOUBLE)
14486     return false;
14487
14488   r = *CONST_DOUBLE_REAL_VALUE (elt);
14489
14490   if (negate_p)
14491     r = real_value_negate (&r);
14492
14493   if (real_equal (&r, &dconst1))
14494     return true;
14495   if (real_equal (&r, &dconsthalf))
14496     return true;
14497   return false;
14498 }
14499
14500 /* Return true if X is a valid immediate operand for an SVE FMUL
14501    instruction.  */
14502
14503 bool
14504 aarch64_sve_float_mul_immediate_p (rtx x)
14505 {
14506   rtx elt;
14507
14508   /* GCC will never generate a multiply with an immediate of 2, so there is no
14509      point testing for it (even though it is a valid constant).  */
14510   return (const_vec_duplicate_p (x, &elt)
14511           && GET_CODE (elt) == CONST_DOUBLE
14512           && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
14513 }
14514
14515 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
14516    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
14517    is nonnull, use it to describe valid immediates.  */
14518 static bool
14519 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
14520                                     simd_immediate_info *info,
14521                                     enum simd_immediate_check which,
14522                                     simd_immediate_info::insn_type insn)
14523 {
14524   /* Try a 4-byte immediate with LSL.  */
14525   for (unsigned int shift = 0; shift < 32; shift += 8)
14526     if ((val32 & (0xff << shift)) == val32)
14527       {
14528         if (info)
14529           *info = simd_immediate_info (SImode, val32 >> shift, insn,
14530                                        simd_immediate_info::LSL, shift);
14531         return true;
14532       }
14533
14534   /* Try a 2-byte immediate with LSL.  */
14535   unsigned int imm16 = val32 & 0xffff;
14536   if (imm16 == (val32 >> 16))
14537     for (unsigned int shift = 0; shift < 16; shift += 8)
14538       if ((imm16 & (0xff << shift)) == imm16)
14539         {
14540           if (info)
14541             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
14542                                          simd_immediate_info::LSL, shift);
14543           return true;
14544         }
14545
14546   /* Try a 4-byte immediate with MSL, except for cases that MVN
14547      can handle.  */
14548   if (which == AARCH64_CHECK_MOV)
14549     for (unsigned int shift = 8; shift < 24; shift += 8)
14550       {
14551         unsigned int low = (1 << shift) - 1;
14552         if (((val32 & (0xff << shift)) | low) == val32)
14553           {
14554             if (info)
14555               *info = simd_immediate_info (SImode, val32 >> shift, insn,
14556                                            simd_immediate_info::MSL, shift);
14557             return true;
14558           }
14559       }
14560
14561   return false;
14562 }
14563
14564 /* Return true if replicating VAL64 is a valid immediate for the
14565    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
14566    use it to describe valid immediates.  */
14567 static bool
14568 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
14569                                  simd_immediate_info *info,
14570                                  enum simd_immediate_check which)
14571 {
14572   unsigned int val32 = val64 & 0xffffffff;
14573   unsigned int val16 = val64 & 0xffff;
14574   unsigned int val8 = val64 & 0xff;
14575
14576   if (val32 == (val64 >> 32))
14577     {
14578       if ((which & AARCH64_CHECK_ORR) != 0
14579           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
14580                                                  simd_immediate_info::MOV))
14581         return true;
14582
14583       if ((which & AARCH64_CHECK_BIC) != 0
14584           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
14585                                                  simd_immediate_info::MVN))
14586         return true;
14587
14588       /* Try using a replicated byte.  */
14589       if (which == AARCH64_CHECK_MOV
14590           && val16 == (val32 >> 16)
14591           && val8 == (val16 >> 8))
14592         {
14593           if (info)
14594             *info = simd_immediate_info (QImode, val8);
14595           return true;
14596         }
14597     }
14598
14599   /* Try using a bit-to-bytemask.  */
14600   if (which == AARCH64_CHECK_MOV)
14601     {
14602       unsigned int i;
14603       for (i = 0; i < 64; i += 8)
14604         {
14605           unsigned char byte = (val64 >> i) & 0xff;
14606           if (byte != 0 && byte != 0xff)
14607             break;
14608         }
14609       if (i == 64)
14610         {
14611           if (info)
14612             *info = simd_immediate_info (DImode, val64);
14613           return true;
14614         }
14615     }
14616   return false;
14617 }
14618
14619 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
14620    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
14621
14622 static bool
14623 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
14624                              simd_immediate_info *info)
14625 {
14626   scalar_int_mode mode = DImode;
14627   unsigned int val32 = val64 & 0xffffffff;
14628   if (val32 == (val64 >> 32))
14629     {
14630       mode = SImode;
14631       unsigned int val16 = val32 & 0xffff;
14632       if (val16 == (val32 >> 16))
14633         {
14634           mode = HImode;
14635           unsigned int val8 = val16 & 0xff;
14636           if (val8 == (val16 >> 8))
14637             mode = QImode;
14638         }
14639     }
14640   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
14641   if (IN_RANGE (val, -0x80, 0x7f))
14642     {
14643       /* DUP with no shift.  */
14644       if (info)
14645         *info = simd_immediate_info (mode, val);
14646       return true;
14647     }
14648   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
14649     {
14650       /* DUP with LSL #8.  */
14651       if (info)
14652         *info = simd_immediate_info (mode, val);
14653       return true;
14654     }
14655   if (aarch64_bitmask_imm (val64, mode))
14656     {
14657       /* DUPM.  */
14658       if (info)
14659         *info = simd_immediate_info (mode, val);
14660       return true;
14661     }
14662   return false;
14663 }
14664
14665 /* Return true if OP is a valid SIMD immediate for the operation
14666    described by WHICH.  If INFO is nonnull, use it to describe valid
14667    immediates.  */
14668 bool
14669 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
14670                               enum simd_immediate_check which)
14671 {
14672   machine_mode mode = GET_MODE (op);
14673   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14674   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
14675     return false;
14676
14677   scalar_mode elt_mode = GET_MODE_INNER (mode);
14678   rtx base, step;
14679   unsigned int n_elts;
14680   if (GET_CODE (op) == CONST_VECTOR
14681       && CONST_VECTOR_DUPLICATE_P (op))
14682     n_elts = CONST_VECTOR_NPATTERNS (op);
14683   else if ((vec_flags & VEC_SVE_DATA)
14684            && const_vec_series_p (op, &base, &step))
14685     {
14686       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
14687       if (!aarch64_sve_index_immediate_p (base)
14688           || !aarch64_sve_index_immediate_p (step))
14689         return false;
14690
14691       if (info)
14692         *info = simd_immediate_info (elt_mode, base, step);
14693       return true;
14694     }
14695   else if (GET_CODE (op) == CONST_VECTOR
14696            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
14697     /* N_ELTS set above.  */;
14698   else
14699     return false;
14700
14701   /* Handle PFALSE and PTRUE.  */
14702   if (vec_flags & VEC_SVE_PRED)
14703     return (op == CONST0_RTX (mode)
14704             || op == CONSTM1_RTX (mode));
14705
14706   scalar_float_mode elt_float_mode;
14707   if (n_elts == 1
14708       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
14709     {
14710       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
14711       if (aarch64_float_const_zero_rtx_p (elt)
14712           || aarch64_float_const_representable_p (elt))
14713         {
14714           if (info)
14715             *info = simd_immediate_info (elt_float_mode, elt);
14716           return true;
14717         }
14718     }
14719
14720   unsigned int elt_size = GET_MODE_SIZE (elt_mode);
14721   if (elt_size > 8)
14722     return false;
14723
14724   scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
14725
14726   /* Expand the vector constant out into a byte vector, with the least
14727      significant byte of the register first.  */
14728   auto_vec<unsigned char, 16> bytes;
14729   bytes.reserve (n_elts * elt_size);
14730   for (unsigned int i = 0; i < n_elts; i++)
14731     {
14732       /* The vector is provided in gcc endian-neutral fashion.
14733          For aarch64_be Advanced SIMD, it must be laid out in the vector
14734          register in reverse order.  */
14735       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
14736       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
14737
14738       if (elt_mode != elt_int_mode)
14739         elt = gen_lowpart (elt_int_mode, elt);
14740
14741       if (!CONST_INT_P (elt))
14742         return false;
14743
14744       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
14745       for (unsigned int byte = 0; byte < elt_size; byte++)
14746         {
14747           bytes.quick_push (elt_val & 0xff);
14748           elt_val >>= BITS_PER_UNIT;
14749         }
14750     }
14751
14752   /* The immediate must repeat every eight bytes.  */
14753   unsigned int nbytes = bytes.length ();
14754   for (unsigned i = 8; i < nbytes; ++i)
14755     if (bytes[i] != bytes[i - 8])
14756       return false;
14757
14758   /* Get the repeating 8-byte value as an integer.  No endian correction
14759      is needed here because bytes is already in lsb-first order.  */
14760   unsigned HOST_WIDE_INT val64 = 0;
14761   for (unsigned int i = 0; i < 8; i++)
14762     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
14763               << (i * BITS_PER_UNIT));
14764
14765   if (vec_flags & VEC_SVE_DATA)
14766     return aarch64_sve_valid_immediate (val64, info);
14767   else
14768     return aarch64_advsimd_valid_immediate (val64, info, which);
14769 }
14770
14771 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
14772    has a step in the range of INDEX.  Return the index expression if so,
14773    otherwise return null.  */
14774 rtx
14775 aarch64_check_zero_based_sve_index_immediate (rtx x)
14776 {
14777   rtx base, step;
14778   if (const_vec_series_p (x, &base, &step)
14779       && base == const0_rtx
14780       && aarch64_sve_index_immediate_p (step))
14781     return step;
14782   return NULL_RTX;
14783 }
14784
14785 /* Check of immediate shift constants are within range.  */
14786 bool
14787 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
14788 {
14789   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
14790   if (left)
14791     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
14792   else
14793     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
14794 }
14795
14796 /* Return the bitmask CONST_INT to select the bits required by a zero extract
14797    operation of width WIDTH at bit position POS.  */
14798
14799 rtx
14800 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
14801 {
14802   gcc_assert (CONST_INT_P (width));
14803   gcc_assert (CONST_INT_P (pos));
14804
14805   unsigned HOST_WIDE_INT mask
14806     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
14807   return GEN_INT (mask << UINTVAL (pos));
14808 }
14809
14810 bool
14811 aarch64_mov_operand_p (rtx x, machine_mode mode)
14812 {
14813   if (GET_CODE (x) == HIGH
14814       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
14815     return true;
14816
14817   if (CONST_INT_P (x))
14818     return true;
14819
14820   if (VECTOR_MODE_P (GET_MODE (x)))
14821     return aarch64_simd_valid_immediate (x, NULL);
14822
14823   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
14824     return true;
14825
14826   if (aarch64_sve_cnt_immediate_p (x))
14827     return true;
14828
14829   return aarch64_classify_symbolic_expression (x)
14830     == SYMBOL_TINY_ABSOLUTE;
14831 }
14832
14833 /* Return a const_int vector of VAL.  */
14834 rtx
14835 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
14836 {
14837   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
14838   return gen_const_vec_duplicate (mode, c);
14839 }
14840
14841 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
14842
14843 bool
14844 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
14845 {
14846   machine_mode vmode;
14847
14848   vmode = aarch64_simd_container_mode (mode, 64);
14849   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
14850   return aarch64_simd_valid_immediate (op_v, NULL);
14851 }
14852
14853 /* Construct and return a PARALLEL RTX vector with elements numbering the
14854    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
14855    the vector - from the perspective of the architecture.  This does not
14856    line up with GCC's perspective on lane numbers, so we end up with
14857    different masks depending on our target endian-ness.  The diagram
14858    below may help.  We must draw the distinction when building masks
14859    which select one half of the vector.  An instruction selecting
14860    architectural low-lanes for a big-endian target, must be described using
14861    a mask selecting GCC high-lanes.
14862
14863                  Big-Endian             Little-Endian
14864
14865 GCC             0   1   2   3           3   2   1   0
14866               | x | x | x | x |       | x | x | x | x |
14867 Architecture    3   2   1   0           3   2   1   0
14868
14869 Low Mask:         { 2, 3 }                { 0, 1 }
14870 High Mask:        { 0, 1 }                { 2, 3 }
14871
14872    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
14873
14874 rtx
14875 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
14876 {
14877   rtvec v = rtvec_alloc (nunits / 2);
14878   int high_base = nunits / 2;
14879   int low_base = 0;
14880   int base;
14881   rtx t1;
14882   int i;
14883
14884   if (BYTES_BIG_ENDIAN)
14885     base = high ? low_base : high_base;
14886   else
14887     base = high ? high_base : low_base;
14888
14889   for (i = 0; i < nunits / 2; i++)
14890     RTVEC_ELT (v, i) = GEN_INT (base + i);
14891
14892   t1 = gen_rtx_PARALLEL (mode, v);
14893   return t1;
14894 }
14895
14896 /* Check OP for validity as a PARALLEL RTX vector with elements
14897    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
14898    from the perspective of the architecture.  See the diagram above
14899    aarch64_simd_vect_par_cnst_half for more details.  */
14900
14901 bool
14902 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
14903                                        bool high)
14904 {
14905   int nelts;
14906   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
14907     return false;
14908
14909   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
14910   HOST_WIDE_INT count_op = XVECLEN (op, 0);
14911   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
14912   int i = 0;
14913
14914   if (count_op != count_ideal)
14915     return false;
14916
14917   for (i = 0; i < count_ideal; i++)
14918     {
14919       rtx elt_op = XVECEXP (op, 0, i);
14920       rtx elt_ideal = XVECEXP (ideal, 0, i);
14921
14922       if (!CONST_INT_P (elt_op)
14923           || INTVAL (elt_ideal) != INTVAL (elt_op))
14924         return false;
14925     }
14926   return true;
14927 }
14928
14929 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
14930    HIGH (exclusive).  */
14931 void
14932 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
14933                           const_tree exp)
14934 {
14935   HOST_WIDE_INT lane;
14936   gcc_assert (CONST_INT_P (operand));
14937   lane = INTVAL (operand);
14938
14939   if (lane < low || lane >= high)
14940   {
14941     if (exp)
14942       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
14943     else
14944       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
14945   }
14946 }
14947
14948 /* Peform endian correction on lane number N, which indexes a vector
14949    of mode MODE, and return the result as an SImode rtx.  */
14950
14951 rtx
14952 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
14953 {
14954   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
14955 }
14956
14957 /* Return TRUE if OP is a valid vector addressing mode.  */
14958
14959 bool
14960 aarch64_simd_mem_operand_p (rtx op)
14961 {
14962   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
14963                         || REG_P (XEXP (op, 0)));
14964 }
14965
14966 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
14967
14968 bool
14969 aarch64_sve_ld1r_operand_p (rtx op)
14970 {
14971   struct aarch64_address_info addr;
14972   scalar_mode mode;
14973
14974   return (MEM_P (op)
14975           && is_a <scalar_mode> (GET_MODE (op), &mode)
14976           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
14977           && addr.type == ADDRESS_REG_IMM
14978           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
14979 }
14980
14981 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
14982    The conditions for STR are the same.  */
14983 bool
14984 aarch64_sve_ldr_operand_p (rtx op)
14985 {
14986   struct aarch64_address_info addr;
14987
14988   return (MEM_P (op)
14989           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
14990                                        false, ADDR_QUERY_ANY)
14991           && addr.type == ADDRESS_REG_IMM);
14992 }
14993
14994 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
14995    We need to be able to access the individual pieces, so the range
14996    is different from LD[234] and ST[234].  */
14997 bool
14998 aarch64_sve_struct_memory_operand_p (rtx op)
14999 {
15000   if (!MEM_P (op))
15001     return false;
15002
15003   machine_mode mode = GET_MODE (op);
15004   struct aarch64_address_info addr;
15005   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
15006                                  ADDR_QUERY_ANY)
15007       || addr.type != ADDRESS_REG_IMM)
15008     return false;
15009
15010   poly_int64 first = addr.const_offset;
15011   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
15012   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
15013           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
15014 }
15015
15016 /* Emit a register copy from operand to operand, taking care not to
15017    early-clobber source registers in the process.
15018
15019    COUNT is the number of components into which the copy needs to be
15020    decomposed.  */
15021 void
15022 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
15023                                 unsigned int count)
15024 {
15025   unsigned int i;
15026   int rdest = REGNO (operands[0]);
15027   int rsrc = REGNO (operands[1]);
15028
15029   if (!reg_overlap_mentioned_p (operands[0], operands[1])
15030       || rdest < rsrc)
15031     for (i = 0; i < count; i++)
15032       emit_move_insn (gen_rtx_REG (mode, rdest + i),
15033                       gen_rtx_REG (mode, rsrc + i));
15034   else
15035     for (i = 0; i < count; i++)
15036       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
15037                       gen_rtx_REG (mode, rsrc + count - i - 1));
15038 }
15039
15040 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
15041    one of VSTRUCT modes: OI, CI, or XI.  */
15042 int
15043 aarch64_simd_attr_length_rglist (machine_mode mode)
15044 {
15045   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
15046   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
15047 }
15048
15049 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
15050    alignment of a vector to 128 bits.  SVE predicates have an alignment of
15051    16 bits.  */
15052 static HOST_WIDE_INT
15053 aarch64_simd_vector_alignment (const_tree type)
15054 {
15055   if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
15056     /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
15057        be set for non-predicate vectors of booleans.  Modes are the most
15058        direct way we have of identifying real SVE predicate types.  */
15059     return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
15060   return wi::umin (wi::to_wide (TYPE_SIZE (type)), 128).to_uhwi ();
15061 }
15062
15063 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
15064 static poly_uint64
15065 aarch64_vectorize_preferred_vector_alignment (const_tree type)
15066 {
15067   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
15068     {
15069       /* If the length of the vector is fixed, try to align to that length,
15070          otherwise don't try to align at all.  */
15071       HOST_WIDE_INT result;
15072       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
15073         result = TYPE_ALIGN (TREE_TYPE (type));
15074       return result;
15075     }
15076   return TYPE_ALIGN (type);
15077 }
15078
15079 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
15080 static bool
15081 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
15082 {
15083   if (is_packed)
15084     return false;
15085
15086   /* For fixed-length vectors, check that the vectorizer will aim for
15087      full-vector alignment.  This isn't true for generic GCC vectors
15088      that are wider than the ABI maximum of 128 bits.  */
15089   poly_uint64 preferred_alignment =
15090     aarch64_vectorize_preferred_vector_alignment (type);
15091   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
15092       && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
15093                    preferred_alignment))
15094     return false;
15095
15096   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
15097   return true;
15098 }
15099
15100 /* Return true if the vector misalignment factor is supported by the
15101    target.  */
15102 static bool
15103 aarch64_builtin_support_vector_misalignment (machine_mode mode,
15104                                              const_tree type, int misalignment,
15105                                              bool is_packed)
15106 {
15107   if (TARGET_SIMD && STRICT_ALIGNMENT)
15108     {
15109       /* Return if movmisalign pattern is not supported for this mode.  */
15110       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
15111         return false;
15112
15113       /* Misalignment factor is unknown at compile time.  */
15114       if (misalignment == -1)
15115         return false;
15116     }
15117   return default_builtin_support_vector_misalignment (mode, type, misalignment,
15118                                                       is_packed);
15119 }
15120
15121 /* If VALS is a vector constant that can be loaded into a register
15122    using DUP, generate instructions to do so and return an RTX to
15123    assign to the register.  Otherwise return NULL_RTX.  */
15124 static rtx
15125 aarch64_simd_dup_constant (rtx vals)
15126 {
15127   machine_mode mode = GET_MODE (vals);
15128   machine_mode inner_mode = GET_MODE_INNER (mode);
15129   rtx x;
15130
15131   if (!const_vec_duplicate_p (vals, &x))
15132     return NULL_RTX;
15133
15134   /* We can load this constant by using DUP and a constant in a
15135      single ARM register.  This will be cheaper than a vector
15136      load.  */
15137   x = copy_to_mode_reg (inner_mode, x);
15138   return gen_vec_duplicate (mode, x);
15139 }
15140
15141
15142 /* Generate code to load VALS, which is a PARALLEL containing only
15143    constants (for vec_init) or CONST_VECTOR, efficiently into a
15144    register.  Returns an RTX to copy into the register, or NULL_RTX
15145    for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
15146 static rtx
15147 aarch64_simd_make_constant (rtx vals)
15148 {
15149   machine_mode mode = GET_MODE (vals);
15150   rtx const_dup;
15151   rtx const_vec = NULL_RTX;
15152   int n_const = 0;
15153   int i;
15154
15155   if (GET_CODE (vals) == CONST_VECTOR)
15156     const_vec = vals;
15157   else if (GET_CODE (vals) == PARALLEL)
15158     {
15159       /* A CONST_VECTOR must contain only CONST_INTs and
15160          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
15161          Only store valid constants in a CONST_VECTOR.  */
15162       int n_elts = XVECLEN (vals, 0);
15163       for (i = 0; i < n_elts; ++i)
15164         {
15165           rtx x = XVECEXP (vals, 0, i);
15166           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15167             n_const++;
15168         }
15169       if (n_const == n_elts)
15170         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
15171     }
15172   else
15173     gcc_unreachable ();
15174
15175   if (const_vec != NULL_RTX
15176       && aarch64_simd_valid_immediate (const_vec, NULL))
15177     /* Load using MOVI/MVNI.  */
15178     return const_vec;
15179   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
15180     /* Loaded using DUP.  */
15181     return const_dup;
15182   else if (const_vec != NULL_RTX)
15183     /* Load from constant pool. We cannot take advantage of single-cycle
15184        LD1 because we need a PC-relative addressing mode.  */
15185     return const_vec;
15186   else
15187     /* A PARALLEL containing something not valid inside CONST_VECTOR.
15188        We cannot construct an initializer.  */
15189     return NULL_RTX;
15190 }
15191
15192 /* Expand a vector initialisation sequence, such that TARGET is
15193    initialised to contain VALS.  */
15194
15195 void
15196 aarch64_expand_vector_init (rtx target, rtx vals)
15197 {
15198   machine_mode mode = GET_MODE (target);
15199   scalar_mode inner_mode = GET_MODE_INNER (mode);
15200   /* The number of vector elements.  */
15201   int n_elts = XVECLEN (vals, 0);
15202   /* The number of vector elements which are not constant.  */
15203   int n_var = 0;
15204   rtx any_const = NULL_RTX;
15205   /* The first element of vals.  */
15206   rtx v0 = XVECEXP (vals, 0, 0);
15207   bool all_same = true;
15208
15209   /* This is a special vec_init<M><N> where N is not an element mode but a
15210      vector mode with half the elements of M.  We expect to find two entries
15211      of mode N in VALS and we must put their concatentation into TARGET.  */
15212   if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
15213     {
15214       gcc_assert (known_eq (GET_MODE_SIZE (mode),
15215                   2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
15216       rtx lo = XVECEXP (vals, 0, 0);
15217       rtx hi = XVECEXP (vals, 0, 1);
15218       machine_mode narrow_mode = GET_MODE (lo);
15219       gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
15220       gcc_assert (narrow_mode == GET_MODE (hi));
15221
15222       /* When we want to concatenate a half-width vector with zeroes we can
15223          use the aarch64_combinez[_be] patterns.  Just make sure that the
15224          zeroes are in the right half.  */
15225       if (BYTES_BIG_ENDIAN
15226           && aarch64_simd_imm_zero (lo, narrow_mode)
15227           && general_operand (hi, narrow_mode))
15228         emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
15229       else if (!BYTES_BIG_ENDIAN
15230                && aarch64_simd_imm_zero (hi, narrow_mode)
15231                && general_operand (lo, narrow_mode))
15232         emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
15233       else
15234         {
15235           /* Else create the two half-width registers and combine them.  */
15236           if (!REG_P (lo))
15237             lo = force_reg (GET_MODE (lo), lo);
15238           if (!REG_P (hi))
15239             hi = force_reg (GET_MODE (hi), hi);
15240
15241           if (BYTES_BIG_ENDIAN)
15242             std::swap (lo, hi);
15243           emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
15244         }
15245      return;
15246    }
15247
15248   /* Count the number of variable elements to initialise.  */
15249   for (int i = 0; i < n_elts; ++i)
15250     {
15251       rtx x = XVECEXP (vals, 0, i);
15252       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
15253         ++n_var;
15254       else
15255         any_const = x;
15256
15257       all_same &= rtx_equal_p (x, v0);
15258     }
15259
15260   /* No variable elements, hand off to aarch64_simd_make_constant which knows
15261      how best to handle this.  */
15262   if (n_var == 0)
15263     {
15264       rtx constant = aarch64_simd_make_constant (vals);
15265       if (constant != NULL_RTX)
15266         {
15267           emit_move_insn (target, constant);
15268           return;
15269         }
15270     }
15271
15272   /* Splat a single non-constant element if we can.  */
15273   if (all_same)
15274     {
15275       rtx x = copy_to_mode_reg (inner_mode, v0);
15276       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
15277       return;
15278     }
15279
15280   enum insn_code icode = optab_handler (vec_set_optab, mode);
15281   gcc_assert (icode != CODE_FOR_nothing);
15282
15283   /* If there are only variable elements, try to optimize
15284      the insertion using dup for the most common element
15285      followed by insertions.  */
15286
15287   /* The algorithm will fill matches[*][0] with the earliest matching element,
15288      and matches[X][1] with the count of duplicate elements (if X is the
15289      earliest element which has duplicates).  */
15290
15291   if (n_var == n_elts && n_elts <= 16)
15292     {
15293       int matches[16][2] = {0};
15294       for (int i = 0; i < n_elts; i++)
15295         {
15296           for (int j = 0; j <= i; j++)
15297             {
15298               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
15299                 {
15300                   matches[i][0] = j;
15301                   matches[j][1]++;
15302                   break;
15303                 }
15304             }
15305         }
15306       int maxelement = 0;
15307       int maxv = 0;
15308       for (int i = 0; i < n_elts; i++)
15309         if (matches[i][1] > maxv)
15310           {
15311             maxelement = i;
15312             maxv = matches[i][1];
15313           }
15314
15315       /* Create a duplicate of the most common element, unless all elements
15316          are equally useless to us, in which case just immediately set the
15317          vector register using the first element.  */
15318
15319       if (maxv == 1)
15320         {
15321           /* For vectors of two 64-bit elements, we can do even better.  */
15322           if (n_elts == 2
15323               && (inner_mode == E_DImode
15324                   || inner_mode == E_DFmode))
15325
15326             {
15327               rtx x0 = XVECEXP (vals, 0, 0);
15328               rtx x1 = XVECEXP (vals, 0, 1);
15329               /* Combine can pick up this case, but handling it directly
15330                  here leaves clearer RTL.
15331
15332                  This is load_pair_lanes<mode>, and also gives us a clean-up
15333                  for store_pair_lanes<mode>.  */
15334               if (memory_operand (x0, inner_mode)
15335                   && memory_operand (x1, inner_mode)
15336                   && !STRICT_ALIGNMENT
15337                   && rtx_equal_p (XEXP (x1, 0),
15338                                   plus_constant (Pmode,
15339                                                  XEXP (x0, 0),
15340                                                  GET_MODE_SIZE (inner_mode))))
15341                 {
15342                   rtx t;
15343                   if (inner_mode == DFmode)
15344                     t = gen_load_pair_lanesdf (target, x0, x1);
15345                   else
15346                     t = gen_load_pair_lanesdi (target, x0, x1);
15347                   emit_insn (t);
15348                   return;
15349                 }
15350             }
15351           /* The subreg-move sequence below will move into lane zero of the
15352              vector register.  For big-endian we want that position to hold
15353              the last element of VALS.  */
15354           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
15355           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
15356           aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
15357         }
15358       else
15359         {
15360           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
15361           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
15362         }
15363
15364       /* Insert the rest.  */
15365       for (int i = 0; i < n_elts; i++)
15366         {
15367           rtx x = XVECEXP (vals, 0, i);
15368           if (matches[i][0] == maxelement)
15369             continue;
15370           x = copy_to_mode_reg (inner_mode, x);
15371           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
15372         }
15373       return;
15374     }
15375
15376   /* Initialise a vector which is part-variable.  We want to first try
15377      to build those lanes which are constant in the most efficient way we
15378      can.  */
15379   if (n_var != n_elts)
15380     {
15381       rtx copy = copy_rtx (vals);
15382
15383       /* Load constant part of vector.  We really don't care what goes into the
15384          parts we will overwrite, but we're more likely to be able to load the
15385          constant efficiently if it has fewer, larger, repeating parts
15386          (see aarch64_simd_valid_immediate).  */
15387       for (int i = 0; i < n_elts; i++)
15388         {
15389           rtx x = XVECEXP (vals, 0, i);
15390           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15391             continue;
15392           rtx subst = any_const;
15393           for (int bit = n_elts / 2; bit > 0; bit /= 2)
15394             {
15395               /* Look in the copied vector, as more elements are const.  */
15396               rtx test = XVECEXP (copy, 0, i ^ bit);
15397               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
15398                 {
15399                   subst = test;
15400                   break;
15401                 }
15402             }
15403           XVECEXP (copy, 0, i) = subst;
15404         }
15405       aarch64_expand_vector_init (target, copy);
15406     }
15407
15408   /* Insert the variable lanes directly.  */
15409   for (int i = 0; i < n_elts; i++)
15410     {
15411       rtx x = XVECEXP (vals, 0, i);
15412       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15413         continue;
15414       x = copy_to_mode_reg (inner_mode, x);
15415       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
15416     }
15417 }
15418
15419 /* Emit RTL corresponding to:
15420    insr TARGET, ELEM.  */
15421
15422 static void
15423 emit_insr (rtx target, rtx elem)
15424 {
15425   machine_mode mode = GET_MODE (target);
15426   scalar_mode elem_mode = GET_MODE_INNER (mode);
15427   elem = force_reg (elem_mode, elem);
15428
15429   insn_code icode = optab_handler (vec_shl_insert_optab, mode);
15430   gcc_assert (icode != CODE_FOR_nothing);
15431   emit_insn (GEN_FCN (icode) (target, target, elem));
15432 }
15433
15434 /* Subroutine of aarch64_sve_expand_vector_init for handling
15435    trailing constants.
15436    This function works as follows:
15437    (a) Create a new vector consisting of trailing constants.
15438    (b) Initialize TARGET with the constant vector using emit_move_insn.
15439    (c) Insert remaining elements in TARGET using insr.
15440    NELTS is the total number of elements in original vector while
15441    while NELTS_REQD is the number of elements that are actually
15442    significant.
15443
15444    ??? The heuristic used is to do above only if number of constants
15445    is at least half the total number of elements.  May need fine tuning.  */
15446
15447 static bool
15448 aarch64_sve_expand_vector_init_handle_trailing_constants
15449  (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
15450 {
15451   machine_mode mode = GET_MODE (target);
15452   scalar_mode elem_mode = GET_MODE_INNER (mode);
15453   int n_trailing_constants = 0;
15454
15455   for (int i = nelts_reqd - 1;
15456        i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i));
15457        i--)
15458     n_trailing_constants++;
15459
15460   if (n_trailing_constants >= nelts_reqd / 2)
15461     {
15462       rtx_vector_builder v (mode, 1, nelts);
15463       for (int i = 0; i < nelts; i++)
15464         v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants));
15465       rtx const_vec = v.build ();
15466       emit_move_insn (target, const_vec);
15467
15468       for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
15469         emit_insr (target, builder.elt (i));
15470
15471       return true;
15472     }
15473
15474   return false;
15475 }
15476
15477 /* Subroutine of aarch64_sve_expand_vector_init.
15478    Works as follows:
15479    (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
15480    (b) Skip trailing elements from BUILDER, which are the same as
15481        element NELTS_REQD - 1.
15482    (c) Insert earlier elements in reverse order in TARGET using insr.  */
15483
15484 static void
15485 aarch64_sve_expand_vector_init_insert_elems (rtx target,
15486                                              const rtx_vector_builder &builder,
15487                                              int nelts_reqd)
15488 {
15489   machine_mode mode = GET_MODE (target);
15490   scalar_mode elem_mode = GET_MODE_INNER (mode);
15491
15492   struct expand_operand ops[2];
15493   enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
15494   gcc_assert (icode != CODE_FOR_nothing);
15495
15496   create_output_operand (&ops[0], target, mode);
15497   create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
15498   expand_insn (icode, 2, ops);
15499
15500   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
15501   for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
15502     emit_insr (target, builder.elt (i));
15503 }
15504
15505 /* Subroutine of aarch64_sve_expand_vector_init to handle case
15506    when all trailing elements of builder are same.
15507    This works as follows:
15508    (a) Use expand_insn interface to broadcast last vector element in TARGET.
15509    (b) Insert remaining elements in TARGET using insr.
15510
15511    ??? The heuristic used is to do above if number of same trailing elements
15512    is at least 3/4 of total number of elements, loosely based on
15513    heuristic from mostly_zeros_p.  May need fine-tuning.  */
15514
15515 static bool
15516 aarch64_sve_expand_vector_init_handle_trailing_same_elem
15517  (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
15518 {
15519   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
15520   if (ndups >= (3 * nelts_reqd) / 4)
15521     {
15522       aarch64_sve_expand_vector_init_insert_elems (target, builder,
15523                                                    nelts_reqd - ndups + 1);
15524       return true;
15525     }
15526
15527   return false;
15528 }
15529
15530 /* Initialize register TARGET from BUILDER. NELTS is the constant number
15531    of elements in BUILDER.
15532
15533    The function tries to initialize TARGET from BUILDER if it fits one
15534    of the special cases outlined below.
15535
15536    Failing that, the function divides BUILDER into two sub-vectors:
15537    v_even = even elements of BUILDER;
15538    v_odd = odd elements of BUILDER;
15539
15540    and recursively calls itself with v_even and v_odd.
15541
15542    if (recursive call succeeded for v_even or v_odd)
15543      TARGET = zip (v_even, v_odd)
15544
15545    The function returns true if it managed to build TARGET from BUILDER
15546    with one of the special cases, false otherwise.
15547
15548    Example: {a, 1, b, 2, c, 3, d, 4}
15549
15550    The vector gets divided into:
15551    v_even = {a, b, c, d}
15552    v_odd = {1, 2, 3, 4}
15553
15554    aarch64_sve_expand_vector_init(v_odd) hits case 1 and
15555    initialize tmp2 from constant vector v_odd using emit_move_insn.
15556
15557    aarch64_sve_expand_vector_init(v_even) fails since v_even contains
15558    4 elements, so we construct tmp1 from v_even using insr:
15559    tmp1 = dup(d)
15560    insr tmp1, c
15561    insr tmp1, b
15562    insr tmp1, a
15563
15564    And finally:
15565    TARGET = zip (tmp1, tmp2)
15566    which sets TARGET to {a, 1, b, 2, c, 3, d, 4}.  */
15567
15568 static bool
15569 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
15570                                 int nelts, int nelts_reqd)
15571 {
15572   machine_mode mode = GET_MODE (target);
15573
15574   /* Case 1: Vector contains trailing constants.  */
15575
15576   if (aarch64_sve_expand_vector_init_handle_trailing_constants
15577        (target, builder, nelts, nelts_reqd))
15578     return true;
15579
15580   /* Case 2: Vector contains leading constants.  */
15581
15582   rtx_vector_builder rev_builder (mode, 1, nelts_reqd);
15583   for (int i = 0; i < nelts_reqd; i++)
15584     rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
15585   rev_builder.finalize ();
15586
15587   if (aarch64_sve_expand_vector_init_handle_trailing_constants
15588        (target, rev_builder, nelts, nelts_reqd))
15589     {
15590       emit_insn (gen_aarch64_sve_rev (mode, target, target));
15591       return true;
15592     }
15593
15594   /* Case 3: Vector contains trailing same element.  */
15595
15596   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
15597        (target, builder, nelts_reqd))
15598     return true;
15599
15600   /* Case 4: Vector contains leading same element.  */
15601
15602   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
15603        (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
15604     {
15605       emit_insn (gen_aarch64_sve_rev (mode, target, target));
15606       return true;
15607     }
15608
15609   /* Avoid recursing below 4-elements.
15610      ??? The threshold 4 may need fine-tuning.  */
15611
15612   if (nelts_reqd <= 4)
15613     return false;
15614
15615   rtx_vector_builder v_even (mode, 1, nelts);
15616   rtx_vector_builder v_odd (mode, 1, nelts);
15617
15618   for (int i = 0; i < nelts * 2; i += 2)
15619     {
15620       v_even.quick_push (builder.elt (i));
15621       v_odd.quick_push (builder.elt (i + 1));
15622     }
15623
15624   v_even.finalize ();
15625   v_odd.finalize ();
15626
15627   rtx tmp1 = gen_reg_rtx (mode);
15628   bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
15629                                                     nelts, nelts_reqd / 2);
15630
15631   rtx tmp2 = gen_reg_rtx (mode);
15632   bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
15633                                                    nelts, nelts_reqd / 2);
15634
15635   if (!did_even_p && !did_odd_p)
15636     return false;
15637
15638   /* Initialize v_even and v_odd using INSR if it didn't match any of the
15639      special cases and zip v_even, v_odd.  */
15640
15641   if (!did_even_p)
15642     aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
15643
15644   if (!did_odd_p)
15645     aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
15646
15647   rtvec v = gen_rtvec (2, tmp1, tmp2);
15648   emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
15649   return true;
15650 }
15651
15652 /* Initialize register TARGET from the elements in PARALLEL rtx VALS.  */
15653
15654 void
15655 aarch64_sve_expand_vector_init (rtx target, rtx vals)
15656 {
15657   machine_mode mode = GET_MODE (target);
15658   int nelts = XVECLEN (vals, 0);
15659
15660   rtx_vector_builder v (mode, 1, nelts);
15661   for (int i = 0; i < nelts; i++)
15662     v.quick_push (XVECEXP (vals, 0, i));
15663   v.finalize ();
15664
15665   /* If neither sub-vectors of v could be initialized specially,
15666      then use INSR to insert all elements from v into TARGET.
15667      ??? This might not be optimal for vectors with large
15668      initializers like 16-element or above.
15669      For nelts < 4, it probably isn't useful to handle specially.  */
15670
15671   if (nelts < 4
15672       || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
15673     aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
15674 }
15675
15676 static unsigned HOST_WIDE_INT
15677 aarch64_shift_truncation_mask (machine_mode mode)
15678 {
15679   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
15680     return 0;
15681   return GET_MODE_UNIT_BITSIZE (mode) - 1;
15682 }
15683
15684 /* Select a format to encode pointers in exception handling data.  */
15685 int
15686 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
15687 {
15688    int type;
15689    switch (aarch64_cmodel)
15690      {
15691      case AARCH64_CMODEL_TINY:
15692      case AARCH64_CMODEL_TINY_PIC:
15693      case AARCH64_CMODEL_SMALL:
15694      case AARCH64_CMODEL_SMALL_PIC:
15695      case AARCH64_CMODEL_SMALL_SPIC:
15696        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
15697           for everything.  */
15698        type = DW_EH_PE_sdata4;
15699        break;
15700      default:
15701        /* No assumptions here.  8-byte relocs required.  */
15702        type = DW_EH_PE_sdata8;
15703        break;
15704      }
15705    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
15706 }
15707
15708 /* Output .variant_pcs for aarch64_vector_pcs function symbols.  */
15709
15710 static void
15711 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
15712 {
15713   if (aarch64_simd_decl_p (decl))
15714     {
15715       fprintf (stream, "\t.variant_pcs\t");
15716       assemble_name (stream, name);
15717       fprintf (stream, "\n");
15718     }
15719 }
15720
15721 /* The last .arch and .tune assembly strings that we printed.  */
15722 static std::string aarch64_last_printed_arch_string;
15723 static std::string aarch64_last_printed_tune_string;
15724
15725 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
15726    by the function fndecl.  */
15727
15728 void
15729 aarch64_declare_function_name (FILE *stream, const char* name,
15730                                 tree fndecl)
15731 {
15732   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
15733
15734   struct cl_target_option *targ_options;
15735   if (target_parts)
15736     targ_options = TREE_TARGET_OPTION (target_parts);
15737   else
15738     targ_options = TREE_TARGET_OPTION (target_option_current_node);
15739   gcc_assert (targ_options);
15740
15741   const struct processor *this_arch
15742     = aarch64_get_arch (targ_options->x_explicit_arch);
15743
15744   uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
15745   std::string extension
15746     = aarch64_get_extension_string_for_isa_flags (isa_flags,
15747                                                   this_arch->flags);
15748   /* Only update the assembler .arch string if it is distinct from the last
15749      such string we printed.  */
15750   std::string to_print = this_arch->name + extension;
15751   if (to_print != aarch64_last_printed_arch_string)
15752     {
15753       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
15754       aarch64_last_printed_arch_string = to_print;
15755     }
15756
15757   /* Print the cpu name we're tuning for in the comments, might be
15758      useful to readers of the generated asm.  Do it only when it changes
15759      from function to function and verbose assembly is requested.  */
15760   const struct processor *this_tune
15761     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
15762
15763   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
15764     {
15765       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
15766                    this_tune->name);
15767       aarch64_last_printed_tune_string = this_tune->name;
15768     }
15769
15770   aarch64_asm_output_variant_pcs (stream, fndecl, name);
15771
15772   /* Don't forget the type directive for ELF.  */
15773   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
15774   ASM_OUTPUT_LABEL (stream, name);
15775 }
15776
15777 /* Implement ASM_OUTPUT_DEF_FROM_DECLS.  Output .variant_pcs for aliases.  */
15778
15779 void
15780 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
15781 {
15782   const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
15783   const char *value = IDENTIFIER_POINTER (target);
15784   aarch64_asm_output_variant_pcs (stream, decl, name);
15785   ASM_OUTPUT_DEF (stream, name, value);
15786 }
15787
15788 /* Implement ASM_OUTPUT_EXTERNAL.  Output .variant_pcs for undefined
15789    function symbol references.  */
15790
15791 void
15792 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
15793 {
15794   default_elf_asm_output_external (stream, decl, name);
15795   aarch64_asm_output_variant_pcs (stream, decl, name);
15796 }
15797
15798 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
15799    Used to output the .cfi_b_key_frame directive when signing the current
15800    function with the B key.  */
15801
15802 void
15803 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
15804 {
15805   if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
15806       && aarch64_ra_sign_key == AARCH64_KEY_B)
15807         asm_fprintf (f, "\t.cfi_b_key_frame\n");
15808 }
15809
15810 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
15811
15812 static void
15813 aarch64_start_file (void)
15814 {
15815   struct cl_target_option *default_options
15816     = TREE_TARGET_OPTION (target_option_default_node);
15817
15818   const struct processor *default_arch
15819     = aarch64_get_arch (default_options->x_explicit_arch);
15820   uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
15821   std::string extension
15822     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
15823                                                   default_arch->flags);
15824
15825    aarch64_last_printed_arch_string = default_arch->name + extension;
15826    aarch64_last_printed_tune_string = "";
15827    asm_fprintf (asm_out_file, "\t.arch %s\n",
15828                 aarch64_last_printed_arch_string.c_str ());
15829
15830    default_file_start ();
15831 }
15832
15833 /* Emit load exclusive.  */
15834
15835 static void
15836 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
15837                              rtx mem, rtx model_rtx)
15838 {
15839   emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
15840 }
15841
15842 /* Emit store exclusive.  */
15843
15844 static void
15845 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
15846                               rtx rval, rtx mem, rtx model_rtx)
15847 {
15848   emit_insn (gen_aarch64_store_exclusive (mode, bval, rval, mem, model_rtx));
15849 }
15850
15851 /* Mark the previous jump instruction as unlikely.  */
15852
15853 static void
15854 aarch64_emit_unlikely_jump (rtx insn)
15855 {
15856   rtx_insn *jump = emit_jump_insn (insn);
15857   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
15858 }
15859
15860 /* Expand a compare and swap pattern.  */
15861
15862 void
15863 aarch64_expand_compare_and_swap (rtx operands[])
15864 {
15865   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
15866   machine_mode mode, r_mode;
15867
15868   bval = operands[0];
15869   rval = operands[1];
15870   mem = operands[2];
15871   oldval = operands[3];
15872   newval = operands[4];
15873   is_weak = operands[5];
15874   mod_s = operands[6];
15875   mod_f = operands[7];
15876   mode = GET_MODE (mem);
15877
15878   /* Normally the succ memory model must be stronger than fail, but in the
15879      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
15880      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
15881   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
15882       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
15883     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
15884
15885   r_mode = mode;
15886   if (mode == QImode || mode == HImode)
15887     {
15888       r_mode = SImode;
15889       rval = gen_reg_rtx (r_mode);
15890     }
15891
15892   if (TARGET_LSE)
15893     {
15894       /* The CAS insn requires oldval and rval overlap, but we need to
15895          have a copy of oldval saved across the operation to tell if
15896          the operation is successful.  */
15897       if (reg_overlap_mentioned_p (rval, oldval))
15898         rval = copy_to_mode_reg (r_mode, oldval);
15899       else
15900         emit_move_insn (rval, gen_lowpart (r_mode, oldval));
15901
15902       emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
15903                                                    newval, mod_s));
15904       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
15905     }
15906   else
15907     {
15908       /* The oldval predicate varies by mode.  Test it and force to reg.  */
15909       insn_code code = code_for_aarch64_compare_and_swap (mode);
15910       if (!insn_data[code].operand[2].predicate (oldval, mode))
15911         oldval = force_reg (mode, oldval);
15912
15913       emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
15914                                  is_weak, mod_s, mod_f));
15915       cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
15916     }
15917
15918   if (r_mode != mode)
15919     rval = gen_lowpart (mode, rval);
15920   emit_move_insn (operands[1], rval);
15921
15922   x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
15923   emit_insn (gen_rtx_SET (bval, x));
15924 }
15925
15926 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
15927    sequence implementing an atomic operation.  */
15928
15929 static void
15930 aarch64_emit_post_barrier (enum memmodel model)
15931 {
15932   const enum memmodel base_model = memmodel_base (model);
15933
15934   if (is_mm_sync (model)
15935       && (base_model == MEMMODEL_ACQUIRE
15936           || base_model == MEMMODEL_ACQ_REL
15937           || base_model == MEMMODEL_SEQ_CST))
15938     {
15939       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
15940     }
15941 }
15942
15943 /* Split a compare and swap pattern.  */
15944
15945 void
15946 aarch64_split_compare_and_swap (rtx operands[])
15947 {
15948   rtx rval, mem, oldval, newval, scratch;
15949   machine_mode mode;
15950   bool is_weak;
15951   rtx_code_label *label1, *label2;
15952   rtx x, cond;
15953   enum memmodel model;
15954   rtx model_rtx;
15955
15956   rval = operands[0];
15957   mem = operands[1];
15958   oldval = operands[2];
15959   newval = operands[3];
15960   is_weak = (operands[4] != const0_rtx);
15961   model_rtx = operands[5];
15962   scratch = operands[7];
15963   mode = GET_MODE (mem);
15964   model = memmodel_from_int (INTVAL (model_rtx));
15965
15966   /* When OLDVAL is zero and we want the strong version we can emit a tighter
15967     loop:
15968     .label1:
15969         LD[A]XR rval, [mem]
15970         CBNZ    rval, .label2
15971         ST[L]XR scratch, newval, [mem]
15972         CBNZ    scratch, .label1
15973     .label2:
15974         CMP     rval, 0.  */
15975   bool strong_zero_p = !is_weak && oldval == const0_rtx;
15976
15977   label1 = NULL;
15978   if (!is_weak)
15979     {
15980       label1 = gen_label_rtx ();
15981       emit_label (label1);
15982     }
15983   label2 = gen_label_rtx ();
15984
15985   /* The initial load can be relaxed for a __sync operation since a final
15986      barrier will be emitted to stop code hoisting.  */
15987   if (is_mm_sync (model))
15988     aarch64_emit_load_exclusive (mode, rval, mem,
15989                                  GEN_INT (MEMMODEL_RELAXED));
15990   else
15991     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
15992
15993   if (strong_zero_p)
15994     {
15995       if (aarch64_track_speculation)
15996         {
15997           /* Emit an explicit compare instruction, so that we can correctly
15998              track the condition codes.  */
15999           rtx cc_reg = aarch64_gen_compare_reg (NE, rval, const0_rtx);
16000           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16001         }
16002       else
16003         x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
16004
16005       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16006                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
16007       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16008     }
16009   else
16010     {
16011       cond = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
16012       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
16013       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16014                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
16015       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16016     }
16017
16018   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
16019
16020   if (!is_weak)
16021     {
16022       if (aarch64_track_speculation)
16023         {
16024           /* Emit an explicit compare instruction, so that we can correctly
16025              track the condition codes.  */
16026           rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
16027           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16028         }
16029       else
16030         x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
16031
16032       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16033                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
16034       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16035     }
16036   else
16037     {
16038       cond = gen_rtx_REG (CCmode, CC_REGNUM);
16039       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
16040       emit_insn (gen_rtx_SET (cond, x));
16041     }
16042
16043   emit_label (label2);
16044   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
16045      to set the condition flags.  If this is not used it will be removed by
16046      later passes.  */
16047   if (strong_zero_p)
16048     {
16049       cond = gen_rtx_REG (CCmode, CC_REGNUM);
16050       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
16051       emit_insn (gen_rtx_SET (cond, x));
16052     }
16053   /* Emit any final barrier needed for a __sync operation.  */
16054   if (is_mm_sync (model))
16055     aarch64_emit_post_barrier (model);
16056 }
16057
16058 /* Split an atomic operation.  */
16059
16060 void
16061 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
16062                          rtx value, rtx model_rtx, rtx cond)
16063 {
16064   machine_mode mode = GET_MODE (mem);
16065   machine_mode wmode = (mode == DImode ? DImode : SImode);
16066   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
16067   const bool is_sync = is_mm_sync (model);
16068   rtx_code_label *label;
16069   rtx x;
16070
16071   /* Split the atomic operation into a sequence.  */
16072   label = gen_label_rtx ();
16073   emit_label (label);
16074
16075   if (new_out)
16076     new_out = gen_lowpart (wmode, new_out);
16077   if (old_out)
16078     old_out = gen_lowpart (wmode, old_out);
16079   else
16080     old_out = new_out;
16081   value = simplify_gen_subreg (wmode, value, mode, 0);
16082
16083   /* The initial load can be relaxed for a __sync operation since a final
16084      barrier will be emitted to stop code hoisting.  */
16085  if (is_sync)
16086     aarch64_emit_load_exclusive (mode, old_out, mem,
16087                                  GEN_INT (MEMMODEL_RELAXED));
16088   else
16089     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
16090
16091   switch (code)
16092     {
16093     case SET:
16094       new_out = value;
16095       break;
16096
16097     case NOT:
16098       x = gen_rtx_AND (wmode, old_out, value);
16099       emit_insn (gen_rtx_SET (new_out, x));
16100       x = gen_rtx_NOT (wmode, new_out);
16101       emit_insn (gen_rtx_SET (new_out, x));
16102       break;
16103
16104     case MINUS:
16105       if (CONST_INT_P (value))
16106         {
16107           value = GEN_INT (-INTVAL (value));
16108           code = PLUS;
16109         }
16110       /* Fall through.  */
16111
16112     default:
16113       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
16114       emit_insn (gen_rtx_SET (new_out, x));
16115       break;
16116     }
16117
16118   aarch64_emit_store_exclusive (mode, cond, mem,
16119                                 gen_lowpart (mode, new_out), model_rtx);
16120
16121   if (aarch64_track_speculation)
16122     {
16123       /* Emit an explicit compare instruction, so that we can correctly
16124          track the condition codes.  */
16125       rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
16126       x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16127     }
16128   else
16129     x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
16130
16131   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16132                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
16133   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16134
16135   /* Emit any final barrier needed for a __sync operation.  */
16136   if (is_sync)
16137     aarch64_emit_post_barrier (model);
16138 }
16139
16140 static void
16141 aarch64_init_libfuncs (void)
16142 {
16143    /* Half-precision float operations.  The compiler handles all operations
16144      with NULL libfuncs by converting to SFmode.  */
16145
16146   /* Conversions.  */
16147   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
16148   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
16149
16150   /* Arithmetic.  */
16151   set_optab_libfunc (add_optab, HFmode, NULL);
16152   set_optab_libfunc (sdiv_optab, HFmode, NULL);
16153   set_optab_libfunc (smul_optab, HFmode, NULL);
16154   set_optab_libfunc (neg_optab, HFmode, NULL);
16155   set_optab_libfunc (sub_optab, HFmode, NULL);
16156
16157   /* Comparisons.  */
16158   set_optab_libfunc (eq_optab, HFmode, NULL);
16159   set_optab_libfunc (ne_optab, HFmode, NULL);
16160   set_optab_libfunc (lt_optab, HFmode, NULL);
16161   set_optab_libfunc (le_optab, HFmode, NULL);
16162   set_optab_libfunc (ge_optab, HFmode, NULL);
16163   set_optab_libfunc (gt_optab, HFmode, NULL);
16164   set_optab_libfunc (unord_optab, HFmode, NULL);
16165 }
16166
16167 /* Target hook for c_mode_for_suffix.  */
16168 static machine_mode
16169 aarch64_c_mode_for_suffix (char suffix)
16170 {
16171   if (suffix == 'q')
16172     return TFmode;
16173
16174   return VOIDmode;
16175 }
16176
16177 /* We can only represent floating point constants which will fit in
16178    "quarter-precision" values.  These values are characterised by
16179    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
16180    by:
16181
16182    (-1)^s * (n/16) * 2^r
16183
16184    Where:
16185      's' is the sign bit.
16186      'n' is an integer in the range 16 <= n <= 31.
16187      'r' is an integer in the range -3 <= r <= 4.  */
16188
16189 /* Return true iff X can be represented by a quarter-precision
16190    floating point immediate operand X.  Note, we cannot represent 0.0.  */
16191 bool
16192 aarch64_float_const_representable_p (rtx x)
16193 {
16194   /* This represents our current view of how many bits
16195      make up the mantissa.  */
16196   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
16197   int exponent;
16198   unsigned HOST_WIDE_INT mantissa, mask;
16199   REAL_VALUE_TYPE r, m;
16200   bool fail;
16201
16202   if (!CONST_DOUBLE_P (x))
16203     return false;
16204
16205   if (GET_MODE (x) == VOIDmode
16206       || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
16207     return false;
16208
16209   r = *CONST_DOUBLE_REAL_VALUE (x);
16210
16211   /* We cannot represent infinities, NaNs or +/-zero.  We won't
16212      know if we have +zero until we analyse the mantissa, but we
16213      can reject the other invalid values.  */
16214   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
16215       || REAL_VALUE_MINUS_ZERO (r))
16216     return false;
16217
16218   /* Extract exponent.  */
16219   r = real_value_abs (&r);
16220   exponent = REAL_EXP (&r);
16221
16222   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
16223      highest (sign) bit, with a fixed binary point at bit point_pos.
16224      m1 holds the low part of the mantissa, m2 the high part.
16225      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
16226      bits for the mantissa, this can fail (low bits will be lost).  */
16227   real_ldexp (&m, &r, point_pos - exponent);
16228   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
16229
16230   /* If the low part of the mantissa has bits set we cannot represent
16231      the value.  */
16232   if (w.ulow () != 0)
16233     return false;
16234   /* We have rejected the lower HOST_WIDE_INT, so update our
16235      understanding of how many bits lie in the mantissa and
16236      look only at the high HOST_WIDE_INT.  */
16237   mantissa = w.elt (1);
16238   point_pos -= HOST_BITS_PER_WIDE_INT;
16239
16240   /* We can only represent values with a mantissa of the form 1.xxxx.  */
16241   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
16242   if ((mantissa & mask) != 0)
16243     return false;
16244
16245   /* Having filtered unrepresentable values, we may now remove all
16246      but the highest 5 bits.  */
16247   mantissa >>= point_pos - 5;
16248
16249   /* We cannot represent the value 0.0, so reject it.  This is handled
16250      elsewhere.  */
16251   if (mantissa == 0)
16252     return false;
16253
16254   /* Then, as bit 4 is always set, we can mask it off, leaving
16255      the mantissa in the range [0, 15].  */
16256   mantissa &= ~(1 << 4);
16257   gcc_assert (mantissa <= 15);
16258
16259   /* GCC internally does not use IEEE754-like encoding (where normalized
16260      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
16261      Our mantissa values are shifted 4 places to the left relative to
16262      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
16263      by 5 places to correct for GCC's representation.  */
16264   exponent = 5 - exponent;
16265
16266   return (exponent >= 0 && exponent <= 7);
16267 }
16268
16269 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
16270    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
16271    output MOVI/MVNI, ORR or BIC immediate.  */
16272 char*
16273 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
16274                                    enum simd_immediate_check which)
16275 {
16276   bool is_valid;
16277   static char templ[40];
16278   const char *mnemonic;
16279   const char *shift_op;
16280   unsigned int lane_count = 0;
16281   char element_char;
16282
16283   struct simd_immediate_info info;
16284
16285   /* This will return true to show const_vector is legal for use as either
16286      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
16287      It will also update INFO to show how the immediate should be generated.
16288      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
16289   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
16290   gcc_assert (is_valid);
16291
16292   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
16293   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
16294
16295   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
16296     {
16297       gcc_assert (info.insn == simd_immediate_info::MOV
16298                   && info.u.mov.shift == 0);
16299       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
16300          move immediate path.  */
16301       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
16302         info.u.mov.value = GEN_INT (0);
16303       else
16304         {
16305           const unsigned int buf_size = 20;
16306           char float_buf[buf_size] = {'\0'};
16307           real_to_decimal_for_mode (float_buf,
16308                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
16309                                     buf_size, buf_size, 1, info.elt_mode);
16310
16311           if (lane_count == 1)
16312             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
16313           else
16314             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
16315                       lane_count, element_char, float_buf);
16316           return templ;
16317         }
16318     }
16319
16320   gcc_assert (CONST_INT_P (info.u.mov.value));
16321
16322   if (which == AARCH64_CHECK_MOV)
16323     {
16324       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
16325       shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
16326                   ? "msl" : "lsl");
16327       if (lane_count == 1)
16328         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
16329                   mnemonic, UINTVAL (info.u.mov.value));
16330       else if (info.u.mov.shift)
16331         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
16332                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
16333                   element_char, UINTVAL (info.u.mov.value), shift_op,
16334                   info.u.mov.shift);
16335       else
16336         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
16337                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
16338                   element_char, UINTVAL (info.u.mov.value));
16339     }
16340   else
16341     {
16342       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
16343       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
16344       if (info.u.mov.shift)
16345         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
16346                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
16347                   element_char, UINTVAL (info.u.mov.value), "lsl",
16348                   info.u.mov.shift);
16349       else
16350         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
16351                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
16352                   element_char, UINTVAL (info.u.mov.value));
16353     }
16354   return templ;
16355 }
16356
16357 char*
16358 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
16359 {
16360
16361   /* If a floating point number was passed and we desire to use it in an
16362      integer mode do the conversion to integer.  */
16363   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
16364     {
16365       unsigned HOST_WIDE_INT ival;
16366       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
16367           gcc_unreachable ();
16368       immediate = gen_int_mode (ival, mode);
16369     }
16370
16371   machine_mode vmode;
16372   /* use a 64 bit mode for everything except for DI/DF mode, where we use
16373      a 128 bit vector mode.  */
16374   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
16375
16376   vmode = aarch64_simd_container_mode (mode, width);
16377   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
16378   return aarch64_output_simd_mov_immediate (v_op, width);
16379 }
16380
16381 /* Return the output string to use for moving immediate CONST_VECTOR
16382    into an SVE register.  */
16383
16384 char *
16385 aarch64_output_sve_mov_immediate (rtx const_vector)
16386 {
16387   static char templ[40];
16388   struct simd_immediate_info info;
16389   char element_char;
16390
16391   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
16392   gcc_assert (is_valid);
16393
16394   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
16395
16396   if (info.insn == simd_immediate_info::INDEX)
16397     {
16398       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
16399                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
16400                 element_char, INTVAL (info.u.index.base),
16401                 INTVAL (info.u.index.step));
16402       return templ;
16403     }
16404
16405   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
16406     {
16407       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
16408         info.u.mov.value = GEN_INT (0);
16409       else
16410         {
16411           const int buf_size = 20;
16412           char float_buf[buf_size] = {};
16413           real_to_decimal_for_mode (float_buf,
16414                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
16415                                     buf_size, buf_size, 1, info.elt_mode);
16416
16417           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
16418                     element_char, float_buf);
16419           return templ;
16420         }
16421     }
16422
16423   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
16424             element_char, INTVAL (info.u.mov.value));
16425   return templ;
16426 }
16427
16428 /* Return the asm format for a PTRUE instruction whose destination has
16429    mode MODE.  SUFFIX is the element size suffix.  */
16430
16431 char *
16432 aarch64_output_ptrue (machine_mode mode, char suffix)
16433 {
16434   unsigned int nunits;
16435   static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
16436   if (GET_MODE_NUNITS (mode).is_constant (&nunits))
16437     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
16438   else
16439     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
16440   return buf;
16441 }
16442
16443 /* Split operands into moves from op[1] + op[2] into op[0].  */
16444
16445 void
16446 aarch64_split_combinev16qi (rtx operands[3])
16447 {
16448   unsigned int dest = REGNO (operands[0]);
16449   unsigned int src1 = REGNO (operands[1]);
16450   unsigned int src2 = REGNO (operands[2]);
16451   machine_mode halfmode = GET_MODE (operands[1]);
16452   unsigned int halfregs = REG_NREGS (operands[1]);
16453   rtx destlo, desthi;
16454
16455   gcc_assert (halfmode == V16QImode);
16456
16457   if (src1 == dest && src2 == dest + halfregs)
16458     {
16459       /* No-op move.  Can't split to nothing; emit something.  */
16460       emit_note (NOTE_INSN_DELETED);
16461       return;
16462     }
16463
16464   /* Preserve register attributes for variable tracking.  */
16465   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
16466   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
16467                                GET_MODE_SIZE (halfmode));
16468
16469   /* Special case of reversed high/low parts.  */
16470   if (reg_overlap_mentioned_p (operands[2], destlo)
16471       && reg_overlap_mentioned_p (operands[1], desthi))
16472     {
16473       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
16474       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
16475       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
16476     }
16477   else if (!reg_overlap_mentioned_p (operands[2], destlo))
16478     {
16479       /* Try to avoid unnecessary moves if part of the result
16480          is in the right place already.  */
16481       if (src1 != dest)
16482         emit_move_insn (destlo, operands[1]);
16483       if (src2 != dest + halfregs)
16484         emit_move_insn (desthi, operands[2]);
16485     }
16486   else
16487     {
16488       if (src2 != dest + halfregs)
16489         emit_move_insn (desthi, operands[2]);
16490       if (src1 != dest)
16491         emit_move_insn (destlo, operands[1]);
16492     }
16493 }
16494
16495 /* vec_perm support.  */
16496
16497 struct expand_vec_perm_d
16498 {
16499   rtx target, op0, op1;
16500   vec_perm_indices perm;
16501   machine_mode vmode;
16502   unsigned int vec_flags;
16503   bool one_vector_p;
16504   bool testing_p;
16505 };
16506
16507 /* Generate a variable permutation.  */
16508
16509 static void
16510 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
16511 {
16512   machine_mode vmode = GET_MODE (target);
16513   bool one_vector_p = rtx_equal_p (op0, op1);
16514
16515   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
16516   gcc_checking_assert (GET_MODE (op0) == vmode);
16517   gcc_checking_assert (GET_MODE (op1) == vmode);
16518   gcc_checking_assert (GET_MODE (sel) == vmode);
16519   gcc_checking_assert (TARGET_SIMD);
16520
16521   if (one_vector_p)
16522     {
16523       if (vmode == V8QImode)
16524         {
16525           /* Expand the argument to a V16QI mode by duplicating it.  */
16526           rtx pair = gen_reg_rtx (V16QImode);
16527           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
16528           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
16529         }
16530       else
16531         {
16532           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
16533         }
16534     }
16535   else
16536     {
16537       rtx pair;
16538
16539       if (vmode == V8QImode)
16540         {
16541           pair = gen_reg_rtx (V16QImode);
16542           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
16543           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
16544         }
16545       else
16546         {
16547           pair = gen_reg_rtx (OImode);
16548           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
16549           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
16550         }
16551     }
16552 }
16553
16554 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
16555    NELT is the number of elements in the vector.  */
16556
16557 void
16558 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
16559                          unsigned int nelt)
16560 {
16561   machine_mode vmode = GET_MODE (target);
16562   bool one_vector_p = rtx_equal_p (op0, op1);
16563   rtx mask;
16564
16565   /* The TBL instruction does not use a modulo index, so we must take care
16566      of that ourselves.  */
16567   mask = aarch64_simd_gen_const_vector_dup (vmode,
16568       one_vector_p ? nelt - 1 : 2 * nelt - 1);
16569   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
16570
16571   /* For big-endian, we also need to reverse the index within the vector
16572      (but not which vector).  */
16573   if (BYTES_BIG_ENDIAN)
16574     {
16575       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
16576       if (!one_vector_p)
16577         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
16578       sel = expand_simple_binop (vmode, XOR, sel, mask,
16579                                  NULL, 0, OPTAB_LIB_WIDEN);
16580     }
16581   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
16582 }
16583
16584 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
16585
16586 static void
16587 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
16588 {
16589   emit_insn (gen_rtx_SET (target,
16590                           gen_rtx_UNSPEC (GET_MODE (target),
16591                                           gen_rtvec (2, op0, op1), code)));
16592 }
16593
16594 /* Expand an SVE vec_perm with the given operands.  */
16595
16596 void
16597 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
16598 {
16599   machine_mode data_mode = GET_MODE (target);
16600   machine_mode sel_mode = GET_MODE (sel);
16601   /* Enforced by the pattern condition.  */
16602   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
16603
16604   /* Note: vec_perm indices are supposed to wrap when they go beyond the
16605      size of the two value vectors, i.e. the upper bits of the indices
16606      are effectively ignored.  SVE TBL instead produces 0 for any
16607      out-of-range indices, so we need to modulo all the vec_perm indices
16608      to ensure they are all in range.  */
16609   rtx sel_reg = force_reg (sel_mode, sel);
16610
16611   /* Check if the sel only references the first values vector.  */
16612   if (GET_CODE (sel) == CONST_VECTOR
16613       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
16614     {
16615       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
16616       return;
16617     }
16618
16619   /* Check if the two values vectors are the same.  */
16620   if (rtx_equal_p (op0, op1))
16621     {
16622       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
16623       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
16624                                          NULL, 0, OPTAB_DIRECT);
16625       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
16626       return;
16627     }
16628
16629   /* Run TBL on for each value vector and combine the results.  */
16630
16631   rtx res0 = gen_reg_rtx (data_mode);
16632   rtx res1 = gen_reg_rtx (data_mode);
16633   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
16634   if (GET_CODE (sel) != CONST_VECTOR
16635       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
16636     {
16637       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
16638                                                        2 * nunits - 1);
16639       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
16640                                      NULL, 0, OPTAB_DIRECT);
16641     }
16642   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
16643   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
16644                                      NULL, 0, OPTAB_DIRECT);
16645   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
16646   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
16647     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
16648   else
16649     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
16650 }
16651
16652 /* Recognize patterns suitable for the TRN instructions.  */
16653 static bool
16654 aarch64_evpc_trn (struct expand_vec_perm_d *d)
16655 {
16656   HOST_WIDE_INT odd;
16657   poly_uint64 nelt = d->perm.length ();
16658   rtx out, in0, in1, x;
16659   machine_mode vmode = d->vmode;
16660
16661   if (GET_MODE_UNIT_SIZE (vmode) > 8)
16662     return false;
16663
16664   /* Note that these are little-endian tests.
16665      We correct for big-endian later.  */
16666   if (!d->perm[0].is_constant (&odd)
16667       || (odd != 0 && odd != 1)
16668       || !d->perm.series_p (0, 2, odd, 2)
16669       || !d->perm.series_p (1, 2, nelt + odd, 2))
16670     return false;
16671
16672   /* Success!  */
16673   if (d->testing_p)
16674     return true;
16675
16676   in0 = d->op0;
16677   in1 = d->op1;
16678   /* We don't need a big-endian lane correction for SVE; see the comment
16679      at the head of aarch64-sve.md for details.  */
16680   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16681     {
16682       x = in0, in0 = in1, in1 = x;
16683       odd = !odd;
16684     }
16685   out = d->target;
16686
16687   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16688                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
16689   return true;
16690 }
16691
16692 /* Recognize patterns suitable for the UZP instructions.  */
16693 static bool
16694 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
16695 {
16696   HOST_WIDE_INT odd;
16697   rtx out, in0, in1, x;
16698   machine_mode vmode = d->vmode;
16699
16700   if (GET_MODE_UNIT_SIZE (vmode) > 8)
16701     return false;
16702
16703   /* Note that these are little-endian tests.
16704      We correct for big-endian later.  */
16705   if (!d->perm[0].is_constant (&odd)
16706       || (odd != 0 && odd != 1)
16707       || !d->perm.series_p (0, 1, odd, 2))
16708     return false;
16709
16710   /* Success!  */
16711   if (d->testing_p)
16712     return true;
16713
16714   in0 = d->op0;
16715   in1 = d->op1;
16716   /* We don't need a big-endian lane correction for SVE; see the comment
16717      at the head of aarch64-sve.md for details.  */
16718   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16719     {
16720       x = in0, in0 = in1, in1 = x;
16721       odd = !odd;
16722     }
16723   out = d->target;
16724
16725   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16726                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
16727   return true;
16728 }
16729
16730 /* Recognize patterns suitable for the ZIP instructions.  */
16731 static bool
16732 aarch64_evpc_zip (struct expand_vec_perm_d *d)
16733 {
16734   unsigned int high;
16735   poly_uint64 nelt = d->perm.length ();
16736   rtx out, in0, in1, x;
16737   machine_mode vmode = d->vmode;
16738
16739   if (GET_MODE_UNIT_SIZE (vmode) > 8)
16740     return false;
16741
16742   /* Note that these are little-endian tests.
16743      We correct for big-endian later.  */
16744   poly_uint64 first = d->perm[0];
16745   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
16746       || !d->perm.series_p (0, 2, first, 1)
16747       || !d->perm.series_p (1, 2, first + nelt, 1))
16748     return false;
16749   high = maybe_ne (first, 0U);
16750
16751   /* Success!  */
16752   if (d->testing_p)
16753     return true;
16754
16755   in0 = d->op0;
16756   in1 = d->op1;
16757   /* We don't need a big-endian lane correction for SVE; see the comment
16758      at the head of aarch64-sve.md for details.  */
16759   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16760     {
16761       x = in0, in0 = in1, in1 = x;
16762       high = !high;
16763     }
16764   out = d->target;
16765
16766   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16767                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
16768   return true;
16769 }
16770
16771 /* Recognize patterns for the EXT insn.  */
16772
16773 static bool
16774 aarch64_evpc_ext (struct expand_vec_perm_d *d)
16775 {
16776   HOST_WIDE_INT location;
16777   rtx offset;
16778
16779   /* The first element always refers to the first vector.
16780      Check if the extracted indices are increasing by one.  */
16781   if (d->vec_flags == VEC_SVE_PRED
16782       || !d->perm[0].is_constant (&location)
16783       || !d->perm.series_p (0, 1, location, 1))
16784     return false;
16785
16786   /* Success! */
16787   if (d->testing_p)
16788     return true;
16789
16790   /* The case where (location == 0) is a no-op for both big- and little-endian,
16791      and is removed by the mid-end at optimization levels -O1 and higher.
16792
16793      We don't need a big-endian lane correction for SVE; see the comment
16794      at the head of aarch64-sve.md for details.  */
16795   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
16796     {
16797       /* After setup, we want the high elements of the first vector (stored
16798          at the LSB end of the register), and the low elements of the second
16799          vector (stored at the MSB end of the register). So swap.  */
16800       std::swap (d->op0, d->op1);
16801       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
16802          to_constant () is safe since this is restricted to Advanced SIMD
16803          vectors.  */
16804       location = d->perm.length ().to_constant () - location;
16805     }
16806
16807   offset = GEN_INT (location);
16808   emit_set_insn (d->target,
16809                  gen_rtx_UNSPEC (d->vmode,
16810                                  gen_rtvec (3, d->op0, d->op1, offset),
16811                                  UNSPEC_EXT));
16812   return true;
16813 }
16814
16815 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
16816    within each 64-bit, 32-bit or 16-bit granule.  */
16817
16818 static bool
16819 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
16820 {
16821   HOST_WIDE_INT diff;
16822   unsigned int i, size, unspec;
16823   machine_mode pred_mode;
16824
16825   if (d->vec_flags == VEC_SVE_PRED
16826       || !d->one_vector_p
16827       || !d->perm[0].is_constant (&diff))
16828     return false;
16829
16830   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
16831   if (size == 8)
16832     {
16833       unspec = UNSPEC_REV64;
16834       pred_mode = VNx2BImode;
16835     }
16836   else if (size == 4)
16837     {
16838       unspec = UNSPEC_REV32;
16839       pred_mode = VNx4BImode;
16840     }
16841   else if (size == 2)
16842     {
16843       unspec = UNSPEC_REV16;
16844       pred_mode = VNx8BImode;
16845     }
16846   else
16847     return false;
16848
16849   unsigned int step = diff + 1;
16850   for (i = 0; i < step; ++i)
16851     if (!d->perm.series_p (i, step, diff - i, step))
16852       return false;
16853
16854   /* Success! */
16855   if (d->testing_p)
16856     return true;
16857
16858   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
16859   if (d->vec_flags == VEC_SVE_DATA)
16860     {
16861       rtx pred = aarch64_ptrue_reg (pred_mode);
16862       src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
16863                             UNSPEC_MERGE_PTRUE);
16864     }
16865   emit_set_insn (d->target, src);
16866   return true;
16867 }
16868
16869 /* Recognize patterns for the REV insn, which reverses elements within
16870    a full vector.  */
16871
16872 static bool
16873 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
16874 {
16875   poly_uint64 nelt = d->perm.length ();
16876
16877   if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
16878     return false;
16879
16880   if (!d->perm.series_p (0, 1, nelt - 1, -1))
16881     return false;
16882
16883   /* Success! */
16884   if (d->testing_p)
16885     return true;
16886
16887   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
16888   emit_set_insn (d->target, src);
16889   return true;
16890 }
16891
16892 static bool
16893 aarch64_evpc_dup (struct expand_vec_perm_d *d)
16894 {
16895   rtx out = d->target;
16896   rtx in0;
16897   HOST_WIDE_INT elt;
16898   machine_mode vmode = d->vmode;
16899   rtx lane;
16900
16901   if (d->vec_flags == VEC_SVE_PRED
16902       || d->perm.encoding ().encoded_nelts () != 1
16903       || !d->perm[0].is_constant (&elt))
16904     return false;
16905
16906   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
16907     return false;
16908
16909   /* Success! */
16910   if (d->testing_p)
16911     return true;
16912
16913   /* The generic preparation in aarch64_expand_vec_perm_const_1
16914      swaps the operand order and the permute indices if it finds
16915      d->perm[0] to be in the second operand.  Thus, we can always
16916      use d->op0 and need not do any extra arithmetic to get the
16917      correct lane number.  */
16918   in0 = d->op0;
16919   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
16920
16921   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
16922   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
16923   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
16924   return true;
16925 }
16926
16927 static bool
16928 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
16929 {
16930   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
16931   machine_mode vmode = d->vmode;
16932
16933   /* Make sure that the indices are constant.  */
16934   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
16935   for (unsigned int i = 0; i < encoded_nelts; ++i)
16936     if (!d->perm[i].is_constant ())
16937       return false;
16938
16939   if (d->testing_p)
16940     return true;
16941
16942   /* Generic code will try constant permutation twice.  Once with the
16943      original mode and again with the elements lowered to QImode.
16944      So wait and don't do the selector expansion ourselves.  */
16945   if (vmode != V8QImode && vmode != V16QImode)
16946     return false;
16947
16948   /* to_constant is safe since this routine is specific to Advanced SIMD
16949      vectors.  */
16950   unsigned int nelt = d->perm.length ().to_constant ();
16951   for (unsigned int i = 0; i < nelt; ++i)
16952     /* If big-endian and two vectors we end up with a weird mixed-endian
16953        mode on NEON.  Reverse the index within each word but not the word
16954        itself.  to_constant is safe because we checked is_constant above.  */
16955     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
16956                         ? d->perm[i].to_constant () ^ (nelt - 1)
16957                         : d->perm[i].to_constant ());
16958
16959   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
16960   sel = force_reg (vmode, sel);
16961
16962   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
16963   return true;
16964 }
16965
16966 /* Try to implement D using an SVE TBL instruction.  */
16967
16968 static bool
16969 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
16970 {
16971   unsigned HOST_WIDE_INT nelt;
16972
16973   /* Permuting two variable-length vectors could overflow the
16974      index range.  */
16975   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
16976     return false;
16977
16978   if (d->testing_p)
16979     return true;
16980
16981   machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
16982   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
16983   if (d->one_vector_p)
16984     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
16985   else
16986     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
16987   return true;
16988 }
16989
16990 static bool
16991 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
16992 {
16993   /* The pattern matching functions above are written to look for a small
16994      number to begin the sequence (0, 1, N/2).  If we begin with an index
16995      from the second operand, we can swap the operands.  */
16996   poly_int64 nelt = d->perm.length ();
16997   if (known_ge (d->perm[0], nelt))
16998     {
16999       d->perm.rotate_inputs (1);
17000       std::swap (d->op0, d->op1);
17001     }
17002
17003   if ((d->vec_flags == VEC_ADVSIMD
17004        || d->vec_flags == VEC_SVE_DATA
17005        || d->vec_flags == VEC_SVE_PRED)
17006       && known_gt (nelt, 1))
17007     {
17008       if (aarch64_evpc_rev_local (d))
17009         return true;
17010       else if (aarch64_evpc_rev_global (d))
17011         return true;
17012       else if (aarch64_evpc_ext (d))
17013         return true;
17014       else if (aarch64_evpc_dup (d))
17015         return true;
17016       else if (aarch64_evpc_zip (d))
17017         return true;
17018       else if (aarch64_evpc_uzp (d))
17019         return true;
17020       else if (aarch64_evpc_trn (d))
17021         return true;
17022       if (d->vec_flags == VEC_SVE_DATA)
17023         return aarch64_evpc_sve_tbl (d);
17024       else if (d->vec_flags == VEC_ADVSIMD)
17025         return aarch64_evpc_tbl (d);
17026     }
17027   return false;
17028 }
17029
17030 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
17031
17032 static bool
17033 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
17034                                   rtx op1, const vec_perm_indices &sel)
17035 {
17036   struct expand_vec_perm_d d;
17037
17038   /* Check whether the mask can be applied to a single vector.  */
17039   if (sel.ninputs () == 1
17040       || (op0 && rtx_equal_p (op0, op1)))
17041     d.one_vector_p = true;
17042   else if (sel.all_from_input_p (0))
17043     {
17044       d.one_vector_p = true;
17045       op1 = op0;
17046     }
17047   else if (sel.all_from_input_p (1))
17048     {
17049       d.one_vector_p = true;
17050       op0 = op1;
17051     }
17052   else
17053     d.one_vector_p = false;
17054
17055   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
17056                      sel.nelts_per_input ());
17057   d.vmode = vmode;
17058   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
17059   d.target = target;
17060   d.op0 = op0;
17061   d.op1 = op1;
17062   d.testing_p = !target;
17063
17064   if (!d.testing_p)
17065     return aarch64_expand_vec_perm_const_1 (&d);
17066
17067   rtx_insn *last = get_last_insn ();
17068   bool ret = aarch64_expand_vec_perm_const_1 (&d);
17069   gcc_assert (last == get_last_insn ());
17070
17071   return ret;
17072 }
17073
17074 /* Generate a byte permute mask for a register of mode MODE,
17075    which has NUNITS units.  */
17076
17077 rtx
17078 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
17079 {
17080   /* We have to reverse each vector because we dont have
17081      a permuted load that can reverse-load according to ABI rules.  */
17082   rtx mask;
17083   rtvec v = rtvec_alloc (16);
17084   unsigned int i, j;
17085   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
17086
17087   gcc_assert (BYTES_BIG_ENDIAN);
17088   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
17089
17090   for (i = 0; i < nunits; i++)
17091     for (j = 0; j < usize; j++)
17092       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
17093   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
17094   return force_reg (V16QImode, mask);
17095 }
17096
17097 /* Return true if X is a valid second operand for the SVE instruction
17098    that implements integer comparison OP_CODE.  */
17099
17100 static bool
17101 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
17102 {
17103   if (register_operand (x, VOIDmode))
17104     return true;
17105
17106   switch (op_code)
17107     {
17108     case LTU:
17109     case LEU:
17110     case GEU:
17111     case GTU:
17112       return aarch64_sve_cmp_immediate_p (x, false);
17113     case LT:
17114     case LE:
17115     case GE:
17116     case GT:
17117     case NE:
17118     case EQ:
17119       return aarch64_sve_cmp_immediate_p (x, true);
17120     default:
17121       gcc_unreachable ();
17122     }
17123 }
17124
17125 /* Use predicated SVE instructions to implement the equivalent of:
17126
17127      (set TARGET OP)
17128
17129    given that PTRUE is an all-true predicate of the appropriate mode.  */
17130
17131 static void
17132 aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
17133 {
17134   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
17135                                gen_rtvec (2, ptrue, op),
17136                                UNSPEC_MERGE_PTRUE);
17137   rtx_insn *insn = emit_set_insn (target, unspec);
17138   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
17139 }
17140
17141 /* Likewise, but also clobber the condition codes.  */
17142
17143 static void
17144 aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
17145 {
17146   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
17147                                gen_rtvec (2, ptrue, op),
17148                                UNSPEC_MERGE_PTRUE);
17149   rtx_insn *insn = emit_insn (gen_set_clobber_cc_nzc (target, unspec));
17150   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
17151 }
17152
17153 /* Return the UNSPEC_COND_* code for comparison CODE.  */
17154
17155 static unsigned int
17156 aarch64_unspec_cond_code (rtx_code code)
17157 {
17158   switch (code)
17159     {
17160     case NE:
17161       return UNSPEC_COND_FCMNE;
17162     case EQ:
17163       return UNSPEC_COND_FCMEQ;
17164     case LT:
17165       return UNSPEC_COND_FCMLT;
17166     case GT:
17167       return UNSPEC_COND_FCMGT;
17168     case LE:
17169       return UNSPEC_COND_FCMLE;
17170     case GE:
17171       return UNSPEC_COND_FCMGE;
17172     default:
17173       gcc_unreachable ();
17174     }
17175 }
17176
17177 /* Emit:
17178
17179       (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
17180
17181    where <X> is the operation associated with comparison CODE.  This form
17182    of instruction is used when (and (CODE OP0 OP1) PRED) would have different
17183    semantics, such as when PRED might not be all-true and when comparing
17184    inactive lanes could have side effects.  */
17185
17186 static void
17187 aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
17188                                   rtx pred, rtx op0, rtx op1)
17189 {
17190   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
17191                                gen_rtvec (3, pred, op0, op1),
17192                                aarch64_unspec_cond_code (code));
17193   emit_set_insn (target, unspec);
17194 }
17195
17196 /* Expand an SVE integer comparison using the SVE equivalent of:
17197
17198      (set TARGET (CODE OP0 OP1)).  */
17199
17200 void
17201 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
17202 {
17203   machine_mode pred_mode = GET_MODE (target);
17204   machine_mode data_mode = GET_MODE (op0);
17205
17206   if (!aarch64_sve_cmp_operand_p (code, op1))
17207     op1 = force_reg (data_mode, op1);
17208
17209   rtx ptrue = aarch64_ptrue_reg (pred_mode);
17210   rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
17211   aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
17212 }
17213
17214 /* Emit the SVE equivalent of:
17215
17216       (set TMP1 (CODE1 OP0 OP1))
17217       (set TMP2 (CODE2 OP0 OP1))
17218       (set TARGET (ior:PRED_MODE TMP1 TMP2))
17219
17220    PTRUE is an all-true predicate with the same mode as TARGET.  */
17221
17222 static void
17223 aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
17224                            rtx ptrue, rtx op0, rtx op1)
17225 {
17226   machine_mode pred_mode = GET_MODE (ptrue);
17227   rtx tmp1 = gen_reg_rtx (pred_mode);
17228   aarch64_emit_sve_ptrue_op (tmp1, ptrue,
17229                              gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
17230   rtx tmp2 = gen_reg_rtx (pred_mode);
17231   aarch64_emit_sve_ptrue_op (tmp2, ptrue,
17232                              gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
17233   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
17234 }
17235
17236 /* Emit the SVE equivalent of:
17237
17238       (set TMP (CODE OP0 OP1))
17239       (set TARGET (not TMP))
17240
17241    PTRUE is an all-true predicate with the same mode as TARGET.  */
17242
17243 static void
17244 aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
17245                                 rtx op0, rtx op1)
17246 {
17247   machine_mode pred_mode = GET_MODE (ptrue);
17248   rtx tmp = gen_reg_rtx (pred_mode);
17249   aarch64_emit_sve_ptrue_op (tmp, ptrue,
17250                              gen_rtx_fmt_ee (code, pred_mode, op0, op1));
17251   aarch64_emit_unop (target, one_cmpl_optab, tmp);
17252 }
17253
17254 /* Expand an SVE floating-point comparison using the SVE equivalent of:
17255
17256      (set TARGET (CODE OP0 OP1))
17257
17258    If CAN_INVERT_P is true, the caller can also handle inverted results;
17259    return true if the result is in fact inverted.  */
17260
17261 bool
17262 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
17263                                   rtx op0, rtx op1, bool can_invert_p)
17264 {
17265   machine_mode pred_mode = GET_MODE (target);
17266   machine_mode data_mode = GET_MODE (op0);
17267
17268   rtx ptrue = aarch64_ptrue_reg (pred_mode);
17269   switch (code)
17270     {
17271     case UNORDERED:
17272       /* UNORDERED has no immediate form.  */
17273       op1 = force_reg (data_mode, op1);
17274       /* fall through */
17275     case LT:
17276     case LE:
17277     case GT:
17278     case GE:
17279     case EQ:
17280     case NE:
17281       {
17282         /* There is native support for the comparison.  */
17283         rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
17284         aarch64_emit_sve_ptrue_op (target, ptrue, cond);
17285         return false;
17286       }
17287
17288     case LTGT:
17289       /* This is a trapping operation (LT or GT).  */
17290       aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
17291       return false;
17292
17293     case UNEQ:
17294       if (!flag_trapping_math)
17295         {
17296           /* This would trap for signaling NaNs.  */
17297           op1 = force_reg (data_mode, op1);
17298           aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
17299           return false;
17300         }
17301       /* fall through */
17302     case UNLT:
17303     case UNLE:
17304     case UNGT:
17305     case UNGE:
17306       if (flag_trapping_math)
17307         {
17308           /* Work out which elements are ordered.  */
17309           rtx ordered = gen_reg_rtx (pred_mode);
17310           op1 = force_reg (data_mode, op1);
17311           aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
17312
17313           /* Test the opposite condition for the ordered elements,
17314              then invert the result.  */
17315           if (code == UNEQ)
17316             code = NE;
17317           else
17318             code = reverse_condition_maybe_unordered (code);
17319           if (can_invert_p)
17320             {
17321               aarch64_emit_sve_predicated_cond (target, code,
17322                                                 ordered, op0, op1);
17323               return true;
17324             }
17325           rtx tmp = gen_reg_rtx (pred_mode);
17326           aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
17327           aarch64_emit_unop (target, one_cmpl_optab, tmp);
17328           return false;
17329         }
17330       break;
17331
17332     case ORDERED:
17333       /* ORDERED has no immediate form.  */
17334       op1 = force_reg (data_mode, op1);
17335       break;
17336
17337     default:
17338       gcc_unreachable ();
17339     }
17340
17341   /* There is native support for the inverse comparison.  */
17342   code = reverse_condition_maybe_unordered (code);
17343   if (can_invert_p)
17344     {
17345       rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
17346       aarch64_emit_sve_ptrue_op (target, ptrue, cond);
17347       return true;
17348     }
17349   aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
17350   return false;
17351 }
17352
17353 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
17354    of the data being selected and CMP_MODE is the mode of the values being
17355    compared.  */
17356
17357 void
17358 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
17359                           rtx *ops)
17360 {
17361   machine_mode pred_mode
17362     = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
17363                              GET_MODE_SIZE (cmp_mode)).require ();
17364   rtx pred = gen_reg_rtx (pred_mode);
17365   if (FLOAT_MODE_P (cmp_mode))
17366     {
17367       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
17368                                             ops[4], ops[5], true))
17369         std::swap (ops[1], ops[2]);
17370     }
17371   else
17372     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
17373
17374   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
17375   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
17376 }
17377
17378 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
17379    true.  However due to issues with register allocation it is preferable
17380    to avoid tieing integer scalar and FP scalar modes.  Executing integer
17381    operations in general registers is better than treating them as scalar
17382    vector operations.  This reduces latency and avoids redundant int<->FP
17383    moves.  So tie modes if they are either the same class, or vector modes
17384    with other vector modes, vector structs or any scalar mode.  */
17385
17386 static bool
17387 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
17388 {
17389   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
17390     return true;
17391
17392   /* We specifically want to allow elements of "structure" modes to
17393      be tieable to the structure.  This more general condition allows
17394      other rarer situations too.  The reason we don't extend this to
17395      predicate modes is that there are no predicate structure modes
17396      nor any specific instructions for extracting part of a predicate
17397      register.  */
17398   if (aarch64_vector_data_mode_p (mode1)
17399       && aarch64_vector_data_mode_p (mode2))
17400     return true;
17401
17402   /* Also allow any scalar modes with vectors.  */
17403   if (aarch64_vector_mode_supported_p (mode1)
17404       || aarch64_vector_mode_supported_p (mode2))
17405     return true;
17406
17407   return false;
17408 }
17409
17410 /* Return a new RTX holding the result of moving POINTER forward by
17411    AMOUNT bytes.  */
17412
17413 static rtx
17414 aarch64_move_pointer (rtx pointer, poly_int64 amount)
17415 {
17416   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
17417
17418   return adjust_automodify_address (pointer, GET_MODE (pointer),
17419                                     next, amount);
17420 }
17421
17422 /* Return a new RTX holding the result of moving POINTER forward by the
17423    size of the mode it points to.  */
17424
17425 static rtx
17426 aarch64_progress_pointer (rtx pointer)
17427 {
17428   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
17429 }
17430
17431 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
17432    MODE bytes.  */
17433
17434 static void
17435 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
17436                                               machine_mode mode)
17437 {
17438   rtx reg = gen_reg_rtx (mode);
17439
17440   /* "Cast" the pointers to the correct mode.  */
17441   *src = adjust_address (*src, mode, 0);
17442   *dst = adjust_address (*dst, mode, 0);
17443   /* Emit the memcpy.  */
17444   emit_move_insn (reg, *src);
17445   emit_move_insn (*dst, reg);
17446   /* Move the pointers forward.  */
17447   *src = aarch64_progress_pointer (*src);
17448   *dst = aarch64_progress_pointer (*dst);
17449 }
17450
17451 /* Expand cpymem, as if from a __builtin_memcpy.  Return true if
17452    we succeed, otherwise return false.  */
17453
17454 bool
17455 aarch64_expand_cpymem (rtx *operands)
17456 {
17457   int n, mode_bits;
17458   rtx dst = operands[0];
17459   rtx src = operands[1];
17460   rtx base;
17461   machine_mode cur_mode = BLKmode, next_mode;
17462   bool speed_p = !optimize_function_for_size_p (cfun);
17463
17464   /* When optimizing for size, give a better estimate of the length of a
17465      memcpy call, but use the default otherwise.  Moves larger than 8 bytes
17466      will always require an even number of instructions to do now.  And each
17467      operation requires both a load+store, so devide the max number by 2.  */
17468   int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
17469
17470   /* We can't do anything smart if the amount to copy is not constant.  */
17471   if (!CONST_INT_P (operands[2]))
17472     return false;
17473
17474   n = INTVAL (operands[2]);
17475
17476   /* Try to keep the number of instructions low.  For all cases we will do at
17477      most two moves for the residual amount, since we'll always overlap the
17478      remainder.  */
17479   if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
17480     return false;
17481
17482   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
17483   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
17484
17485   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
17486   src = adjust_automodify_address (src, VOIDmode, base, 0);
17487
17488   /* Convert n to bits to make the rest of the code simpler.  */
17489   n = n * BITS_PER_UNIT;
17490
17491   /* Maximum amount to copy in one go.  The AArch64 back-end has integer modes
17492      larger than TImode, but we should not use them for loads/stores here.  */
17493   const int copy_limit = GET_MODE_BITSIZE (TImode);
17494
17495   while (n > 0)
17496     {
17497       /* Find the largest mode in which to do the copy in without over reading
17498          or writing.  */
17499       opt_scalar_int_mode mode_iter;
17500       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
17501         if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
17502           cur_mode = mode_iter.require ();
17503
17504       gcc_assert (cur_mode != BLKmode);
17505
17506       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
17507       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
17508
17509       n -= mode_bits;
17510
17511       /* Do certain trailing copies as overlapping if it's going to be
17512          cheaper.  i.e. less instructions to do so.  For instance doing a 15
17513          byte copy it's more efficient to do two overlapping 8 byte copies than
17514          8 + 6 + 1.  */
17515       if (n > 0 && n <= 8 * BITS_PER_UNIT)
17516         {
17517           next_mode = smallest_mode_for_size (n, MODE_INT);
17518           int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
17519           src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
17520           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
17521           n = n_bits;
17522         }
17523     }
17524
17525   return true;
17526 }
17527
17528 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
17529    SImode stores.  Handle the case when the constant has identical
17530    bottom and top halves.  This is beneficial when the two stores can be
17531    merged into an STP and we avoid synthesising potentially expensive
17532    immediates twice.  Return true if such a split is possible.  */
17533
17534 bool
17535 aarch64_split_dimode_const_store (rtx dst, rtx src)
17536 {
17537   rtx lo = gen_lowpart (SImode, src);
17538   rtx hi = gen_highpart_mode (SImode, DImode, src);
17539
17540   bool size_p = optimize_function_for_size_p (cfun);
17541
17542   if (!rtx_equal_p (lo, hi))
17543     return false;
17544
17545   unsigned int orig_cost
17546     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
17547   unsigned int lo_cost
17548     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
17549
17550   /* We want to transform:
17551      MOV        x1, 49370
17552      MOVK       x1, 0x140, lsl 16
17553      MOVK       x1, 0xc0da, lsl 32
17554      MOVK       x1, 0x140, lsl 48
17555      STR        x1, [x0]
17556    into:
17557      MOV        w1, 49370
17558      MOVK       w1, 0x140, lsl 16
17559      STP        w1, w1, [x0]
17560    So we want to perform this only when we save two instructions
17561    or more.  When optimizing for size, however, accept any code size
17562    savings we can.  */
17563   if (size_p && orig_cost <= lo_cost)
17564     return false;
17565
17566   if (!size_p
17567       && (orig_cost <= lo_cost + 1))
17568     return false;
17569
17570   rtx mem_lo = adjust_address (dst, SImode, 0);
17571   if (!aarch64_mem_pair_operand (mem_lo, SImode))
17572     return false;
17573
17574   rtx tmp_reg = gen_reg_rtx (SImode);
17575   aarch64_expand_mov_immediate (tmp_reg, lo);
17576   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
17577   /* Don't emit an explicit store pair as this may not be always profitable.
17578      Let the sched-fusion logic decide whether to merge them.  */
17579   emit_move_insn (mem_lo, tmp_reg);
17580   emit_move_insn (mem_hi, tmp_reg);
17581
17582   return true;
17583 }
17584
17585 /* Generate RTL for a conditional branch with rtx comparison CODE in
17586    mode CC_MODE.  The destination of the unlikely conditional branch
17587    is LABEL_REF.  */
17588
17589 void
17590 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
17591                               rtx label_ref)
17592 {
17593   rtx x;
17594   x = gen_rtx_fmt_ee (code, VOIDmode,
17595                       gen_rtx_REG (cc_mode, CC_REGNUM),
17596                       const0_rtx);
17597
17598   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
17599                             gen_rtx_LABEL_REF (VOIDmode, label_ref),
17600                             pc_rtx);
17601   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
17602 }
17603
17604 /* Generate DImode scratch registers for 128-bit (TImode) addition.
17605
17606    OP1 represents the TImode destination operand 1
17607    OP2 represents the TImode destination operand 2
17608    LOW_DEST represents the low half (DImode) of TImode operand 0
17609    LOW_IN1 represents the low half (DImode) of TImode operand 1
17610    LOW_IN2 represents the low half (DImode) of TImode operand 2
17611    HIGH_DEST represents the high half (DImode) of TImode operand 0
17612    HIGH_IN1 represents the high half (DImode) of TImode operand 1
17613    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
17614
17615 void
17616 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
17617                             rtx *low_in1, rtx *low_in2,
17618                             rtx *high_dest, rtx *high_in1,
17619                             rtx *high_in2)
17620 {
17621   *low_dest = gen_reg_rtx (DImode);
17622   *low_in1 = gen_lowpart (DImode, op1);
17623   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
17624                                   subreg_lowpart_offset (DImode, TImode));
17625   *high_dest = gen_reg_rtx (DImode);
17626   *high_in1 = gen_highpart (DImode, op1);
17627   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
17628                                    subreg_highpart_offset (DImode, TImode));
17629 }
17630
17631 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
17632
17633    This function differs from 'arch64_addti_scratch_regs' in that
17634    OP1 can be an immediate constant (zero). We must call
17635    subreg_highpart_offset with DImode and TImode arguments, otherwise
17636    VOIDmode will be used for the const_int which generates an internal
17637    error from subreg_size_highpart_offset which does not expect a size of zero.
17638
17639    OP1 represents the TImode destination operand 1
17640    OP2 represents the TImode destination operand 2
17641    LOW_DEST represents the low half (DImode) of TImode operand 0
17642    LOW_IN1 represents the low half (DImode) of TImode operand 1
17643    LOW_IN2 represents the low half (DImode) of TImode operand 2
17644    HIGH_DEST represents the high half (DImode) of TImode operand 0
17645    HIGH_IN1 represents the high half (DImode) of TImode operand 1
17646    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
17647
17648
17649 void
17650 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
17651                              rtx *low_in1, rtx *low_in2,
17652                              rtx *high_dest, rtx *high_in1,
17653                              rtx *high_in2)
17654 {
17655   *low_dest = gen_reg_rtx (DImode);
17656   *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
17657                                   subreg_lowpart_offset (DImode, TImode));
17658
17659   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
17660                                   subreg_lowpart_offset (DImode, TImode));
17661   *high_dest = gen_reg_rtx (DImode);
17662
17663   *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
17664                                    subreg_highpart_offset (DImode, TImode));
17665   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
17666                                    subreg_highpart_offset (DImode, TImode));
17667 }
17668
17669 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
17670
17671    OP0 represents the TImode destination operand 0
17672    LOW_DEST represents the low half (DImode) of TImode operand 0
17673    LOW_IN1 represents the low half (DImode) of TImode operand 1
17674    LOW_IN2 represents the low half (DImode) of TImode operand 2
17675    HIGH_DEST represents the high half (DImode) of TImode operand 0
17676    HIGH_IN1 represents the high half (DImode) of TImode operand 1
17677    HIGH_IN2 represents the high half (DImode) of TImode operand 2
17678    UNSIGNED_P is true if the operation is being performed on unsigned
17679    values.  */
17680 void
17681 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
17682                        rtx low_in2, rtx high_dest, rtx high_in1,
17683                        rtx high_in2, bool unsigned_p)
17684 {
17685   if (low_in2 == const0_rtx)
17686     {
17687       low_dest = low_in1;
17688       high_in2 = force_reg (DImode, high_in2);
17689       if (unsigned_p)
17690         emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
17691       else
17692         emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
17693     }
17694   else
17695     {
17696       if (CONST_INT_P (low_in2))
17697         {
17698           high_in2 = force_reg (DImode, high_in2);
17699           emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
17700                                               GEN_INT (-INTVAL (low_in2))));
17701         }
17702       else
17703         emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
17704
17705       if (unsigned_p)
17706         emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
17707       else
17708         emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
17709     }
17710
17711   emit_move_insn (gen_lowpart (DImode, op0), low_dest);
17712   emit_move_insn (gen_highpart (DImode, op0), high_dest);
17713
17714 }
17715
17716 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
17717
17718 static unsigned HOST_WIDE_INT
17719 aarch64_asan_shadow_offset (void)
17720 {
17721   if (TARGET_ILP32)
17722     return (HOST_WIDE_INT_1 << 29);
17723   else
17724     return (HOST_WIDE_INT_1 << 36);
17725 }
17726
17727 static rtx
17728 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
17729                         int code, tree treeop0, tree treeop1)
17730 {
17731   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
17732   rtx op0, op1;
17733   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
17734   insn_code icode;
17735   struct expand_operand ops[4];
17736
17737   start_sequence ();
17738   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
17739
17740   op_mode = GET_MODE (op0);
17741   if (op_mode == VOIDmode)
17742     op_mode = GET_MODE (op1);
17743
17744   switch (op_mode)
17745     {
17746     case E_QImode:
17747     case E_HImode:
17748     case E_SImode:
17749       cmp_mode = SImode;
17750       icode = CODE_FOR_cmpsi;
17751       break;
17752
17753     case E_DImode:
17754       cmp_mode = DImode;
17755       icode = CODE_FOR_cmpdi;
17756       break;
17757
17758     case E_SFmode:
17759       cmp_mode = SFmode;
17760       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
17761       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
17762       break;
17763
17764     case E_DFmode:
17765       cmp_mode = DFmode;
17766       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
17767       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
17768       break;
17769
17770     default:
17771       end_sequence ();
17772       return NULL_RTX;
17773     }
17774
17775   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
17776   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
17777   if (!op0 || !op1)
17778     {
17779       end_sequence ();
17780       return NULL_RTX;
17781     }
17782   *prep_seq = get_insns ();
17783   end_sequence ();
17784
17785   create_fixed_operand (&ops[0], op0);
17786   create_fixed_operand (&ops[1], op1);
17787
17788   start_sequence ();
17789   if (!maybe_expand_insn (icode, 2, ops))
17790     {
17791       end_sequence ();
17792       return NULL_RTX;
17793     }
17794   *gen_seq = get_insns ();
17795   end_sequence ();
17796
17797   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
17798                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
17799 }
17800
17801 static rtx
17802 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
17803                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
17804 {
17805   rtx op0, op1, target;
17806   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
17807   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
17808   insn_code icode;
17809   struct expand_operand ops[6];
17810   int aarch64_cond;
17811
17812   push_to_sequence (*prep_seq);
17813   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
17814
17815   op_mode = GET_MODE (op0);
17816   if (op_mode == VOIDmode)
17817     op_mode = GET_MODE (op1);
17818
17819   switch (op_mode)
17820     {
17821     case E_QImode:
17822     case E_HImode:
17823     case E_SImode:
17824       cmp_mode = SImode;
17825       icode = CODE_FOR_ccmpsi;
17826       break;
17827
17828     case E_DImode:
17829       cmp_mode = DImode;
17830       icode = CODE_FOR_ccmpdi;
17831       break;
17832
17833     case E_SFmode:
17834       cmp_mode = SFmode;
17835       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
17836       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
17837       break;
17838
17839     case E_DFmode:
17840       cmp_mode = DFmode;
17841       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
17842       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
17843       break;
17844
17845     default:
17846       end_sequence ();
17847       return NULL_RTX;
17848     }
17849
17850   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
17851   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
17852   if (!op0 || !op1)
17853     {
17854       end_sequence ();
17855       return NULL_RTX;
17856     }
17857   *prep_seq = get_insns ();
17858   end_sequence ();
17859
17860   target = gen_rtx_REG (cc_mode, CC_REGNUM);
17861   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
17862
17863   if (bit_code != AND)
17864     {
17865       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
17866                                                 GET_MODE (XEXP (prev, 0))),
17867                              VOIDmode, XEXP (prev, 0), const0_rtx);
17868       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
17869     }
17870
17871   create_fixed_operand (&ops[0], XEXP (prev, 0));
17872   create_fixed_operand (&ops[1], target);
17873   create_fixed_operand (&ops[2], op0);
17874   create_fixed_operand (&ops[3], op1);
17875   create_fixed_operand (&ops[4], prev);
17876   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
17877
17878   push_to_sequence (*gen_seq);
17879   if (!maybe_expand_insn (icode, 6, ops))
17880     {
17881       end_sequence ();
17882       return NULL_RTX;
17883     }
17884
17885   *gen_seq = get_insns ();
17886   end_sequence ();
17887
17888   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
17889 }
17890
17891 #undef TARGET_GEN_CCMP_FIRST
17892 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
17893
17894 #undef TARGET_GEN_CCMP_NEXT
17895 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
17896
17897 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
17898    instruction fusion of some sort.  */
17899
17900 static bool
17901 aarch64_macro_fusion_p (void)
17902 {
17903   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
17904 }
17905
17906
17907 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
17908    should be kept together during scheduling.  */
17909
17910 static bool
17911 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
17912 {
17913   rtx set_dest;
17914   rtx prev_set = single_set (prev);
17915   rtx curr_set = single_set (curr);
17916   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
17917   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
17918
17919   if (!aarch64_macro_fusion_p ())
17920     return false;
17921
17922   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
17923     {
17924       /* We are trying to match:
17925          prev (mov)  == (set (reg r0) (const_int imm16))
17926          curr (movk) == (set (zero_extract (reg r0)
17927                                            (const_int 16)
17928                                            (const_int 16))
17929                              (const_int imm16_1))  */
17930
17931       set_dest = SET_DEST (curr_set);
17932
17933       if (GET_CODE (set_dest) == ZERO_EXTRACT
17934           && CONST_INT_P (SET_SRC (curr_set))
17935           && CONST_INT_P (SET_SRC (prev_set))
17936           && CONST_INT_P (XEXP (set_dest, 2))
17937           && INTVAL (XEXP (set_dest, 2)) == 16
17938           && REG_P (XEXP (set_dest, 0))
17939           && REG_P (SET_DEST (prev_set))
17940           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
17941         {
17942           return true;
17943         }
17944     }
17945
17946   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
17947     {
17948
17949       /*  We're trying to match:
17950           prev (adrp) == (set (reg r1)
17951                               (high (symbol_ref ("SYM"))))
17952           curr (add) == (set (reg r0)
17953                              (lo_sum (reg r1)
17954                                      (symbol_ref ("SYM"))))
17955           Note that r0 need not necessarily be the same as r1, especially
17956           during pre-regalloc scheduling.  */
17957
17958       if (satisfies_constraint_Ush (SET_SRC (prev_set))
17959           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
17960         {
17961           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
17962               && REG_P (XEXP (SET_SRC (curr_set), 0))
17963               && REGNO (XEXP (SET_SRC (curr_set), 0))
17964                  == REGNO (SET_DEST (prev_set))
17965               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
17966                               XEXP (SET_SRC (curr_set), 1)))
17967             return true;
17968         }
17969     }
17970
17971   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
17972     {
17973
17974       /* We're trying to match:
17975          prev (movk) == (set (zero_extract (reg r0)
17976                                            (const_int 16)
17977                                            (const_int 32))
17978                              (const_int imm16_1))
17979          curr (movk) == (set (zero_extract (reg r0)
17980                                            (const_int 16)
17981                                            (const_int 48))
17982                              (const_int imm16_2))  */
17983
17984       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
17985           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
17986           && REG_P (XEXP (SET_DEST (prev_set), 0))
17987           && REG_P (XEXP (SET_DEST (curr_set), 0))
17988           && REGNO (XEXP (SET_DEST (prev_set), 0))
17989              == REGNO (XEXP (SET_DEST (curr_set), 0))
17990           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
17991           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
17992           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
17993           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
17994           && CONST_INT_P (SET_SRC (prev_set))
17995           && CONST_INT_P (SET_SRC (curr_set)))
17996         return true;
17997
17998     }
17999   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
18000     {
18001       /* We're trying to match:
18002           prev (adrp) == (set (reg r0)
18003                               (high (symbol_ref ("SYM"))))
18004           curr (ldr) == (set (reg r1)
18005                              (mem (lo_sum (reg r0)
18006                                              (symbol_ref ("SYM")))))
18007                  or
18008           curr (ldr) == (set (reg r1)
18009                              (zero_extend (mem
18010                                            (lo_sum (reg r0)
18011                                                    (symbol_ref ("SYM"))))))  */
18012       if (satisfies_constraint_Ush (SET_SRC (prev_set))
18013           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
18014         {
18015           rtx curr_src = SET_SRC (curr_set);
18016
18017           if (GET_CODE (curr_src) == ZERO_EXTEND)
18018             curr_src = XEXP (curr_src, 0);
18019
18020           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
18021               && REG_P (XEXP (XEXP (curr_src, 0), 0))
18022               && REGNO (XEXP (XEXP (curr_src, 0), 0))
18023                  == REGNO (SET_DEST (prev_set))
18024               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
18025                               XEXP (SET_SRC (prev_set), 0)))
18026               return true;
18027         }
18028     }
18029
18030   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
18031       && any_condjump_p (curr))
18032     {
18033       unsigned int condreg1, condreg2;
18034       rtx cc_reg_1;
18035       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
18036       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
18037
18038       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
18039           && prev
18040           && modified_in_p (cc_reg_1, prev))
18041         {
18042           enum attr_type prev_type = get_attr_type (prev);
18043
18044           /* FIXME: this misses some which is considered simple arthematic
18045              instructions for ThunderX.  Simple shifts are missed here.  */
18046           if (prev_type == TYPE_ALUS_SREG
18047               || prev_type == TYPE_ALUS_IMM
18048               || prev_type == TYPE_LOGICS_REG
18049               || prev_type == TYPE_LOGICS_IMM)
18050             return true;
18051         }
18052     }
18053
18054   if (prev_set
18055       && curr_set
18056       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
18057       && any_condjump_p (curr))
18058     {
18059       /* We're trying to match:
18060           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
18061           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
18062                                                          (const_int 0))
18063                                                  (label_ref ("SYM"))
18064                                                  (pc))  */
18065       if (SET_DEST (curr_set) == (pc_rtx)
18066           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
18067           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
18068           && REG_P (SET_DEST (prev_set))
18069           && REGNO (SET_DEST (prev_set))
18070              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
18071         {
18072           /* Fuse ALU operations followed by conditional branch instruction.  */
18073           switch (get_attr_type (prev))
18074             {
18075             case TYPE_ALU_IMM:
18076             case TYPE_ALU_SREG:
18077             case TYPE_ADC_REG:
18078             case TYPE_ADC_IMM:
18079             case TYPE_ADCS_REG:
18080             case TYPE_ADCS_IMM:
18081             case TYPE_LOGIC_REG:
18082             case TYPE_LOGIC_IMM:
18083             case TYPE_CSEL:
18084             case TYPE_ADR:
18085             case TYPE_MOV_IMM:
18086             case TYPE_SHIFT_REG:
18087             case TYPE_SHIFT_IMM:
18088             case TYPE_BFM:
18089             case TYPE_RBIT:
18090             case TYPE_REV:
18091             case TYPE_EXTEND:
18092               return true;
18093
18094             default:;
18095             }
18096         }
18097     }
18098
18099   return false;
18100 }
18101
18102 /* Return true iff the instruction fusion described by OP is enabled.  */
18103
18104 bool
18105 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
18106 {
18107   return (aarch64_tune_params.fusible_ops & op) != 0;
18108 }
18109
18110 /* If MEM is in the form of [base+offset], extract the two parts
18111    of address and set to BASE and OFFSET, otherwise return false
18112    after clearing BASE and OFFSET.  */
18113
18114 bool
18115 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
18116 {
18117   rtx addr;
18118
18119   gcc_assert (MEM_P (mem));
18120
18121   addr = XEXP (mem, 0);
18122
18123   if (REG_P (addr))
18124     {
18125       *base = addr;
18126       *offset = const0_rtx;
18127       return true;
18128     }
18129
18130   if (GET_CODE (addr) == PLUS
18131       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
18132     {
18133       *base = XEXP (addr, 0);
18134       *offset = XEXP (addr, 1);
18135       return true;
18136     }
18137
18138   *base = NULL_RTX;
18139   *offset = NULL_RTX;
18140
18141   return false;
18142 }
18143
18144 /* Types for scheduling fusion.  */
18145 enum sched_fusion_type
18146 {
18147   SCHED_FUSION_NONE = 0,
18148   SCHED_FUSION_LD_SIGN_EXTEND,
18149   SCHED_FUSION_LD_ZERO_EXTEND,
18150   SCHED_FUSION_LD,
18151   SCHED_FUSION_ST,
18152   SCHED_FUSION_NUM
18153 };
18154
18155 /* If INSN is a load or store of address in the form of [base+offset],
18156    extract the two parts and set to BASE and OFFSET.  Return scheduling
18157    fusion type this INSN is.  */
18158
18159 static enum sched_fusion_type
18160 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
18161 {
18162   rtx x, dest, src;
18163   enum sched_fusion_type fusion = SCHED_FUSION_LD;
18164
18165   gcc_assert (INSN_P (insn));
18166   x = PATTERN (insn);
18167   if (GET_CODE (x) != SET)
18168     return SCHED_FUSION_NONE;
18169
18170   src = SET_SRC (x);
18171   dest = SET_DEST (x);
18172
18173   machine_mode dest_mode = GET_MODE (dest);
18174
18175   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
18176     return SCHED_FUSION_NONE;
18177
18178   if (GET_CODE (src) == SIGN_EXTEND)
18179     {
18180       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
18181       src = XEXP (src, 0);
18182       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
18183         return SCHED_FUSION_NONE;
18184     }
18185   else if (GET_CODE (src) == ZERO_EXTEND)
18186     {
18187       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
18188       src = XEXP (src, 0);
18189       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
18190         return SCHED_FUSION_NONE;
18191     }
18192
18193   if (GET_CODE (src) == MEM && REG_P (dest))
18194     extract_base_offset_in_addr (src, base, offset);
18195   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
18196     {
18197       fusion = SCHED_FUSION_ST;
18198       extract_base_offset_in_addr (dest, base, offset);
18199     }
18200   else
18201     return SCHED_FUSION_NONE;
18202
18203   if (*base == NULL_RTX || *offset == NULL_RTX)
18204     fusion = SCHED_FUSION_NONE;
18205
18206   return fusion;
18207 }
18208
18209 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
18210
18211    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
18212    and PRI are only calculated for these instructions.  For other instruction,
18213    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
18214    type instruction fusion can be added by returning different priorities.
18215
18216    It's important that irrelevant instructions get the largest FUSION_PRI.  */
18217
18218 static void
18219 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
18220                                int *fusion_pri, int *pri)
18221 {
18222   int tmp, off_val;
18223   rtx base, offset;
18224   enum sched_fusion_type fusion;
18225
18226   gcc_assert (INSN_P (insn));
18227
18228   tmp = max_pri - 1;
18229   fusion = fusion_load_store (insn, &base, &offset);
18230   if (fusion == SCHED_FUSION_NONE)
18231     {
18232       *pri = tmp;
18233       *fusion_pri = tmp;
18234       return;
18235     }
18236
18237   /* Set FUSION_PRI according to fusion type and base register.  */
18238   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
18239
18240   /* Calculate PRI.  */
18241   tmp /= 2;
18242
18243   /* INSN with smaller offset goes first.  */
18244   off_val = (int)(INTVAL (offset));
18245   if (off_val >= 0)
18246     tmp -= (off_val & 0xfffff);
18247   else
18248     tmp += ((- off_val) & 0xfffff);
18249
18250   *pri = tmp;
18251   return;
18252 }
18253
18254 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
18255    Adjust priority of sha1h instructions so they are scheduled before
18256    other SHA1 instructions.  */
18257
18258 static int
18259 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
18260 {
18261   rtx x = PATTERN (insn);
18262
18263   if (GET_CODE (x) == SET)
18264     {
18265       x = SET_SRC (x);
18266
18267       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
18268         return priority + 10;
18269     }
18270
18271   return priority;
18272 }
18273
18274 /* Given OPERANDS of consecutive load/store, check if we can merge
18275    them into ldp/stp.  LOAD is true if they are load instructions.
18276    MODE is the mode of memory operands.  */
18277
18278 bool
18279 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
18280                                 machine_mode mode)
18281 {
18282   HOST_WIDE_INT offval_1, offval_2, msize;
18283   enum reg_class rclass_1, rclass_2;
18284   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
18285
18286   if (load)
18287     {
18288       mem_1 = operands[1];
18289       mem_2 = operands[3];
18290       reg_1 = operands[0];
18291       reg_2 = operands[2];
18292       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
18293       if (REGNO (reg_1) == REGNO (reg_2))
18294         return false;
18295     }
18296   else
18297     {
18298       mem_1 = operands[0];
18299       mem_2 = operands[2];
18300       reg_1 = operands[1];
18301       reg_2 = operands[3];
18302     }
18303
18304   /* The mems cannot be volatile.  */
18305   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
18306     return false;
18307
18308   /* If we have SImode and slow unaligned ldp,
18309      check the alignment to be at least 8 byte. */
18310   if (mode == SImode
18311       && (aarch64_tune_params.extra_tuning_flags
18312           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
18313       && !optimize_size
18314       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
18315     return false;
18316
18317   /* Check if the addresses are in the form of [base+offset].  */
18318   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
18319   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
18320     return false;
18321   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
18322   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
18323     return false;
18324
18325   /* Check if the bases are same.  */
18326   if (!rtx_equal_p (base_1, base_2))
18327     return false;
18328
18329   /* The operands must be of the same size.  */
18330   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
18331                          GET_MODE_SIZE (GET_MODE (mem_2))));
18332
18333   offval_1 = INTVAL (offset_1);
18334   offval_2 = INTVAL (offset_2);
18335   /* We should only be trying this for fixed-sized modes.  There is no
18336      SVE LDP/STP instruction.  */
18337   msize = GET_MODE_SIZE (mode).to_constant ();
18338   /* Check if the offsets are consecutive.  */
18339   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
18340     return false;
18341
18342   /* Check if the addresses are clobbered by load.  */
18343   if (load)
18344     {
18345       if (reg_mentioned_p (reg_1, mem_1))
18346         return false;
18347
18348       /* In increasing order, the last load can clobber the address.  */
18349       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
18350         return false;
18351     }
18352
18353   /* One of the memory accesses must be a mempair operand.
18354      If it is not the first one, they need to be swapped by the
18355      peephole.  */
18356   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
18357        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
18358     return false;
18359
18360   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
18361     rclass_1 = FP_REGS;
18362   else
18363     rclass_1 = GENERAL_REGS;
18364
18365   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
18366     rclass_2 = FP_REGS;
18367   else
18368     rclass_2 = GENERAL_REGS;
18369
18370   /* Check if the registers are of same class.  */
18371   if (rclass_1 != rclass_2)
18372     return false;
18373
18374   return true;
18375 }
18376
18377 /* Given OPERANDS of consecutive load/store that can be merged,
18378    swap them if they are not in ascending order.  */
18379 void
18380 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
18381 {
18382   rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
18383   HOST_WIDE_INT offval_1, offval_2;
18384
18385   if (load)
18386     {
18387       mem_1 = operands[1];
18388       mem_2 = operands[3];
18389     }
18390   else
18391     {
18392       mem_1 = operands[0];
18393       mem_2 = operands[2];
18394     }
18395
18396   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
18397   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
18398
18399   offval_1 = INTVAL (offset_1);
18400   offval_2 = INTVAL (offset_2);
18401
18402   if (offval_1 > offval_2)
18403     {
18404       /* Irrespective of whether this is a load or a store,
18405          we do the same swap.  */
18406       std::swap (operands[0], operands[2]);
18407       std::swap (operands[1], operands[3]);
18408     }
18409 }
18410
18411 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
18412    comparison between the two.  */
18413 int
18414 aarch64_host_wide_int_compare (const void *x, const void *y)
18415 {
18416   return wi::cmps (* ((const HOST_WIDE_INT *) x),
18417                    * ((const HOST_WIDE_INT *) y));
18418 }
18419
18420 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
18421    other pointing to a REG rtx containing an offset, compare the offsets
18422    of the two pairs.
18423
18424    Return:
18425
18426         1 iff offset (X) > offset (Y)
18427         0 iff offset (X) == offset (Y)
18428         -1 iff offset (X) < offset (Y)  */
18429 int
18430 aarch64_ldrstr_offset_compare (const void *x, const void *y)
18431 {
18432   const rtx * operands_1 = (const rtx *) x;
18433   const rtx * operands_2 = (const rtx *) y;
18434   rtx mem_1, mem_2, base, offset_1, offset_2;
18435
18436   if (MEM_P (operands_1[0]))
18437     mem_1 = operands_1[0];
18438   else
18439     mem_1 = operands_1[1];
18440
18441   if (MEM_P (operands_2[0]))
18442     mem_2 = operands_2[0];
18443   else
18444     mem_2 = operands_2[1];
18445
18446   /* Extract the offsets.  */
18447   extract_base_offset_in_addr (mem_1, &base, &offset_1);
18448   extract_base_offset_in_addr (mem_2, &base, &offset_2);
18449
18450   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
18451
18452   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
18453 }
18454
18455 /* Given OPERANDS of consecutive load/store, check if we can merge
18456    them into ldp/stp by adjusting the offset.  LOAD is true if they
18457    are load instructions.  MODE is the mode of memory operands.
18458
18459    Given below consecutive stores:
18460
18461      str  w1, [xb, 0x100]
18462      str  w1, [xb, 0x104]
18463      str  w1, [xb, 0x108]
18464      str  w1, [xb, 0x10c]
18465
18466    Though the offsets are out of the range supported by stp, we can
18467    still pair them after adjusting the offset, like:
18468
18469      add  scratch, xb, 0x100
18470      stp  w1, w1, [scratch]
18471      stp  w1, w1, [scratch, 0x8]
18472
18473    The peephole patterns detecting this opportunity should guarantee
18474    the scratch register is avaliable.  */
18475
18476 bool
18477 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
18478                                        scalar_mode mode)
18479 {
18480   const int num_insns = 4;
18481   enum reg_class rclass;
18482   HOST_WIDE_INT offvals[num_insns], msize;
18483   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
18484
18485   if (load)
18486     {
18487       for (int i = 0; i < num_insns; i++)
18488         {
18489           reg[i] = operands[2 * i];
18490           mem[i] = operands[2 * i + 1];
18491
18492           gcc_assert (REG_P (reg[i]));
18493         }
18494
18495       /* Do not attempt to merge the loads if the loads clobber each other.  */
18496       for (int i = 0; i < 8; i += 2)
18497         for (int j = i + 2; j < 8; j += 2)
18498           if (reg_overlap_mentioned_p (operands[i], operands[j]))
18499             return false;
18500     }
18501   else
18502     for (int i = 0; i < num_insns; i++)
18503       {
18504         mem[i] = operands[2 * i];
18505         reg[i] = operands[2 * i + 1];
18506       }
18507
18508   /* Skip if memory operand is by itself valid for ldp/stp.  */
18509   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
18510     return false;
18511
18512   for (int i = 0; i < num_insns; i++)
18513     {
18514       /* The mems cannot be volatile.  */
18515       if (MEM_VOLATILE_P (mem[i]))
18516         return false;
18517
18518       /* Check if the addresses are in the form of [base+offset].  */
18519       extract_base_offset_in_addr (mem[i], base + i, offset + i);
18520       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
18521         return false;
18522     }
18523
18524   /* Check if the registers are of same class.  */
18525   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
18526     ? FP_REGS : GENERAL_REGS;
18527
18528   for (int i = 1; i < num_insns; i++)
18529     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
18530       {
18531         if (rclass != FP_REGS)
18532           return false;
18533       }
18534     else
18535       {
18536         if (rclass != GENERAL_REGS)
18537           return false;
18538       }
18539
18540   /* Only the last register in the order in which they occur
18541      may be clobbered by the load.  */
18542   if (rclass == GENERAL_REGS && load)
18543     for (int i = 0; i < num_insns - 1; i++)
18544       if (reg_mentioned_p (reg[i], mem[i]))
18545         return false;
18546
18547   /* Check if the bases are same.  */
18548   for (int i = 0; i < num_insns - 1; i++)
18549     if (!rtx_equal_p (base[i], base[i + 1]))
18550       return false;
18551
18552   for (int i = 0; i < num_insns; i++)
18553     offvals[i] = INTVAL (offset[i]);
18554
18555   msize = GET_MODE_SIZE (mode);
18556
18557   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
18558   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
18559          aarch64_host_wide_int_compare);
18560
18561   if (!(offvals[1] == offvals[0] + msize
18562         && offvals[3] == offvals[2] + msize))
18563     return false;
18564
18565   /* Check that offsets are within range of each other.  The ldp/stp
18566      instructions have 7 bit immediate offsets, so use 0x80.  */
18567   if (offvals[2] - offvals[0] >= msize * 0x80)
18568     return false;
18569
18570   /* The offsets must be aligned with respect to each other.  */
18571   if (offvals[0] % msize != offvals[2] % msize)
18572     return false;
18573
18574   /* If we have SImode and slow unaligned ldp,
18575      check the alignment to be at least 8 byte. */
18576   if (mode == SImode
18577       && (aarch64_tune_params.extra_tuning_flags
18578           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
18579       && !optimize_size
18580       && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
18581     return false;
18582
18583   return true;
18584 }
18585
18586 /* Given OPERANDS of consecutive load/store, this function pairs them
18587    into LDP/STP after adjusting the offset.  It depends on the fact
18588    that the operands can be sorted so the offsets are correct for STP.
18589    MODE is the mode of memory operands.  CODE is the rtl operator
18590    which should be applied to all memory operands, it's SIGN_EXTEND,
18591    ZERO_EXTEND or UNKNOWN.  */
18592
18593 bool
18594 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
18595                              scalar_mode mode, RTX_CODE code)
18596 {
18597   rtx base, offset_1, offset_3, t1, t2;
18598   rtx mem_1, mem_2, mem_3, mem_4;
18599   rtx temp_operands[8];
18600   HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
18601                 stp_off_upper_limit, stp_off_lower_limit, msize;
18602
18603   /* We make changes on a copy as we may still bail out.  */
18604   for (int i = 0; i < 8; i ++)
18605     temp_operands[i] = operands[i];
18606
18607   /* Sort the operands.  */
18608   qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
18609
18610   /* Copy the memory operands so that if we have to bail for some
18611      reason the original addresses are unchanged.  */
18612   if (load)
18613     {
18614       mem_1 = copy_rtx (temp_operands[1]);
18615       mem_2 = copy_rtx (temp_operands[3]);
18616       mem_3 = copy_rtx (temp_operands[5]);
18617       mem_4 = copy_rtx (temp_operands[7]);
18618     }
18619   else
18620     {
18621       mem_1 = copy_rtx (temp_operands[0]);
18622       mem_2 = copy_rtx (temp_operands[2]);
18623       mem_3 = copy_rtx (temp_operands[4]);
18624       mem_4 = copy_rtx (temp_operands[6]);
18625       gcc_assert (code == UNKNOWN);
18626     }
18627
18628   extract_base_offset_in_addr (mem_1, &base, &offset_1);
18629   extract_base_offset_in_addr (mem_3, &base, &offset_3);
18630   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
18631               && offset_3 != NULL_RTX);
18632
18633   /* Adjust offset so it can fit in LDP/STP instruction.  */
18634   msize = GET_MODE_SIZE (mode);
18635   stp_off_upper_limit = msize * (0x40 - 1);
18636   stp_off_lower_limit = - msize * 0x40;
18637
18638   off_val_1 = INTVAL (offset_1);
18639   off_val_3 = INTVAL (offset_3);
18640
18641   /* The base offset is optimally half way between the two STP/LDP offsets.  */
18642   if (msize <= 4)
18643     base_off = (off_val_1 + off_val_3) / 2;
18644   else
18645     /* However, due to issues with negative LDP/STP offset generation for
18646        larger modes, for DF, DI and vector modes. we must not use negative
18647        addresses smaller than 9 signed unadjusted bits can store.  This
18648        provides the most range in this case.  */
18649     base_off = off_val_1;
18650
18651   /* Adjust the base so that it is aligned with the addresses but still
18652      optimal.  */
18653   if (base_off % msize != off_val_1 % msize)
18654     /* Fix the offset, bearing in mind we want to make it bigger not
18655        smaller.  */
18656     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18657   else if (msize <= 4)
18658     /* The negative range of LDP/STP is one larger than the positive range.  */
18659     base_off += msize;
18660
18661   /* Check if base offset is too big or too small.  We can attempt to resolve
18662      this issue by setting it to the maximum value and seeing if the offsets
18663      still fit.  */
18664   if (base_off >= 0x1000)
18665     {
18666       base_off = 0x1000 - 1;
18667       /* We must still make sure that the base offset is aligned with respect
18668          to the address.  But it may may not be made any bigger.  */
18669       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18670     }
18671
18672   /* Likewise for the case where the base is too small.  */
18673   if (base_off <= -0x1000)
18674     {
18675       base_off = -0x1000 + 1;
18676       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18677     }
18678
18679   /* Offset of the first STP/LDP.  */
18680   new_off_1 = off_val_1 - base_off;
18681
18682   /* Offset of the second STP/LDP.  */
18683   new_off_3 = off_val_3 - base_off;
18684
18685   /* The offsets must be within the range of the LDP/STP instructions.  */
18686   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
18687       || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
18688     return false;
18689
18690   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
18691                                                   new_off_1), true);
18692   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
18693                                                   new_off_1 + msize), true);
18694   replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
18695                                                   new_off_3), true);
18696   replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
18697                                                   new_off_3 + msize), true);
18698
18699   if (!aarch64_mem_pair_operand (mem_1, mode)
18700       || !aarch64_mem_pair_operand (mem_3, mode))
18701     return false;
18702
18703   if (code == ZERO_EXTEND)
18704     {
18705       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
18706       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
18707       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
18708       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
18709     }
18710   else if (code == SIGN_EXTEND)
18711     {
18712       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
18713       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
18714       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
18715       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
18716     }
18717
18718   if (load)
18719     {
18720       operands[0] = temp_operands[0];
18721       operands[1] = mem_1;
18722       operands[2] = temp_operands[2];
18723       operands[3] = mem_2;
18724       operands[4] = temp_operands[4];
18725       operands[5] = mem_3;
18726       operands[6] = temp_operands[6];
18727       operands[7] = mem_4;
18728     }
18729   else
18730     {
18731       operands[0] = mem_1;
18732       operands[1] = temp_operands[1];
18733       operands[2] = mem_2;
18734       operands[3] = temp_operands[3];
18735       operands[4] = mem_3;
18736       operands[5] = temp_operands[5];
18737       operands[6] = mem_4;
18738       operands[7] = temp_operands[7];
18739     }
18740
18741   /* Emit adjusting instruction.  */
18742   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
18743   /* Emit ldp/stp instructions.  */
18744   t1 = gen_rtx_SET (operands[0], operands[1]);
18745   t2 = gen_rtx_SET (operands[2], operands[3]);
18746   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
18747   t1 = gen_rtx_SET (operands[4], operands[5]);
18748   t2 = gen_rtx_SET (operands[6], operands[7]);
18749   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
18750   return true;
18751 }
18752
18753 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
18754    it isn't worth branching around empty masked ops (including masked
18755    stores).  */
18756
18757 static bool
18758 aarch64_empty_mask_is_expensive (unsigned)
18759 {
18760   return false;
18761 }
18762
18763 /* Return 1 if pseudo register should be created and used to hold
18764    GOT address for PIC code.  */
18765
18766 bool
18767 aarch64_use_pseudo_pic_reg (void)
18768 {
18769   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
18770 }
18771
18772 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
18773
18774 static int
18775 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
18776 {
18777   switch (XINT (x, 1))
18778     {
18779     case UNSPEC_GOTSMALLPIC:
18780     case UNSPEC_GOTSMALLPIC28K:
18781     case UNSPEC_GOTTINYPIC:
18782       return 0;
18783     default:
18784       break;
18785     }
18786
18787   return default_unspec_may_trap_p (x, flags);
18788 }
18789
18790
18791 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
18792    return the log2 of that value.  Otherwise return -1.  */
18793
18794 int
18795 aarch64_fpconst_pow_of_2 (rtx x)
18796 {
18797   const REAL_VALUE_TYPE *r;
18798
18799   if (!CONST_DOUBLE_P (x))
18800     return -1;
18801
18802   r = CONST_DOUBLE_REAL_VALUE (x);
18803
18804   if (REAL_VALUE_NEGATIVE (*r)
18805       || REAL_VALUE_ISNAN (*r)
18806       || REAL_VALUE_ISINF (*r)
18807       || !real_isinteger (r, DFmode))
18808     return -1;
18809
18810   return exact_log2 (real_to_integer (r));
18811 }
18812
18813 /* If X is a vector of equal CONST_DOUBLE values and that value is
18814    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
18815
18816 int
18817 aarch64_vec_fpconst_pow_of_2 (rtx x)
18818 {
18819   int nelts;
18820   if (GET_CODE (x) != CONST_VECTOR
18821       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
18822     return -1;
18823
18824   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
18825     return -1;
18826
18827   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
18828   if (firstval <= 0)
18829     return -1;
18830
18831   for (int i = 1; i < nelts; i++)
18832     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
18833       return -1;
18834
18835   return firstval;
18836 }
18837
18838 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
18839    to float.
18840
18841    __fp16 always promotes through this hook.
18842    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
18843    through the generic excess precision logic rather than here.  */
18844
18845 static tree
18846 aarch64_promoted_type (const_tree t)
18847 {
18848   if (SCALAR_FLOAT_TYPE_P (t)
18849       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
18850     return float_type_node;
18851
18852   return NULL_TREE;
18853 }
18854
18855 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
18856
18857 static bool
18858 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
18859                            optimization_type opt_type)
18860 {
18861   switch (op)
18862     {
18863     case rsqrt_optab:
18864       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
18865
18866     default:
18867       return true;
18868     }
18869 }
18870
18871 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
18872
18873 static unsigned int
18874 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
18875                                         int *offset)
18876 {
18877   /* Polynomial invariant 1 == (VG / 2) - 1.  */
18878   gcc_assert (i == 1);
18879   *factor = 2;
18880   *offset = 1;
18881   return AARCH64_DWARF_VG;
18882 }
18883
18884 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
18885    if MODE is HFmode, and punt to the generic implementation otherwise.  */
18886
18887 static bool
18888 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
18889 {
18890   return (mode == HFmode
18891           ? true
18892           : default_libgcc_floating_mode_supported_p (mode));
18893 }
18894
18895 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
18896    if MODE is HFmode, and punt to the generic implementation otherwise.  */
18897
18898 static bool
18899 aarch64_scalar_mode_supported_p (scalar_mode mode)
18900 {
18901   return (mode == HFmode
18902           ? true
18903           : default_scalar_mode_supported_p (mode));
18904 }
18905
18906 /* Set the value of FLT_EVAL_METHOD.
18907    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
18908
18909     0: evaluate all operations and constants, whose semantic type has at
18910        most the range and precision of type float, to the range and
18911        precision of float; evaluate all other operations and constants to
18912        the range and precision of the semantic type;
18913
18914     N, where _FloatN is a supported interchange floating type
18915        evaluate all operations and constants, whose semantic type has at
18916        most the range and precision of _FloatN type, to the range and
18917        precision of the _FloatN type; evaluate all other operations and
18918        constants to the range and precision of the semantic type;
18919
18920    If we have the ARMv8.2-A extensions then we support _Float16 in native
18921    precision, so we should set this to 16.  Otherwise, we support the type,
18922    but want to evaluate expressions in float precision, so set this to
18923    0.  */
18924
18925 static enum flt_eval_method
18926 aarch64_excess_precision (enum excess_precision_type type)
18927 {
18928   switch (type)
18929     {
18930       case EXCESS_PRECISION_TYPE_FAST:
18931       case EXCESS_PRECISION_TYPE_STANDARD:
18932         /* We can calculate either in 16-bit range and precision or
18933            32-bit range and precision.  Make that decision based on whether
18934            we have native support for the ARMv8.2-A 16-bit floating-point
18935            instructions or not.  */
18936         return (TARGET_FP_F16INST
18937                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
18938                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
18939       case EXCESS_PRECISION_TYPE_IMPLICIT:
18940         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
18941       default:
18942         gcc_unreachable ();
18943     }
18944   return FLT_EVAL_METHOD_UNPREDICTABLE;
18945 }
18946
18947 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
18948    scheduled for speculative execution.  Reject the long-running division
18949    and square-root instructions.  */
18950
18951 static bool
18952 aarch64_sched_can_speculate_insn (rtx_insn *insn)
18953 {
18954   switch (get_attr_type (insn))
18955     {
18956       case TYPE_SDIV:
18957       case TYPE_UDIV:
18958       case TYPE_FDIVS:
18959       case TYPE_FDIVD:
18960       case TYPE_FSQRTS:
18961       case TYPE_FSQRTD:
18962       case TYPE_NEON_FP_SQRT_S:
18963       case TYPE_NEON_FP_SQRT_D:
18964       case TYPE_NEON_FP_SQRT_S_Q:
18965       case TYPE_NEON_FP_SQRT_D_Q:
18966       case TYPE_NEON_FP_DIV_S:
18967       case TYPE_NEON_FP_DIV_D:
18968       case TYPE_NEON_FP_DIV_S_Q:
18969       case TYPE_NEON_FP_DIV_D_Q:
18970         return false;
18971       default:
18972         return true;
18973     }
18974 }
18975
18976 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
18977
18978 static int
18979 aarch64_compute_pressure_classes (reg_class *classes)
18980 {
18981   int i = 0;
18982   classes[i++] = GENERAL_REGS;
18983   classes[i++] = FP_REGS;
18984   /* PR_REGS isn't a useful pressure class because many predicate pseudo
18985      registers need to go in PR_LO_REGS at some point during their
18986      lifetime.  Splitting it into two halves has the effect of making
18987      all predicates count against PR_LO_REGS, so that we try whenever
18988      possible to restrict the number of live predicates to 8.  This
18989      greatly reduces the amount of spilling in certain loops.  */
18990   classes[i++] = PR_LO_REGS;
18991   classes[i++] = PR_HI_REGS;
18992   return i;
18993 }
18994
18995 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
18996
18997 static bool
18998 aarch64_can_change_mode_class (machine_mode from,
18999                                machine_mode to, reg_class_t)
19000 {
19001   if (BYTES_BIG_ENDIAN)
19002     {
19003       bool from_sve_p = aarch64_sve_data_mode_p (from);
19004       bool to_sve_p = aarch64_sve_data_mode_p (to);
19005
19006       /* Don't allow changes between SVE data modes and non-SVE modes.
19007          See the comment at the head of aarch64-sve.md for details.  */
19008       if (from_sve_p != to_sve_p)
19009         return false;
19010
19011       /* Don't allow changes in element size: lane 0 of the new vector
19012          would not then be lane 0 of the old vector.  See the comment
19013          above aarch64_maybe_expand_sve_subreg_move for a more detailed
19014          description.
19015
19016          In the worst case, this forces a register to be spilled in
19017          one mode and reloaded in the other, which handles the
19018          endianness correctly.  */
19019       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
19020         return false;
19021     }
19022   return true;
19023 }
19024
19025 /* Implement TARGET_EARLY_REMAT_MODES.  */
19026
19027 static void
19028 aarch64_select_early_remat_modes (sbitmap modes)
19029 {
19030   /* SVE values are not normally live across a call, so it should be
19031      worth doing early rematerialization even in VL-specific mode.  */
19032   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
19033     {
19034       machine_mode mode = (machine_mode) i;
19035       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
19036       if (vec_flags & VEC_ANY_SVE)
19037         bitmap_set_bit (modes, i);
19038     }
19039 }
19040
19041 /* Override the default target speculation_safe_value.  */
19042 static rtx
19043 aarch64_speculation_safe_value (machine_mode mode,
19044                                 rtx result, rtx val, rtx failval)
19045 {
19046   /* Maybe we should warn if falling back to hard barriers.  They are
19047      likely to be noticably more expensive than the alternative below.  */
19048   if (!aarch64_track_speculation)
19049     return default_speculation_safe_value (mode, result, val, failval);
19050
19051   if (!REG_P (val))
19052     val = copy_to_mode_reg (mode, val);
19053
19054   if (!aarch64_reg_or_zero (failval, mode))
19055     failval = copy_to_mode_reg (mode, failval);
19056
19057   emit_insn (gen_despeculate_copy (mode, result, val, failval));
19058   return result;
19059 }
19060
19061 /* Implement TARGET_ESTIMATED_POLY_VALUE.
19062    Look into the tuning structure for an estimate.
19063    VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
19064    Advanced SIMD 128 bits.  */
19065
19066 static HOST_WIDE_INT
19067 aarch64_estimated_poly_value (poly_int64 val)
19068 {
19069   enum aarch64_sve_vector_bits_enum width_source
19070     = aarch64_tune_params.sve_width;
19071
19072   /* If we still don't have an estimate, use the default.  */
19073   if (width_source == SVE_SCALABLE)
19074     return default_estimated_poly_value (val);
19075
19076   HOST_WIDE_INT over_128 = width_source - 128;
19077   return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
19078 }
19079
19080
19081 /* Return true for types that could be supported as SIMD return or
19082    argument types.  */
19083
19084 static bool
19085 supported_simd_type (tree t)
19086 {
19087   if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
19088     {
19089       HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
19090       return s == 1 || s == 2 || s == 4 || s == 8;
19091     }
19092   return false;
19093 }
19094
19095 /* Return true for types that currently are supported as SIMD return
19096    or argument types.  */
19097
19098 static bool
19099 currently_supported_simd_type (tree t, tree b)
19100 {
19101   if (COMPLEX_FLOAT_TYPE_P (t))
19102     return false;
19103
19104   if (TYPE_SIZE (t) != TYPE_SIZE (b))
19105     return false;
19106
19107   return supported_simd_type (t);
19108 }
19109
19110 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
19111
19112 static int
19113 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
19114                                         struct cgraph_simd_clone *clonei,
19115                                         tree base_type, int num)
19116 {
19117   tree t, ret_type, arg_type;
19118   unsigned int elt_bits, vec_bits, count;
19119
19120   if (!TARGET_SIMD)
19121     return 0;
19122
19123   if (clonei->simdlen
19124       && (clonei->simdlen < 2
19125           || clonei->simdlen > 1024
19126           || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
19127     {
19128       warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19129                   "unsupported simdlen %d", clonei->simdlen);
19130       return 0;
19131     }
19132
19133   ret_type = TREE_TYPE (TREE_TYPE (node->decl));
19134   if (TREE_CODE (ret_type) != VOID_TYPE
19135       && !currently_supported_simd_type (ret_type, base_type))
19136     {
19137       if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
19138         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19139                     "GCC does not currently support mixed size types "
19140                     "for %<simd%> functions");
19141       else if (supported_simd_type (ret_type))
19142         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19143                     "GCC does not currently support return type %qT "
19144                     "for %<simd%> functions", ret_type);
19145       else
19146         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19147                     "unsupported return type %qT for %<simd%> functions",
19148                     ret_type);
19149       return 0;
19150     }
19151
19152   for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
19153     {
19154       arg_type = TREE_TYPE (t);
19155
19156       if (!currently_supported_simd_type (arg_type, base_type))
19157         {
19158           if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
19159             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19160                         "GCC does not currently support mixed size types "
19161                         "for %<simd%> functions");
19162           else
19163             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19164                         "GCC does not currently support argument type %qT "
19165                         "for %<simd%> functions", arg_type);
19166           return 0;
19167         }
19168     }
19169
19170   clonei->vecsize_mangle = 'n';
19171   clonei->mask_mode = VOIDmode;
19172   elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
19173   if (clonei->simdlen == 0)
19174     {
19175       count = 2;
19176       vec_bits = (num == 0 ? 64 : 128);
19177       clonei->simdlen = vec_bits / elt_bits;
19178     }
19179   else
19180     {
19181       count = 1;
19182       vec_bits = clonei->simdlen * elt_bits;
19183       if (vec_bits != 64 && vec_bits != 128)
19184         {
19185           warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19186                       "GCC does not currently support simdlen %d for type %qT",
19187                       clonei->simdlen, base_type);
19188           return 0;
19189         }
19190     }
19191   clonei->vecsize_int = vec_bits;
19192   clonei->vecsize_float = vec_bits;
19193   return count;
19194 }
19195
19196 /* Implement TARGET_SIMD_CLONE_ADJUST.  */
19197
19198 static void
19199 aarch64_simd_clone_adjust (struct cgraph_node *node)
19200 {
19201   /* Add aarch64_vector_pcs target attribute to SIMD clones so they
19202      use the correct ABI.  */
19203
19204   tree t = TREE_TYPE (node->decl);
19205   TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
19206                                         TYPE_ATTRIBUTES (t));
19207 }
19208
19209 /* Implement TARGET_SIMD_CLONE_USABLE.  */
19210
19211 static int
19212 aarch64_simd_clone_usable (struct cgraph_node *node)
19213 {
19214   switch (node->simdclone->vecsize_mangle)
19215     {
19216     case 'n':
19217       if (!TARGET_SIMD)
19218         return -1;
19219       return 0;
19220     default:
19221       gcc_unreachable ();
19222     }
19223 }
19224
19225 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
19226
19227 static int
19228 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
19229 {
19230   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
19231       != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
19232     return 0;
19233   return 1;
19234 }
19235
19236 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
19237
19238 static const char *
19239 aarch64_get_multilib_abi_name (void)
19240 {
19241   if (TARGET_BIG_END)
19242     return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
19243   return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
19244 }
19245
19246 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
19247    global variable based guard use the default else
19248    return a null tree.  */
19249 static tree
19250 aarch64_stack_protect_guard (void)
19251 {
19252   if (aarch64_stack_protector_guard == SSP_GLOBAL)
19253     return default_stack_protect_guard ();
19254
19255   return NULL_TREE;
19256 }
19257
19258 /* Implement TARGET_ASM_FILE_END for AArch64.  This adds the AArch64 GNU NOTE
19259    section at the end if needed.  */
19260 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND      0xc0000000
19261 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI      (1U << 0)
19262 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC      (1U << 1)
19263 void
19264 aarch64_file_end_indicate_exec_stack ()
19265 {
19266   file_end_indicate_exec_stack ();
19267
19268   unsigned feature_1_and = 0;
19269   if (aarch64_bti_enabled ())
19270     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
19271
19272   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
19273     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
19274
19275   if (feature_1_and)
19276     {
19277       /* Generate .note.gnu.property section.  */
19278       switch_to_section (get_section (".note.gnu.property",
19279                                       SECTION_NOTYPE, NULL));
19280
19281       /* PT_NOTE header: namesz, descsz, type.
19282          namesz = 4 ("GNU\0")
19283          descsz = 16 (Size of the program property array)
19284                   [(12 + padding) * Number of array elements]
19285          type   = 5 (NT_GNU_PROPERTY_TYPE_0).  */
19286       assemble_align (POINTER_SIZE);
19287       assemble_integer (GEN_INT (4), 4, 32, 1);
19288       assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
19289       assemble_integer (GEN_INT (5), 4, 32, 1);
19290
19291       /* PT_NOTE name.  */
19292       assemble_string ("GNU", 4);
19293
19294       /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
19295          type   = GNU_PROPERTY_AARCH64_FEATURE_1_AND
19296          datasz = 4
19297          data   = feature_1_and.  */
19298       assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
19299       assemble_integer (GEN_INT (4), 4, 32, 1);
19300       assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
19301
19302       /* Pad the size of the note to the required alignment.  */
19303       assemble_align (POINTER_SIZE);
19304     }
19305 }
19306 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
19307 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
19308 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
19309
19310 /* Target-specific selftests.  */
19311
19312 #if CHECKING_P
19313
19314 namespace selftest {
19315
19316 /* Selftest for the RTL loader.
19317    Verify that the RTL loader copes with a dump from
19318    print_rtx_function.  This is essentially just a test that class
19319    function_reader can handle a real dump, but it also verifies
19320    that lookup_reg_by_dump_name correctly handles hard regs.
19321    The presence of hard reg names in the dump means that the test is
19322    target-specific, hence it is in this file.  */
19323
19324 static void
19325 aarch64_test_loading_full_dump ()
19326 {
19327   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
19328
19329   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
19330
19331   rtx_insn *insn_1 = get_insn_by_uid (1);
19332   ASSERT_EQ (NOTE, GET_CODE (insn_1));
19333
19334   rtx_insn *insn_15 = get_insn_by_uid (15);
19335   ASSERT_EQ (INSN, GET_CODE (insn_15));
19336   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
19337
19338   /* Verify crtl->return_rtx.  */
19339   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
19340   ASSERT_EQ (0, REGNO (crtl->return_rtx));
19341   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
19342 }
19343
19344 /* Run all target-specific selftests.  */
19345
19346 static void
19347 aarch64_run_selftests (void)
19348 {
19349   aarch64_test_loading_full_dump ();
19350 }
19351
19352 } // namespace selftest
19353
19354 #endif /* #if CHECKING_P */
19355
19356 #undef TARGET_STACK_PROTECT_GUARD
19357 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
19358
19359 #undef TARGET_ADDRESS_COST
19360 #define TARGET_ADDRESS_COST aarch64_address_cost
19361
19362 /* This hook will determines whether unnamed bitfields affect the alignment
19363    of the containing structure.  The hook returns true if the structure
19364    should inherit the alignment requirements of an unnamed bitfield's
19365    type.  */
19366 #undef TARGET_ALIGN_ANON_BITFIELD
19367 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
19368
19369 #undef TARGET_ASM_ALIGNED_DI_OP
19370 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
19371
19372 #undef TARGET_ASM_ALIGNED_HI_OP
19373 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
19374
19375 #undef TARGET_ASM_ALIGNED_SI_OP
19376 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
19377
19378 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
19379 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
19380   hook_bool_const_tree_hwi_hwi_const_tree_true
19381
19382 #undef TARGET_ASM_FILE_START
19383 #define TARGET_ASM_FILE_START aarch64_start_file
19384
19385 #undef TARGET_ASM_OUTPUT_MI_THUNK
19386 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
19387
19388 #undef TARGET_ASM_SELECT_RTX_SECTION
19389 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
19390
19391 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
19392 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
19393
19394 #undef TARGET_BUILD_BUILTIN_VA_LIST
19395 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
19396
19397 #undef TARGET_CALLEE_COPIES
19398 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
19399
19400 #undef TARGET_CAN_ELIMINATE
19401 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
19402
19403 #undef TARGET_CAN_INLINE_P
19404 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
19405
19406 #undef TARGET_CANNOT_FORCE_CONST_MEM
19407 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
19408
19409 #undef TARGET_CASE_VALUES_THRESHOLD
19410 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
19411
19412 #undef TARGET_CONDITIONAL_REGISTER_USAGE
19413 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
19414
19415 /* Only the least significant bit is used for initialization guard
19416    variables.  */
19417 #undef TARGET_CXX_GUARD_MASK_BIT
19418 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
19419
19420 #undef TARGET_C_MODE_FOR_SUFFIX
19421 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
19422
19423 #ifdef TARGET_BIG_ENDIAN_DEFAULT
19424 #undef  TARGET_DEFAULT_TARGET_FLAGS
19425 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
19426 #endif
19427
19428 #undef TARGET_CLASS_MAX_NREGS
19429 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
19430
19431 #undef TARGET_BUILTIN_DECL
19432 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
19433
19434 #undef TARGET_BUILTIN_RECIPROCAL
19435 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
19436
19437 #undef TARGET_C_EXCESS_PRECISION
19438 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
19439
19440 #undef  TARGET_EXPAND_BUILTIN
19441 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
19442
19443 #undef TARGET_EXPAND_BUILTIN_VA_START
19444 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
19445
19446 #undef TARGET_FOLD_BUILTIN
19447 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
19448
19449 #undef TARGET_FUNCTION_ARG
19450 #define TARGET_FUNCTION_ARG aarch64_function_arg
19451
19452 #undef TARGET_FUNCTION_ARG_ADVANCE
19453 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
19454
19455 #undef TARGET_FUNCTION_ARG_BOUNDARY
19456 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
19457
19458 #undef TARGET_FUNCTION_ARG_PADDING
19459 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
19460
19461 #undef TARGET_GET_RAW_RESULT_MODE
19462 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
19463 #undef TARGET_GET_RAW_ARG_MODE
19464 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
19465
19466 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
19467 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
19468
19469 #undef TARGET_FUNCTION_VALUE
19470 #define TARGET_FUNCTION_VALUE aarch64_function_value
19471
19472 #undef TARGET_FUNCTION_VALUE_REGNO_P
19473 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
19474
19475 #undef TARGET_GIMPLE_FOLD_BUILTIN
19476 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
19477
19478 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
19479 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
19480
19481 #undef  TARGET_INIT_BUILTINS
19482 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
19483
19484 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
19485 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
19486   aarch64_ira_change_pseudo_allocno_class
19487
19488 #undef TARGET_LEGITIMATE_ADDRESS_P
19489 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
19490
19491 #undef TARGET_LEGITIMATE_CONSTANT_P
19492 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
19493
19494 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
19495 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
19496   aarch64_legitimize_address_displacement
19497
19498 #undef TARGET_LIBGCC_CMP_RETURN_MODE
19499 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
19500
19501 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
19502 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
19503 aarch64_libgcc_floating_mode_supported_p
19504
19505 #undef TARGET_MANGLE_TYPE
19506 #define TARGET_MANGLE_TYPE aarch64_mangle_type
19507
19508 #undef TARGET_MEMORY_MOVE_COST
19509 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
19510
19511 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
19512 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
19513
19514 #undef TARGET_MUST_PASS_IN_STACK
19515 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
19516
19517 /* This target hook should return true if accesses to volatile bitfields
19518    should use the narrowest mode possible.  It should return false if these
19519    accesses should use the bitfield container type.  */
19520 #undef TARGET_NARROW_VOLATILE_BITFIELD
19521 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
19522
19523 #undef  TARGET_OPTION_OVERRIDE
19524 #define TARGET_OPTION_OVERRIDE aarch64_override_options
19525
19526 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
19527 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
19528   aarch64_override_options_after_change
19529
19530 #undef TARGET_OPTION_SAVE
19531 #define TARGET_OPTION_SAVE aarch64_option_save
19532
19533 #undef TARGET_OPTION_RESTORE
19534 #define TARGET_OPTION_RESTORE aarch64_option_restore
19535
19536 #undef TARGET_OPTION_PRINT
19537 #define TARGET_OPTION_PRINT aarch64_option_print
19538
19539 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
19540 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
19541
19542 #undef TARGET_SET_CURRENT_FUNCTION
19543 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
19544
19545 #undef TARGET_PASS_BY_REFERENCE
19546 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
19547
19548 #undef TARGET_PREFERRED_RELOAD_CLASS
19549 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
19550
19551 #undef TARGET_SCHED_REASSOCIATION_WIDTH
19552 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
19553
19554 #undef TARGET_PROMOTED_TYPE
19555 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
19556
19557 #undef TARGET_SECONDARY_RELOAD
19558 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
19559
19560 #undef TARGET_SHIFT_TRUNCATION_MASK
19561 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
19562
19563 #undef TARGET_SETUP_INCOMING_VARARGS
19564 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
19565
19566 #undef TARGET_STRUCT_VALUE_RTX
19567 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
19568
19569 #undef TARGET_REGISTER_MOVE_COST
19570 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
19571
19572 #undef TARGET_RETURN_IN_MEMORY
19573 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
19574
19575 #undef TARGET_RETURN_IN_MSB
19576 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
19577
19578 #undef TARGET_RTX_COSTS
19579 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
19580
19581 #undef TARGET_SCALAR_MODE_SUPPORTED_P
19582 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
19583
19584 #undef TARGET_SCHED_ISSUE_RATE
19585 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
19586
19587 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
19588 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
19589   aarch64_sched_first_cycle_multipass_dfa_lookahead
19590
19591 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
19592 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
19593   aarch64_first_cycle_multipass_dfa_lookahead_guard
19594
19595 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
19596 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
19597   aarch64_get_separate_components
19598
19599 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
19600 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
19601   aarch64_components_for_bb
19602
19603 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
19604 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
19605   aarch64_disqualify_components
19606
19607 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
19608 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
19609   aarch64_emit_prologue_components
19610
19611 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
19612 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
19613   aarch64_emit_epilogue_components
19614
19615 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
19616 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
19617   aarch64_set_handled_components
19618
19619 #undef TARGET_TRAMPOLINE_INIT
19620 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
19621
19622 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
19623 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
19624
19625 #undef TARGET_VECTOR_MODE_SUPPORTED_P
19626 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
19627
19628 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
19629 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
19630   aarch64_builtin_support_vector_misalignment
19631
19632 #undef TARGET_ARRAY_MODE
19633 #define TARGET_ARRAY_MODE aarch64_array_mode
19634
19635 #undef TARGET_ARRAY_MODE_SUPPORTED_P
19636 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
19637
19638 #undef TARGET_VECTORIZE_ADD_STMT_COST
19639 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
19640
19641 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
19642 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
19643   aarch64_builtin_vectorization_cost
19644
19645 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
19646 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
19647
19648 #undef TARGET_VECTORIZE_BUILTINS
19649 #define TARGET_VECTORIZE_BUILTINS
19650
19651 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
19652 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
19653   aarch64_builtin_vectorized_function
19654
19655 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
19656 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
19657   aarch64_autovectorize_vector_sizes
19658
19659 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
19660 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
19661   aarch64_atomic_assign_expand_fenv
19662
19663 /* Section anchor support.  */
19664
19665 #undef TARGET_MIN_ANCHOR_OFFSET
19666 #define TARGET_MIN_ANCHOR_OFFSET -256
19667
19668 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
19669    byte offset; we can do much more for larger data types, but have no way
19670    to determine the size of the access.  We assume accesses are aligned.  */
19671 #undef TARGET_MAX_ANCHOR_OFFSET
19672 #define TARGET_MAX_ANCHOR_OFFSET 4095
19673
19674 #undef TARGET_VECTOR_ALIGNMENT
19675 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
19676
19677 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
19678 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
19679   aarch64_vectorize_preferred_vector_alignment
19680 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
19681 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
19682   aarch64_simd_vector_alignment_reachable
19683
19684 /* vec_perm support.  */
19685
19686 #undef TARGET_VECTORIZE_VEC_PERM_CONST
19687 #define TARGET_VECTORIZE_VEC_PERM_CONST \
19688   aarch64_vectorize_vec_perm_const
19689
19690 #undef TARGET_VECTORIZE_GET_MASK_MODE
19691 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
19692 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
19693 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
19694   aarch64_empty_mask_is_expensive
19695 #undef TARGET_PREFERRED_ELSE_VALUE
19696 #define TARGET_PREFERRED_ELSE_VALUE \
19697   aarch64_preferred_else_value
19698
19699 #undef TARGET_INIT_LIBFUNCS
19700 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
19701
19702 #undef TARGET_FIXED_CONDITION_CODE_REGS
19703 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
19704
19705 #undef TARGET_FLAGS_REGNUM
19706 #define TARGET_FLAGS_REGNUM CC_REGNUM
19707
19708 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
19709 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
19710
19711 #undef TARGET_ASAN_SHADOW_OFFSET
19712 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
19713
19714 #undef TARGET_LEGITIMIZE_ADDRESS
19715 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
19716
19717 #undef TARGET_SCHED_CAN_SPECULATE_INSN
19718 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
19719
19720 #undef TARGET_CAN_USE_DOLOOP_P
19721 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
19722
19723 #undef TARGET_SCHED_ADJUST_PRIORITY
19724 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
19725
19726 #undef TARGET_SCHED_MACRO_FUSION_P
19727 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
19728
19729 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
19730 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
19731
19732 #undef TARGET_SCHED_FUSION_PRIORITY
19733 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
19734
19735 #undef TARGET_UNSPEC_MAY_TRAP_P
19736 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
19737
19738 #undef TARGET_USE_PSEUDO_PIC_REG
19739 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
19740
19741 #undef TARGET_PRINT_OPERAND
19742 #define TARGET_PRINT_OPERAND aarch64_print_operand
19743
19744 #undef TARGET_PRINT_OPERAND_ADDRESS
19745 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
19746
19747 #undef TARGET_OPTAB_SUPPORTED_P
19748 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
19749
19750 #undef TARGET_OMIT_STRUCT_RETURN_REG
19751 #define TARGET_OMIT_STRUCT_RETURN_REG true
19752
19753 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
19754 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
19755   aarch64_dwarf_poly_indeterminate_value
19756
19757 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
19758 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
19759 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
19760
19761 #undef TARGET_HARD_REGNO_NREGS
19762 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
19763 #undef TARGET_HARD_REGNO_MODE_OK
19764 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
19765
19766 #undef TARGET_MODES_TIEABLE_P
19767 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
19768
19769 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
19770 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
19771   aarch64_hard_regno_call_part_clobbered
19772
19773 #undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
19774 #define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \
19775   aarch64_remove_extra_call_preserved_regs
19776
19777 #undef TARGET_RETURN_CALL_WITH_MAX_CLOBBERS
19778 #define TARGET_RETURN_CALL_WITH_MAX_CLOBBERS \
19779   aarch64_return_call_with_max_clobbers
19780
19781 #undef TARGET_CONSTANT_ALIGNMENT
19782 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
19783
19784 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
19785 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
19786   aarch64_stack_clash_protection_alloca_probe_range
19787
19788 #undef TARGET_COMPUTE_PRESSURE_CLASSES
19789 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
19790
19791 #undef TARGET_CAN_CHANGE_MODE_CLASS
19792 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
19793
19794 #undef TARGET_SELECT_EARLY_REMAT_MODES
19795 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
19796
19797 #undef TARGET_SPECULATION_SAFE_VALUE
19798 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
19799
19800 #undef TARGET_ESTIMATED_POLY_VALUE
19801 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
19802
19803 #undef TARGET_ATTRIBUTE_TABLE
19804 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
19805
19806 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
19807 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
19808   aarch64_simd_clone_compute_vecsize_and_simdlen
19809
19810 #undef TARGET_SIMD_CLONE_ADJUST
19811 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
19812
19813 #undef TARGET_SIMD_CLONE_USABLE
19814 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
19815
19816 #undef TARGET_COMP_TYPE_ATTRIBUTES
19817 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
19818
19819 #undef TARGET_GET_MULTILIB_ABI_NAME
19820 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
19821
19822 #if CHECKING_P
19823 #undef TARGET_RUN_TARGET_SELFTESTS
19824 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
19825 #endif /* #if CHECKING_P */
19826
19827 #undef TARGET_ASM_POST_CFI_STARTPROC
19828 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
19829
19830 struct gcc_target targetm = TARGET_INITIALIZER;
19831
19832 #include "gt-aarch64.h"