gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2019 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #include "config.h"
  24 #define INCLUDE_STRING
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "memmodel.h"
  32 #include "gimple.h"
  33 #include "cfghooks.h"
  34 #include "cfgloop.h"
  35 #include "df.h"
  36 #include "tm_p.h"
  37 #include "stringpool.h"
  38 #include "attribs.h"
  39 #include "optabs.h"
  40 #include "regs.h"
  41 #include "emit-rtl.h"
  42 #include "recog.h"
  43 #include "diagnostic.h"
  44 #include "insn-attr.h"
  45 #include "alias.h"
  46 #include "fold-const.h"
  47 #include "stor-layout.h"
  48 #include "calls.h"
  49 #include "varasm.h"
  50 #include "output.h"
  51 #include "flags.h"
  52 #include "explow.h"
  53 #include "expr.h"
  54 #include "reload.h"
  55 #include "langhooks.h"
  56 #include "opts.h"
  57 #include "params.h"
  58 #include "gimplify.h"
  59 #include "dwarf2.h"
  60 #include "gimple-iterator.h"
  61 #include "tree-vectorizer.h"
  62 #include "aarch64-cost-tables.h"
  63 #include "dumpfile.h"
  64 #include "builtins.h"
  65 #include "rtl-iter.h"
  66 #include "tm-constrs.h"
  67 #include "sched-int.h"
  68 #include "target-globals.h"
  69 #include "common/common-target.h"
  70 #include "cfgrtl.h"
  71 #include "selftest.h"
  72 #include "selftest-rtl.h"
  73 #include "rtx-vector-builder.h"
  74
  75 /* This file should be included last.  */
  76 #include "target-def.h"
  77
  78 /* Defined for convenience.  */
  79 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  80
  81 /* Information about a legitimate vector immediate operand.  */
  82 struct simd_immediate_info
  83 {
  84   enum insn_type { MOV, MVN };
  85   enum modifier_type { LSL, MSL };
  86
  87   simd_immediate_info () {}
  88   simd_immediate_info (scalar_float_mode, rtx);
  89   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
  90                        insn_type = MOV, modifier_type = LSL,
  91                        unsigned int = 0);
  92   simd_immediate_info (scalar_mode, rtx, rtx);
  93
  94   /* The mode of the elements.  */
  95   scalar_mode elt_mode;
  96
  97   /* The value of each element if all elements are the same, or the
  98      first value if the constant is a series.  */
  99   rtx value;
 100
 101   /* The value of the step if the constant is a series, null otherwise.  */
 102   rtx step;
 103
 104   /* The instruction to use to move the immediate into a vector.  */
 105   insn_type insn;
 106
 107   /* The kind of shift modifier to use, and the number of bits to shift.
 108      This is (LSL, 0) if no shift is needed.  */
 109   modifier_type modifier;
 110   unsigned int shift;
 111 };
 112
 113 /* Construct a floating-point immediate in which each element has mode
 114    ELT_MODE_IN and value VALUE_IN.  */
 115 inline simd_immediate_info
 116 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 117   : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
 118     modifier (LSL), shift (0)
 119 {}
 120
 121 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 122    and value VALUE_IN.  The other parameters are as for the structure
 123    fields.  */
 124 inline simd_immediate_info
 125 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 126                        unsigned HOST_WIDE_INT value_in,
 127                        insn_type insn_in, modifier_type modifier_in,
 128                        unsigned int shift_in)
 129   : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
 130     step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
 131 {}
 132
 133 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 134    and where element I is equal to VALUE_IN + I * STEP_IN.  */
 135 inline simd_immediate_info
 136 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
 137   : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
 138     modifier (LSL), shift (0)
 139 {}
 140
 141 /* The current code model.  */
 142 enum aarch64_code_model aarch64_cmodel;
 143
 144 /* The number of 64-bit elements in an SVE vector.  */
 145 poly_uint16 aarch64_sve_vg;
 146
 147 #ifdef HAVE_AS_TLS
 148 #undef TARGET_HAVE_TLS
 149 #define TARGET_HAVE_TLS 1
 150 #endif
 151
 152 static bool aarch64_composite_type_p (const_tree, machine_mode);
 153 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 154                                                      const_tree,
 155                                                      machine_mode *, int *,
 156                                                      bool *);
 157 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 158 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 159 static void aarch64_override_options_after_change (void);
 160 static bool aarch64_vector_mode_supported_p (machine_mode);
 161 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 162 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 163                                                          const_tree type,
 164                                                          int misalignment,
 165                                                          bool is_packed);
 166 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 167 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 168                                             aarch64_addr_query_type);
 169 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
 170
 171 /* Major revision number of the ARM Architecture implemented by the target.  */
 172 unsigned aarch64_architecture_version;
 173
 174 /* The processor for which instructions should be scheduled.  */
 175 enum aarch64_processor aarch64_tune = cortexa53;
 176
 177 /* Mask to specify which instruction scheduling options should be used.  */
 178 unsigned long aarch64_tune_flags = 0;
 179
 180 /* Global flag for PC relative loads.  */
 181 bool aarch64_pcrelative_literal_loads;
 182
 183 /* Global flag for whether frame pointer is enabled.  */
 184 bool aarch64_use_frame_pointer;
 185
 186 #define BRANCH_PROTECT_STR_MAX 255
 187 char *accepted_branch_protection_string = NULL;
 188
 189 static enum aarch64_parse_opt_result
 190 aarch64_parse_branch_protection (const char*, char**);
 191
 192 /* Support for command line parsing of boolean flags in the tuning
 193    structures.  */
 194 struct aarch64_flag_desc
 195 {
 196   const char* name;
 197   unsigned int flag;
 198 };
 199
 200 #define AARCH64_FUSION_PAIR(name, internal_name) \
 201   { name, AARCH64_FUSE_##internal_name },
 202 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 203 {
 204   { "none", AARCH64_FUSE_NOTHING },
 205 #include "aarch64-fusion-pairs.def"
 206   { "all", AARCH64_FUSE_ALL },
 207   { NULL, AARCH64_FUSE_NOTHING }
 208 };
 209
 210 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 211   { name, AARCH64_EXTRA_TUNE_##internal_name },
 212 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 213 {
 214   { "none", AARCH64_EXTRA_TUNE_NONE },
 215 #include "aarch64-tuning-flags.def"
 216   { "all", AARCH64_EXTRA_TUNE_ALL },
 217   { NULL, AARCH64_EXTRA_TUNE_NONE }
 218 };
 219
 220 /* Tuning parameters.  */
 221
 222 static const struct cpu_addrcost_table generic_addrcost_table =
 223 {
 224     {
 225       1, /* hi  */
 226       0, /* si  */
 227       0, /* di  */
 228       1, /* ti  */
 229     },
 230   0, /* pre_modify  */
 231   0, /* post_modify  */
 232   0, /* register_offset  */
 233   0, /* register_sextend  */
 234   0, /* register_zextend  */
 235   0 /* imm_offset  */
 236 };
 237
 238 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 239 {
 240     {
 241       0, /* hi  */
 242       0, /* si  */
 243       0, /* di  */
 244       2, /* ti  */
 245     },
 246   0, /* pre_modify  */
 247   0, /* post_modify  */
 248   1, /* register_offset  */
 249   1, /* register_sextend  */
 250   2, /* register_zextend  */
 251   0, /* imm_offset  */
 252 };
 253
 254 static const struct cpu_addrcost_table xgene1_addrcost_table =
 255 {
 256     {
 257       1, /* hi  */
 258       0, /* si  */
 259       0, /* di  */
 260       1, /* ti  */
 261     },
 262   1, /* pre_modify  */
 263   1, /* post_modify  */
 264   0, /* register_offset  */
 265   1, /* register_sextend  */
 266   1, /* register_zextend  */
 267   0, /* imm_offset  */
 268 };
 269
 270 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 271 {
 272     {
 273       1, /* hi  */
 274       1, /* si  */
 275       1, /* di  */
 276       2, /* ti  */
 277     },
 278   0, /* pre_modify  */
 279   0, /* post_modify  */
 280   2, /* register_offset  */
 281   3, /* register_sextend  */
 282   3, /* register_zextend  */
 283   0, /* imm_offset  */
 284 };
 285
 286 static const struct cpu_addrcost_table tsv110_addrcost_table =
 287 {
 288     {
 289       1, /* hi  */
 290       0, /* si  */
 291       0, /* di  */
 292       1, /* ti  */
 293     },
 294   0, /* pre_modify  */
 295   0, /* post_modify  */
 296   0, /* register_offset  */
 297   1, /* register_sextend  */
 298   1, /* register_zextend  */
 299   0, /* imm_offset  */
 300 };
 301
 302 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 303 {
 304     {
 305       1, /* hi  */
 306       1, /* si  */
 307       1, /* di  */
 308       2, /* ti  */
 309     },
 310   1, /* pre_modify  */
 311   1, /* post_modify  */
 312   3, /* register_offset  */
 313   3, /* register_sextend  */
 314   3, /* register_zextend  */
 315   2, /* imm_offset  */
 316 };
 317
 318 static const struct cpu_regmove_cost generic_regmove_cost =
 319 {
 320   1, /* GP2GP  */
 321   /* Avoid the use of slow int<->fp moves for spilling by setting
 322      their cost higher than memmov_cost.  */
 323   5, /* GP2FP  */
 324   5, /* FP2GP  */
 325   2 /* FP2FP  */
 326 };
 327
 328 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 329 {
 330   1, /* GP2GP  */
 331   /* Avoid the use of slow int<->fp moves for spilling by setting
 332      their cost higher than memmov_cost.  */
 333   5, /* GP2FP  */
 334   5, /* FP2GP  */
 335   2 /* FP2FP  */
 336 };
 337
 338 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 339 {
 340   1, /* GP2GP  */
 341   /* Avoid the use of slow int<->fp moves for spilling by setting
 342      their cost higher than memmov_cost.  */
 343   5, /* GP2FP  */
 344   5, /* FP2GP  */
 345   2 /* FP2FP  */
 346 };
 347
 348 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 349 {
 350   1, /* GP2GP  */
 351   /* Avoid the use of slow int<->fp moves for spilling by setting
 352      their cost higher than memmov_cost (actual, 4 and 9).  */
 353   9, /* GP2FP  */
 354   9, /* FP2GP  */
 355   1 /* FP2FP  */
 356 };
 357
 358 static const struct cpu_regmove_cost thunderx_regmove_cost =
 359 {
 360   2, /* GP2GP  */
 361   2, /* GP2FP  */
 362   6, /* FP2GP  */
 363   4 /* FP2FP  */
 364 };
 365
 366 static const struct cpu_regmove_cost xgene1_regmove_cost =
 367 {
 368   1, /* GP2GP  */
 369   /* Avoid the use of slow int<->fp moves for spilling by setting
 370      their cost higher than memmov_cost.  */
 371   8, /* GP2FP  */
 372   8, /* FP2GP  */
 373   2 /* FP2FP  */
 374 };
 375
 376 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 377 {
 378   2, /* GP2GP  */
 379   /* Avoid the use of int<->fp moves for spilling.  */
 380   6, /* GP2FP  */
 381   6, /* FP2GP  */
 382   4 /* FP2FP  */
 383 };
 384
 385 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 386 {
 387   1, /* GP2GP  */
 388   /* Avoid the use of int<->fp moves for spilling.  */
 389   8, /* GP2FP  */
 390   8, /* FP2GP  */
 391   4  /* FP2FP  */
 392 };
 393
 394 static const struct cpu_regmove_cost tsv110_regmove_cost =
 395 {
 396   1, /* GP2GP  */
 397   /* Avoid the use of slow int<->fp moves for spilling by setting
 398      their cost higher than memmov_cost.  */
 399   2, /* GP2FP  */
 400   3, /* FP2GP  */
 401   2  /* FP2FP  */
 402 };
 403
 404 /* Generic costs for vector insn classes.  */
 405 static const struct cpu_vector_cost generic_vector_cost =
 406 {
 407   1, /* scalar_int_stmt_cost  */
 408   1, /* scalar_fp_stmt_cost  */
 409   1, /* scalar_load_cost  */
 410   1, /* scalar_store_cost  */
 411   1, /* vec_int_stmt_cost  */
 412   1, /* vec_fp_stmt_cost  */
 413   2, /* vec_permute_cost  */
 414   1, /* vec_to_scalar_cost  */
 415   1, /* scalar_to_vec_cost  */
 416   1, /* vec_align_load_cost  */
 417   1, /* vec_unalign_load_cost  */
 418   1, /* vec_unalign_store_cost  */
 419   1, /* vec_store_cost  */
 420   3, /* cond_taken_branch_cost  */
 421   1 /* cond_not_taken_branch_cost  */
 422 };
 423
 424 /* QDF24XX costs for vector insn classes.  */
 425 static const struct cpu_vector_cost qdf24xx_vector_cost =
 426 {
 427   1, /* scalar_int_stmt_cost  */
 428   1, /* scalar_fp_stmt_cost  */
 429   1, /* scalar_load_cost  */
 430   1, /* scalar_store_cost  */
 431   1, /* vec_int_stmt_cost  */
 432   3, /* vec_fp_stmt_cost  */
 433   2, /* vec_permute_cost  */
 434   1, /* vec_to_scalar_cost  */
 435   1, /* scalar_to_vec_cost  */
 436   1, /* vec_align_load_cost  */
 437   1, /* vec_unalign_load_cost  */
 438   1, /* vec_unalign_store_cost  */
 439   1, /* vec_store_cost  */
 440   3, /* cond_taken_branch_cost  */
 441   1 /* cond_not_taken_branch_cost  */
 442 };
 443
 444 /* ThunderX costs for vector insn classes.  */
 445 static const struct cpu_vector_cost thunderx_vector_cost =
 446 {
 447   1, /* scalar_int_stmt_cost  */
 448   1, /* scalar_fp_stmt_cost  */
 449   3, /* scalar_load_cost  */
 450   1, /* scalar_store_cost  */
 451   4, /* vec_int_stmt_cost  */
 452   1, /* vec_fp_stmt_cost  */
 453   4, /* vec_permute_cost  */
 454   2, /* vec_to_scalar_cost  */
 455   2, /* scalar_to_vec_cost  */
 456   3, /* vec_align_load_cost  */
 457   5, /* vec_unalign_load_cost  */
 458   5, /* vec_unalign_store_cost  */
 459   1, /* vec_store_cost  */
 460   3, /* cond_taken_branch_cost  */
 461   3 /* cond_not_taken_branch_cost  */
 462 };
 463
 464 static const struct cpu_vector_cost tsv110_vector_cost =
 465 {
 466   1, /* scalar_int_stmt_cost  */
 467   1, /* scalar_fp_stmt_cost  */
 468   5, /* scalar_load_cost  */
 469   1, /* scalar_store_cost  */
 470   2, /* vec_int_stmt_cost  */
 471   2, /* vec_fp_stmt_cost  */
 472   2, /* vec_permute_cost  */
 473   3, /* vec_to_scalar_cost  */
 474   2, /* scalar_to_vec_cost  */
 475   5, /* vec_align_load_cost  */
 476   5, /* vec_unalign_load_cost  */
 477   1, /* vec_unalign_store_cost  */
 478   1, /* vec_store_cost  */
 479   1, /* cond_taken_branch_cost  */
 480   1 /* cond_not_taken_branch_cost  */
 481 };
 482
 483 /* Generic costs for vector insn classes.  */
 484 static const struct cpu_vector_cost cortexa57_vector_cost =
 485 {
 486   1, /* scalar_int_stmt_cost  */
 487   1, /* scalar_fp_stmt_cost  */
 488   4, /* scalar_load_cost  */
 489   1, /* scalar_store_cost  */
 490   2, /* vec_int_stmt_cost  */
 491   2, /* vec_fp_stmt_cost  */
 492   3, /* vec_permute_cost  */
 493   8, /* vec_to_scalar_cost  */
 494   8, /* scalar_to_vec_cost  */
 495   4, /* vec_align_load_cost  */
 496   4, /* vec_unalign_load_cost  */
 497   1, /* vec_unalign_store_cost  */
 498   1, /* vec_store_cost  */
 499   1, /* cond_taken_branch_cost  */
 500   1 /* cond_not_taken_branch_cost  */
 501 };
 502
 503 static const struct cpu_vector_cost exynosm1_vector_cost =
 504 {
 505   1, /* scalar_int_stmt_cost  */
 506   1, /* scalar_fp_stmt_cost  */
 507   5, /* scalar_load_cost  */
 508   1, /* scalar_store_cost  */
 509   3, /* vec_int_stmt_cost  */
 510   3, /* vec_fp_stmt_cost  */
 511   3, /* vec_permute_cost  */
 512   3, /* vec_to_scalar_cost  */
 513   3, /* scalar_to_vec_cost  */
 514   5, /* vec_align_load_cost  */
 515   5, /* vec_unalign_load_cost  */
 516   1, /* vec_unalign_store_cost  */
 517   1, /* vec_store_cost  */
 518   1, /* cond_taken_branch_cost  */
 519   1 /* cond_not_taken_branch_cost  */
 520 };
 521
 522 /* Generic costs for vector insn classes.  */
 523 static const struct cpu_vector_cost xgene1_vector_cost =
 524 {
 525   1, /* scalar_int_stmt_cost  */
 526   1, /* scalar_fp_stmt_cost  */
 527   5, /* scalar_load_cost  */
 528   1, /* scalar_store_cost  */
 529   2, /* vec_int_stmt_cost  */
 530   2, /* vec_fp_stmt_cost  */
 531   2, /* vec_permute_cost  */
 532   4, /* vec_to_scalar_cost  */
 533   4, /* scalar_to_vec_cost  */
 534   10, /* vec_align_load_cost  */
 535   10, /* vec_unalign_load_cost  */
 536   2, /* vec_unalign_store_cost  */
 537   2, /* vec_store_cost  */
 538   2, /* cond_taken_branch_cost  */
 539   1 /* cond_not_taken_branch_cost  */
 540 };
 541
 542 /* Costs for vector insn classes for Vulcan.  */
 543 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 544 {
 545   1, /* scalar_int_stmt_cost  */
 546   6, /* scalar_fp_stmt_cost  */
 547   4, /* scalar_load_cost  */
 548   1, /* scalar_store_cost  */
 549   5, /* vec_int_stmt_cost  */
 550   6, /* vec_fp_stmt_cost  */
 551   3, /* vec_permute_cost  */
 552   6, /* vec_to_scalar_cost  */
 553   5, /* scalar_to_vec_cost  */
 554   8, /* vec_align_load_cost  */
 555   8, /* vec_unalign_load_cost  */
 556   4, /* vec_unalign_store_cost  */
 557   4, /* vec_store_cost  */
 558   2, /* cond_taken_branch_cost  */
 559   1  /* cond_not_taken_branch_cost  */
 560 };
 561
 562 /* Generic costs for branch instructions.  */
 563 static const struct cpu_branch_cost generic_branch_cost =
 564 {
 565   1,  /* Predictable.  */
 566   3   /* Unpredictable.  */
 567 };
 568
 569 /* Generic approximation modes.  */
 570 static const cpu_approx_modes generic_approx_modes =
 571 {
 572   AARCH64_APPROX_NONE,  /* division  */
 573   AARCH64_APPROX_NONE,  /* sqrt  */
 574   AARCH64_APPROX_NONE   /* recip_sqrt  */
 575 };
 576
 577 /* Approximation modes for Exynos M1.  */
 578 static const cpu_approx_modes exynosm1_approx_modes =
 579 {
 580   AARCH64_APPROX_NONE,  /* division  */
 581   AARCH64_APPROX_ALL,   /* sqrt  */
 582   AARCH64_APPROX_ALL    /* recip_sqrt  */
 583 };
 584
 585 /* Approximation modes for X-Gene 1.  */
 586 static const cpu_approx_modes xgene1_approx_modes =
 587 {
 588   AARCH64_APPROX_NONE,  /* division  */
 589   AARCH64_APPROX_NONE,  /* sqrt  */
 590   AARCH64_APPROX_ALL    /* recip_sqrt  */
 591 };
 592
 593 /* Generic prefetch settings (which disable prefetch).  */
 594 static const cpu_prefetch_tune generic_prefetch_tune =
 595 {
 596   0,                    /* num_slots  */
 597   -1,                   /* l1_cache_size  */
 598   -1,                   /* l1_cache_line_size  */
 599   -1,                   /* l2_cache_size  */
 600   true,                 /* prefetch_dynamic_strides */
 601   -1,                   /* minimum_stride */
 602   -1                    /* default_opt_level  */
 603 };
 604
 605 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 606 {
 607   0,                    /* num_slots  */
 608   -1,                   /* l1_cache_size  */
 609   64,                   /* l1_cache_line_size  */
 610   -1,                   /* l2_cache_size  */
 611   true,                 /* prefetch_dynamic_strides */
 612   -1,                   /* minimum_stride */
 613   -1                    /* default_opt_level  */
 614 };
 615
 616 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 617 {
 618   4,                    /* num_slots  */
 619   32,                   /* l1_cache_size  */
 620   64,                   /* l1_cache_line_size  */
 621   512,                  /* l2_cache_size  */
 622   false,                /* prefetch_dynamic_strides */
 623   2048,                 /* minimum_stride */
 624   3                     /* default_opt_level  */
 625 };
 626
 627 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 628 {
 629   8,                    /* num_slots  */
 630   32,                   /* l1_cache_size  */
 631   128,                  /* l1_cache_line_size  */
 632   16*1024,              /* l2_cache_size  */
 633   true,                 /* prefetch_dynamic_strides */
 634   -1,                   /* minimum_stride */
 635   3                     /* default_opt_level  */
 636 };
 637
 638 static const cpu_prefetch_tune thunderx_prefetch_tune =
 639 {
 640   8,                    /* num_slots  */
 641   32,                   /* l1_cache_size  */
 642   128,                  /* l1_cache_line_size  */
 643   -1,                   /* l2_cache_size  */
 644   true,                 /* prefetch_dynamic_strides */
 645   -1,                   /* minimum_stride */
 646   -1                    /* default_opt_level  */
 647 };
 648
 649 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 650 {
 651   8,                    /* num_slots  */
 652   32,                   /* l1_cache_size  */
 653   64,                   /* l1_cache_line_size  */
 654   256,                  /* l2_cache_size  */
 655   true,                 /* prefetch_dynamic_strides */
 656   -1,                   /* minimum_stride */
 657   -1                    /* default_opt_level  */
 658 };
 659
 660 static const cpu_prefetch_tune tsv110_prefetch_tune =
 661 {
 662   0,                    /* num_slots  */
 663   64,                   /* l1_cache_size  */
 664   64,                   /* l1_cache_line_size  */
 665   512,                  /* l2_cache_size  */
 666   true,                 /* prefetch_dynamic_strides */
 667   -1,                   /* minimum_stride */
 668   -1                    /* default_opt_level  */
 669 };
 670
 671 static const cpu_prefetch_tune xgene1_prefetch_tune =
 672 {
 673   8,                    /* num_slots  */
 674   32,                   /* l1_cache_size  */
 675   64,                   /* l1_cache_line_size  */
 676   256,                  /* l2_cache_size  */
 677   true,                 /* prefetch_dynamic_strides */
 678   -1,                   /* minimum_stride */
 679   -1                    /* default_opt_level  */
 680 };
 681
 682 static const struct tune_params generic_tunings =
 683 {
 684   &cortexa57_extra_costs,
 685   &generic_addrcost_table,
 686   &generic_regmove_cost,
 687   &generic_vector_cost,
 688   &generic_branch_cost,
 689   &generic_approx_modes,
 690   SVE_NOT_IMPLEMENTED, /* sve_width  */
 691   4, /* memmov_cost  */
 692   2, /* issue_rate  */
 693   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 694   "8",  /* function_align.  */
 695   "4",  /* jump_align.  */
 696   "8",  /* loop_align.  */
 697   2,    /* int_reassoc_width.  */
 698   4,    /* fp_reassoc_width.  */
 699   1,    /* vec_reassoc_width.  */
 700   2,    /* min_div_recip_mul_sf.  */
 701   2,    /* min_div_recip_mul_df.  */
 702   0,    /* max_case_values.  */
 703   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 704   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 705   &generic_prefetch_tune
 706 };
 707
 708 static const struct tune_params cortexa35_tunings =
 709 {
 710   &cortexa53_extra_costs,
 711   &generic_addrcost_table,
 712   &cortexa53_regmove_cost,
 713   &generic_vector_cost,
 714   &generic_branch_cost,
 715   &generic_approx_modes,
 716   SVE_NOT_IMPLEMENTED, /* sve_width  */
 717   4, /* memmov_cost  */
 718   1, /* issue_rate  */
 719   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 720    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 721   "16", /* function_align.  */
 722   "4",  /* jump_align.  */
 723   "8",  /* loop_align.  */
 724   2,    /* int_reassoc_width.  */
 725   4,    /* fp_reassoc_width.  */
 726   1,    /* vec_reassoc_width.  */
 727   2,    /* min_div_recip_mul_sf.  */
 728   2,    /* min_div_recip_mul_df.  */
 729   0,    /* max_case_values.  */
 730   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 731   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 732   &generic_prefetch_tune
 733 };
 734
 735 static const struct tune_params cortexa53_tunings =
 736 {
 737   &cortexa53_extra_costs,
 738   &generic_addrcost_table,
 739   &cortexa53_regmove_cost,
 740   &generic_vector_cost,
 741   &generic_branch_cost,
 742   &generic_approx_modes,
 743   SVE_NOT_IMPLEMENTED, /* sve_width  */
 744   4, /* memmov_cost  */
 745   2, /* issue_rate  */
 746   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 747    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 748   "16", /* function_align.  */
 749   "4",  /* jump_align.  */
 750   "8",  /* loop_align.  */
 751   2,    /* int_reassoc_width.  */
 752   4,    /* fp_reassoc_width.  */
 753   1,    /* vec_reassoc_width.  */
 754   2,    /* min_div_recip_mul_sf.  */
 755   2,    /* min_div_recip_mul_df.  */
 756   0,    /* max_case_values.  */
 757   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 758   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 759   &generic_prefetch_tune
 760 };
 761
 762 static const struct tune_params cortexa57_tunings =
 763 {
 764   &cortexa57_extra_costs,
 765   &generic_addrcost_table,
 766   &cortexa57_regmove_cost,
 767   &cortexa57_vector_cost,
 768   &generic_branch_cost,
 769   &generic_approx_modes,
 770   SVE_NOT_IMPLEMENTED, /* sve_width  */
 771   4, /* memmov_cost  */
 772   3, /* issue_rate  */
 773   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 774    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 775   "16", /* function_align.  */
 776   "4",  /* jump_align.  */
 777   "8",  /* loop_align.  */
 778   2,    /* int_reassoc_width.  */
 779   4,    /* fp_reassoc_width.  */
 780   1,    /* vec_reassoc_width.  */
 781   2,    /* min_div_recip_mul_sf.  */
 782   2,    /* min_div_recip_mul_df.  */
 783   0,    /* max_case_values.  */
 784   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 785   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 786   &generic_prefetch_tune
 787 };
 788
 789 static const struct tune_params cortexa72_tunings =
 790 {
 791   &cortexa57_extra_costs,
 792   &generic_addrcost_table,
 793   &cortexa57_regmove_cost,
 794   &cortexa57_vector_cost,
 795   &generic_branch_cost,
 796   &generic_approx_modes,
 797   SVE_NOT_IMPLEMENTED, /* sve_width  */
 798   4, /* memmov_cost  */
 799   3, /* issue_rate  */
 800   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 801    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 802   "16", /* function_align.  */
 803   "4",  /* jump_align.  */
 804   "8",  /* loop_align.  */
 805   2,    /* int_reassoc_width.  */
 806   4,    /* fp_reassoc_width.  */
 807   1,    /* vec_reassoc_width.  */
 808   2,    /* min_div_recip_mul_sf.  */
 809   2,    /* min_div_recip_mul_df.  */
 810   0,    /* max_case_values.  */
 811   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 812   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 813   &generic_prefetch_tune
 814 };
 815
 816 static const struct tune_params cortexa73_tunings =
 817 {
 818   &cortexa57_extra_costs,
 819   &generic_addrcost_table,
 820   &cortexa57_regmove_cost,
 821   &cortexa57_vector_cost,
 822   &generic_branch_cost,
 823   &generic_approx_modes,
 824   SVE_NOT_IMPLEMENTED, /* sve_width  */
 825   4, /* memmov_cost.  */
 826   2, /* issue_rate.  */
 827   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 828    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 829   "16", /* function_align.  */
 830   "4",  /* jump_align.  */
 831   "8",  /* loop_align.  */
 832   2,    /* int_reassoc_width.  */
 833   4,    /* fp_reassoc_width.  */
 834   1,    /* vec_reassoc_width.  */
 835   2,    /* min_div_recip_mul_sf.  */
 836   2,    /* min_div_recip_mul_df.  */
 837   0,    /* max_case_values.  */
 838   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 839   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 840   &generic_prefetch_tune
 841 };
 842
 843
 844
 845 static const struct tune_params exynosm1_tunings =
 846 {
 847   &exynosm1_extra_costs,
 848   &exynosm1_addrcost_table,
 849   &exynosm1_regmove_cost,
 850   &exynosm1_vector_cost,
 851   &generic_branch_cost,
 852   &exynosm1_approx_modes,
 853   SVE_NOT_IMPLEMENTED, /* sve_width  */
 854   4,    /* memmov_cost  */
 855   3,    /* issue_rate  */
 856   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 857   "4",  /* function_align.  */
 858   "4",  /* jump_align.  */
 859   "4",  /* loop_align.  */
 860   2,    /* int_reassoc_width.  */
 861   4,    /* fp_reassoc_width.  */
 862   1,    /* vec_reassoc_width.  */
 863   2,    /* min_div_recip_mul_sf.  */
 864   2,    /* min_div_recip_mul_df.  */
 865   48,   /* max_case_values.  */
 866   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 867   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 868   &exynosm1_prefetch_tune
 869 };
 870
 871 static const struct tune_params thunderxt88_tunings =
 872 {
 873   &thunderx_extra_costs,
 874   &generic_addrcost_table,
 875   &thunderx_regmove_cost,
 876   &thunderx_vector_cost,
 877   &generic_branch_cost,
 878   &generic_approx_modes,
 879   SVE_NOT_IMPLEMENTED, /* sve_width  */
 880   6, /* memmov_cost  */
 881   2, /* issue_rate  */
 882   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 883   "8",  /* function_align.  */
 884   "8",  /* jump_align.  */
 885   "8",  /* loop_align.  */
 886   2,    /* int_reassoc_width.  */
 887   4,    /* fp_reassoc_width.  */
 888   1,    /* vec_reassoc_width.  */
 889   2,    /* min_div_recip_mul_sf.  */
 890   2,    /* min_div_recip_mul_df.  */
 891   0,    /* max_case_values.  */
 892   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 893   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 894   &thunderxt88_prefetch_tune
 895 };
 896
 897 static const struct tune_params thunderx_tunings =
 898 {
 899   &thunderx_extra_costs,
 900   &generic_addrcost_table,
 901   &thunderx_regmove_cost,
 902   &thunderx_vector_cost,
 903   &generic_branch_cost,
 904   &generic_approx_modes,
 905   SVE_NOT_IMPLEMENTED, /* sve_width  */
 906   6, /* memmov_cost  */
 907   2, /* issue_rate  */
 908   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 909   "8",  /* function_align.  */
 910   "8",  /* jump_align.  */
 911   "8",  /* loop_align.  */
 912   2,    /* int_reassoc_width.  */
 913   4,    /* fp_reassoc_width.  */
 914   1,    /* vec_reassoc_width.  */
 915   2,    /* min_div_recip_mul_sf.  */
 916   2,    /* min_div_recip_mul_df.  */
 917   0,    /* max_case_values.  */
 918   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 919   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 920    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 921   &thunderx_prefetch_tune
 922 };
 923
 924 static const struct tune_params tsv110_tunings =
 925 {
 926   &tsv110_extra_costs,
 927   &tsv110_addrcost_table,
 928   &tsv110_regmove_cost,
 929   &tsv110_vector_cost,
 930   &generic_branch_cost,
 931   &generic_approx_modes,
 932   SVE_NOT_IMPLEMENTED, /* sve_width  */
 933   4,    /* memmov_cost  */
 934   4,    /* issue_rate  */
 935   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
 936    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 937   "16", /* function_align.  */
 938   "4",  /* jump_align.  */
 939   "8",  /* loop_align.  */
 940   2,    /* int_reassoc_width.  */
 941   4,    /* fp_reassoc_width.  */
 942   1,    /* vec_reassoc_width.  */
 943   2,    /* min_div_recip_mul_sf.  */
 944   2,    /* min_div_recip_mul_df.  */
 945   0,    /* max_case_values.  */
 946   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 947   (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
 948   &tsv110_prefetch_tune
 949 };
 950
 951 static const struct tune_params xgene1_tunings =
 952 {
 953   &xgene1_extra_costs,
 954   &xgene1_addrcost_table,
 955   &xgene1_regmove_cost,
 956   &xgene1_vector_cost,
 957   &generic_branch_cost,
 958   &xgene1_approx_modes,
 959   SVE_NOT_IMPLEMENTED, /* sve_width  */
 960   6, /* memmov_cost  */
 961   4, /* issue_rate  */
 962   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 963   "16", /* function_align.  */
 964   "16", /* jump_align.  */
 965   "16", /* loop_align.  */
 966   2,    /* int_reassoc_width.  */
 967   4,    /* fp_reassoc_width.  */
 968   1,    /* vec_reassoc_width.  */
 969   2,    /* min_div_recip_mul_sf.  */
 970   2,    /* min_div_recip_mul_df.  */
 971   17,   /* max_case_values.  */
 972   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 973   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
 974   &xgene1_prefetch_tune
 975 };
 976
 977 static const struct tune_params emag_tunings =
 978 {
 979   &xgene1_extra_costs,
 980   &xgene1_addrcost_table,
 981   &xgene1_regmove_cost,
 982   &xgene1_vector_cost,
 983   &generic_branch_cost,
 984   &xgene1_approx_modes,
 985   SVE_NOT_IMPLEMENTED,
 986   6, /* memmov_cost  */
 987   4, /* issue_rate  */
 988   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 989   "16", /* function_align.  */
 990   "16", /* jump_align.  */
 991   "16", /* loop_align.  */
 992   2,    /* int_reassoc_width.  */
 993   4,    /* fp_reassoc_width.  */
 994   1,    /* vec_reassoc_width.  */
 995   2,    /* min_div_recip_mul_sf.  */
 996   2,    /* min_div_recip_mul_df.  */
 997   17,   /* max_case_values.  */
 998   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 999   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1000   &xgene1_prefetch_tune
1001 };
1002
1003 static const struct tune_params qdf24xx_tunings =
1004 {
1005   &qdf24xx_extra_costs,
1006   &qdf24xx_addrcost_table,
1007   &qdf24xx_regmove_cost,
1008   &qdf24xx_vector_cost,
1009   &generic_branch_cost,
1010   &generic_approx_modes,
1011   SVE_NOT_IMPLEMENTED, /* sve_width  */
1012   4, /* memmov_cost  */
1013   4, /* issue_rate  */
1014   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1015    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1016   "16", /* function_align.  */
1017   "8",  /* jump_align.  */
1018   "16", /* loop_align.  */
1019   2,    /* int_reassoc_width.  */
1020   4,    /* fp_reassoc_width.  */
1021   1,    /* vec_reassoc_width.  */
1022   2,    /* min_div_recip_mul_sf.  */
1023   2,    /* min_div_recip_mul_df.  */
1024   0,    /* max_case_values.  */
1025   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1026   AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
1027   &qdf24xx_prefetch_tune
1028 };
1029
1030 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
1031    for now.  */
1032 static const struct tune_params saphira_tunings =
1033 {
1034   &generic_extra_costs,
1035   &generic_addrcost_table,
1036   &generic_regmove_cost,
1037   &generic_vector_cost,
1038   &generic_branch_cost,
1039   &generic_approx_modes,
1040   SVE_NOT_IMPLEMENTED, /* sve_width  */
1041   4, /* memmov_cost  */
1042   4, /* issue_rate  */
1043   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1044    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1045   "16", /* function_align.  */
1046   "8",  /* jump_align.  */
1047   "16", /* loop_align.  */
1048   2,    /* int_reassoc_width.  */
1049   4,    /* fp_reassoc_width.  */
1050   1,    /* vec_reassoc_width.  */
1051   2,    /* min_div_recip_mul_sf.  */
1052   2,    /* min_div_recip_mul_df.  */
1053   0,    /* max_case_values.  */
1054   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1055   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
1056   &generic_prefetch_tune
1057 };
1058
1059 static const struct tune_params thunderx2t99_tunings =
1060 {
1061   &thunderx2t99_extra_costs,
1062   &thunderx2t99_addrcost_table,
1063   &thunderx2t99_regmove_cost,
1064   &thunderx2t99_vector_cost,
1065   &generic_branch_cost,
1066   &generic_approx_modes,
1067   SVE_NOT_IMPLEMENTED, /* sve_width  */
1068   4, /* memmov_cost.  */
1069   4, /* issue_rate.  */
1070   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
1071    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
1072   "16", /* function_align.  */
1073   "8",  /* jump_align.  */
1074   "16", /* loop_align.  */
1075   3,    /* int_reassoc_width.  */
1076   2,    /* fp_reassoc_width.  */
1077   2,    /* vec_reassoc_width.  */
1078   2,    /* min_div_recip_mul_sf.  */
1079   2,    /* min_div_recip_mul_df.  */
1080   0,    /* max_case_values.  */
1081   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1082   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1083   &thunderx2t99_prefetch_tune
1084 };
1085
1086 /* Support for fine-grained override of the tuning structures.  */
1087 struct aarch64_tuning_override_function
1088 {
1089   const char* name;
1090   void (*parse_override)(const char*, struct tune_params*);
1091 };
1092
1093 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1094 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1095 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1096
1097 static const struct aarch64_tuning_override_function
1098 aarch64_tuning_override_functions[] =
1099 {
1100   { "fuse", aarch64_parse_fuse_string },
1101   { "tune", aarch64_parse_tune_string },
1102   { "sve_width", aarch64_parse_sve_width_string },
1103   { NULL, NULL }
1104 };
1105
1106 /* A processor implementing AArch64.  */
1107 struct processor
1108 {
1109   const char *const name;
1110   enum aarch64_processor ident;
1111   enum aarch64_processor sched_core;
1112   enum aarch64_arch arch;
1113   unsigned architecture_version;
1114   const unsigned long flags;
1115   const struct tune_params *const tune;
1116 };
1117
1118 /* Architectures implementing AArch64.  */
1119 static const struct processor all_architectures[] =
1120 {
1121 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1122   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1123 #include "aarch64-arches.def"
1124   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1125 };
1126
1127 /* Processor cores implementing AArch64.  */
1128 static const struct processor all_cores[] =
1129 {
1130 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1131   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
1132   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
1133   FLAGS, &COSTS##_tunings},
1134 #include "aarch64-cores.def"
1135   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1136     AARCH64_FL_FOR_ARCH8, &generic_tunings},
1137   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1138 };
1139
1140
1141 /* Target specification.  These are populated by the -march, -mtune, -mcpu
1142    handling code or by target attributes.  */
1143 static const struct processor *selected_arch;
1144 static const struct processor *selected_cpu;
1145 static const struct processor *selected_tune;
1146
1147 /* The current tuning set.  */
1148 struct tune_params aarch64_tune_params = generic_tunings;
1149
1150 /* Table of machine attributes.  */
1151 static const struct attribute_spec aarch64_attribute_table[] =
1152 {
1153   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1154        affects_type_identity, handler, exclude } */
1155   { "aarch64_vector_pcs", 0, 0, false, true,  true,  false, NULL, NULL },
1156   { NULL,                 0, 0, false, false, false, false, NULL, NULL }
1157 };
1158
1159 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1160
1161 /* An ISA extension in the co-processor and main instruction set space.  */
1162 struct aarch64_option_extension
1163 {
1164   const char *const name;
1165   const unsigned long flags_on;
1166   const unsigned long flags_off;
1167 };
1168
1169 typedef enum aarch64_cond_code
1170 {
1171   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1172   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1173   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1174 }
1175 aarch64_cc;
1176
1177 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1178
1179 struct aarch64_branch_protect_type
1180 {
1181   /* The type's name that the user passes to the branch-protection option
1182     string.  */
1183   const char* name;
1184   /* Function to handle the protection type and set global variables.
1185     First argument is the string token corresponding with this type and the
1186     second argument is the next token in the option string.
1187     Return values:
1188     * AARCH64_PARSE_OK: Handling was sucessful.
1189     * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1190       should print an error.
1191     * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1192       own error.  */
1193   enum aarch64_parse_opt_result (*handler)(char*, char*);
1194   /* A list of types that can follow this type in the option string.  */
1195   const aarch64_branch_protect_type* subtypes;
1196   unsigned int num_subtypes;
1197 };
1198
1199 static enum aarch64_parse_opt_result
1200 aarch64_handle_no_branch_protection (char* str, char* rest)
1201 {
1202   aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1203   aarch64_enable_bti = 0;
1204   if (rest)
1205     {
1206       error ("unexpected %<%s%> after %<%s%>", rest, str);
1207       return AARCH64_PARSE_INVALID_FEATURE;
1208     }
1209   return AARCH64_PARSE_OK;
1210 }
1211
1212 static enum aarch64_parse_opt_result
1213 aarch64_handle_standard_branch_protection (char* str, char* rest)
1214 {
1215   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1216   aarch64_enable_bti = 1;
1217   if (rest)
1218     {
1219       error ("unexpected %<%s%> after %<%s%>", rest, str);
1220       return AARCH64_PARSE_INVALID_FEATURE;
1221     }
1222   return AARCH64_PARSE_OK;
1223 }
1224
1225 static enum aarch64_parse_opt_result
1226 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1227                                     char* rest ATTRIBUTE_UNUSED)
1228 {
1229   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1230   return AARCH64_PARSE_OK;
1231 }
1232
1233 static enum aarch64_parse_opt_result
1234 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1235                               char* rest ATTRIBUTE_UNUSED)
1236 {
1237   aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1238   return AARCH64_PARSE_OK;
1239 }
1240
1241 static enum aarch64_parse_opt_result
1242 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1243                                     char* rest ATTRIBUTE_UNUSED)
1244 {
1245   aarch64_enable_bti = 1;
1246   return AARCH64_PARSE_OK;
1247 }
1248
1249 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1250   { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1251   { NULL, NULL, NULL, 0 }
1252 };
1253
1254 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1255   { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1256   { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1257   { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1258     ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1259   { "bti", aarch64_handle_bti_protection, NULL, 0 },
1260   { NULL, NULL, NULL, 0 }
1261 };
1262
1263 /* The condition codes of the processor, and the inverse function.  */
1264 static const char * const aarch64_condition_codes[] =
1265 {
1266   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1267   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1268 };
1269
1270 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1271 const char *
1272 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1273                         const char * branch_format)
1274 {
1275     rtx_code_label * tmp_label = gen_label_rtx ();
1276     char label_buf[256];
1277     char buffer[128];
1278     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1279                                  CODE_LABEL_NUMBER (tmp_label));
1280     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1281     rtx dest_label = operands[pos_label];
1282     operands[pos_label] = tmp_label;
1283
1284     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1285     output_asm_insn (buffer, operands);
1286
1287     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1288     operands[pos_label] = dest_label;
1289     output_asm_insn (buffer, operands);
1290     return "";
1291 }
1292
1293 void
1294 aarch64_err_no_fpadvsimd (machine_mode mode)
1295 {
1296   if (TARGET_GENERAL_REGS_ONLY)
1297     if (FLOAT_MODE_P (mode))
1298       error ("%qs is incompatible with the use of floating-point types",
1299              "-mgeneral-regs-only");
1300     else
1301       error ("%qs is incompatible with the use of vector types",
1302              "-mgeneral-regs-only");
1303   else
1304     if (FLOAT_MODE_P (mode))
1305       error ("%qs feature modifier is incompatible with the use of"
1306              " floating-point types", "+nofp");
1307     else
1308       error ("%qs feature modifier is incompatible with the use of"
1309              " vector types", "+nofp");
1310 }
1311
1312 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1313    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1314    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1315    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1316    and GENERAL_REGS is lower than the memory cost (in this case the best class
1317    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
1318    cost results in bad allocations with many redundant int<->FP moves which
1319    are expensive on various cores.
1320    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1321    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
1322    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
1323    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
1324    The result of this is that it is no longer inefficient to have a higher
1325    memory move cost than the register move cost.
1326 */
1327
1328 static reg_class_t
1329 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1330                                          reg_class_t best_class)
1331 {
1332   machine_mode mode;
1333
1334   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1335       || !reg_class_subset_p (FP_REGS, allocno_class))
1336     return allocno_class;
1337
1338   if (!reg_class_subset_p (GENERAL_REGS, best_class)
1339       || !reg_class_subset_p (FP_REGS, best_class))
1340     return best_class;
1341
1342   mode = PSEUDO_REGNO_MODE (regno);
1343   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1344 }
1345
1346 static unsigned int
1347 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1348 {
1349   if (GET_MODE_UNIT_SIZE (mode) == 4)
1350     return aarch64_tune_params.min_div_recip_mul_sf;
1351   return aarch64_tune_params.min_div_recip_mul_df;
1352 }
1353
1354 /* Return the reassociation width of treeop OPC with mode MODE.  */
1355 static int
1356 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1357 {
1358   if (VECTOR_MODE_P (mode))
1359     return aarch64_tune_params.vec_reassoc_width;
1360   if (INTEGRAL_MODE_P (mode))
1361     return aarch64_tune_params.int_reassoc_width;
1362   /* Avoid reassociating floating point addition so we emit more FMAs.  */
1363   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1364     return aarch64_tune_params.fp_reassoc_width;
1365   return 1;
1366 }
1367
1368 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1369 unsigned
1370 aarch64_dbx_register_number (unsigned regno)
1371 {
1372    if (GP_REGNUM_P (regno))
1373      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1374    else if (regno == SP_REGNUM)
1375      return AARCH64_DWARF_SP;
1376    else if (FP_REGNUM_P (regno))
1377      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1378    else if (PR_REGNUM_P (regno))
1379      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1380    else if (regno == VG_REGNUM)
1381      return AARCH64_DWARF_VG;
1382
1383    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1384       equivalent DWARF register.  */
1385    return DWARF_FRAME_REGISTERS;
1386 }
1387
1388 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1389 static bool
1390 aarch64_advsimd_struct_mode_p (machine_mode mode)
1391 {
1392   return (TARGET_SIMD
1393           && (mode == OImode || mode == CImode || mode == XImode));
1394 }
1395
1396 /* Return true if MODE is an SVE predicate mode.  */
1397 static bool
1398 aarch64_sve_pred_mode_p (machine_mode mode)
1399 {
1400   return (TARGET_SVE
1401           && (mode == VNx16BImode
1402               || mode == VNx8BImode
1403               || mode == VNx4BImode
1404               || mode == VNx2BImode));
1405 }
1406
1407 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1408 const unsigned int VEC_ADVSIMD  = 1;
1409 const unsigned int VEC_SVE_DATA = 2;
1410 const unsigned int VEC_SVE_PRED = 4;
1411 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1412    a structure of 2, 3 or 4 vectors.  */
1413 const unsigned int VEC_STRUCT   = 8;
1414 /* Useful combinations of the above.  */
1415 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1416 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1417
1418 /* Return a set of flags describing the vector properties of mode MODE.
1419    Ignore modes that are not supported by the current target.  */
1420 static unsigned int
1421 aarch64_classify_vector_mode (machine_mode mode)
1422 {
1423   if (aarch64_advsimd_struct_mode_p (mode))
1424     return VEC_ADVSIMD | VEC_STRUCT;
1425
1426   if (aarch64_sve_pred_mode_p (mode))
1427     return VEC_SVE_PRED;
1428
1429   scalar_mode inner = GET_MODE_INNER (mode);
1430   if (VECTOR_MODE_P (mode)
1431       && (inner == QImode
1432           || inner == HImode
1433           || inner == HFmode
1434           || inner == SImode
1435           || inner == SFmode
1436           || inner == DImode
1437           || inner == DFmode))
1438     {
1439       if (TARGET_SVE)
1440         {
1441           if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1442             return VEC_SVE_DATA;
1443           if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1444               || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1445               || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1446             return VEC_SVE_DATA | VEC_STRUCT;
1447         }
1448
1449       /* This includes V1DF but not V1DI (which doesn't exist).  */
1450       if (TARGET_SIMD
1451           && (known_eq (GET_MODE_BITSIZE (mode), 64)
1452               || known_eq (GET_MODE_BITSIZE (mode), 128)))
1453         return VEC_ADVSIMD;
1454     }
1455
1456   return 0;
1457 }
1458
1459 /* Return true if MODE is any of the data vector modes, including
1460    structure modes.  */
1461 static bool
1462 aarch64_vector_data_mode_p (machine_mode mode)
1463 {
1464   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1465 }
1466
1467 /* Return true if MODE is an SVE data vector mode; either a single vector
1468    or a structure of vectors.  */
1469 static bool
1470 aarch64_sve_data_mode_p (machine_mode mode)
1471 {
1472   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1473 }
1474
1475 /* Implement target hook TARGET_ARRAY_MODE.  */
1476 static opt_machine_mode
1477 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1478 {
1479   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1480       && IN_RANGE (nelems, 2, 4))
1481     return mode_for_vector (GET_MODE_INNER (mode),
1482                             GET_MODE_NUNITS (mode) * nelems);
1483
1484   return opt_machine_mode ();
1485 }
1486
1487 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1488 static bool
1489 aarch64_array_mode_supported_p (machine_mode mode,
1490                                 unsigned HOST_WIDE_INT nelems)
1491 {
1492   if (TARGET_SIMD
1493       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1494           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1495       && (nelems >= 2 && nelems <= 4))
1496     return true;
1497
1498   return false;
1499 }
1500
1501 /* Return the SVE predicate mode to use for elements that have
1502    ELEM_NBYTES bytes, if such a mode exists.  */
1503
1504 opt_machine_mode
1505 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1506 {
1507   if (TARGET_SVE)
1508     {
1509       if (elem_nbytes == 1)
1510         return VNx16BImode;
1511       if (elem_nbytes == 2)
1512         return VNx8BImode;
1513       if (elem_nbytes == 4)
1514         return VNx4BImode;
1515       if (elem_nbytes == 8)
1516         return VNx2BImode;
1517     }
1518   return opt_machine_mode ();
1519 }
1520
1521 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1522
1523 static opt_machine_mode
1524 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1525 {
1526   if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1527     {
1528       unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1529       machine_mode pred_mode;
1530       if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1531         return pred_mode;
1532     }
1533
1534   return default_get_mask_mode (nunits, nbytes);
1535 }
1536
1537 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
1538    prefer to use the first arithmetic operand as the else value if
1539    the else value doesn't matter, since that exactly matches the SVE
1540    destructive merging form.  For ternary operations we could either
1541    pick the first operand and use FMAD-like instructions or the last
1542    operand and use FMLA-like instructions; the latter seems more
1543    natural.  */
1544
1545 static tree
1546 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1547 {
1548   return nops == 3 ? ops[2] : ops[0];
1549 }
1550
1551 /* Implement TARGET_HARD_REGNO_NREGS.  */
1552
1553 static unsigned int
1554 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1555 {
1556   /* ??? Logically we should only need to provide a value when
1557      HARD_REGNO_MODE_OK says that the combination is valid,
1558      but at the moment we need to handle all modes.  Just ignore
1559      any runtime parts for registers that can't store them.  */
1560   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1561   switch (aarch64_regno_regclass (regno))
1562     {
1563     case FP_REGS:
1564     case FP_LO_REGS:
1565       if (aarch64_sve_data_mode_p (mode))
1566         return exact_div (GET_MODE_SIZE (mode),
1567                           BYTES_PER_SVE_VECTOR).to_constant ();
1568       return CEIL (lowest_size, UNITS_PER_VREG);
1569     case PR_REGS:
1570     case PR_LO_REGS:
1571     case PR_HI_REGS:
1572       return 1;
1573     default:
1574       return CEIL (lowest_size, UNITS_PER_WORD);
1575     }
1576   gcc_unreachable ();
1577 }
1578
1579 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1580
1581 static bool
1582 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1583 {
1584   if (GET_MODE_CLASS (mode) == MODE_CC)
1585     return regno == CC_REGNUM;
1586
1587   if (regno == VG_REGNUM)
1588     /* This must have the same size as _Unwind_Word.  */
1589     return mode == DImode;
1590
1591   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1592   if (vec_flags & VEC_SVE_PRED)
1593     return PR_REGNUM_P (regno);
1594
1595   if (PR_REGNUM_P (regno))
1596     return 0;
1597
1598   if (regno == SP_REGNUM)
1599     /* The purpose of comparing with ptr_mode is to support the
1600        global register variable associated with the stack pointer
1601        register via the syntax of asm ("wsp") in ILP32.  */
1602     return mode == Pmode || mode == ptr_mode;
1603
1604   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1605     return mode == Pmode;
1606
1607   if (GP_REGNUM_P (regno))
1608     {
1609       if (known_le (GET_MODE_SIZE (mode), 8))
1610         return true;
1611       else if (known_le (GET_MODE_SIZE (mode), 16))
1612         return (regno & 1) == 0;
1613     }
1614   else if (FP_REGNUM_P (regno))
1615     {
1616       if (vec_flags & VEC_STRUCT)
1617         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1618       else
1619         return !VECTOR_MODE_P (mode) || vec_flags != 0;
1620     }
1621
1622   return false;
1623 }
1624
1625 /* Return true if this is a definition of a vectorized simd function.  */
1626
1627 static bool
1628 aarch64_simd_decl_p (tree fndecl)
1629 {
1630   tree fntype;
1631
1632   if (fndecl == NULL)
1633     return false;
1634   fntype = TREE_TYPE (fndecl);
1635   if (fntype == NULL)
1636     return false;
1637
1638   /* Functions with the aarch64_vector_pcs attribute use the simd ABI.  */
1639   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
1640     return true;
1641
1642   return false;
1643 }
1644
1645 /* Return the mode a register save/restore should use.  DImode for integer
1646    registers, DFmode for FP registers in non-SIMD functions (they only save
1647    the bottom half of a 128 bit register), or TFmode for FP registers in
1648    SIMD functions.  */
1649
1650 static machine_mode
1651 aarch64_reg_save_mode (tree fndecl, unsigned regno)
1652 {
1653   return GP_REGNUM_P (regno)
1654            ? E_DImode
1655            : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
1656 }
1657
1658 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1659    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1660    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1661
1662 static bool
1663 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1664 {
1665   return FP_REGNUM_P (regno) && maybe_gt (GET_MODE_SIZE (mode), 8);
1666 }
1667
1668 /* Implement REGMODE_NATURAL_SIZE.  */
1669 poly_uint64
1670 aarch64_regmode_natural_size (machine_mode mode)
1671 {
1672   /* The natural size for SVE data modes is one SVE data vector,
1673      and similarly for predicates.  We can't independently modify
1674      anything smaller than that.  */
1675   /* ??? For now, only do this for variable-width SVE registers.
1676      Doing it for constant-sized registers breaks lower-subreg.c.  */
1677   /* ??? And once that's fixed, we should probably have similar
1678      code for Advanced SIMD.  */
1679   if (!aarch64_sve_vg.is_constant ())
1680     {
1681       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1682       if (vec_flags & VEC_SVE_PRED)
1683         return BYTES_PER_SVE_PRED;
1684       if (vec_flags & VEC_SVE_DATA)
1685         return BYTES_PER_SVE_VECTOR;
1686     }
1687   return UNITS_PER_WORD;
1688 }
1689
1690 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1691 machine_mode
1692 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1693                                      machine_mode mode)
1694 {
1695   /* The predicate mode determines which bits are significant and
1696      which are "don't care".  Decreasing the number of lanes would
1697      lose data while increasing the number of lanes would make bits
1698      unnecessarily significant.  */
1699   if (PR_REGNUM_P (regno))
1700     return mode;
1701   if (known_ge (GET_MODE_SIZE (mode), 4))
1702     return mode;
1703   else
1704     return SImode;
1705 }
1706
1707 /* Return true if I's bits are consecutive ones from the MSB.  */
1708 bool
1709 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
1710 {
1711   return exact_log2 (-i) != HOST_WIDE_INT_M1;
1712 }
1713
1714 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
1715    that strcpy from constants will be faster.  */
1716
1717 static HOST_WIDE_INT
1718 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1719 {
1720   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1721     return MAX (align, BITS_PER_WORD);
1722   return align;
1723 }
1724
1725 /* Return true if calls to DECL should be treated as
1726    long-calls (ie called via a register).  */
1727 static bool
1728 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1729 {
1730   return false;
1731 }
1732
1733 /* Return true if calls to symbol-ref SYM should be treated as
1734    long-calls (ie called via a register).  */
1735 bool
1736 aarch64_is_long_call_p (rtx sym)
1737 {
1738   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1739 }
1740
1741 /* Return true if calls to symbol-ref SYM should not go through
1742    plt stubs.  */
1743
1744 bool
1745 aarch64_is_noplt_call_p (rtx sym)
1746 {
1747   const_tree decl = SYMBOL_REF_DECL (sym);
1748
1749   if (flag_pic
1750       && decl
1751       && (!flag_plt
1752           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1753       && !targetm.binds_local_p (decl))
1754     return true;
1755
1756   return false;
1757 }
1758
1759 /* Return true if the offsets to a zero/sign-extract operation
1760    represent an expression that matches an extend operation.  The
1761    operands represent the paramters from
1762
1763    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1764 bool
1765 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1766                                 rtx extract_imm)
1767 {
1768   HOST_WIDE_INT mult_val, extract_val;
1769
1770   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1771     return false;
1772
1773   mult_val = INTVAL (mult_imm);
1774   extract_val = INTVAL (extract_imm);
1775
1776   if (extract_val > 8
1777       && extract_val < GET_MODE_BITSIZE (mode)
1778       && exact_log2 (extract_val & ~7) > 0
1779       && (extract_val & 7) <= 4
1780       && mult_val == (1 << (extract_val & 7)))
1781     return true;
1782
1783   return false;
1784 }
1785
1786 /* Emit an insn that's a simple single-set.  Both the operands must be
1787    known to be valid.  */
1788 inline static rtx_insn *
1789 emit_set_insn (rtx x, rtx y)
1790 {
1791   return emit_insn (gen_rtx_SET (x, y));
1792 }
1793
1794 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1795    return the rtx for register 0 in the proper mode.  */
1796 rtx
1797 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1798 {
1799   machine_mode mode = SELECT_CC_MODE (code, x, y);
1800   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1801
1802   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1803   return cc_reg;
1804 }
1805
1806 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
1807
1808 static rtx
1809 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
1810                                   machine_mode y_mode)
1811 {
1812   if (y_mode == E_QImode || y_mode == E_HImode)
1813     {
1814       if (CONST_INT_P (y))
1815         y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
1816       else
1817         {
1818           rtx t, cc_reg;
1819           machine_mode cc_mode;
1820
1821           t = gen_rtx_ZERO_EXTEND (SImode, y);
1822           t = gen_rtx_COMPARE (CC_SWPmode, t, x);
1823           cc_mode = CC_SWPmode;
1824           cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
1825           emit_set_insn (cc_reg, t);
1826           return cc_reg;
1827         }
1828     }
1829
1830   return aarch64_gen_compare_reg (code, x, y);
1831 }
1832
1833 /* Build the SYMBOL_REF for __tls_get_addr.  */
1834
1835 static GTY(()) rtx tls_get_addr_libfunc;
1836
1837 rtx
1838 aarch64_tls_get_addr (void)
1839 {
1840   if (!tls_get_addr_libfunc)
1841     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1842   return tls_get_addr_libfunc;
1843 }
1844
1845 /* Return the TLS model to use for ADDR.  */
1846
1847 static enum tls_model
1848 tls_symbolic_operand_type (rtx addr)
1849 {
1850   enum tls_model tls_kind = TLS_MODEL_NONE;
1851   if (GET_CODE (addr) == CONST)
1852     {
1853       poly_int64 addend;
1854       rtx sym = strip_offset (addr, &addend);
1855       if (GET_CODE (sym) == SYMBOL_REF)
1856         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1857     }
1858   else if (GET_CODE (addr) == SYMBOL_REF)
1859     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1860
1861   return tls_kind;
1862 }
1863
1864 /* We'll allow lo_sum's in addresses in our legitimate addresses
1865    so that combine would take care of combining addresses where
1866    necessary, but for generation purposes, we'll generate the address
1867    as :
1868    RTL                               Absolute
1869    tmp = hi (symbol_ref);            adrp  x1, foo
1870    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1871                                      nop
1872
1873    PIC                               TLS
1874    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1875    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1876                                      bl   __tls_get_addr
1877                                      nop
1878
1879    Load TLS symbol, depending on TLS mechanism and TLS access model.
1880
1881    Global Dynamic - Traditional TLS:
1882    adrp tmp, :tlsgd:imm
1883    add  dest, tmp, #:tlsgd_lo12:imm
1884    bl   __tls_get_addr
1885
1886    Global Dynamic - TLS Descriptors:
1887    adrp dest, :tlsdesc:imm
1888    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1889    add  dest, dest, #:tlsdesc_lo12:imm
1890    blr  tmp
1891    mrs  tp, tpidr_el0
1892    add  dest, dest, tp
1893
1894    Initial Exec:
1895    mrs  tp, tpidr_el0
1896    adrp tmp, :gottprel:imm
1897    ldr  dest, [tmp, #:gottprel_lo12:imm]
1898    add  dest, dest, tp
1899
1900    Local Exec:
1901    mrs  tp, tpidr_el0
1902    add  t0, tp, #:tprel_hi12:imm, lsl #12
1903    add  t0, t0, #:tprel_lo12_nc:imm
1904 */
1905
1906 static void
1907 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1908                                    enum aarch64_symbol_type type)
1909 {
1910   switch (type)
1911     {
1912     case SYMBOL_SMALL_ABSOLUTE:
1913       {
1914         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1915         rtx tmp_reg = dest;
1916         machine_mode mode = GET_MODE (dest);
1917
1918         gcc_assert (mode == Pmode || mode == ptr_mode);
1919
1920         if (can_create_pseudo_p ())
1921           tmp_reg = gen_reg_rtx (mode);
1922
1923         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1924         emit_insn (gen_add_losym (dest, tmp_reg, imm));
1925         return;
1926       }
1927
1928     case SYMBOL_TINY_ABSOLUTE:
1929       emit_insn (gen_rtx_SET (dest, imm));
1930       return;
1931
1932     case SYMBOL_SMALL_GOT_28K:
1933       {
1934         machine_mode mode = GET_MODE (dest);
1935         rtx gp_rtx = pic_offset_table_rtx;
1936         rtx insn;
1937         rtx mem;
1938
1939         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1940            here before rtl expand.  Tree IVOPT will generate rtl pattern to
1941            decide rtx costs, in which case pic_offset_table_rtx is not
1942            initialized.  For that case no need to generate the first adrp
1943            instruction as the final cost for global variable access is
1944            one instruction.  */
1945         if (gp_rtx != NULL)
1946           {
1947             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1948                using the page base as GOT base, the first page may be wasted,
1949                in the worst scenario, there is only 28K space for GOT).
1950
1951                The generate instruction sequence for accessing global variable
1952                is:
1953
1954                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1955
1956                Only one instruction needed. But we must initialize
1957                pic_offset_table_rtx properly.  We generate initialize insn for
1958                every global access, and allow CSE to remove all redundant.
1959
1960                The final instruction sequences will look like the following
1961                for multiply global variables access.
1962
1963                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1964
1965                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1966                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1967                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1968                  ...  */
1969
1970             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1971             crtl->uses_pic_offset_table = 1;
1972             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1973
1974             if (mode != GET_MODE (gp_rtx))
1975              gp_rtx = gen_lowpart (mode, gp_rtx);
1976
1977           }
1978
1979         if (mode == ptr_mode)
1980           {
1981             if (mode == DImode)
1982               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1983             else
1984               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1985
1986             mem = XVECEXP (SET_SRC (insn), 0, 0);
1987           }
1988         else
1989           {
1990             gcc_assert (mode == Pmode);
1991
1992             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1993             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1994           }
1995
1996         /* The operand is expected to be MEM.  Whenever the related insn
1997            pattern changed, above code which calculate mem should be
1998            updated.  */
1999         gcc_assert (GET_CODE (mem) == MEM);
2000         MEM_READONLY_P (mem) = 1;
2001         MEM_NOTRAP_P (mem) = 1;
2002         emit_insn (insn);
2003         return;
2004       }
2005
2006     case SYMBOL_SMALL_GOT_4G:
2007       {
2008         /* In ILP32, the mode of dest can be either SImode or DImode,
2009            while the got entry is always of SImode size.  The mode of
2010            dest depends on how dest is used: if dest is assigned to a
2011            pointer (e.g. in the memory), it has SImode; it may have
2012            DImode if dest is dereferenced to access the memeory.
2013            This is why we have to handle three different ldr_got_small
2014            patterns here (two patterns for ILP32).  */
2015
2016         rtx insn;
2017         rtx mem;
2018         rtx tmp_reg = dest;
2019         machine_mode mode = GET_MODE (dest);
2020
2021         if (can_create_pseudo_p ())
2022           tmp_reg = gen_reg_rtx (mode);
2023
2024         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2025         if (mode == ptr_mode)
2026           {
2027             if (mode == DImode)
2028               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2029             else
2030               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2031
2032             mem = XVECEXP (SET_SRC (insn), 0, 0);
2033           }
2034         else
2035           {
2036             gcc_assert (mode == Pmode);
2037
2038             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2039             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2040           }
2041
2042         gcc_assert (GET_CODE (mem) == MEM);
2043         MEM_READONLY_P (mem) = 1;
2044         MEM_NOTRAP_P (mem) = 1;
2045         emit_insn (insn);
2046         return;
2047       }
2048
2049     case SYMBOL_SMALL_TLSGD:
2050       {
2051         rtx_insn *insns;
2052         machine_mode mode = GET_MODE (dest);
2053         rtx result = gen_rtx_REG (mode, R0_REGNUM);
2054
2055         start_sequence ();
2056         if (TARGET_ILP32)
2057           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2058         else
2059           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2060         insns = get_insns ();
2061         end_sequence ();
2062
2063         RTL_CONST_CALL_P (insns) = 1;
2064         emit_libcall_block (insns, dest, result, imm);
2065         return;
2066       }
2067
2068     case SYMBOL_SMALL_TLSDESC:
2069       {
2070         machine_mode mode = GET_MODE (dest);
2071         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2072         rtx tp;
2073
2074         gcc_assert (mode == Pmode || mode == ptr_mode);
2075
2076         /* In ILP32, the got entry is always of SImode size.  Unlike
2077            small GOT, the dest is fixed at reg 0.  */
2078         if (TARGET_ILP32)
2079           emit_insn (gen_tlsdesc_small_si (imm));
2080         else
2081           emit_insn (gen_tlsdesc_small_di (imm));
2082         tp = aarch64_load_tp (NULL);
2083
2084         if (mode != Pmode)
2085           tp = gen_lowpart (mode, tp);
2086
2087         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2088         if (REG_P (dest))
2089           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2090         return;
2091       }
2092
2093     case SYMBOL_SMALL_TLSIE:
2094       {
2095         /* In ILP32, the mode of dest can be either SImode or DImode,
2096            while the got entry is always of SImode size.  The mode of
2097            dest depends on how dest is used: if dest is assigned to a
2098            pointer (e.g. in the memory), it has SImode; it may have
2099            DImode if dest is dereferenced to access the memeory.
2100            This is why we have to handle three different tlsie_small
2101            patterns here (two patterns for ILP32).  */
2102         machine_mode mode = GET_MODE (dest);
2103         rtx tmp_reg = gen_reg_rtx (mode);
2104         rtx tp = aarch64_load_tp (NULL);
2105
2106         if (mode == ptr_mode)
2107           {
2108             if (mode == DImode)
2109               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2110             else
2111               {
2112                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2113                 tp = gen_lowpart (mode, tp);
2114               }
2115           }
2116         else
2117           {
2118             gcc_assert (mode == Pmode);
2119             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2120           }
2121
2122         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2123         if (REG_P (dest))
2124           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2125         return;
2126       }
2127
2128     case SYMBOL_TLSLE12:
2129     case SYMBOL_TLSLE24:
2130     case SYMBOL_TLSLE32:
2131     case SYMBOL_TLSLE48:
2132       {
2133         machine_mode mode = GET_MODE (dest);
2134         rtx tp = aarch64_load_tp (NULL);
2135
2136         if (mode != Pmode)
2137           tp = gen_lowpart (mode, tp);
2138
2139         switch (type)
2140           {
2141           case SYMBOL_TLSLE12:
2142             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2143                         (dest, tp, imm));
2144             break;
2145           case SYMBOL_TLSLE24:
2146             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2147                         (dest, tp, imm));
2148           break;
2149           case SYMBOL_TLSLE32:
2150             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2151                         (dest, imm));
2152             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2153                         (dest, dest, tp));
2154           break;
2155           case SYMBOL_TLSLE48:
2156             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2157                         (dest, imm));
2158             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2159                         (dest, dest, tp));
2160             break;
2161           default:
2162             gcc_unreachable ();
2163           }
2164
2165         if (REG_P (dest))
2166           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2167         return;
2168       }
2169
2170     case SYMBOL_TINY_GOT:
2171       emit_insn (gen_ldr_got_tiny (dest, imm));
2172       return;
2173
2174     case SYMBOL_TINY_TLSIE:
2175       {
2176         machine_mode mode = GET_MODE (dest);
2177         rtx tp = aarch64_load_tp (NULL);
2178
2179         if (mode == ptr_mode)
2180           {
2181             if (mode == DImode)
2182               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2183             else
2184               {
2185                 tp = gen_lowpart (mode, tp);
2186                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2187               }
2188           }
2189         else
2190           {
2191             gcc_assert (mode == Pmode);
2192             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2193           }
2194
2195         if (REG_P (dest))
2196           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2197         return;
2198       }
2199
2200     default:
2201       gcc_unreachable ();
2202     }
2203 }
2204
2205 /* Emit a move from SRC to DEST.  Assume that the move expanders can
2206    handle all moves if !can_create_pseudo_p ().  The distinction is
2207    important because, unlike emit_move_insn, the move expanders know
2208    how to force Pmode objects into the constant pool even when the
2209    constant pool address is not itself legitimate.  */
2210 static rtx
2211 aarch64_emit_move (rtx dest, rtx src)
2212 {
2213   return (can_create_pseudo_p ()
2214           ? emit_move_insn (dest, src)
2215           : emit_move_insn_1 (dest, src));
2216 }
2217
2218 /* Apply UNOPTAB to OP and store the result in DEST.  */
2219
2220 static void
2221 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2222 {
2223   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2224   if (dest != tmp)
2225     emit_move_insn (dest, tmp);
2226 }
2227
2228 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
2229
2230 static void
2231 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2232 {
2233   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2234                           OPTAB_DIRECT);
2235   if (dest != tmp)
2236     emit_move_insn (dest, tmp);
2237 }
2238
2239 /* Split a 128-bit move operation into two 64-bit move operations,
2240    taking care to handle partial overlap of register to register
2241    copies.  Special cases are needed when moving between GP regs and
2242    FP regs.  SRC can be a register, constant or memory; DST a register
2243    or memory.  If either operand is memory it must not have any side
2244    effects.  */
2245 void
2246 aarch64_split_128bit_move (rtx dst, rtx src)
2247 {
2248   rtx dst_lo, dst_hi;
2249   rtx src_lo, src_hi;
2250
2251   machine_mode mode = GET_MODE (dst);
2252
2253   gcc_assert (mode == TImode || mode == TFmode);
2254   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2255   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2256
2257   if (REG_P (dst) && REG_P (src))
2258     {
2259       int src_regno = REGNO (src);
2260       int dst_regno = REGNO (dst);
2261
2262       /* Handle FP <-> GP regs.  */
2263       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2264         {
2265           src_lo = gen_lowpart (word_mode, src);
2266           src_hi = gen_highpart (word_mode, src);
2267
2268           emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2269           emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2270           return;
2271         }
2272       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2273         {
2274           dst_lo = gen_lowpart (word_mode, dst);
2275           dst_hi = gen_highpart (word_mode, dst);
2276
2277           emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2278           emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2279           return;
2280         }
2281     }
2282
2283   dst_lo = gen_lowpart (word_mode, dst);
2284   dst_hi = gen_highpart (word_mode, dst);
2285   src_lo = gen_lowpart (word_mode, src);
2286   src_hi = gen_highpart_mode (word_mode, mode, src);
2287
2288   /* At most one pairing may overlap.  */
2289   if (reg_overlap_mentioned_p (dst_lo, src_hi))
2290     {
2291       aarch64_emit_move (dst_hi, src_hi);
2292       aarch64_emit_move (dst_lo, src_lo);
2293     }
2294   else
2295     {
2296       aarch64_emit_move (dst_lo, src_lo);
2297       aarch64_emit_move (dst_hi, src_hi);
2298     }
2299 }
2300
2301 bool
2302 aarch64_split_128bit_move_p (rtx dst, rtx src)
2303 {
2304   return (! REG_P (src)
2305           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2306 }
2307
2308 /* Split a complex SIMD combine.  */
2309
2310 void
2311 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2312 {
2313   machine_mode src_mode = GET_MODE (src1);
2314   machine_mode dst_mode = GET_MODE (dst);
2315
2316   gcc_assert (VECTOR_MODE_P (dst_mode));
2317   gcc_assert (register_operand (dst, dst_mode)
2318               && register_operand (src1, src_mode)
2319               && register_operand (src2, src_mode));
2320
2321   emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2322   return;
2323 }
2324
2325 /* Split a complex SIMD move.  */
2326
2327 void
2328 aarch64_split_simd_move (rtx dst, rtx src)
2329 {
2330   machine_mode src_mode = GET_MODE (src);
2331   machine_mode dst_mode = GET_MODE (dst);
2332
2333   gcc_assert (VECTOR_MODE_P (dst_mode));
2334
2335   if (REG_P (dst) && REG_P (src))
2336     {
2337       gcc_assert (VECTOR_MODE_P (src_mode));
2338       emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2339     }
2340 }
2341
2342 bool
2343 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2344                               machine_mode ymode, rtx y)
2345 {
2346   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2347   gcc_assert (r != NULL);
2348   return rtx_equal_p (x, r);
2349 }
2350
2351
2352 static rtx
2353 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2354 {
2355   if (can_create_pseudo_p ())
2356     return force_reg (mode, value);
2357   else
2358     {
2359       gcc_assert (x);
2360       aarch64_emit_move (x, value);
2361       return x;
2362     }
2363 }
2364
2365 /* Return true if we can move VALUE into a register using a single
2366    CNT[BHWD] instruction.  */
2367
2368 static bool
2369 aarch64_sve_cnt_immediate_p (poly_int64 value)
2370 {
2371   HOST_WIDE_INT factor = value.coeffs[0];
2372   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
2373   return (value.coeffs[1] == factor
2374           && IN_RANGE (factor, 2, 16 * 16)
2375           && (factor & 1) == 0
2376           && factor <= 16 * (factor & -factor));
2377 }
2378
2379 /* Likewise for rtx X.  */
2380
2381 bool
2382 aarch64_sve_cnt_immediate_p (rtx x)
2383 {
2384   poly_int64 value;
2385   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2386 }
2387
2388 /* Return the asm string for an instruction with a CNT-like vector size
2389    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2390    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2391    first part of the operands template (the part that comes before the
2392    vector size itself).  FACTOR is the number of quadwords.
2393    NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2394    If it is zero, we can use any element size.  */
2395
2396 static char *
2397 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2398                                   unsigned int factor,
2399                                   unsigned int nelts_per_vq)
2400 {
2401   static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2402
2403   if (nelts_per_vq == 0)
2404     /* There is some overlap in the ranges of the four CNT instructions.
2405        Here we always use the smallest possible element size, so that the
2406        multiplier is 1 whereever possible.  */
2407     nelts_per_vq = factor & -factor;
2408   int shift = std::min (exact_log2 (nelts_per_vq), 4);
2409   gcc_assert (IN_RANGE (shift, 1, 4));
2410   char suffix = "dwhb"[shift - 1];
2411
2412   factor >>= shift;
2413   unsigned int written;
2414   if (factor == 1)
2415     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2416                         prefix, suffix, operands);
2417   else
2418     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2419                         prefix, suffix, operands, factor);
2420   gcc_assert (written < sizeof (buffer));
2421   return buffer;
2422 }
2423
2424 /* Return the asm string for an instruction with a CNT-like vector size
2425    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2426    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2427    first part of the operands template (the part that comes before the
2428    vector size itself).  X is the value of the vector size operand,
2429    as a polynomial integer rtx.  */
2430
2431 char *
2432 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2433                                   rtx x)
2434 {
2435   poly_int64 value = rtx_to_poly_int64 (x);
2436   gcc_assert (aarch64_sve_cnt_immediate_p (value));
2437   return aarch64_output_sve_cnt_immediate (prefix, operands,
2438                                            value.coeffs[1], 0);
2439 }
2440
2441 /* Return true if we can add VALUE to a register using a single ADDVL
2442    or ADDPL instruction.  */
2443
2444 static bool
2445 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2446 {
2447   HOST_WIDE_INT factor = value.coeffs[0];
2448   if (factor == 0 || value.coeffs[1] != factor)
2449     return false;
2450   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2451      and a value of 16 is one vector width.  */
2452   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2453           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2454 }
2455
2456 /* Likewise for rtx X.  */
2457
2458 bool
2459 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2460 {
2461   poly_int64 value;
2462   return (poly_int_rtx_p (x, &value)
2463           && aarch64_sve_addvl_addpl_immediate_p (value));
2464 }
2465
2466 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2467    and storing the result in operand 0.  */
2468
2469 char *
2470 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2471 {
2472   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2473   poly_int64 offset_value = rtx_to_poly_int64 (offset);
2474   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2475
2476   /* Use INC or DEC if possible.  */
2477   if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2478     {
2479       if (aarch64_sve_cnt_immediate_p (offset_value))
2480         return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2481                                                  offset_value.coeffs[1], 0);
2482       if (aarch64_sve_cnt_immediate_p (-offset_value))
2483         return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2484                                                  -offset_value.coeffs[1], 0);
2485     }
2486
2487   int factor = offset_value.coeffs[1];
2488   if ((factor & 15) == 0)
2489     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2490   else
2491     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2492   return buffer;
2493 }
2494
2495 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2496    instruction.  If it is, store the number of elements in each vector
2497    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2498    factor in *FACTOR_OUT (if nonnull).  */
2499
2500 bool
2501 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2502                                  unsigned int *nelts_per_vq_out)
2503 {
2504   rtx elt;
2505   poly_int64 value;
2506
2507   if (!const_vec_duplicate_p (x, &elt)
2508       || !poly_int_rtx_p (elt, &value))
2509     return false;
2510
2511   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2512   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2513     /* There's no vector INCB.  */
2514     return false;
2515
2516   HOST_WIDE_INT factor = value.coeffs[0];
2517   if (value.coeffs[1] != factor)
2518     return false;
2519
2520   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
2521   if ((factor % nelts_per_vq) != 0
2522       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2523     return false;
2524
2525   if (factor_out)
2526     *factor_out = factor;
2527   if (nelts_per_vq_out)
2528     *nelts_per_vq_out = nelts_per_vq;
2529   return true;
2530 }
2531
2532 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2533    instruction.  */
2534
2535 bool
2536 aarch64_sve_inc_dec_immediate_p (rtx x)
2537 {
2538   return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2539 }
2540
2541 /* Return the asm template for an SVE vector INC or DEC instruction.
2542    OPERANDS gives the operands before the vector count and X is the
2543    value of the vector count operand itself.  */
2544
2545 char *
2546 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2547 {
2548   int factor;
2549   unsigned int nelts_per_vq;
2550   if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2551     gcc_unreachable ();
2552   if (factor < 0)
2553     return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2554                                              nelts_per_vq);
2555   else
2556     return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2557                                              nelts_per_vq);
2558 }
2559
2560 static int
2561 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2562                                 scalar_int_mode mode)
2563 {
2564   int i;
2565   unsigned HOST_WIDE_INT val, val2, mask;
2566   int one_match, zero_match;
2567   int num_insns;
2568
2569   val = INTVAL (imm);
2570
2571   if (aarch64_move_imm (val, mode))
2572     {
2573       if (generate)
2574         emit_insn (gen_rtx_SET (dest, imm));
2575       return 1;
2576     }
2577
2578   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2579      (with XXXX non-zero). In that case check to see if the move can be done in
2580      a smaller mode.  */
2581   val2 = val & 0xffffffff;
2582   if (mode == DImode
2583       && aarch64_move_imm (val2, SImode)
2584       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2585     {
2586       if (generate)
2587         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2588
2589       /* Check if we have to emit a second instruction by checking to see
2590          if any of the upper 32 bits of the original DI mode value is set.  */
2591       if (val == val2)
2592         return 1;
2593
2594       i = (val >> 48) ? 48 : 32;
2595
2596       if (generate)
2597          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2598                                     GEN_INT ((val >> i) & 0xffff)));
2599
2600       return 2;
2601     }
2602
2603   if ((val >> 32) == 0 || mode == SImode)
2604     {
2605       if (generate)
2606         {
2607           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2608           if (mode == SImode)
2609             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2610                                        GEN_INT ((val >> 16) & 0xffff)));
2611           else
2612             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2613                                        GEN_INT ((val >> 16) & 0xffff)));
2614         }
2615       return 2;
2616     }
2617
2618   /* Remaining cases are all for DImode.  */
2619
2620   mask = 0xffff;
2621   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2622     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2623   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2624     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2625
2626   if (zero_match != 2 && one_match != 2)
2627     {
2628       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2629          For a 64-bit bitmask try whether changing 16 bits to all ones or
2630          zeroes creates a valid bitmask.  To check any repeated bitmask,
2631          try using 16 bits from the other 32-bit half of val.  */
2632
2633       for (i = 0; i < 64; i += 16, mask <<= 16)
2634         {
2635           val2 = val & ~mask;
2636           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2637             break;
2638           val2 = val | mask;
2639           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2640             break;
2641           val2 = val2 & ~mask;
2642           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2643           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2644             break;
2645         }
2646       if (i != 64)
2647         {
2648           if (generate)
2649             {
2650               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2651               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2652                                          GEN_INT ((val >> i) & 0xffff)));
2653             }
2654           return 2;
2655         }
2656     }
2657
2658   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2659      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
2660      otherwise skip zero bits.  */
2661
2662   num_insns = 1;
2663   mask = 0xffff;
2664   val2 = one_match > zero_match ? ~val : val;
2665   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2666
2667   if (generate)
2668     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2669                                            ? (val | ~(mask << i))
2670                                            : (val & (mask << i)))));
2671   for (i += 16; i < 64; i += 16)
2672     {
2673       if ((val2 & (mask << i)) == 0)
2674         continue;
2675       if (generate)
2676         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2677                                    GEN_INT ((val >> i) & 0xffff)));
2678       num_insns ++;
2679     }
2680
2681   return num_insns;
2682 }
2683
2684 /* Return whether imm is a 128-bit immediate which is simple enough to
2685    expand inline.  */
2686 bool
2687 aarch64_mov128_immediate (rtx imm)
2688 {
2689   if (GET_CODE (imm) == CONST_INT)
2690     return true;
2691
2692   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2693
2694   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2695   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2696
2697   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2698          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2699 }
2700
2701
2702 /* Return the number of temporary registers that aarch64_add_offset_1
2703    would need to add OFFSET to a register.  */
2704
2705 static unsigned int
2706 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2707 {
2708   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2709 }
2710
2711 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
2712    a non-polynomial OFFSET.  MODE is the mode of the addition.
2713    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2714    be set and CFA adjustments added to the generated instructions.
2715
2716    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2717    temporary if register allocation is already complete.  This temporary
2718    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
2719    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2720    the immediate again.
2721
2722    Since this function may be used to adjust the stack pointer, we must
2723    ensure that it cannot cause transient stack deallocation (for example
2724    by first incrementing SP and then decrementing when adjusting by a
2725    large immediate).  */
2726
2727 static void
2728 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2729                       rtx src, HOST_WIDE_INT offset, rtx temp1,
2730                       bool frame_related_p, bool emit_move_imm)
2731 {
2732   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2733   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2734
2735   HOST_WIDE_INT moffset = abs_hwi (offset);
2736   rtx_insn *insn;
2737
2738   if (!moffset)
2739     {
2740       if (!rtx_equal_p (dest, src))
2741         {
2742           insn = emit_insn (gen_rtx_SET (dest, src));
2743           RTX_FRAME_RELATED_P (insn) = frame_related_p;
2744         }
2745       return;
2746     }
2747
2748   /* Single instruction adjustment.  */
2749   if (aarch64_uimm12_shift (moffset))
2750     {
2751       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2752       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2753       return;
2754     }
2755
2756   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2757      and either:
2758
2759      a) the offset cannot be loaded by a 16-bit move or
2760      b) there is no spare register into which we can move it.  */
2761   if (moffset < 0x1000000
2762       && ((!temp1 && !can_create_pseudo_p ())
2763           || !aarch64_move_imm (moffset, mode)))
2764     {
2765       HOST_WIDE_INT low_off = moffset & 0xfff;
2766
2767       low_off = offset < 0 ? -low_off : low_off;
2768       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2769       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2770       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2771       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2772       return;
2773     }
2774
2775   /* Emit a move immediate if required and an addition/subtraction.  */
2776   if (emit_move_imm)
2777     {
2778       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2779       temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2780     }
2781   insn = emit_insn (offset < 0
2782                     ? gen_sub3_insn (dest, src, temp1)
2783                     : gen_add3_insn (dest, src, temp1));
2784   if (frame_related_p)
2785     {
2786       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2787       rtx adj = plus_constant (mode, src, offset);
2788       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2789     }
2790 }
2791
2792 /* Return the number of temporary registers that aarch64_add_offset
2793    would need to move OFFSET into a register or add OFFSET to a register;
2794    ADD_P is true if we want the latter rather than the former.  */
2795
2796 static unsigned int
2797 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2798 {
2799   /* This follows the same structure as aarch64_add_offset.  */
2800   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2801     return 0;
2802
2803   unsigned int count = 0;
2804   HOST_WIDE_INT factor = offset.coeffs[1];
2805   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2806   poly_int64 poly_offset (factor, factor);
2807   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2808     /* Need one register for the ADDVL/ADDPL result.  */
2809     count += 1;
2810   else if (factor != 0)
2811     {
2812       factor = abs (factor);
2813       if (factor > 16 * (factor & -factor))
2814         /* Need one register for the CNT result and one for the multiplication
2815            factor.  If necessary, the second temporary can be reused for the
2816            constant part of the offset.  */
2817         return 2;
2818       /* Need one register for the CNT result (which might then
2819          be shifted).  */
2820       count += 1;
2821     }
2822   return count + aarch64_add_offset_1_temporaries (constant);
2823 }
2824
2825 /* If X can be represented as a poly_int64, return the number
2826    of temporaries that are required to add it to a register.
2827    Return -1 otherwise.  */
2828
2829 int
2830 aarch64_add_offset_temporaries (rtx x)
2831 {
2832   poly_int64 offset;
2833   if (!poly_int_rtx_p (x, &offset))
2834     return -1;
2835   return aarch64_offset_temporaries (true, offset);
2836 }
2837
2838 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
2839    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2840    be set and CFA adjustments added to the generated instructions.
2841
2842    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2843    temporary if register allocation is already complete.  This temporary
2844    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2845    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2846    false to avoid emitting the immediate again.
2847
2848    TEMP2, if nonnull, is a second temporary register that doesn't
2849    overlap either DEST or REG.
2850
2851    Since this function may be used to adjust the stack pointer, we must
2852    ensure that it cannot cause transient stack deallocation (for example
2853    by first incrementing SP and then decrementing when adjusting by a
2854    large immediate).  */
2855
2856 static void
2857 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2858                     poly_int64 offset, rtx temp1, rtx temp2,
2859                     bool frame_related_p, bool emit_move_imm = true)
2860 {
2861   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2862   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2863   gcc_assert (temp1 == NULL_RTX
2864               || !frame_related_p
2865               || !reg_overlap_mentioned_p (temp1, dest));
2866   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2867
2868   /* Try using ADDVL or ADDPL to add the whole value.  */
2869   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2870     {
2871       rtx offset_rtx = gen_int_mode (offset, mode);
2872       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2873       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2874       return;
2875     }
2876
2877   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2878      SVE vector register, over and above the minimum size of 128 bits.
2879      This is equivalent to half the value returned by CNTD with a
2880      vector shape of ALL.  */
2881   HOST_WIDE_INT factor = offset.coeffs[1];
2882   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2883
2884   /* Try using ADDVL or ADDPL to add the VG-based part.  */
2885   poly_int64 poly_offset (factor, factor);
2886   if (src != const0_rtx
2887       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2888     {
2889       rtx offset_rtx = gen_int_mode (poly_offset, mode);
2890       if (frame_related_p)
2891         {
2892           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2893           RTX_FRAME_RELATED_P (insn) = true;
2894           src = dest;
2895         }
2896       else
2897         {
2898           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
2899           src = aarch64_force_temporary (mode, temp1, addr);
2900           temp1 = temp2;
2901           temp2 = NULL_RTX;
2902         }
2903     }
2904   /* Otherwise use a CNT-based sequence.  */
2905   else if (factor != 0)
2906     {
2907       /* Use a subtraction if we have a negative factor.  */
2908       rtx_code code = PLUS;
2909       if (factor < 0)
2910         {
2911           factor = -factor;
2912           code = MINUS;
2913         }
2914
2915       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
2916          into the multiplication.  */
2917       rtx val;
2918       int shift = 0;
2919       if (factor & 1)
2920         /* Use a right shift by 1.  */
2921         shift = -1;
2922       else
2923         factor /= 2;
2924       HOST_WIDE_INT low_bit = factor & -factor;
2925       if (factor <= 16 * low_bit)
2926         {
2927           if (factor > 16 * 8)
2928             {
2929               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
2930                  the value with the minimum multiplier and shift it into
2931                  position.  */
2932               int extra_shift = exact_log2 (low_bit);
2933               shift += extra_shift;
2934               factor >>= extra_shift;
2935             }
2936           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
2937         }
2938       else
2939         {
2940           /* Use CNTD, then multiply it by FACTOR.  */
2941           val = gen_int_mode (poly_int64 (2, 2), mode);
2942           val = aarch64_force_temporary (mode, temp1, val);
2943
2944           /* Go back to using a negative multiplication factor if we have
2945              no register from which to subtract.  */
2946           if (code == MINUS && src == const0_rtx)
2947             {
2948               factor = -factor;
2949               code = PLUS;
2950             }
2951           rtx coeff1 = gen_int_mode (factor, mode);
2952           coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
2953           val = gen_rtx_MULT (mode, val, coeff1);
2954         }
2955
2956       if (shift > 0)
2957         {
2958           /* Multiply by 1 << SHIFT.  */
2959           val = aarch64_force_temporary (mode, temp1, val);
2960           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
2961         }
2962       else if (shift == -1)
2963         {
2964           /* Divide by 2.  */
2965           val = aarch64_force_temporary (mode, temp1, val);
2966           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
2967         }
2968
2969       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
2970       if (src != const0_rtx)
2971         {
2972           val = aarch64_force_temporary (mode, temp1, val);
2973           val = gen_rtx_fmt_ee (code, mode, src, val);
2974         }
2975       else if (code == MINUS)
2976         {
2977           val = aarch64_force_temporary (mode, temp1, val);
2978           val = gen_rtx_NEG (mode, val);
2979         }
2980
2981       if (constant == 0 || frame_related_p)
2982         {
2983           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
2984           if (frame_related_p)
2985             {
2986               RTX_FRAME_RELATED_P (insn) = true;
2987               add_reg_note (insn, REG_CFA_ADJUST_CFA,
2988                             gen_rtx_SET (dest, plus_constant (Pmode, src,
2989                                                               poly_offset)));
2990             }
2991           src = dest;
2992           if (constant == 0)
2993             return;
2994         }
2995       else
2996         {
2997           src = aarch64_force_temporary (mode, temp1, val);
2998           temp1 = temp2;
2999           temp2 = NULL_RTX;
3000         }
3001
3002       emit_move_imm = true;
3003     }
3004
3005   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
3006                         frame_related_p, emit_move_imm);
3007 }
3008
3009 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3010    than a poly_int64.  */
3011
3012 void
3013 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3014                           rtx offset_rtx, rtx temp1, rtx temp2)
3015 {
3016   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
3017                       temp1, temp2, false);
3018 }
3019
3020 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3021    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
3022    if TEMP1 already contains abs (DELTA).  */
3023
3024 static inline void
3025 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
3026 {
3027   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
3028                       temp1, temp2, true, emit_move_imm);
3029 }
3030
3031 /* Subtract DELTA from the stack pointer, marking the instructions
3032    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
3033    if nonnull.  */
3034
3035 static inline void
3036 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
3037                 bool emit_move_imm = true)
3038 {
3039   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
3040                       temp1, temp2, frame_related_p, emit_move_imm);
3041 }
3042
3043 /* Set DEST to (vec_series BASE STEP).  */
3044
3045 static void
3046 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
3047 {
3048   machine_mode mode = GET_MODE (dest);
3049   scalar_mode inner = GET_MODE_INNER (mode);
3050
3051   /* Each operand can be a register or an immediate in the range [-16, 15].  */
3052   if (!aarch64_sve_index_immediate_p (base))
3053     base = force_reg (inner, base);
3054   if (!aarch64_sve_index_immediate_p (step))
3055     step = force_reg (inner, step);
3056
3057   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
3058 }
3059
3060 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
3061    integer of mode INT_MODE.  Return true on success.  */
3062
3063 static bool
3064 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
3065                                       rtx src)
3066 {
3067   /* If the constant is smaller than 128 bits, we can do the move
3068      using a vector of SRC_MODEs.  */
3069   if (src_mode != TImode)
3070     {
3071       poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
3072                                      GET_MODE_SIZE (src_mode));
3073       machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
3074       emit_move_insn (gen_lowpart (dup_mode, dest),
3075                       gen_const_vec_duplicate (dup_mode, src));
3076       return true;
3077     }
3078
3079   /* Use LD1RQ[BHWD] to load the 128 bits from memory.  */
3080   src = force_const_mem (src_mode, src);
3081   if (!src)
3082     return false;
3083
3084   /* Make sure that the address is legitimate.  */
3085   if (!aarch64_sve_ld1r_operand_p (src))
3086     {
3087       rtx addr = force_reg (Pmode, XEXP (src, 0));
3088       src = replace_equiv_address (src, addr);
3089     }
3090
3091   machine_mode mode = GET_MODE (dest);
3092   unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
3093   machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
3094   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3095   src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
3096   emit_insn (gen_rtx_SET (dest, src));
3097   return true;
3098 }
3099
3100 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
3101    isn't a simple duplicate or series.  */
3102
3103 static void
3104 aarch64_expand_sve_const_vector (rtx dest, rtx src)
3105 {
3106   machine_mode mode = GET_MODE (src);
3107   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
3108   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
3109   gcc_assert (npatterns > 1);
3110
3111   if (nelts_per_pattern == 1)
3112     {
3113       /* The constant is a repeating seqeuence of at least two elements,
3114          where the repeating elements occupy no more than 128 bits.
3115          Get an integer representation of the replicated value.  */
3116       scalar_int_mode int_mode;
3117       if (BYTES_BIG_ENDIAN)
3118         /* For now, always use LD1RQ to load the value on big-endian
3119            targets, since the handling of smaller integers includes a
3120            subreg that is semantically an element reverse.  */
3121         int_mode = TImode;
3122       else
3123         {
3124           unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
3125           gcc_assert (int_bits <= 128);
3126           int_mode = int_mode_for_size (int_bits, 0).require ();
3127         }
3128       rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
3129       if (int_value
3130           && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
3131         return;
3132     }
3133
3134   /* Expand each pattern individually.  */
3135   rtx_vector_builder builder;
3136   auto_vec<rtx, 16> vectors (npatterns);
3137   for (unsigned int i = 0; i < npatterns; ++i)
3138     {
3139       builder.new_vector (mode, 1, nelts_per_pattern);
3140       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
3141         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
3142       vectors.quick_push (force_reg (mode, builder.build ()));
3143     }
3144
3145   /* Use permutes to interleave the separate vectors.  */
3146   while (npatterns > 1)
3147     {
3148       npatterns /= 2;
3149       for (unsigned int i = 0; i < npatterns; ++i)
3150         {
3151           rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
3152           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
3153           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
3154           vectors[i] = tmp;
3155         }
3156     }
3157   gcc_assert (vectors[0] == dest);
3158 }
3159
3160 /* Set DEST to immediate IMM.  For SVE vector modes, GEN_VEC_DUPLICATE
3161    is a pattern that can be used to set DEST to a replicated scalar
3162    element.  */
3163
3164 void
3165 aarch64_expand_mov_immediate (rtx dest, rtx imm,
3166                               rtx (*gen_vec_duplicate) (rtx, rtx))
3167 {
3168   machine_mode mode = GET_MODE (dest);
3169
3170   /* Check on what type of symbol it is.  */
3171   scalar_int_mode int_mode;
3172   if ((GET_CODE (imm) == SYMBOL_REF
3173        || GET_CODE (imm) == LABEL_REF
3174        || GET_CODE (imm) == CONST
3175        || GET_CODE (imm) == CONST_POLY_INT)
3176       && is_a <scalar_int_mode> (mode, &int_mode))
3177     {
3178       rtx mem;
3179       poly_int64 offset;
3180       HOST_WIDE_INT const_offset;
3181       enum aarch64_symbol_type sty;
3182
3183       /* If we have (const (plus symbol offset)), separate out the offset
3184          before we start classifying the symbol.  */
3185       rtx base = strip_offset (imm, &offset);
3186
3187       /* We must always add an offset involving VL separately, rather than
3188          folding it into the relocation.  */
3189       if (!offset.is_constant (&const_offset))
3190         {
3191           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
3192             emit_insn (gen_rtx_SET (dest, imm));
3193           else
3194             {
3195               /* Do arithmetic on 32-bit values if the result is smaller
3196                  than that.  */
3197               if (partial_subreg_p (int_mode, SImode))
3198                 {
3199                   /* It is invalid to do symbol calculations in modes
3200                      narrower than SImode.  */
3201                   gcc_assert (base == const0_rtx);
3202                   dest = gen_lowpart (SImode, dest);
3203                   int_mode = SImode;
3204                 }
3205               if (base != const0_rtx)
3206                 {
3207                   base = aarch64_force_temporary (int_mode, dest, base);
3208                   aarch64_add_offset (int_mode, dest, base, offset,
3209                                       NULL_RTX, NULL_RTX, false);
3210                 }
3211               else
3212                 aarch64_add_offset (int_mode, dest, base, offset,
3213                                     dest, NULL_RTX, false);
3214             }
3215           return;
3216         }
3217
3218       sty = aarch64_classify_symbol (base, const_offset);
3219       switch (sty)
3220         {
3221         case SYMBOL_FORCE_TO_MEM:
3222           if (const_offset != 0
3223               && targetm.cannot_force_const_mem (int_mode, imm))
3224             {
3225               gcc_assert (can_create_pseudo_p ());
3226               base = aarch64_force_temporary (int_mode, dest, base);
3227               aarch64_add_offset (int_mode, dest, base, const_offset,
3228                                   NULL_RTX, NULL_RTX, false);
3229               return;
3230             }
3231
3232           mem = force_const_mem (ptr_mode, imm);
3233           gcc_assert (mem);
3234
3235           /* If we aren't generating PC relative literals, then
3236              we need to expand the literal pool access carefully.
3237              This is something that needs to be done in a number
3238              of places, so could well live as a separate function.  */
3239           if (!aarch64_pcrelative_literal_loads)
3240             {
3241               gcc_assert (can_create_pseudo_p ());
3242               base = gen_reg_rtx (ptr_mode);
3243               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
3244               if (ptr_mode != Pmode)
3245                 base = convert_memory_address (Pmode, base);
3246               mem = gen_rtx_MEM (ptr_mode, base);
3247             }
3248
3249           if (int_mode != ptr_mode)
3250             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
3251
3252           emit_insn (gen_rtx_SET (dest, mem));
3253
3254           return;
3255
3256         case SYMBOL_SMALL_TLSGD:
3257         case SYMBOL_SMALL_TLSDESC:
3258         case SYMBOL_SMALL_TLSIE:
3259         case SYMBOL_SMALL_GOT_28K:
3260         case SYMBOL_SMALL_GOT_4G:
3261         case SYMBOL_TINY_GOT:
3262         case SYMBOL_TINY_TLSIE:
3263           if (const_offset != 0)
3264             {
3265               gcc_assert(can_create_pseudo_p ());
3266               base = aarch64_force_temporary (int_mode, dest, base);
3267               aarch64_add_offset (int_mode, dest, base, const_offset,
3268                                   NULL_RTX, NULL_RTX, false);
3269               return;
3270             }
3271           /* FALLTHRU */
3272
3273         case SYMBOL_SMALL_ABSOLUTE:
3274         case SYMBOL_TINY_ABSOLUTE:
3275         case SYMBOL_TLSLE12:
3276         case SYMBOL_TLSLE24:
3277         case SYMBOL_TLSLE32:
3278         case SYMBOL_TLSLE48:
3279           aarch64_load_symref_appropriately (dest, imm, sty);
3280           return;
3281
3282         default:
3283           gcc_unreachable ();
3284         }
3285     }
3286
3287   if (!CONST_INT_P (imm))
3288     {
3289       rtx base, step, value;
3290       if (GET_CODE (imm) == HIGH
3291           || aarch64_simd_valid_immediate (imm, NULL))
3292         emit_insn (gen_rtx_SET (dest, imm));
3293       else if (const_vec_series_p (imm, &base, &step))
3294         aarch64_expand_vec_series (dest, base, step);
3295       else if (const_vec_duplicate_p (imm, &value))
3296         {
3297           /* If the constant is out of range of an SVE vector move,
3298              load it from memory if we can, otherwise move it into
3299              a register and use a DUP.  */
3300           scalar_mode inner_mode = GET_MODE_INNER (mode);
3301           rtx op = force_const_mem (inner_mode, value);
3302           if (!op)
3303             op = force_reg (inner_mode, value);
3304           else if (!aarch64_sve_ld1r_operand_p (op))
3305             {
3306               rtx addr = force_reg (Pmode, XEXP (op, 0));
3307               op = replace_equiv_address (op, addr);
3308             }
3309           emit_insn (gen_vec_duplicate (dest, op));
3310         }
3311       else if (GET_CODE (imm) == CONST_VECTOR
3312                && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3313         aarch64_expand_sve_const_vector (dest, imm);
3314       else
3315         {
3316           rtx mem = force_const_mem (mode, imm);
3317           gcc_assert (mem);
3318           emit_move_insn (dest, mem);
3319         }
3320
3321       return;
3322     }
3323
3324   aarch64_internal_mov_immediate (dest, imm, true,
3325                                   as_a <scalar_int_mode> (mode));
3326 }
3327
3328 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
3329    that is known to contain PTRUE.  */
3330
3331 void
3332 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3333 {
3334   emit_insn (gen_rtx_SET (dest, gen_rtx_UNSPEC (GET_MODE (dest),
3335                                                 gen_rtvec (2, pred, src),
3336                                                 UNSPEC_MERGE_PTRUE)));
3337 }
3338
3339 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3340    operand is in memory.  In this case we need to use the predicated LD1
3341    and ST1 instead of LDR and STR, both for correctness on big-endian
3342    targets and because LD1 and ST1 support a wider range of addressing modes.
3343    PRED_MODE is the mode of the predicate.
3344
3345    See the comment at the head of aarch64-sve.md for details about the
3346    big-endian handling.  */
3347
3348 void
3349 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3350 {
3351   machine_mode mode = GET_MODE (dest);
3352   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3353   if (!register_operand (src, mode)
3354       && !register_operand (dest, mode))
3355     {
3356       rtx tmp = gen_reg_rtx (mode);
3357       if (MEM_P (src))
3358         aarch64_emit_sve_pred_move (tmp, ptrue, src);
3359       else
3360         emit_move_insn (tmp, src);
3361       src = tmp;
3362     }
3363   aarch64_emit_sve_pred_move (dest, ptrue, src);
3364 }
3365
3366 /* Called only on big-endian targets.  See whether an SVE vector move
3367    from SRC to DEST is effectively a REV[BHW] instruction, because at
3368    least one operand is a subreg of an SVE vector that has wider or
3369    narrower elements.  Return true and emit the instruction if so.
3370
3371    For example:
3372
3373      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3374
3375    represents a VIEW_CONVERT between the following vectors, viewed
3376    in memory order:
3377
3378      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
3379      R1: { [0],      [1],      [2],      [3],     ... }
3380
3381    The high part of lane X in R2 should therefore correspond to lane X*2
3382    of R1, but the register representations are:
3383
3384          msb                                      lsb
3385      R2: ...... [1].high  [1].low   [0].high  [0].low
3386      R1: ...... [3]       [2]       [1]       [0]
3387
3388    where the low part of lane X in R2 corresponds to lane X*2 in R1.
3389    We therefore need a reverse operation to swap the high and low values
3390    around.
3391
3392    This is purely an optimization.  Without it we would spill the
3393    subreg operand to the stack in one mode and reload it in the
3394    other mode, which has the same effect as the REV.  */
3395
3396 bool
3397 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3398 {
3399   gcc_assert (BYTES_BIG_ENDIAN);
3400   if (GET_CODE (dest) == SUBREG)
3401     dest = SUBREG_REG (dest);
3402   if (GET_CODE (src) == SUBREG)
3403     src = SUBREG_REG (src);
3404
3405   /* The optimization handles two single SVE REGs with different element
3406      sizes.  */
3407   if (!REG_P (dest)
3408       || !REG_P (src)
3409       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3410       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3411       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3412           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3413     return false;
3414
3415   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
3416   rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3417   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3418                                UNSPEC_REV_SUBREG);
3419   emit_insn (gen_rtx_SET (dest, unspec));
3420   return true;
3421 }
3422
3423 /* Return a copy of X with mode MODE, without changing its other
3424    attributes.  Unlike gen_lowpart, this doesn't care whether the
3425    mode change is valid.  */
3426
3427 static rtx
3428 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3429 {
3430   if (GET_MODE (x) == mode)
3431     return x;
3432
3433   x = shallow_copy_rtx (x);
3434   set_mode_and_regno (x, mode, REGNO (x));
3435   return x;
3436 }
3437
3438 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3439    operands.  */
3440
3441 void
3442 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3443 {
3444   /* Decide which REV operation we need.  The mode with narrower elements
3445      determines the mode of the operands and the mode with the wider
3446      elements determines the reverse width.  */
3447   machine_mode mode_with_wider_elts = GET_MODE (dest);
3448   machine_mode mode_with_narrower_elts = GET_MODE (src);
3449   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3450       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3451     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3452
3453   unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3454   unsigned int unspec;
3455   if (wider_bytes == 8)
3456     unspec = UNSPEC_REV64;
3457   else if (wider_bytes == 4)
3458     unspec = UNSPEC_REV32;
3459   else if (wider_bytes == 2)
3460     unspec = UNSPEC_REV16;
3461   else
3462     gcc_unreachable ();
3463   machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3464
3465   /* Emit:
3466
3467        (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3468                          UNSPEC_MERGE_PTRUE))
3469
3470      with the appropriate modes.  */
3471   ptrue = gen_lowpart (pred_mode, ptrue);
3472   dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3473   src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3474   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3475   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3476                         UNSPEC_MERGE_PTRUE);
3477   emit_insn (gen_rtx_SET (dest, src));
3478 }
3479
3480 static bool
3481 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3482                                  tree exp ATTRIBUTE_UNUSED)
3483 {
3484   if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
3485     return false;
3486
3487   return true;
3488 }
3489
3490 /* Implement TARGET_PASS_BY_REFERENCE.  */
3491
3492 static bool
3493 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3494                            machine_mode mode,
3495                            const_tree type,
3496                            bool named ATTRIBUTE_UNUSED)
3497 {
3498   HOST_WIDE_INT size;
3499   machine_mode dummymode;
3500   int nregs;
3501
3502   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
3503   if (mode == BLKmode && type)
3504     size = int_size_in_bytes (type);
3505   else
3506     /* No frontends can create types with variable-sized modes, so we
3507        shouldn't be asked to pass or return them.  */
3508     size = GET_MODE_SIZE (mode).to_constant ();
3509
3510   /* Aggregates are passed by reference based on their size.  */
3511   if (type && AGGREGATE_TYPE_P (type))
3512     {
3513       size = int_size_in_bytes (type);
3514     }
3515
3516   /* Variable sized arguments are always returned by reference.  */
3517   if (size < 0)
3518     return true;
3519
3520   /* Can this be a candidate to be passed in fp/simd register(s)?  */
3521   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3522                                                &dummymode, &nregs,
3523                                                NULL))
3524     return false;
3525
3526   /* Arguments which are variable sized or larger than 2 registers are
3527      passed by reference unless they are a homogenous floating point
3528      aggregate.  */
3529   return size > 2 * UNITS_PER_WORD;
3530 }
3531
3532 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
3533 static bool
3534 aarch64_return_in_msb (const_tree valtype)
3535 {
3536   machine_mode dummy_mode;
3537   int dummy_int;
3538
3539   /* Never happens in little-endian mode.  */
3540   if (!BYTES_BIG_ENDIAN)
3541     return false;
3542
3543   /* Only composite types smaller than or equal to 16 bytes can
3544      be potentially returned in registers.  */
3545   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3546       || int_size_in_bytes (valtype) <= 0
3547       || int_size_in_bytes (valtype) > 16)
3548     return false;
3549
3550   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3551      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3552      is always passed/returned in the least significant bits of fp/simd
3553      register(s).  */
3554   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3555                                                &dummy_mode, &dummy_int, NULL))
3556     return false;
3557
3558   return true;
3559 }
3560
3561 /* Implement TARGET_FUNCTION_VALUE.
3562    Define how to find the value returned by a function.  */
3563
3564 static rtx
3565 aarch64_function_value (const_tree type, const_tree func,
3566                         bool outgoing ATTRIBUTE_UNUSED)
3567 {
3568   machine_mode mode;
3569   int unsignedp;
3570   int count;
3571   machine_mode ag_mode;
3572
3573   mode = TYPE_MODE (type);
3574   if (INTEGRAL_TYPE_P (type))
3575     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3576
3577   if (aarch64_return_in_msb (type))
3578     {
3579       HOST_WIDE_INT size = int_size_in_bytes (type);
3580
3581       if (size % UNITS_PER_WORD != 0)
3582         {
3583           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3584           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3585         }
3586     }
3587
3588   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3589                                                &ag_mode, &count, NULL))
3590     {
3591       if (!aarch64_composite_type_p (type, mode))
3592         {
3593           gcc_assert (count == 1 && mode == ag_mode);
3594           return gen_rtx_REG (mode, V0_REGNUM);
3595         }
3596       else
3597         {
3598           int i;
3599           rtx par;
3600
3601           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3602           for (i = 0; i < count; i++)
3603             {
3604               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3605               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3606               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3607               XVECEXP (par, 0, i) = tmp;
3608             }
3609           return par;
3610         }
3611     }
3612   else
3613     return gen_rtx_REG (mode, R0_REGNUM);
3614 }
3615
3616 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3617    Return true if REGNO is the number of a hard register in which the values
3618    of called function may come back.  */
3619
3620 static bool
3621 aarch64_function_value_regno_p (const unsigned int regno)
3622 {
3623   /* Maximum of 16 bytes can be returned in the general registers.  Examples
3624      of 16-byte return values are: 128-bit integers and 16-byte small
3625      structures (excluding homogeneous floating-point aggregates).  */
3626   if (regno == R0_REGNUM || regno == R1_REGNUM)
3627     return true;
3628
3629   /* Up to four fp/simd registers can return a function value, e.g. a
3630      homogeneous floating-point aggregate having four members.  */
3631   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3632     return TARGET_FLOAT;
3633
3634   return false;
3635 }
3636
3637 /* Implement TARGET_RETURN_IN_MEMORY.
3638
3639    If the type T of the result of a function is such that
3640      void func (T arg)
3641    would require that arg be passed as a value in a register (or set of
3642    registers) according to the parameter passing rules, then the result
3643    is returned in the same registers as would be used for such an
3644    argument.  */
3645
3646 static bool
3647 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3648 {
3649   HOST_WIDE_INT size;
3650   machine_mode ag_mode;
3651   int count;
3652
3653   if (!AGGREGATE_TYPE_P (type)
3654       && TREE_CODE (type) != COMPLEX_TYPE
3655       && TREE_CODE (type) != VECTOR_TYPE)
3656     /* Simple scalar types always returned in registers.  */
3657     return false;
3658
3659   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3660                                                type,
3661                                                &ag_mode,
3662                                                &count,
3663                                                NULL))
3664     return false;
3665
3666   /* Types larger than 2 registers returned in memory.  */
3667   size = int_size_in_bytes (type);
3668   return (size < 0 || size > 2 * UNITS_PER_WORD);
3669 }
3670
3671 static bool
3672 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3673                                const_tree type, int *nregs)
3674 {
3675   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3676   return aarch64_vfp_is_call_or_return_candidate (mode,
3677                                                   type,
3678                                                   &pcum->aapcs_vfp_rmode,
3679                                                   nregs,
3680                                                   NULL);
3681 }
3682
3683 /* Given MODE and TYPE of a function argument, return the alignment in
3684    bits.  The idea is to suppress any stronger alignment requested by
3685    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3686    This is a helper function for local use only.  */
3687
3688 static unsigned int
3689 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
3690 {
3691   if (!type)
3692     return GET_MODE_ALIGNMENT (mode);
3693
3694   if (integer_zerop (TYPE_SIZE (type)))
3695     return 0;
3696
3697   gcc_assert (TYPE_MODE (type) == mode);
3698
3699   if (!AGGREGATE_TYPE_P (type))
3700     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3701
3702   if (TREE_CODE (type) == ARRAY_TYPE)
3703     return TYPE_ALIGN (TREE_TYPE (type));
3704
3705   unsigned int alignment = 0;
3706   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3707     if (TREE_CODE (field) == FIELD_DECL)
3708       alignment = std::max (alignment, DECL_ALIGN (field));
3709
3710   return alignment;
3711 }
3712
3713 /* Layout a function argument according to the AAPCS64 rules.  The rule
3714    numbers refer to the rule numbers in the AAPCS64.  */
3715
3716 static void
3717 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3718                     const_tree type,
3719                     bool named ATTRIBUTE_UNUSED)
3720 {
3721   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3722   int ncrn, nvrn, nregs;
3723   bool allocate_ncrn, allocate_nvrn;
3724   HOST_WIDE_INT size;
3725
3726   /* We need to do this once per argument.  */
3727   if (pcum->aapcs_arg_processed)
3728     return;
3729
3730   pcum->aapcs_arg_processed = true;
3731
3732   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
3733   if (type)
3734     size = int_size_in_bytes (type);
3735   else
3736     /* No frontends can create types with variable-sized modes, so we
3737        shouldn't be asked to pass or return them.  */
3738     size = GET_MODE_SIZE (mode).to_constant ();
3739   size = ROUND_UP (size, UNITS_PER_WORD);
3740
3741   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3742   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3743                                                  mode,
3744                                                  type,
3745                                                  &nregs);
3746
3747   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3748      The following code thus handles passing by SIMD/FP registers first.  */
3749
3750   nvrn = pcum->aapcs_nvrn;
3751
3752   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3753      and homogenous short-vector aggregates (HVA).  */
3754   if (allocate_nvrn)
3755     {
3756       if (!TARGET_FLOAT)
3757         aarch64_err_no_fpadvsimd (mode);
3758
3759       if (nvrn + nregs <= NUM_FP_ARG_REGS)
3760         {
3761           pcum->aapcs_nextnvrn = nvrn + nregs;
3762           if (!aarch64_composite_type_p (type, mode))
3763             {
3764               gcc_assert (nregs == 1);
3765               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3766             }
3767           else
3768             {
3769               rtx par;
3770               int i;
3771               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3772               for (i = 0; i < nregs; i++)
3773                 {
3774                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3775                                          V0_REGNUM + nvrn + i);
3776                   rtx offset = gen_int_mode
3777                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3778                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3779                   XVECEXP (par, 0, i) = tmp;
3780                 }
3781               pcum->aapcs_reg = par;
3782             }
3783           return;
3784         }
3785       else
3786         {
3787           /* C.3 NSRN is set to 8.  */
3788           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3789           goto on_stack;
3790         }
3791     }
3792
3793   ncrn = pcum->aapcs_ncrn;
3794   nregs = size / UNITS_PER_WORD;
3795
3796   /* C6 - C9.  though the sign and zero extension semantics are
3797      handled elsewhere.  This is the case where the argument fits
3798      entirely general registers.  */
3799   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3800     {
3801
3802       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3803
3804       /* C.8 if the argument has an alignment of 16 then the NGRN is
3805          rounded up to the next even number.  */
3806       if (nregs == 2
3807           && ncrn % 2
3808           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3809              comparison is there because for > 16 * BITS_PER_UNIT
3810              alignment nregs should be > 2 and therefore it should be
3811              passed by reference rather than value.  */
3812           && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3813         {
3814           ++ncrn;
3815           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3816         }
3817
3818       /* NREGS can be 0 when e.g. an empty structure is to be passed.
3819          A reg is still generated for it, but the caller should be smart
3820          enough not to use it.  */
3821       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3822         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3823       else
3824         {
3825           rtx par;
3826           int i;
3827
3828           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3829           for (i = 0; i < nregs; i++)
3830             {
3831               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3832               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3833                                        GEN_INT (i * UNITS_PER_WORD));
3834               XVECEXP (par, 0, i) = tmp;
3835             }
3836           pcum->aapcs_reg = par;
3837         }
3838
3839       pcum->aapcs_nextncrn = ncrn + nregs;
3840       return;
3841     }
3842
3843   /* C.11  */
3844   pcum->aapcs_nextncrn = NUM_ARG_REGS;
3845
3846   /* The argument is passed on stack; record the needed number of words for
3847      this argument and align the total size if necessary.  */
3848 on_stack:
3849   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3850
3851   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3852     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
3853                                        16 / UNITS_PER_WORD);
3854   return;
3855 }
3856
3857 /* Implement TARGET_FUNCTION_ARG.  */
3858
3859 static rtx
3860 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
3861                       const_tree type, bool named)
3862 {
3863   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3864   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
3865
3866   if (mode == VOIDmode)
3867     return NULL_RTX;
3868
3869   aarch64_layout_arg (pcum_v, mode, type, named);
3870   return pcum->aapcs_reg;
3871 }
3872
3873 void
3874 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
3875                            const_tree fntype ATTRIBUTE_UNUSED,
3876                            rtx libname ATTRIBUTE_UNUSED,
3877                            const_tree fndecl ATTRIBUTE_UNUSED,
3878                            unsigned n_named ATTRIBUTE_UNUSED)
3879 {
3880   pcum->aapcs_ncrn = 0;
3881   pcum->aapcs_nvrn = 0;
3882   pcum->aapcs_nextncrn = 0;
3883   pcum->aapcs_nextnvrn = 0;
3884   pcum->pcs_variant = ARM_PCS_AAPCS64;
3885   pcum->aapcs_reg = NULL_RTX;
3886   pcum->aapcs_arg_processed = false;
3887   pcum->aapcs_stack_words = 0;
3888   pcum->aapcs_stack_size = 0;
3889
3890   if (!TARGET_FLOAT
3891       && fndecl && TREE_PUBLIC (fndecl)
3892       && fntype && fntype != error_mark_node)
3893     {
3894       const_tree type = TREE_TYPE (fntype);
3895       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
3896       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
3897       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
3898                                                    &mode, &nregs, NULL))
3899         aarch64_err_no_fpadvsimd (TYPE_MODE (type));
3900     }
3901   return;
3902 }
3903
3904 static void
3905 aarch64_function_arg_advance (cumulative_args_t pcum_v,
3906                               machine_mode mode,
3907                               const_tree type,
3908                               bool named)
3909 {
3910   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3911   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
3912     {
3913       aarch64_layout_arg (pcum_v, mode, type, named);
3914       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
3915                   != (pcum->aapcs_stack_words != 0));
3916       pcum->aapcs_arg_processed = false;
3917       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
3918       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
3919       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
3920       pcum->aapcs_stack_words = 0;
3921       pcum->aapcs_reg = NULL_RTX;
3922     }
3923 }
3924
3925 bool
3926 aarch64_function_arg_regno_p (unsigned regno)
3927 {
3928   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
3929           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
3930 }
3931
3932 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
3933    PARM_BOUNDARY bits of alignment, but will be given anything up
3934    to STACK_BOUNDARY bits if the type requires it.  This makes sure
3935    that both before and after the layout of each argument, the Next
3936    Stacked Argument Address (NSAA) will have a minimum alignment of
3937    8 bytes.  */
3938
3939 static unsigned int
3940 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
3941 {
3942   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
3943   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
3944 }
3945
3946 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
3947
3948 static fixed_size_mode
3949 aarch64_get_reg_raw_mode (int regno)
3950 {
3951   if (TARGET_SVE && FP_REGNUM_P (regno))
3952     /* Don't use the SVE part of the register for __builtin_apply and
3953        __builtin_return.  The SVE registers aren't used by the normal PCS,
3954        so using them there would be a waste of time.  The PCS extensions
3955        for SVE types are fundamentally incompatible with the
3956        __builtin_return/__builtin_apply interface.  */
3957     return as_a <fixed_size_mode> (V16QImode);
3958   return default_get_reg_raw_mode (regno);
3959 }
3960
3961 /* Implement TARGET_FUNCTION_ARG_PADDING.
3962
3963    Small aggregate types are placed in the lowest memory address.
3964
3965    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
3966
3967 static pad_direction
3968 aarch64_function_arg_padding (machine_mode mode, const_tree type)
3969 {
3970   /* On little-endian targets, the least significant byte of every stack
3971      argument is passed at the lowest byte address of the stack slot.  */
3972   if (!BYTES_BIG_ENDIAN)
3973     return PAD_UPWARD;
3974
3975   /* Otherwise, integral, floating-point and pointer types are padded downward:
3976      the least significant byte of a stack argument is passed at the highest
3977      byte address of the stack slot.  */
3978   if (type
3979       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
3980          || POINTER_TYPE_P (type))
3981       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
3982     return PAD_DOWNWARD;
3983
3984   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
3985   return PAD_UPWARD;
3986 }
3987
3988 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
3989
3990    It specifies padding for the last (may also be the only)
3991    element of a block move between registers and memory.  If
3992    assuming the block is in the memory, padding upward means that
3993    the last element is padded after its highest significant byte,
3994    while in downward padding, the last element is padded at the
3995    its least significant byte side.
3996
3997    Small aggregates and small complex types are always padded
3998    upwards.
3999
4000    We don't need to worry about homogeneous floating-point or
4001    short-vector aggregates; their move is not affected by the
4002    padding direction determined here.  Regardless of endianness,
4003    each element of such an aggregate is put in the least
4004    significant bits of a fp/simd register.
4005
4006    Return !BYTES_BIG_ENDIAN if the least significant byte of the
4007    register has useful data, and return the opposite if the most
4008    significant byte does.  */
4009
4010 bool
4011 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
4012                      bool first ATTRIBUTE_UNUSED)
4013 {
4014
4015   /* Small composite types are always padded upward.  */
4016   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
4017     {
4018       HOST_WIDE_INT size;
4019       if (type)
4020         size = int_size_in_bytes (type);
4021       else
4022         /* No frontends can create types with variable-sized modes, so we
4023            shouldn't be asked to pass or return them.  */
4024         size = GET_MODE_SIZE (mode).to_constant ();
4025       if (size < 2 * UNITS_PER_WORD)
4026         return true;
4027     }
4028
4029   /* Otherwise, use the default padding.  */
4030   return !BYTES_BIG_ENDIAN;
4031 }
4032
4033 static scalar_int_mode
4034 aarch64_libgcc_cmp_return_mode (void)
4035 {
4036   return SImode;
4037 }
4038
4039 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
4040
4041 /* We use the 12-bit shifted immediate arithmetic instructions so values
4042    must be multiple of (1 << 12), i.e. 4096.  */
4043 #define ARITH_FACTOR 4096
4044
4045 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
4046 #error Cannot use simple address calculation for stack probing
4047 #endif
4048
4049 /* The pair of scratch registers used for stack probing.  */
4050 #define PROBE_STACK_FIRST_REG  R9_REGNUM
4051 #define PROBE_STACK_SECOND_REG R10_REGNUM
4052
4053 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
4054    inclusive.  These are offsets from the current stack pointer.  */
4055
4056 static void
4057 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
4058 {
4059   HOST_WIDE_INT size;
4060   if (!poly_size.is_constant (&size))
4061     {
4062       sorry ("stack probes for SVE frames");
4063       return;
4064     }
4065
4066   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
4067
4068   /* See the same assertion on PROBE_INTERVAL above.  */
4069   gcc_assert ((first % ARITH_FACTOR) == 0);
4070
4071   /* See if we have a constant small number of probes to generate.  If so,
4072      that's the easy case.  */
4073   if (size <= PROBE_INTERVAL)
4074     {
4075       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
4076
4077       emit_set_insn (reg1,
4078                      plus_constant (Pmode,
4079                                     stack_pointer_rtx, -(first + base)));
4080       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
4081     }
4082
4083   /* The run-time loop is made up of 8 insns in the generic case while the
4084      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
4085   else if (size <= 4 * PROBE_INTERVAL)
4086     {
4087       HOST_WIDE_INT i, rem;
4088
4089       emit_set_insn (reg1,
4090                      plus_constant (Pmode,
4091                                     stack_pointer_rtx,
4092                                     -(first + PROBE_INTERVAL)));
4093       emit_stack_probe (reg1);
4094
4095       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
4096          it exceeds SIZE.  If only two probes are needed, this will not
4097          generate any code.  Then probe at FIRST + SIZE.  */
4098       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
4099         {
4100           emit_set_insn (reg1,
4101                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
4102           emit_stack_probe (reg1);
4103         }
4104
4105       rem = size - (i - PROBE_INTERVAL);
4106       if (rem > 256)
4107         {
4108           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4109
4110           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
4111           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
4112         }
4113       else
4114         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
4115     }
4116
4117   /* Otherwise, do the same as above, but in a loop.  Note that we must be
4118      extra careful with variables wrapping around because we might be at
4119      the very top (or the very bottom) of the address space and we have
4120      to be able to handle this case properly; in particular, we use an
4121      equality test for the loop condition.  */
4122   else
4123     {
4124       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
4125
4126       /* Step 1: round SIZE to the previous multiple of the interval.  */
4127
4128       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
4129
4130
4131       /* Step 2: compute initial and final value of the loop counter.  */
4132
4133       /* TEST_ADDR = SP + FIRST.  */
4134       emit_set_insn (reg1,
4135                      plus_constant (Pmode, stack_pointer_rtx, -first));
4136
4137       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
4138       HOST_WIDE_INT adjustment = - (first + rounded_size);
4139       if (! aarch64_uimm12_shift (adjustment))
4140         {
4141           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
4142                                           true, Pmode);
4143           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
4144         }
4145       else
4146         emit_set_insn (reg2,
4147                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
4148
4149       /* Step 3: the loop
4150
4151          do
4152            {
4153              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
4154              probe at TEST_ADDR
4155            }
4156          while (TEST_ADDR != LAST_ADDR)
4157
4158          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
4159          until it is equal to ROUNDED_SIZE.  */
4160
4161       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
4162
4163
4164       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
4165          that SIZE is equal to ROUNDED_SIZE.  */
4166
4167       if (size != rounded_size)
4168         {
4169           HOST_WIDE_INT rem = size - rounded_size;
4170
4171           if (rem > 256)
4172             {
4173               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4174
4175               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
4176               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
4177             }
4178           else
4179             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
4180         }
4181     }
4182
4183   /* Make sure nothing is scheduled before we are done.  */
4184   emit_insn (gen_blockage ());
4185 }
4186
4187 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
4188    absolute addresses.  */
4189
4190 const char *
4191 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
4192 {
4193   static int labelno = 0;
4194   char loop_lab[32];
4195   rtx xops[2];
4196
4197   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
4198
4199   /* Loop.  */
4200   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
4201
4202   HOST_WIDE_INT stack_clash_probe_interval
4203     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
4204
4205   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
4206   xops[0] = reg1;
4207   HOST_WIDE_INT interval;
4208   if (flag_stack_clash_protection)
4209     interval = stack_clash_probe_interval;
4210   else
4211     interval = PROBE_INTERVAL;
4212
4213   gcc_assert (aarch64_uimm12_shift (interval));
4214   xops[1] = GEN_INT (interval);
4215
4216   output_asm_insn ("sub\t%0, %0, %1", xops);
4217
4218   /* If doing stack clash protection then we probe up by the ABI specified
4219      amount.  We do this because we're dropping full pages at a time in the
4220      loop.  But if we're doing non-stack clash probing, probe at SP 0.  */
4221   if (flag_stack_clash_protection)
4222     xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
4223   else
4224     xops[1] = CONST0_RTX (GET_MODE (xops[1]));
4225
4226   /* Probe at TEST_ADDR.  If we're inside the loop it is always safe to probe
4227      by this amount for each iteration.  */
4228   output_asm_insn ("str\txzr, [%0, %1]", xops);
4229
4230   /* Test if TEST_ADDR == LAST_ADDR.  */
4231   xops[1] = reg2;
4232   output_asm_insn ("cmp\t%0, %1", xops);
4233
4234   /* Branch.  */
4235   fputs ("\tb.ne\t", asm_out_file);
4236   assemble_name_raw (asm_out_file, loop_lab);
4237   fputc ('\n', asm_out_file);
4238
4239   return "";
4240 }
4241
4242 /* Emit the probe loop for doing stack clash probes and stack adjustments for
4243    SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
4244    of GUARD_SIZE.  When a probe is emitted it is done at most
4245    MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
4246    at most MIN_PROBE_THRESHOLD.  By the end of this function
4247    BASE = BASE - ADJUSTMENT.  */
4248
4249 const char *
4250 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
4251                                       rtx min_probe_threshold, rtx guard_size)
4252 {
4253   /* This function is not allowed to use any instruction generation function
4254      like gen_ and friends.  If you do you'll likely ICE during CFG validation,
4255      so instead emit the code you want using output_asm_insn.  */
4256   gcc_assert (flag_stack_clash_protection);
4257   gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
4258   gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
4259
4260   /* The minimum required allocation before the residual requires probing.  */
4261   HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
4262
4263   /* Clamp the value down to the nearest value that can be used with a cmp.  */
4264   residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
4265   rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
4266
4267   gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
4268   gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
4269
4270   static int labelno = 0;
4271   char loop_start_lab[32];
4272   char loop_end_lab[32];
4273   rtx xops[2];
4274
4275   ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
4276   ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
4277
4278   /* Emit loop start label.  */
4279   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
4280
4281   /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
4282   xops[0] = adjustment;
4283   xops[1] = probe_offset_value_rtx;
4284   output_asm_insn ("cmp\t%0, %1", xops);
4285
4286   /* Branch to end if not enough adjustment to probe.  */
4287   fputs ("\tb.lt\t", asm_out_file);
4288   assemble_name_raw (asm_out_file, loop_end_lab);
4289   fputc ('\n', asm_out_file);
4290
4291   /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
4292   xops[0] = base;
4293   xops[1] = probe_offset_value_rtx;
4294   output_asm_insn ("sub\t%0, %0, %1", xops);
4295
4296   /* Probe at BASE.  */
4297   xops[1] = const0_rtx;
4298   output_asm_insn ("str\txzr, [%0, %1]", xops);
4299
4300   /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
4301   xops[0] = adjustment;
4302   xops[1] = probe_offset_value_rtx;
4303   output_asm_insn ("sub\t%0, %0, %1", xops);
4304
4305   /* Branch to start if still more bytes to allocate.  */
4306   fputs ("\tb\t", asm_out_file);
4307   assemble_name_raw (asm_out_file, loop_start_lab);
4308   fputc ('\n', asm_out_file);
4309
4310   /* No probe leave.  */
4311   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
4312
4313   /* BASE = BASE - ADJUSTMENT.  */
4314   xops[0] = base;
4315   xops[1] = adjustment;
4316   output_asm_insn ("sub\t%0, %0, %1", xops);
4317   return "";
4318 }
4319
4320 /* Determine whether a frame chain needs to be generated.  */
4321 static bool
4322 aarch64_needs_frame_chain (void)
4323 {
4324   /* Force a frame chain for EH returns so the return address is at FP+8.  */
4325   if (frame_pointer_needed || crtl->calls_eh_return)
4326     return true;
4327
4328   /* A leaf function cannot have calls or write LR.  */
4329   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
4330
4331   /* Don't use a frame chain in leaf functions if leaf frame pointers
4332      are disabled.  */
4333   if (flag_omit_leaf_frame_pointer && is_leaf)
4334     return false;
4335
4336   return aarch64_use_frame_pointer;
4337 }
4338
4339 /* Mark the registers that need to be saved by the callee and calculate
4340    the size of the callee-saved registers area and frame record (both FP
4341    and LR may be omitted).  */
4342 static void
4343 aarch64_layout_frame (void)
4344 {
4345   HOST_WIDE_INT offset = 0;
4346   int regno, last_fp_reg = INVALID_REGNUM;
4347   bool simd_function = aarch64_simd_decl_p (cfun->decl);
4348
4349   cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
4350
4351   /* Adjust the outgoing arguments size if required.  Keep it in sync with what
4352      the mid-end is doing.  */
4353   crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
4354
4355 #define SLOT_NOT_REQUIRED (-2)
4356 #define SLOT_REQUIRED     (-1)
4357
4358   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
4359   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
4360
4361   /* If this is a non-leaf simd function with calls we assume that
4362      at least one of those calls is to a non-simd function and thus
4363      we must save V8 to V23 in the prologue.  */
4364
4365   if (simd_function && !crtl->is_leaf)
4366     {
4367       for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4368         if (FP_SIMD_SAVED_REGNUM_P (regno))
4369           df_set_regs_ever_live (regno, true);
4370     }
4371
4372   /* First mark all the registers that really need to be saved...  */
4373   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4374     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4375
4376   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4377     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4378
4379   /* ... that includes the eh data registers (if needed)...  */
4380   if (crtl->calls_eh_return)
4381     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
4382       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
4383         = SLOT_REQUIRED;
4384
4385   /* ... and any callee saved register that dataflow says is live.  */
4386   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4387     if (df_regs_ever_live_p (regno)
4388         && (regno == R30_REGNUM
4389             || !call_used_regs[regno]))
4390       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4391
4392   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4393     if (df_regs_ever_live_p (regno)
4394         && (!call_used_regs[regno]
4395             || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
4396       {
4397         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4398         last_fp_reg = regno;
4399       }
4400
4401   if (cfun->machine->frame.emit_frame_chain)
4402     {
4403       /* FP and LR are placed in the linkage record.  */
4404       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
4405       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
4406       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
4407       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
4408       offset = 2 * UNITS_PER_WORD;
4409     }
4410
4411   /* With stack-clash, LR must be saved in non-leaf functions.  */
4412   gcc_assert (crtl->is_leaf
4413               || (cfun->machine->frame.reg_offset[R30_REGNUM]
4414                   != SLOT_NOT_REQUIRED));
4415
4416   /* Now assign stack slots for them.  */
4417   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4418     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4419       {
4420         cfun->machine->frame.reg_offset[regno] = offset;
4421         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4422           cfun->machine->frame.wb_candidate1 = regno;
4423         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4424           cfun->machine->frame.wb_candidate2 = regno;
4425         offset += UNITS_PER_WORD;
4426       }
4427
4428   HOST_WIDE_INT max_int_offset = offset;
4429   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4430   bool has_align_gap = offset != max_int_offset;
4431
4432   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4433     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4434       {
4435         /* If there is an alignment gap between integer and fp callee-saves,
4436            allocate the last fp register to it if possible.  */
4437         if (regno == last_fp_reg
4438             && has_align_gap
4439             && !simd_function
4440             && (offset & 8) == 0)
4441           {
4442             cfun->machine->frame.reg_offset[regno] = max_int_offset;
4443             break;
4444           }
4445
4446         cfun->machine->frame.reg_offset[regno] = offset;
4447         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4448           cfun->machine->frame.wb_candidate1 = regno;
4449         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4450                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4451           cfun->machine->frame.wb_candidate2 = regno;
4452         offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
4453       }
4454
4455   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4456
4457   cfun->machine->frame.saved_regs_size = offset;
4458
4459   HOST_WIDE_INT varargs_and_saved_regs_size
4460     = offset + cfun->machine->frame.saved_varargs_size;
4461
4462   cfun->machine->frame.hard_fp_offset
4463     = aligned_upper_bound (varargs_and_saved_regs_size
4464                            + get_frame_size (),
4465                            STACK_BOUNDARY / BITS_PER_UNIT);
4466
4467   /* Both these values are already aligned.  */
4468   gcc_assert (multiple_p (crtl->outgoing_args_size,
4469                           STACK_BOUNDARY / BITS_PER_UNIT));
4470   cfun->machine->frame.frame_size
4471     = (cfun->machine->frame.hard_fp_offset
4472        + crtl->outgoing_args_size);
4473
4474   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4475
4476   cfun->machine->frame.initial_adjust = 0;
4477   cfun->machine->frame.final_adjust = 0;
4478   cfun->machine->frame.callee_adjust = 0;
4479   cfun->machine->frame.callee_offset = 0;
4480
4481   HOST_WIDE_INT max_push_offset = 0;
4482   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4483     max_push_offset = 512;
4484   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4485     max_push_offset = 256;
4486
4487   HOST_WIDE_INT const_size, const_fp_offset;
4488   if (cfun->machine->frame.frame_size.is_constant (&const_size)
4489       && const_size < max_push_offset
4490       && known_eq (crtl->outgoing_args_size, 0))
4491     {
4492       /* Simple, small frame with no outgoing arguments:
4493          stp reg1, reg2, [sp, -frame_size]!
4494          stp reg3, reg4, [sp, 16]  */
4495       cfun->machine->frame.callee_adjust = const_size;
4496     }
4497   else if (known_lt (crtl->outgoing_args_size
4498                      + cfun->machine->frame.saved_regs_size, 512)
4499            && !(cfun->calls_alloca
4500                 && known_lt (cfun->machine->frame.hard_fp_offset,
4501                              max_push_offset)))
4502     {
4503       /* Frame with small outgoing arguments:
4504          sub sp, sp, frame_size
4505          stp reg1, reg2, [sp, outgoing_args_size]
4506          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
4507       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4508       cfun->machine->frame.callee_offset
4509         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4510     }
4511   else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4512            && const_fp_offset < max_push_offset)
4513     {
4514       /* Frame with large outgoing arguments but a small local area:
4515          stp reg1, reg2, [sp, -hard_fp_offset]!
4516          stp reg3, reg4, [sp, 16]
4517          sub sp, sp, outgoing_args_size  */
4518       cfun->machine->frame.callee_adjust = const_fp_offset;
4519       cfun->machine->frame.final_adjust
4520         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4521     }
4522   else
4523     {
4524       /* Frame with large local area and outgoing arguments using frame pointer:
4525          sub sp, sp, hard_fp_offset
4526          stp x29, x30, [sp, 0]
4527          add x29, sp, 0
4528          stp reg3, reg4, [sp, 16]
4529          sub sp, sp, outgoing_args_size  */
4530       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4531       cfun->machine->frame.final_adjust
4532         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4533     }
4534
4535   cfun->machine->frame.laid_out = true;
4536 }
4537
4538 /* Return true if the register REGNO is saved on entry to
4539    the current function.  */
4540
4541 static bool
4542 aarch64_register_saved_on_entry (int regno)
4543 {
4544   return cfun->machine->frame.reg_offset[regno] >= 0;
4545 }
4546
4547 /* Return the next register up from REGNO up to LIMIT for the callee
4548    to save.  */
4549
4550 static unsigned
4551 aarch64_next_callee_save (unsigned regno, unsigned limit)
4552 {
4553   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4554     regno ++;
4555   return regno;
4556 }
4557
4558 /* Push the register number REGNO of mode MODE to the stack with write-back
4559    adjusting the stack by ADJUSTMENT.  */
4560
4561 static void
4562 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4563                            HOST_WIDE_INT adjustment)
4564  {
4565   rtx base_rtx = stack_pointer_rtx;
4566   rtx insn, reg, mem;
4567
4568   reg = gen_rtx_REG (mode, regno);
4569   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4570                             plus_constant (Pmode, base_rtx, -adjustment));
4571   mem = gen_frame_mem (mode, mem);
4572
4573   insn = emit_move_insn (mem, reg);
4574   RTX_FRAME_RELATED_P (insn) = 1;
4575 }
4576
4577 /* Generate and return an instruction to store the pair of registers
4578    REG and REG2 of mode MODE to location BASE with write-back adjusting
4579    the stack location BASE by ADJUSTMENT.  */
4580
4581 static rtx
4582 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4583                           HOST_WIDE_INT adjustment)
4584 {
4585   switch (mode)
4586     {
4587     case E_DImode:
4588       return gen_storewb_pairdi_di (base, base, reg, reg2,
4589                                     GEN_INT (-adjustment),
4590                                     GEN_INT (UNITS_PER_WORD - adjustment));
4591     case E_DFmode:
4592       return gen_storewb_pairdf_di (base, base, reg, reg2,
4593                                     GEN_INT (-adjustment),
4594                                     GEN_INT (UNITS_PER_WORD - adjustment));
4595     case E_TFmode:
4596       return gen_storewb_pairtf_di (base, base, reg, reg2,
4597                                     GEN_INT (-adjustment),
4598                                     GEN_INT (UNITS_PER_VREG - adjustment));
4599     default:
4600       gcc_unreachable ();
4601     }
4602 }
4603
4604 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4605    stack pointer by ADJUSTMENT.  */
4606
4607 static void
4608 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4609 {
4610   rtx_insn *insn;
4611   machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
4612
4613   if (regno2 == INVALID_REGNUM)
4614     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4615
4616   rtx reg1 = gen_rtx_REG (mode, regno1);
4617   rtx reg2 = gen_rtx_REG (mode, regno2);
4618
4619   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4620                                               reg2, adjustment));
4621   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4622   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4623   RTX_FRAME_RELATED_P (insn) = 1;
4624 }
4625
4626 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4627    adjusting it by ADJUSTMENT afterwards.  */
4628
4629 static rtx
4630 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4631                          HOST_WIDE_INT adjustment)
4632 {
4633   switch (mode)
4634     {
4635     case E_DImode:
4636       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4637                                    GEN_INT (UNITS_PER_WORD));
4638     case E_DFmode:
4639       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4640                                    GEN_INT (UNITS_PER_WORD));
4641     case E_TFmode:
4642       return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
4643                                    GEN_INT (UNITS_PER_VREG));
4644     default:
4645       gcc_unreachable ();
4646     }
4647 }
4648
4649 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4650    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4651    into CFI_OPS.  */
4652
4653 static void
4654 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4655                   rtx *cfi_ops)
4656 {
4657   machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
4658   rtx reg1 = gen_rtx_REG (mode, regno1);
4659
4660   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4661
4662   if (regno2 == INVALID_REGNUM)
4663     {
4664       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4665       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4666       emit_move_insn (reg1, gen_frame_mem (mode, mem));
4667     }
4668   else
4669     {
4670       rtx reg2 = gen_rtx_REG (mode, regno2);
4671       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4672       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4673                                           reg2, adjustment));
4674     }
4675 }
4676
4677 /* Generate and return a store pair instruction of mode MODE to store
4678    register REG1 to MEM1 and register REG2 to MEM2.  */
4679
4680 static rtx
4681 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4682                         rtx reg2)
4683 {
4684   switch (mode)
4685     {
4686     case E_DImode:
4687       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
4688
4689     case E_DFmode:
4690       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
4691
4692     case E_TFmode:
4693       return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
4694
4695     default:
4696       gcc_unreachable ();
4697     }
4698 }
4699
4700 /* Generate and regurn a load pair isntruction of mode MODE to load register
4701    REG1 from MEM1 and register REG2 from MEM2.  */
4702
4703 static rtx
4704 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4705                        rtx mem2)
4706 {
4707   switch (mode)
4708     {
4709     case E_DImode:
4710       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
4711
4712     case E_DFmode:
4713       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
4714
4715     case E_TFmode:
4716       return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
4717
4718     default:
4719       gcc_unreachable ();
4720     }
4721 }
4722
4723 /* Return TRUE if return address signing should be enabled for the current
4724    function, otherwise return FALSE.  */
4725
4726 bool
4727 aarch64_return_address_signing_enabled (void)
4728 {
4729   /* This function should only be called after frame laid out.   */
4730   gcc_assert (cfun->machine->frame.laid_out);
4731
4732   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4733      if it's LR is pushed onto stack.  */
4734   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4735           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4736               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4737 }
4738
4739 /* Return TRUE if Branch Target Identification Mechanism is enabled.  */
4740 bool
4741 aarch64_bti_enabled (void)
4742 {
4743   return (aarch64_enable_bti == 1);
4744 }
4745
4746 /* Emit code to save the callee-saved registers from register number START
4747    to LIMIT to the stack at the location starting at offset START_OFFSET,
4748    skipping any write-back candidates if SKIP_WB is true.  */
4749
4750 static void
4751 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4752                            unsigned start, unsigned limit, bool skip_wb)
4753 {
4754   rtx_insn *insn;
4755   unsigned regno;
4756   unsigned regno2;
4757
4758   for (regno = aarch64_next_callee_save (start, limit);
4759        regno <= limit;
4760        regno = aarch64_next_callee_save (regno + 1, limit))
4761     {
4762       rtx reg, mem;
4763       poly_int64 offset;
4764       int offset_diff;
4765
4766       if (skip_wb
4767           && (regno == cfun->machine->frame.wb_candidate1
4768               || regno == cfun->machine->frame.wb_candidate2))
4769         continue;
4770
4771       if (cfun->machine->reg_is_wrapped_separately[regno])
4772        continue;
4773
4774       reg = gen_rtx_REG (mode, regno);
4775       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4776       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4777                                                 offset));
4778
4779       regno2 = aarch64_next_callee_save (regno + 1, limit);
4780       offset_diff = cfun->machine->frame.reg_offset[regno2]
4781                     - cfun->machine->frame.reg_offset[regno];
4782
4783       if (regno2 <= limit
4784           && !cfun->machine->reg_is_wrapped_separately[regno2]
4785           && known_eq (GET_MODE_SIZE (mode), offset_diff))
4786         {
4787           rtx reg2 = gen_rtx_REG (mode, regno2);
4788           rtx mem2;
4789
4790           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4791           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4792                                                      offset));
4793           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4794                                                     reg2));
4795
4796           /* The first part of a frame-related parallel insn is
4797              always assumed to be relevant to the frame
4798              calculations; subsequent parts, are only
4799              frame-related if explicitly marked.  */
4800           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4801           regno = regno2;
4802         }
4803       else
4804         insn = emit_move_insn (mem, reg);
4805
4806       RTX_FRAME_RELATED_P (insn) = 1;
4807     }
4808 }
4809
4810 /* Emit code to restore the callee registers of mode MODE from register
4811    number START up to and including LIMIT.  Restore from the stack offset
4812    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4813    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
4814
4815 static void
4816 aarch64_restore_callee_saves (machine_mode mode,
4817                               poly_int64 start_offset, unsigned start,
4818                               unsigned limit, bool skip_wb, rtx *cfi_ops)
4819 {
4820   rtx base_rtx = stack_pointer_rtx;
4821   unsigned regno;
4822   unsigned regno2;
4823   poly_int64 offset;
4824
4825   for (regno = aarch64_next_callee_save (start, limit);
4826        regno <= limit;
4827        regno = aarch64_next_callee_save (regno + 1, limit))
4828     {
4829       if (cfun->machine->reg_is_wrapped_separately[regno])
4830        continue;
4831
4832       rtx reg, mem;
4833       int offset_diff;
4834
4835       if (skip_wb
4836           && (regno == cfun->machine->frame.wb_candidate1
4837               || regno == cfun->machine->frame.wb_candidate2))
4838         continue;
4839
4840       reg = gen_rtx_REG (mode, regno);
4841       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4842       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4843
4844       regno2 = aarch64_next_callee_save (regno + 1, limit);
4845       offset_diff = cfun->machine->frame.reg_offset[regno2]
4846                     - cfun->machine->frame.reg_offset[regno];
4847
4848       if (regno2 <= limit
4849           && !cfun->machine->reg_is_wrapped_separately[regno2]
4850           && known_eq (GET_MODE_SIZE (mode), offset_diff))
4851         {
4852           rtx reg2 = gen_rtx_REG (mode, regno2);
4853           rtx mem2;
4854
4855           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4856           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4857           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4858
4859           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4860           regno = regno2;
4861         }
4862       else
4863         emit_move_insn (reg, mem);
4864       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
4865     }
4866 }
4867
4868 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4869    of MODE.  */
4870
4871 static inline bool
4872 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4873 {
4874   HOST_WIDE_INT multiple;
4875   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4876           && IN_RANGE (multiple, -8, 7));
4877 }
4878
4879 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4880    of MODE.  */
4881
4882 static inline bool
4883 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4884 {
4885   HOST_WIDE_INT multiple;
4886   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4887           && IN_RANGE (multiple, 0, 63));
4888 }
4889
4890 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4891    of MODE.  */
4892
4893 bool
4894 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4895 {
4896   HOST_WIDE_INT multiple;
4897   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4898           && IN_RANGE (multiple, -64, 63));
4899 }
4900
4901 /* Return true if OFFSET is a signed 9-bit value.  */
4902
4903 bool
4904 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
4905                                        poly_int64 offset)
4906 {
4907   HOST_WIDE_INT const_offset;
4908   return (offset.is_constant (&const_offset)
4909           && IN_RANGE (const_offset, -256, 255));
4910 }
4911
4912 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4913    of MODE.  */
4914
4915 static inline bool
4916 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4917 {
4918   HOST_WIDE_INT multiple;
4919   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4920           && IN_RANGE (multiple, -256, 255));
4921 }
4922
4923 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
4924    of MODE.  */
4925
4926 static inline bool
4927 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4928 {
4929   HOST_WIDE_INT multiple;
4930   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4931           && IN_RANGE (multiple, 0, 4095));
4932 }
4933
4934 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
4935
4936 static sbitmap
4937 aarch64_get_separate_components (void)
4938 {
4939   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4940   bitmap_clear (components);
4941
4942   /* The registers we need saved to the frame.  */
4943   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4944     if (aarch64_register_saved_on_entry (regno))
4945       {
4946         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4947         if (!frame_pointer_needed)
4948           offset += cfun->machine->frame.frame_size
4949                     - cfun->machine->frame.hard_fp_offset;
4950         /* Check that we can access the stack slot of the register with one
4951            direct load with no adjustments needed.  */
4952         if (offset_12bit_unsigned_scaled_p (DImode, offset))
4953           bitmap_set_bit (components, regno);
4954       }
4955
4956   /* Don't mess with the hard frame pointer.  */
4957   if (frame_pointer_needed)
4958     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
4959
4960   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4961   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4962   /* If registers have been chosen to be stored/restored with
4963      writeback don't interfere with them to avoid having to output explicit
4964      stack adjustment instructions.  */
4965   if (reg2 != INVALID_REGNUM)
4966     bitmap_clear_bit (components, reg2);
4967   if (reg1 != INVALID_REGNUM)
4968     bitmap_clear_bit (components, reg1);
4969
4970   bitmap_clear_bit (components, LR_REGNUM);
4971   bitmap_clear_bit (components, SP_REGNUM);
4972
4973   return components;
4974 }
4975
4976 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
4977
4978 static sbitmap
4979 aarch64_components_for_bb (basic_block bb)
4980 {
4981   bitmap in = DF_LIVE_IN (bb);
4982   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
4983   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
4984   bool simd_function = aarch64_simd_decl_p (cfun->decl);
4985
4986   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4987   bitmap_clear (components);
4988
4989   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
4990   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4991     if ((!call_used_regs[regno]
4992         || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
4993        && (bitmap_bit_p (in, regno)
4994            || bitmap_bit_p (gen, regno)
4995            || bitmap_bit_p (kill, regno)))
4996       {
4997         unsigned regno2, offset, offset2;
4998         bitmap_set_bit (components, regno);
4999
5000         /* If there is a callee-save at an adjacent offset, add it too
5001            to increase the use of LDP/STP.  */
5002         offset = cfun->machine->frame.reg_offset[regno];
5003         regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
5004
5005         if (regno2 <= LAST_SAVED_REGNUM)
5006           {
5007             offset2 = cfun->machine->frame.reg_offset[regno2];
5008             if ((offset & ~8) == (offset2 & ~8))
5009               bitmap_set_bit (components, regno2);
5010           }
5011       }
5012
5013   return components;
5014 }
5015
5016 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5017    Nothing to do for aarch64.  */
5018
5019 static void
5020 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
5021 {
5022 }
5023
5024 /* Return the next set bit in BMP from START onwards.  Return the total number
5025    of bits in BMP if no set bit is found at or after START.  */
5026
5027 static unsigned int
5028 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
5029 {
5030   unsigned int nbits = SBITMAP_SIZE (bmp);
5031   if (start == nbits)
5032     return start;
5033
5034   gcc_assert (start < nbits);
5035   for (unsigned int i = start; i < nbits; i++)
5036     if (bitmap_bit_p (bmp, i))
5037       return i;
5038
5039   return nbits;
5040 }
5041
5042 /* Do the work for aarch64_emit_prologue_components and
5043    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
5044    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
5045    for these components or the epilogue sequence.  That is, it determines
5046    whether we should emit stores or loads and what kind of CFA notes to attach
5047    to the insns.  Otherwise the logic for the two sequences is very
5048    similar.  */
5049
5050 static void
5051 aarch64_process_components (sbitmap components, bool prologue_p)
5052 {
5053   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
5054                              ? HARD_FRAME_POINTER_REGNUM
5055                              : STACK_POINTER_REGNUM);
5056
5057   unsigned last_regno = SBITMAP_SIZE (components);
5058   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
5059   rtx_insn *insn = NULL;
5060
5061   while (regno != last_regno)
5062     {
5063       /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
5064          so DFmode for the vector registers is enough.  For simd functions
5065          we want to save the low 128 bits.  */
5066       machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
5067
5068       rtx reg = gen_rtx_REG (mode, regno);
5069       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5070       if (!frame_pointer_needed)
5071         offset += cfun->machine->frame.frame_size
5072                   - cfun->machine->frame.hard_fp_offset;
5073       rtx addr = plus_constant (Pmode, ptr_reg, offset);
5074       rtx mem = gen_frame_mem (mode, addr);
5075
5076       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
5077       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
5078       /* No more registers to handle after REGNO.
5079          Emit a single save/restore and exit.  */
5080       if (regno2 == last_regno)
5081         {
5082           insn = emit_insn (set);
5083           RTX_FRAME_RELATED_P (insn) = 1;
5084           if (prologue_p)
5085             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5086           else
5087             add_reg_note (insn, REG_CFA_RESTORE, reg);
5088           break;
5089         }
5090
5091       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
5092       /* The next register is not of the same class or its offset is not
5093          mergeable with the current one into a pair.  */
5094       if (!satisfies_constraint_Ump (mem)
5095           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
5096           || (aarch64_simd_decl_p (cfun->decl) && FP_REGNUM_P (regno))
5097           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
5098                        GET_MODE_SIZE (mode)))
5099         {
5100           insn = emit_insn (set);
5101           RTX_FRAME_RELATED_P (insn) = 1;
5102           if (prologue_p)
5103             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5104           else
5105             add_reg_note (insn, REG_CFA_RESTORE, reg);
5106
5107           regno = regno2;
5108           continue;
5109         }
5110
5111       /* REGNO2 can be saved/restored in a pair with REGNO.  */
5112       rtx reg2 = gen_rtx_REG (mode, regno2);
5113       if (!frame_pointer_needed)
5114         offset2 += cfun->machine->frame.frame_size
5115                   - cfun->machine->frame.hard_fp_offset;
5116       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
5117       rtx mem2 = gen_frame_mem (mode, addr2);
5118       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
5119                              : gen_rtx_SET (reg2, mem2);
5120
5121       if (prologue_p)
5122         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
5123       else
5124         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5125
5126       RTX_FRAME_RELATED_P (insn) = 1;
5127       if (prologue_p)
5128         {
5129           add_reg_note (insn, REG_CFA_OFFSET, set);
5130           add_reg_note (insn, REG_CFA_OFFSET, set2);
5131         }
5132       else
5133         {
5134           add_reg_note (insn, REG_CFA_RESTORE, reg);
5135           add_reg_note (insn, REG_CFA_RESTORE, reg2);
5136         }
5137
5138       regno = aarch64_get_next_set_bit (components, regno2 + 1);
5139     }
5140 }
5141
5142 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
5143
5144 static void
5145 aarch64_emit_prologue_components (sbitmap components)
5146 {
5147   aarch64_process_components (components, true);
5148 }
5149
5150 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
5151
5152 static void
5153 aarch64_emit_epilogue_components (sbitmap components)
5154 {
5155   aarch64_process_components (components, false);
5156 }
5157
5158 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
5159
5160 static void
5161 aarch64_set_handled_components (sbitmap components)
5162 {
5163   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5164     if (bitmap_bit_p (components, regno))
5165       cfun->machine->reg_is_wrapped_separately[regno] = true;
5166 }
5167
5168 /* On AArch64 we have an ABI defined safe buffer.  This constant is used to
5169    determining the probe offset for alloca.  */
5170
5171 static HOST_WIDE_INT
5172 aarch64_stack_clash_protection_alloca_probe_range (void)
5173 {
5174   return STACK_CLASH_CALLER_GUARD;
5175 }
5176
5177
5178 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
5179    registers.  If POLY_SIZE is not large enough to require a probe this function
5180    will only adjust the stack.  When allocating the stack space
5181    FRAME_RELATED_P is then used to indicate if the allocation is frame related.
5182    FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
5183    arguments.  If we are then we ensure that any allocation larger than the ABI
5184    defined buffer needs a probe so that the invariant of having a 1KB buffer is
5185    maintained.
5186
5187    We emit barriers after each stack adjustment to prevent optimizations from
5188    breaking the invariant that we never drop the stack more than a page.  This
5189    invariant is needed to make it easier to correctly handle asynchronous
5190    events, e.g. if we were to allow the stack to be dropped by more than a page
5191    and then have multiple probes up and we take a signal somewhere in between
5192    then the signal handler doesn't know the state of the stack and can make no
5193    assumptions about which pages have been probed.  */
5194
5195 static void
5196 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
5197                                         poly_int64 poly_size,
5198                                         bool frame_related_p,
5199                                         bool final_adjustment_p)
5200 {
5201   HOST_WIDE_INT guard_size
5202     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5203   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
5204   /* When doing the final adjustment for the outgoing argument size we can't
5205      assume that LR was saved at position 0.  So subtract it's offset from the
5206      ABI safe buffer so that we don't accidentally allow an adjustment that
5207      would result in an allocation larger than the ABI buffer without
5208      probing.  */
5209   HOST_WIDE_INT min_probe_threshold
5210     = final_adjustment_p
5211       ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
5212       : guard_size - guard_used_by_caller;
5213
5214   poly_int64 frame_size = cfun->machine->frame.frame_size;
5215
5216   /* We should always have a positive probe threshold.  */
5217   gcc_assert (min_probe_threshold > 0);
5218
5219   if (flag_stack_clash_protection && !final_adjustment_p)
5220     {
5221       poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5222       poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5223
5224       if (known_eq (frame_size, 0))
5225         {
5226           dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
5227         }
5228       else if (known_lt (initial_adjust, guard_size - guard_used_by_caller)
5229                && known_lt (final_adjust, guard_used_by_caller))
5230         {
5231           dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
5232         }
5233     }
5234
5235   /* If SIZE is not large enough to require probing, just adjust the stack and
5236      exit.  */
5237   if (known_lt (poly_size, min_probe_threshold)
5238       || !flag_stack_clash_protection)
5239     {
5240       aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
5241       return;
5242     }
5243
5244   HOST_WIDE_INT size;
5245   /* Handle the SVE non-constant case first.  */
5246   if (!poly_size.is_constant (&size))
5247     {
5248      if (dump_file)
5249       {
5250         fprintf (dump_file, "Stack clash SVE prologue: ");
5251         print_dec (poly_size, dump_file);
5252         fprintf (dump_file, " bytes, dynamic probing will be required.\n");
5253       }
5254
5255       /* First calculate the amount of bytes we're actually spilling.  */
5256       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
5257                           poly_size, temp1, temp2, false, true);
5258
5259       rtx_insn *insn = get_last_insn ();
5260
5261       if (frame_related_p)
5262         {
5263           /* This is done to provide unwinding information for the stack
5264              adjustments we're about to do, however to prevent the optimizers
5265              from removing the R15 move and leaving the CFA note (which would be
5266              very wrong) we tie the old and new stack pointer together.
5267              The tie will expand to nothing but the optimizers will not touch
5268              the instruction.  */
5269           rtx stack_ptr_copy = gen_rtx_REG (Pmode, R15_REGNUM);
5270           emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
5271           emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
5272
5273           /* We want the CFA independent of the stack pointer for the
5274              duration of the loop.  */
5275           add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
5276           RTX_FRAME_RELATED_P (insn) = 1;
5277         }
5278
5279       rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
5280       rtx guard_const = gen_int_mode (guard_size, Pmode);
5281
5282       insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
5283                                                    stack_pointer_rtx, temp1,
5284                                                    probe_const, guard_const));
5285
5286       /* Now reset the CFA register if needed.  */
5287       if (frame_related_p)
5288         {
5289           add_reg_note (insn, REG_CFA_DEF_CFA,
5290                         gen_rtx_PLUS (Pmode, stack_pointer_rtx,
5291                                       gen_int_mode (poly_size, Pmode)));
5292           RTX_FRAME_RELATED_P (insn) = 1;
5293         }
5294
5295       return;
5296     }
5297
5298   if (dump_file)
5299     fprintf (dump_file,
5300              "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
5301              " bytes, probing will be required.\n", size);
5302
5303   /* Round size to the nearest multiple of guard_size, and calculate the
5304      residual as the difference between the original size and the rounded
5305      size.  */
5306   HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
5307   HOST_WIDE_INT residual = size - rounded_size;
5308
5309   /* We can handle a small number of allocations/probes inline.  Otherwise
5310      punt to a loop.  */
5311   if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
5312     {
5313       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
5314         {
5315           aarch64_sub_sp (NULL, temp2, guard_size, true);
5316           emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
5317                                            guard_used_by_caller));
5318           emit_insn (gen_blockage ());
5319         }
5320       dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
5321     }
5322   else
5323     {
5324       /* Compute the ending address.  */
5325       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
5326                           temp1, NULL, false, true);
5327       rtx_insn *insn = get_last_insn ();
5328
5329       /* For the initial allocation, we don't have a frame pointer
5330          set up, so we always need CFI notes.  If we're doing the
5331          final allocation, then we may have a frame pointer, in which
5332          case it is the CFA, otherwise we need CFI notes.
5333
5334          We can determine which allocation we are doing by looking at
5335          the value of FRAME_RELATED_P since the final allocations are not
5336          frame related.  */
5337       if (frame_related_p)
5338         {
5339           /* We want the CFA independent of the stack pointer for the
5340              duration of the loop.  */
5341           add_reg_note (insn, REG_CFA_DEF_CFA,
5342                         plus_constant (Pmode, temp1, rounded_size));
5343           RTX_FRAME_RELATED_P (insn) = 1;
5344         }
5345
5346       /* This allocates and probes the stack.  Note that this re-uses some of
5347          the existing Ada stack protection code.  However we are guaranteed not
5348          to enter the non loop or residual branches of that code.
5349
5350          The non-loop part won't be entered because if our allocation amount
5351          doesn't require a loop, the case above would handle it.
5352
5353          The residual amount won't be entered because TEMP1 is a mutliple of
5354          the allocation size.  The residual will always be 0.  As such, the only
5355          part we are actually using from that code is the loop setup.  The
5356          actual probing is done in aarch64_output_probe_stack_range.  */
5357       insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
5358                                                stack_pointer_rtx, temp1));
5359
5360       /* Now reset the CFA register if needed.  */
5361       if (frame_related_p)
5362         {
5363           add_reg_note (insn, REG_CFA_DEF_CFA,
5364                         plus_constant (Pmode, stack_pointer_rtx, rounded_size));
5365           RTX_FRAME_RELATED_P (insn) = 1;
5366         }
5367
5368       emit_insn (gen_blockage ());
5369       dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
5370     }
5371
5372   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
5373      be probed.  This maintains the requirement that each page is probed at
5374      least once.  For initial probing we probe only if the allocation is
5375      more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
5376      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
5377      GUARD_SIZE.  This works that for any allocation that is large enough to
5378      trigger a probe here, we'll have at least one, and if they're not large
5379      enough for this code to emit anything for them, The page would have been
5380      probed by the saving of FP/LR either by this function or any callees.  If
5381      we don't have any callees then we won't have more stack adjustments and so
5382      are still safe.  */
5383   if (residual)
5384     {
5385       HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
5386       /* If we're doing final adjustments, and we've done any full page
5387          allocations then any residual needs to be probed.  */
5388       if (final_adjustment_p && rounded_size != 0)
5389         min_probe_threshold = 0;
5390       /* If doing a small final adjustment, we always probe at offset 0.
5391          This is done to avoid issues when LR is not at position 0 or when
5392          the final adjustment is smaller than the probing offset.  */
5393       else if (final_adjustment_p && rounded_size == 0)
5394         residual_probe_offset = 0;
5395
5396       aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
5397       if (residual >= min_probe_threshold)
5398         {
5399           if (dump_file)
5400             fprintf (dump_file,
5401                      "Stack clash AArch64 prologue residuals: "
5402                      HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
5403                      "\n", residual);
5404
5405             emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
5406                                              residual_probe_offset));
5407           emit_insn (gen_blockage ());
5408         }
5409     }
5410 }
5411
5412 /* Return 1 if the register is used by the epilogue.  We need to say the
5413    return register is used, but only after epilogue generation is complete.
5414    Note that in the case of sibcalls, the values "used by the epilogue" are
5415    considered live at the start of the called function.
5416
5417    For SIMD functions we need to return 1 for FP registers that are saved and
5418    restored by a function but are not zero in call_used_regs.  If we do not do
5419    this optimizations may remove the restore of the register.  */
5420
5421 int
5422 aarch64_epilogue_uses (int regno)
5423 {
5424   if (epilogue_completed)
5425     {
5426       if (regno == LR_REGNUM)
5427         return 1;
5428       if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno))
5429         return 1;
5430     }
5431   return 0;
5432 }
5433
5434 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
5435    is saved at BASE + OFFSET.  */
5436
5437 static void
5438 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
5439                             rtx base, poly_int64 offset)
5440 {
5441   rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
5442   add_reg_note (insn, REG_CFA_EXPRESSION,
5443                 gen_rtx_SET (mem, regno_reg_rtx[reg]));
5444 }
5445
5446 /* AArch64 stack frames generated by this compiler look like:
5447
5448         +-------------------------------+
5449         |                               |
5450         |  incoming stack arguments     |
5451         |                               |
5452         +-------------------------------+
5453         |                               | <-- incoming stack pointer (aligned)
5454         |  callee-allocated save area   |
5455         |  for register varargs         |
5456         |                               |
5457         +-------------------------------+
5458         |  local variables              | <-- frame_pointer_rtx
5459         |                               |
5460         +-------------------------------+
5461         |  padding                      | \
5462         +-------------------------------+  |
5463         |  callee-saved registers       |  | frame.saved_regs_size
5464         +-------------------------------+  |
5465         |  LR'                          |  |
5466         +-------------------------------+  |
5467         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
5468         +-------------------------------+
5469         |  dynamic allocation           |
5470         +-------------------------------+
5471         |  padding                      |
5472         +-------------------------------+
5473         |  outgoing stack arguments     | <-- arg_pointer
5474         |                               |
5475         +-------------------------------+
5476         |                               | <-- stack_pointer_rtx (aligned)
5477
5478    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
5479    but leave frame_pointer_rtx and hard_frame_pointer_rtx
5480    unchanged.
5481
5482    By default for stack-clash we assume the guard is at least 64KB, but this
5483    value is configurable to either 4KB or 64KB.  We also force the guard size to
5484    be the same as the probing interval and both values are kept in sync.
5485
5486    With those assumptions the callee can allocate up to 63KB (or 3KB depending
5487    on the guard size) of stack space without probing.
5488
5489    When probing is needed, we emit a probe at the start of the prologue
5490    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
5491
5492    We have to track how much space has been allocated and the only stores
5493    to the stack we track as implicit probes are the FP/LR stores.
5494
5495    For outgoing arguments we probe if the size is larger than 1KB, such that
5496    the ABI specified buffer is maintained for the next callee.  */
5497
5498 /* Generate the prologue instructions for entry into a function.
5499    Establish the stack frame by decreasing the stack pointer with a
5500    properly calculated size and, if necessary, create a frame record
5501    filled with the values of LR and previous frame pointer.  The
5502    current FP is also set up if it is in use.  */
5503
5504 void
5505 aarch64_expand_prologue (void)
5506 {
5507   poly_int64 frame_size = cfun->machine->frame.frame_size;
5508   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5509   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
5510   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5511   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
5512   unsigned reg1 = cfun->machine->frame.wb_candidate1;
5513   unsigned reg2 = cfun->machine->frame.wb_candidate2;
5514   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
5515   rtx_insn *insn;
5516
5517   /* Sign return address for functions.  */
5518   if (aarch64_return_address_signing_enabled ())
5519     {
5520       insn = emit_insn (gen_pacisp ());
5521       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5522       RTX_FRAME_RELATED_P (insn) = 1;
5523     }
5524
5525   if (flag_stack_usage_info)
5526     current_function_static_stack_size = constant_lower_bound (frame_size);
5527
5528   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
5529     {
5530       if (crtl->is_leaf && !cfun->calls_alloca)
5531         {
5532           if (maybe_gt (frame_size, PROBE_INTERVAL)
5533               && maybe_gt (frame_size, get_stack_check_protect ()))
5534             aarch64_emit_probe_stack_range (get_stack_check_protect (),
5535                                             (frame_size
5536                                              - get_stack_check_protect ()));
5537         }
5538       else if (maybe_gt (frame_size, 0))
5539         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
5540     }
5541
5542   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
5543   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
5544
5545   /* In theory we should never have both an initial adjustment
5546      and a callee save adjustment.  Verify that is the case since the
5547      code below does not handle it for -fstack-clash-protection.  */
5548   gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
5549
5550   /* Will only probe if the initial adjustment is larger than the guard
5551      less the amount of the guard reserved for use by the caller's
5552      outgoing args.  */
5553   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
5554                                           true, false);
5555
5556   if (callee_adjust != 0)
5557     aarch64_push_regs (reg1, reg2, callee_adjust);
5558
5559   if (emit_frame_chain)
5560     {
5561       poly_int64 reg_offset = callee_adjust;
5562       if (callee_adjust == 0)
5563         {
5564           reg1 = R29_REGNUM;
5565           reg2 = R30_REGNUM;
5566           reg_offset = callee_offset;
5567           aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
5568         }
5569       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
5570                           stack_pointer_rtx, callee_offset,
5571                           tmp1_rtx, tmp0_rtx, frame_pointer_needed);
5572       if (frame_pointer_needed && !frame_size.is_constant ())
5573         {
5574           /* Variable-sized frames need to describe the save slot
5575              address using DW_CFA_expression rather than DW_CFA_offset.
5576              This means that, without taking further action, the
5577              locations of the registers that we've already saved would
5578              remain based on the stack pointer even after we redefine
5579              the CFA based on the frame pointer.  We therefore need new
5580              DW_CFA_expressions to re-express the save slots with addresses
5581              based on the frame pointer.  */
5582           rtx_insn *insn = get_last_insn ();
5583           gcc_assert (RTX_FRAME_RELATED_P (insn));
5584
5585           /* Add an explicit CFA definition if this was previously
5586              implicit.  */
5587           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
5588             {
5589               rtx src = plus_constant (Pmode, stack_pointer_rtx,
5590                                        callee_offset);
5591               add_reg_note (insn, REG_CFA_ADJUST_CFA,
5592                             gen_rtx_SET (hard_frame_pointer_rtx, src));
5593             }
5594
5595           /* Change the save slot expressions for the registers that
5596              we've already saved.  */
5597           reg_offset -= callee_offset;
5598           aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
5599                                       reg_offset + UNITS_PER_WORD);
5600           aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
5601                                       reg_offset);
5602         }
5603       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
5604     }
5605
5606   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5607                              callee_adjust != 0 || emit_frame_chain);
5608   if (aarch64_simd_decl_p (cfun->decl))
5609     aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5610                                callee_adjust != 0 || emit_frame_chain);
5611   else
5612     aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5613                                callee_adjust != 0 || emit_frame_chain);
5614
5615   /* We may need to probe the final adjustment if it is larger than the guard
5616      that is assumed by the called.  */
5617   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
5618                                           !frame_pointer_needed, true);
5619 }
5620
5621 /* Return TRUE if we can use a simple_return insn.
5622
5623    This function checks whether the callee saved stack is empty, which
5624    means no restore actions are need. The pro_and_epilogue will use
5625    this to check whether shrink-wrapping opt is feasible.  */
5626
5627 bool
5628 aarch64_use_return_insn_p (void)
5629 {
5630   if (!reload_completed)
5631     return false;
5632
5633   if (crtl->profile)
5634     return false;
5635
5636   return known_eq (cfun->machine->frame.frame_size, 0);
5637 }
5638
5639 /* Return false for non-leaf SIMD functions in order to avoid
5640    shrink-wrapping them.  Doing this will lose the necessary
5641    save/restore of FP registers.  */
5642
5643 bool
5644 aarch64_use_simple_return_insn_p (void)
5645 {
5646   if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf)
5647     return false;
5648
5649   return true;
5650 }
5651
5652 /* Generate the epilogue instructions for returning from a function.
5653    This is almost exactly the reverse of the prolog sequence, except
5654    that we need to insert barriers to avoid scheduling loads that read
5655    from a deallocated stack, and we optimize the unwind records by
5656    emitting them all together if possible.  */
5657 void
5658 aarch64_expand_epilogue (bool for_sibcall)
5659 {
5660   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5661   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
5662   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5663   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
5664   unsigned reg1 = cfun->machine->frame.wb_candidate1;
5665   unsigned reg2 = cfun->machine->frame.wb_candidate2;
5666   rtx cfi_ops = NULL;
5667   rtx_insn *insn;
5668   /* A stack clash protection prologue may not have left EP0_REGNUM or
5669      EP1_REGNUM in a usable state.  The same is true for allocations
5670      with an SVE component, since we then need both temporary registers
5671      for each allocation.  For stack clash we are in a usable state if
5672      the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
5673   HOST_WIDE_INT guard_size
5674     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5675   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
5676
5677   /* We can re-use the registers when the allocation amount is smaller than
5678      guard_size - guard_used_by_caller because we won't be doing any probes
5679      then.  In such situations the register should remain live with the correct
5680      value.  */
5681   bool can_inherit_p = (initial_adjust.is_constant ()
5682                         && final_adjust.is_constant ())
5683                         && (!flag_stack_clash_protection
5684                             || known_lt (initial_adjust,
5685                                          guard_size - guard_used_by_caller));
5686
5687   /* We need to add memory barrier to prevent read from deallocated stack.  */
5688   bool need_barrier_p
5689     = maybe_ne (get_frame_size ()
5690                 + cfun->machine->frame.saved_varargs_size, 0);
5691
5692   /* Emit a barrier to prevent loads from a deallocated stack.  */
5693   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
5694       || cfun->calls_alloca
5695       || crtl->calls_eh_return)
5696     {
5697       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5698       need_barrier_p = false;
5699     }
5700
5701   /* Restore the stack pointer from the frame pointer if it may not
5702      be the same as the stack pointer.  */
5703   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
5704   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
5705   if (frame_pointer_needed
5706       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
5707     /* If writeback is used when restoring callee-saves, the CFA
5708        is restored on the instruction doing the writeback.  */
5709     aarch64_add_offset (Pmode, stack_pointer_rtx,
5710                         hard_frame_pointer_rtx, -callee_offset,
5711                         tmp1_rtx, tmp0_rtx, callee_adjust == 0);
5712   else
5713      /* The case where we need to re-use the register here is very rare, so
5714         avoid the complicated condition and just always emit a move if the
5715         immediate doesn't fit.  */
5716      aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
5717
5718   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5719                                 callee_adjust != 0, &cfi_ops);
5720   if (aarch64_simd_decl_p (cfun->decl))
5721     aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5722                                   callee_adjust != 0, &cfi_ops);
5723   else
5724     aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5725                                   callee_adjust != 0, &cfi_ops);
5726
5727   if (need_barrier_p)
5728     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5729
5730   if (callee_adjust != 0)
5731     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
5732
5733   if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
5734     {
5735       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
5736       insn = get_last_insn ();
5737       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
5738       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
5739       RTX_FRAME_RELATED_P (insn) = 1;
5740       cfi_ops = NULL;
5741     }
5742
5743   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
5744      add restriction on emit_move optimization to leaf functions.  */
5745   aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
5746                   (!can_inherit_p || !crtl->is_leaf
5747                    || df_regs_ever_live_p (EP0_REGNUM)));
5748
5749   if (cfi_ops)
5750     {
5751       /* Emit delayed restores and reset the CFA to be SP.  */
5752       insn = get_last_insn ();
5753       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
5754       REG_NOTES (insn) = cfi_ops;
5755       RTX_FRAME_RELATED_P (insn) = 1;
5756     }
5757
5758   /* We prefer to emit the combined return/authenticate instruction RETAA,
5759      however there are three cases in which we must instead emit an explicit
5760      authentication instruction.
5761
5762         1) Sibcalls don't return in a normal way, so if we're about to call one
5763            we must authenticate.
5764
5765         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5766            generating code for !TARGET_ARMV8_3 we can't use it and must
5767            explicitly authenticate.
5768
5769         3) On an eh_return path we make extra stack adjustments to update the
5770            canonical frame address to be the exception handler's CFA.  We want
5771            to authenticate using the CFA of the function which calls eh_return.
5772     */
5773   if (aarch64_return_address_signing_enabled ()
5774       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
5775     {
5776       insn = emit_insn (gen_autisp ());
5777       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5778       RTX_FRAME_RELATED_P (insn) = 1;
5779     }
5780
5781   /* Stack adjustment for exception handler.  */
5782   if (crtl->calls_eh_return)
5783     {
5784       /* We need to unwind the stack by the offset computed by
5785          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
5786          to be SP; letting the CFA move during this adjustment
5787          is just as correct as retaining the CFA from the body
5788          of the function.  Therefore, do nothing special.  */
5789       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
5790     }
5791
5792   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
5793   if (!for_sibcall)
5794     emit_jump_insn (ret_rtx);
5795 }
5796
5797 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
5798    normally or return to a previous frame after unwinding.
5799
5800    An EH return uses a single shared return sequence.  The epilogue is
5801    exactly like a normal epilogue except that it has an extra input
5802    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5803    that must be applied after the frame has been destroyed.  An extra label
5804    is inserted before the epilogue which initializes this register to zero,
5805    and this is the entry point for a normal return.
5806
5807    An actual EH return updates the return address, initializes the stack
5808    adjustment and jumps directly into the epilogue (bypassing the zeroing
5809    of the adjustment).  Since the return address is typically saved on the
5810    stack when a function makes a call, the saved LR must be updated outside
5811    the epilogue.
5812
5813    This poses problems as the store is generated well before the epilogue,
5814    so the offset of LR is not known yet.  Also optimizations will remove the
5815    store as it appears dead, even after the epilogue is generated (as the
5816    base or offset for loading LR is different in many cases).
5817
5818    To avoid these problems this implementation forces the frame pointer
5819    in eh_return functions so that the location of LR is fixed and known early.
5820    It also marks the store volatile, so no optimization is permitted to
5821    remove the store.  */
5822 rtx
5823 aarch64_eh_return_handler_rtx (void)
5824 {
5825   rtx tmp = gen_frame_mem (Pmode,
5826     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
5827
5828   /* Mark the store volatile, so no optimization is permitted to remove it.  */
5829   MEM_VOLATILE_P (tmp) = true;
5830   return tmp;
5831 }
5832
5833 /* Output code to add DELTA to the first argument, and then jump
5834    to FUNCTION.  Used for C++ multiple inheritance.  */
5835 static void
5836 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
5837                          HOST_WIDE_INT delta,
5838                          HOST_WIDE_INT vcall_offset,
5839                          tree function)
5840 {
5841   /* The this pointer is always in x0.  Note that this differs from
5842      Arm where the this pointer maybe bumped to r1 if r0 is required
5843      to return a pointer to an aggregate.  On AArch64 a result value
5844      pointer will be in x8.  */
5845   int this_regno = R0_REGNUM;
5846   rtx this_rtx, temp0, temp1, addr, funexp;
5847   rtx_insn *insn;
5848
5849   reload_completed = 1;
5850   emit_note (NOTE_INSN_PROLOGUE_END);
5851
5852   this_rtx = gen_rtx_REG (Pmode, this_regno);
5853   temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
5854   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
5855
5856   if (vcall_offset == 0)
5857     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
5858   else
5859     {
5860       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
5861
5862       addr = this_rtx;
5863       if (delta != 0)
5864         {
5865           if (delta >= -256 && delta < 256)
5866             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
5867                                        plus_constant (Pmode, this_rtx, delta));
5868           else
5869             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
5870                                 temp1, temp0, false);
5871         }
5872
5873       if (Pmode == ptr_mode)
5874         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
5875       else
5876         aarch64_emit_move (temp0,
5877                            gen_rtx_ZERO_EXTEND (Pmode,
5878                                                 gen_rtx_MEM (ptr_mode, addr)));
5879
5880       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
5881           addr = plus_constant (Pmode, temp0, vcall_offset);
5882       else
5883         {
5884           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
5885                                           Pmode);
5886           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
5887         }
5888
5889       if (Pmode == ptr_mode)
5890         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
5891       else
5892         aarch64_emit_move (temp1,
5893                            gen_rtx_SIGN_EXTEND (Pmode,
5894                                                 gen_rtx_MEM (ptr_mode, addr)));
5895
5896       emit_insn (gen_add2_insn (this_rtx, temp1));
5897     }
5898
5899   /* Generate a tail call to the target function.  */
5900   if (!TREE_USED (function))
5901     {
5902       assemble_external (function);
5903       TREE_USED (function) = 1;
5904     }
5905   funexp = XEXP (DECL_RTL (function), 0);
5906   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
5907   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
5908   SIBLING_CALL_P (insn) = 1;
5909
5910   insn = get_insns ();
5911   shorten_branches (insn);
5912   final_start_function (insn, file, 1);
5913   final (insn, file, 1);
5914   final_end_function ();
5915
5916   /* Stop pretending to be a post-reload pass.  */
5917   reload_completed = 0;
5918 }
5919
5920 static bool
5921 aarch64_tls_referenced_p (rtx x)
5922 {
5923   if (!TARGET_HAVE_TLS)
5924     return false;
5925   subrtx_iterator::array_type array;
5926   FOR_EACH_SUBRTX (iter, array, x, ALL)
5927     {
5928       const_rtx x = *iter;
5929       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
5930         return true;
5931       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
5932          TLS offsets, not real symbol references.  */
5933       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5934         iter.skip_subrtxes ();
5935     }
5936   return false;
5937 }
5938
5939
5940 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5941    a left shift of 0 or 12 bits.  */
5942 bool
5943 aarch64_uimm12_shift (HOST_WIDE_INT val)
5944 {
5945   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
5946           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
5947           );
5948 }
5949
5950 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
5951    that can be created with a left shift of 0 or 12.  */
5952 static HOST_WIDE_INT
5953 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
5954 {
5955   /* Check to see if the value fits in 24 bits, as that is the maximum we can
5956      handle correctly.  */
5957   gcc_assert ((val & 0xffffff) == val);
5958
5959   if (((val & 0xfff) << 0) == val)
5960     return val;
5961
5962   return val & (0xfff << 12);
5963 }
5964
5965 /* Return true if val is an immediate that can be loaded into a
5966    register by a MOVZ instruction.  */
5967 static bool
5968 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
5969 {
5970   if (GET_MODE_SIZE (mode) > 4)
5971     {
5972       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
5973           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
5974         return 1;
5975     }
5976   else
5977     {
5978       /* Ignore sign extension.  */
5979       val &= (HOST_WIDE_INT) 0xffffffff;
5980     }
5981   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
5982           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
5983 }
5984
5985 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
5986    64-bit (DImode) integer.  */
5987
5988 static unsigned HOST_WIDE_INT
5989 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5990 {
5991   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
5992   while (size < 64)
5993     {
5994       val &= (HOST_WIDE_INT_1U << size) - 1;
5995       val |= val << size;
5996       size *= 2;
5997     }
5998   return val;
5999 }
6000
6001 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
6002
6003 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
6004   {
6005     0x0000000100000001ull,
6006     0x0001000100010001ull,
6007     0x0101010101010101ull,
6008     0x1111111111111111ull,
6009     0x5555555555555555ull,
6010   };
6011
6012
6013 /* Return true if val is a valid bitmask immediate.  */
6014
6015 bool
6016 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
6017 {
6018   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
6019   int bits;
6020
6021   /* Check for a single sequence of one bits and return quickly if so.
6022      The special cases of all ones and all zeroes returns false.  */
6023   val = aarch64_replicate_bitmask_imm (val_in, mode);
6024   tmp = val + (val & -val);
6025
6026   if (tmp == (tmp & -tmp))
6027     return (val + 1) > 1;
6028
6029   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
6030   if (mode == SImode)
6031     val = (val << 32) | (val & 0xffffffff);
6032
6033   /* Invert if the immediate doesn't start with a zero bit - this means we
6034      only need to search for sequences of one bits.  */
6035   if (val & 1)
6036     val = ~val;
6037
6038   /* Find the first set bit and set tmp to val with the first sequence of one
6039      bits removed.  Return success if there is a single sequence of ones.  */
6040   first_one = val & -val;
6041   tmp = val & (val + first_one);
6042
6043   if (tmp == 0)
6044     return true;
6045
6046   /* Find the next set bit and compute the difference in bit position.  */
6047   next_one = tmp & -tmp;
6048   bits = clz_hwi (first_one) - clz_hwi (next_one);
6049   mask = val ^ tmp;
6050
6051   /* Check the bit position difference is a power of 2, and that the first
6052      sequence of one bits fits within 'bits' bits.  */
6053   if ((mask >> bits) != 0 || bits != (bits & -bits))
6054     return false;
6055
6056   /* Check the sequence of one bits is repeated 64/bits times.  */
6057   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
6058 }
6059
6060 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
6061    Assumed precondition: VAL_IN Is not zero.  */
6062
6063 unsigned HOST_WIDE_INT
6064 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
6065 {
6066   int lowest_bit_set = ctz_hwi (val_in);
6067   int highest_bit_set = floor_log2 (val_in);
6068   gcc_assert (val_in != 0);
6069
6070   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
6071           (HOST_WIDE_INT_1U << lowest_bit_set));
6072 }
6073
6074 /* Create constant where bits outside of lowest bit set to highest bit set
6075    are set to 1.  */
6076
6077 unsigned HOST_WIDE_INT
6078 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
6079 {
6080   return val_in | ~aarch64_and_split_imm1 (val_in);
6081 }
6082
6083 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
6084
6085 bool
6086 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
6087 {
6088   scalar_int_mode int_mode;
6089   if (!is_a <scalar_int_mode> (mode, &int_mode))
6090     return false;
6091
6092   if (aarch64_bitmask_imm (val_in, int_mode))
6093     return false;
6094
6095   if (aarch64_move_imm (val_in, int_mode))
6096     return false;
6097
6098   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
6099
6100   return aarch64_bitmask_imm (imm2, int_mode);
6101 }
6102
6103 /* Return true if val is an immediate that can be loaded into a
6104    register in a single instruction.  */
6105 bool
6106 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
6107 {
6108   scalar_int_mode int_mode;
6109   if (!is_a <scalar_int_mode> (mode, &int_mode))
6110     return false;
6111
6112   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
6113     return 1;
6114   return aarch64_bitmask_imm (val, int_mode);
6115 }
6116
6117 static bool
6118 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
6119 {
6120   rtx base, offset;
6121
6122   if (GET_CODE (x) == HIGH)
6123     return true;
6124
6125   /* There's no way to calculate VL-based values using relocations.  */
6126   subrtx_iterator::array_type array;
6127   FOR_EACH_SUBRTX (iter, array, x, ALL)
6128     if (GET_CODE (*iter) == CONST_POLY_INT)
6129       return true;
6130
6131   split_const (x, &base, &offset);
6132   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
6133     {
6134       if (aarch64_classify_symbol (base, INTVAL (offset))
6135           != SYMBOL_FORCE_TO_MEM)
6136         return true;
6137       else
6138         /* Avoid generating a 64-bit relocation in ILP32; leave
6139            to aarch64_expand_mov_immediate to handle it properly.  */
6140         return mode != ptr_mode;
6141     }
6142
6143   return aarch64_tls_referenced_p (x);
6144 }
6145
6146 /* Implement TARGET_CASE_VALUES_THRESHOLD.
6147    The expansion for a table switch is quite expensive due to the number
6148    of instructions, the table lookup and hard to predict indirect jump.
6149    When optimizing for speed, and -O3 enabled, use the per-core tuning if
6150    set, otherwise use tables for > 16 cases as a tradeoff between size and
6151    performance.  When optimizing for size, use the default setting.  */
6152
6153 static unsigned int
6154 aarch64_case_values_threshold (void)
6155 {
6156   /* Use the specified limit for the number of cases before using jump
6157      tables at higher optimization levels.  */
6158   if (optimize > 2
6159       && selected_cpu->tune->max_case_values != 0)
6160     return selected_cpu->tune->max_case_values;
6161   else
6162     return optimize_size ? default_case_values_threshold () : 17;
6163 }
6164
6165 /* Return true if register REGNO is a valid index register.
6166    STRICT_P is true if REG_OK_STRICT is in effect.  */
6167
6168 bool
6169 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
6170 {
6171   if (!HARD_REGISTER_NUM_P (regno))
6172     {
6173       if (!strict_p)
6174         return true;
6175
6176       if (!reg_renumber)
6177         return false;
6178
6179       regno = reg_renumber[regno];
6180     }
6181   return GP_REGNUM_P (regno);
6182 }
6183
6184 /* Return true if register REGNO is a valid base register for mode MODE.
6185    STRICT_P is true if REG_OK_STRICT is in effect.  */
6186
6187 bool
6188 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
6189 {
6190   if (!HARD_REGISTER_NUM_P (regno))
6191     {
6192       if (!strict_p)
6193         return true;
6194
6195       if (!reg_renumber)
6196         return false;
6197
6198       regno = reg_renumber[regno];
6199     }
6200
6201   /* The fake registers will be eliminated to either the stack or
6202      hard frame pointer, both of which are usually valid base registers.
6203      Reload deals with the cases where the eliminated form isn't valid.  */
6204   return (GP_REGNUM_P (regno)
6205           || regno == SP_REGNUM
6206           || regno == FRAME_POINTER_REGNUM
6207           || regno == ARG_POINTER_REGNUM);
6208 }
6209
6210 /* Return true if X is a valid base register for mode MODE.
6211    STRICT_P is true if REG_OK_STRICT is in effect.  */
6212
6213 static bool
6214 aarch64_base_register_rtx_p (rtx x, bool strict_p)
6215 {
6216   if (!strict_p
6217       && GET_CODE (x) == SUBREG
6218       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
6219     x = SUBREG_REG (x);
6220
6221   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
6222 }
6223
6224 /* Return true if address offset is a valid index.  If it is, fill in INFO
6225    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
6226
6227 static bool
6228 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
6229                         machine_mode mode, bool strict_p)
6230 {
6231   enum aarch64_address_type type;
6232   rtx index;
6233   int shift;
6234
6235   /* (reg:P) */
6236   if ((REG_P (x) || GET_CODE (x) == SUBREG)
6237       && GET_MODE (x) == Pmode)
6238     {
6239       type = ADDRESS_REG_REG;
6240       index = x;
6241       shift = 0;
6242     }
6243   /* (sign_extend:DI (reg:SI)) */
6244   else if ((GET_CODE (x) == SIGN_EXTEND
6245             || GET_CODE (x) == ZERO_EXTEND)
6246            && GET_MODE (x) == DImode
6247            && GET_MODE (XEXP (x, 0)) == SImode)
6248     {
6249       type = (GET_CODE (x) == SIGN_EXTEND)
6250         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6251       index = XEXP (x, 0);
6252       shift = 0;
6253     }
6254   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
6255   else if (GET_CODE (x) == MULT
6256            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6257                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6258            && GET_MODE (XEXP (x, 0)) == DImode
6259            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6260            && CONST_INT_P (XEXP (x, 1)))
6261     {
6262       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6263         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6264       index = XEXP (XEXP (x, 0), 0);
6265       shift = exact_log2 (INTVAL (XEXP (x, 1)));
6266     }
6267   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
6268   else if (GET_CODE (x) == ASHIFT
6269            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6270                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6271            && GET_MODE (XEXP (x, 0)) == DImode
6272            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6273            && CONST_INT_P (XEXP (x, 1)))
6274     {
6275       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6276         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6277       index = XEXP (XEXP (x, 0), 0);
6278       shift = INTVAL (XEXP (x, 1));
6279     }
6280   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
6281   else if ((GET_CODE (x) == SIGN_EXTRACT
6282             || GET_CODE (x) == ZERO_EXTRACT)
6283            && GET_MODE (x) == DImode
6284            && GET_CODE (XEXP (x, 0)) == MULT
6285            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6286            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
6287     {
6288       type = (GET_CODE (x) == SIGN_EXTRACT)
6289         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6290       index = XEXP (XEXP (x, 0), 0);
6291       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
6292       if (INTVAL (XEXP (x, 1)) != 32 + shift
6293           || INTVAL (XEXP (x, 2)) != 0)
6294         shift = -1;
6295     }
6296   /* (and:DI (mult:DI (reg:DI) (const_int scale))
6297      (const_int 0xffffffff<<shift)) */
6298   else if (GET_CODE (x) == AND
6299            && GET_MODE (x) == DImode
6300            && GET_CODE (XEXP (x, 0)) == MULT
6301            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6302            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6303            && CONST_INT_P (XEXP (x, 1)))
6304     {
6305       type = ADDRESS_REG_UXTW;
6306       index = XEXP (XEXP (x, 0), 0);
6307       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
6308       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
6309         shift = -1;
6310     }
6311   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
6312   else if ((GET_CODE (x) == SIGN_EXTRACT
6313             || GET_CODE (x) == ZERO_EXTRACT)
6314            && GET_MODE (x) == DImode
6315            && GET_CODE (XEXP (x, 0)) == ASHIFT
6316            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6317            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
6318     {
6319       type = (GET_CODE (x) == SIGN_EXTRACT)
6320         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6321       index = XEXP (XEXP (x, 0), 0);
6322       shift = INTVAL (XEXP (XEXP (x, 0), 1));
6323       if (INTVAL (XEXP (x, 1)) != 32 + shift
6324           || INTVAL (XEXP (x, 2)) != 0)
6325         shift = -1;
6326     }
6327   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
6328      (const_int 0xffffffff<<shift)) */
6329   else if (GET_CODE (x) == AND
6330            && GET_MODE (x) == DImode
6331            && GET_CODE (XEXP (x, 0)) == ASHIFT
6332            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6333            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6334            && CONST_INT_P (XEXP (x, 1)))
6335     {
6336       type = ADDRESS_REG_UXTW;
6337       index = XEXP (XEXP (x, 0), 0);
6338       shift = INTVAL (XEXP (XEXP (x, 0), 1));
6339       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
6340         shift = -1;
6341     }
6342   /* (mult:P (reg:P) (const_int scale)) */
6343   else if (GET_CODE (x) == MULT
6344            && GET_MODE (x) == Pmode
6345            && GET_MODE (XEXP (x, 0)) == Pmode
6346            && CONST_INT_P (XEXP (x, 1)))
6347     {
6348       type = ADDRESS_REG_REG;
6349       index = XEXP (x, 0);
6350       shift = exact_log2 (INTVAL (XEXP (x, 1)));
6351     }
6352   /* (ashift:P (reg:P) (const_int shift)) */
6353   else if (GET_CODE (x) == ASHIFT
6354            && GET_MODE (x) == Pmode
6355            && GET_MODE (XEXP (x, 0)) == Pmode
6356            && CONST_INT_P (XEXP (x, 1)))
6357     {
6358       type = ADDRESS_REG_REG;
6359       index = XEXP (x, 0);
6360       shift = INTVAL (XEXP (x, 1));
6361     }
6362   else
6363     return false;
6364
6365   if (!strict_p
6366       && GET_CODE (index) == SUBREG
6367       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
6368     index = SUBREG_REG (index);
6369
6370   if (aarch64_sve_data_mode_p (mode))
6371     {
6372       if (type != ADDRESS_REG_REG
6373           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
6374         return false;
6375     }
6376   else
6377     {
6378       if (shift != 0
6379           && !(IN_RANGE (shift, 1, 3)
6380                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
6381         return false;
6382     }
6383
6384   if (REG_P (index)
6385       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
6386     {
6387       info->type = type;
6388       info->offset = index;
6389       info->shift = shift;
6390       return true;
6391     }
6392
6393   return false;
6394 }
6395
6396 /* Return true if MODE is one of the modes for which we
6397    support LDP/STP operations.  */
6398
6399 static bool
6400 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
6401 {
6402   return mode == SImode || mode == DImode
6403          || mode == SFmode || mode == DFmode
6404          || (aarch64_vector_mode_supported_p (mode)
6405              && (known_eq (GET_MODE_SIZE (mode), 8)
6406                  || (known_eq (GET_MODE_SIZE (mode), 16)
6407                     && (aarch64_tune_params.extra_tuning_flags
6408                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
6409 }
6410
6411 /* Return true if REGNO is a virtual pointer register, or an eliminable
6412    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
6413    include stack_pointer or hard_frame_pointer.  */
6414 static bool
6415 virt_or_elim_regno_p (unsigned regno)
6416 {
6417   return ((regno >= FIRST_VIRTUAL_REGISTER
6418            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
6419           || regno == FRAME_POINTER_REGNUM
6420           || regno == ARG_POINTER_REGNUM);
6421 }
6422
6423 /* Return true if X is a valid address of type TYPE for machine mode MODE.
6424    If it is, fill in INFO appropriately.  STRICT_P is true if
6425    REG_OK_STRICT is in effect.  */
6426
6427 bool
6428 aarch64_classify_address (struct aarch64_address_info *info,
6429                           rtx x, machine_mode mode, bool strict_p,
6430                           aarch64_addr_query_type type)
6431 {
6432   enum rtx_code code = GET_CODE (x);
6433   rtx op0, op1;
6434   poly_int64 offset;
6435
6436   HOST_WIDE_INT const_size;
6437
6438   /* On BE, we use load/store pair for all large int mode load/stores.
6439      TI/TFmode may also use a load/store pair.  */
6440   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6441   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
6442   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
6443                             || type == ADDR_QUERY_LDP_STP_N
6444                             || mode == TImode
6445                             || mode == TFmode
6446                             || (BYTES_BIG_ENDIAN && advsimd_struct_p));
6447
6448   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
6449      corresponds to the actual size of the memory being loaded/stored and the
6450      mode of the corresponding addressing mode is half of that.  */
6451   if (type == ADDR_QUERY_LDP_STP_N
6452       && known_eq (GET_MODE_SIZE (mode), 16))
6453     mode = DFmode;
6454
6455   bool allow_reg_index_p = (!load_store_pair_p
6456                             && (known_lt (GET_MODE_SIZE (mode), 16)
6457                                 || vec_flags == VEC_ADVSIMD
6458                                 || vec_flags == VEC_SVE_DATA));
6459
6460   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
6461      [Rn, #offset, MUL VL].  */
6462   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
6463       && (code != REG && code != PLUS))
6464     return false;
6465
6466   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
6467      REG addressing.  */
6468   if (advsimd_struct_p
6469       && !BYTES_BIG_ENDIAN
6470       && (code != POST_INC && code != REG))
6471     return false;
6472
6473   gcc_checking_assert (GET_MODE (x) == VOIDmode
6474                        || SCALAR_INT_MODE_P (GET_MODE (x)));
6475
6476   switch (code)
6477     {
6478     case REG:
6479     case SUBREG:
6480       info->type = ADDRESS_REG_IMM;
6481       info->base = x;
6482       info->offset = const0_rtx;
6483       info->const_offset = 0;
6484       return aarch64_base_register_rtx_p (x, strict_p);
6485
6486     case PLUS:
6487       op0 = XEXP (x, 0);
6488       op1 = XEXP (x, 1);
6489
6490       if (! strict_p
6491           && REG_P (op0)
6492           && virt_or_elim_regno_p (REGNO (op0))
6493           && poly_int_rtx_p (op1, &offset))
6494         {
6495           info->type = ADDRESS_REG_IMM;
6496           info->base = op0;
6497           info->offset = op1;
6498           info->const_offset = offset;
6499
6500           return true;
6501         }
6502
6503       if (maybe_ne (GET_MODE_SIZE (mode), 0)
6504           && aarch64_base_register_rtx_p (op0, strict_p)
6505           && poly_int_rtx_p (op1, &offset))
6506         {
6507           info->type = ADDRESS_REG_IMM;
6508           info->base = op0;
6509           info->offset = op1;
6510           info->const_offset = offset;
6511
6512           /* TImode and TFmode values are allowed in both pairs of X
6513              registers and individual Q registers.  The available
6514              address modes are:
6515              X,X: 7-bit signed scaled offset
6516              Q:   9-bit signed offset
6517              We conservatively require an offset representable in either mode.
6518              When performing the check for pairs of X registers i.e.  LDP/STP
6519              pass down DImode since that is the natural size of the LDP/STP
6520              instruction memory accesses.  */
6521           if (mode == TImode || mode == TFmode)
6522             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
6523                     && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
6524                         || offset_12bit_unsigned_scaled_p (mode, offset)));
6525
6526           /* A 7bit offset check because OImode will emit a ldp/stp
6527              instruction (only big endian will get here).
6528              For ldp/stp instructions, the offset is scaled for the size of a
6529              single element of the pair.  */
6530           if (mode == OImode)
6531             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
6532
6533           /* Three 9/12 bit offsets checks because CImode will emit three
6534              ldr/str instructions (only big endian will get here).  */
6535           if (mode == CImode)
6536             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
6537                     && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
6538                                                                offset + 32)
6539                         || offset_12bit_unsigned_scaled_p (V16QImode,
6540                                                            offset + 32)));
6541
6542           /* Two 7bit offsets checks because XImode will emit two ldp/stp
6543              instructions (only big endian will get here).  */
6544           if (mode == XImode)
6545             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
6546                     && aarch64_offset_7bit_signed_scaled_p (TImode,
6547                                                             offset + 32));
6548
6549           /* Make "m" use the LD1 offset range for SVE data modes, so
6550              that pre-RTL optimizers like ivopts will work to that
6551              instead of the wider LDR/STR range.  */
6552           if (vec_flags == VEC_SVE_DATA)
6553             return (type == ADDR_QUERY_M
6554                     ? offset_4bit_signed_scaled_p (mode, offset)
6555                     : offset_9bit_signed_scaled_p (mode, offset));
6556
6557           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
6558             {
6559               poly_int64 end_offset = (offset
6560                                        + GET_MODE_SIZE (mode)
6561                                        - BYTES_PER_SVE_VECTOR);
6562               return (type == ADDR_QUERY_M
6563                       ? offset_4bit_signed_scaled_p (mode, offset)
6564                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
6565                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
6566                                                          end_offset)));
6567             }
6568
6569           if (vec_flags == VEC_SVE_PRED)
6570             return offset_9bit_signed_scaled_p (mode, offset);
6571
6572           if (load_store_pair_p)
6573             return ((known_eq (GET_MODE_SIZE (mode), 4)
6574                      || known_eq (GET_MODE_SIZE (mode), 8)
6575                      || known_eq (GET_MODE_SIZE (mode), 16))
6576                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
6577           else
6578             return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
6579                     || offset_12bit_unsigned_scaled_p (mode, offset));
6580         }
6581
6582       if (allow_reg_index_p)
6583         {
6584           /* Look for base + (scaled/extended) index register.  */
6585           if (aarch64_base_register_rtx_p (op0, strict_p)
6586               && aarch64_classify_index (info, op1, mode, strict_p))
6587             {
6588               info->base = op0;
6589               return true;
6590             }
6591           if (aarch64_base_register_rtx_p (op1, strict_p)
6592               && aarch64_classify_index (info, op0, mode, strict_p))
6593             {
6594               info->base = op1;
6595               return true;
6596             }
6597         }
6598
6599       return false;
6600
6601     case POST_INC:
6602     case POST_DEC:
6603     case PRE_INC:
6604     case PRE_DEC:
6605       info->type = ADDRESS_REG_WB;
6606       info->base = XEXP (x, 0);
6607       info->offset = NULL_RTX;
6608       return aarch64_base_register_rtx_p (info->base, strict_p);
6609
6610     case POST_MODIFY:
6611     case PRE_MODIFY:
6612       info->type = ADDRESS_REG_WB;
6613       info->base = XEXP (x, 0);
6614       if (GET_CODE (XEXP (x, 1)) == PLUS
6615           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
6616           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
6617           && aarch64_base_register_rtx_p (info->base, strict_p))
6618         {
6619           info->offset = XEXP (XEXP (x, 1), 1);
6620           info->const_offset = offset;
6621
6622           /* TImode and TFmode values are allowed in both pairs of X
6623              registers and individual Q registers.  The available
6624              address modes are:
6625              X,X: 7-bit signed scaled offset
6626              Q:   9-bit signed offset
6627              We conservatively require an offset representable in either mode.
6628            */
6629           if (mode == TImode || mode == TFmode)
6630             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
6631                     && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
6632
6633           if (load_store_pair_p)
6634             return ((known_eq (GET_MODE_SIZE (mode), 4)
6635                      || known_eq (GET_MODE_SIZE (mode), 8)
6636                      || known_eq (GET_MODE_SIZE (mode), 16))
6637                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
6638           else
6639             return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
6640         }
6641       return false;
6642
6643     case CONST:
6644     case SYMBOL_REF:
6645     case LABEL_REF:
6646       /* load literal: pc-relative constant pool entry.  Only supported
6647          for SI mode or larger.  */
6648       info->type = ADDRESS_SYMBOLIC;
6649
6650       if (!load_store_pair_p
6651           && GET_MODE_SIZE (mode).is_constant (&const_size)
6652           && const_size >= 4)
6653         {
6654           rtx sym, addend;
6655
6656           split_const (x, &sym, &addend);
6657           return ((GET_CODE (sym) == LABEL_REF
6658                    || (GET_CODE (sym) == SYMBOL_REF
6659                        && CONSTANT_POOL_ADDRESS_P (sym)
6660                        && aarch64_pcrelative_literal_loads)));
6661         }
6662       return false;
6663
6664     case LO_SUM:
6665       info->type = ADDRESS_LO_SUM;
6666       info->base = XEXP (x, 0);
6667       info->offset = XEXP (x, 1);
6668       if (allow_reg_index_p
6669           && aarch64_base_register_rtx_p (info->base, strict_p))
6670         {
6671           rtx sym, offs;
6672           split_const (info->offset, &sym, &offs);
6673           if (GET_CODE (sym) == SYMBOL_REF
6674               && (aarch64_classify_symbol (sym, INTVAL (offs))
6675                   == SYMBOL_SMALL_ABSOLUTE))
6676             {
6677               /* The symbol and offset must be aligned to the access size.  */
6678               unsigned int align;
6679
6680               if (CONSTANT_POOL_ADDRESS_P (sym))
6681                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
6682               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
6683                 {
6684                   tree exp = SYMBOL_REF_DECL (sym);
6685                   align = TYPE_ALIGN (TREE_TYPE (exp));
6686                   align = aarch64_constant_alignment (exp, align);
6687                 }
6688               else if (SYMBOL_REF_DECL (sym))
6689                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
6690               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
6691                        && SYMBOL_REF_BLOCK (sym) != NULL)
6692                 align = SYMBOL_REF_BLOCK (sym)->alignment;
6693               else
6694                 align = BITS_PER_UNIT;
6695
6696               poly_int64 ref_size = GET_MODE_SIZE (mode);
6697               if (known_eq (ref_size, 0))
6698                 ref_size = GET_MODE_SIZE (DImode);
6699
6700               return (multiple_p (INTVAL (offs), ref_size)
6701                       && multiple_p (align / BITS_PER_UNIT, ref_size));
6702             }
6703         }
6704       return false;
6705
6706     default:
6707       return false;
6708     }
6709 }
6710
6711 /* Return true if the address X is valid for a PRFM instruction.
6712    STRICT_P is true if we should do strict checking with
6713    aarch64_classify_address.  */
6714
6715 bool
6716 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
6717 {
6718   struct aarch64_address_info addr;
6719
6720   /* PRFM accepts the same addresses as DImode...  */
6721   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
6722   if (!res)
6723     return false;
6724
6725   /* ... except writeback forms.  */
6726   return addr.type != ADDRESS_REG_WB;
6727 }
6728
6729 bool
6730 aarch64_symbolic_address_p (rtx x)
6731 {
6732   rtx offset;
6733
6734   split_const (x, &x, &offset);
6735   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
6736 }
6737
6738 /* Classify the base of symbolic expression X.  */
6739
6740 enum aarch64_symbol_type
6741 aarch64_classify_symbolic_expression (rtx x)
6742 {
6743   rtx offset;
6744
6745   split_const (x, &x, &offset);
6746   return aarch64_classify_symbol (x, INTVAL (offset));
6747 }
6748
6749
6750 /* Return TRUE if X is a legitimate address for accessing memory in
6751    mode MODE.  */
6752 static bool
6753 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
6754 {
6755   struct aarch64_address_info addr;
6756
6757   return aarch64_classify_address (&addr, x, mode, strict_p);
6758 }
6759
6760 /* Return TRUE if X is a legitimate address of type TYPE for accessing
6761    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
6762 bool
6763 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
6764                               aarch64_addr_query_type type)
6765 {
6766   struct aarch64_address_info addr;
6767
6768   return aarch64_classify_address (&addr, x, mode, strict_p, type);
6769 }
6770
6771 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
6772
6773 static bool
6774 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
6775                                          poly_int64 orig_offset,
6776                                          machine_mode mode)
6777 {
6778   HOST_WIDE_INT size;
6779   if (GET_MODE_SIZE (mode).is_constant (&size))
6780     {
6781       HOST_WIDE_INT const_offset, second_offset;
6782
6783       /* A general SVE offset is A * VQ + B.  Remove the A component from
6784          coefficient 0 in order to get the constant B.  */
6785       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
6786
6787       /* Split an out-of-range address displacement into a base and
6788          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
6789          range otherwise to increase opportunities for sharing the base
6790          address of different sizes.  Unaligned accesses use the signed
6791          9-bit range, TImode/TFmode use the intersection of signed
6792          scaled 7-bit and signed 9-bit offset.  */
6793       if (mode == TImode || mode == TFmode)
6794         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
6795       else if ((const_offset & (size - 1)) != 0)
6796         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
6797       else
6798         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
6799
6800       if (second_offset == 0 || known_eq (orig_offset, second_offset))
6801         return false;
6802
6803       /* Split the offset into second_offset and the rest.  */
6804       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6805       *offset2 = gen_int_mode (second_offset, Pmode);
6806       return true;
6807     }
6808   else
6809     {
6810       /* Get the mode we should use as the basis of the range.  For structure
6811          modes this is the mode of one vector.  */
6812       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6813       machine_mode step_mode
6814         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
6815
6816       /* Get the "mul vl" multiplier we'd like to use.  */
6817       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
6818       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
6819       if (vec_flags & VEC_SVE_DATA)
6820         /* LDR supports a 9-bit range, but the move patterns for
6821            structure modes require all vectors to be in range of the
6822            same base.  The simplest way of accomodating that while still
6823            promoting reuse of anchor points between different modes is
6824            to use an 8-bit range unconditionally.  */
6825         vnum = ((vnum + 128) & 255) - 128;
6826       else
6827         /* Predicates are only handled singly, so we might as well use
6828            the full range.  */
6829         vnum = ((vnum + 256) & 511) - 256;
6830       if (vnum == 0)
6831         return false;
6832
6833       /* Convert the "mul vl" multiplier into a byte offset.  */
6834       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
6835       if (known_eq (second_offset, orig_offset))
6836         return false;
6837
6838       /* Split the offset into second_offset and the rest.  */
6839       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6840       *offset2 = gen_int_mode (second_offset, Pmode);
6841       return true;
6842     }
6843 }
6844
6845 /* Return the binary representation of floating point constant VALUE in INTVAL.
6846    If the value cannot be converted, return false without setting INTVAL.
6847    The conversion is done in the given MODE.  */
6848 bool
6849 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
6850 {
6851
6852   /* We make a general exception for 0.  */
6853   if (aarch64_float_const_zero_rtx_p (value))
6854     {
6855       *intval = 0;
6856       return true;
6857     }
6858
6859   scalar_float_mode mode;
6860   if (GET_CODE (value) != CONST_DOUBLE
6861       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
6862       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
6863       /* Only support up to DF mode.  */
6864       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
6865     return false;
6866
6867   unsigned HOST_WIDE_INT ival = 0;
6868
6869   long res[2];
6870   real_to_target (res,
6871                   CONST_DOUBLE_REAL_VALUE (value),
6872                   REAL_MODE_FORMAT (mode));
6873
6874   if (mode == DFmode)
6875     {
6876       int order = BYTES_BIG_ENDIAN ? 1 : 0;
6877       ival = zext_hwi (res[order], 32);
6878       ival |= (zext_hwi (res[1 - order], 32) << 32);
6879     }
6880   else
6881       ival = zext_hwi (res[0], 32);
6882
6883   *intval = ival;
6884   return true;
6885 }
6886
6887 /* Return TRUE if rtx X is an immediate constant that can be moved using a
6888    single MOV(+MOVK) followed by an FMOV.  */
6889 bool
6890 aarch64_float_const_rtx_p (rtx x)
6891 {
6892   machine_mode mode = GET_MODE (x);
6893   if (mode == VOIDmode)
6894     return false;
6895
6896   /* Determine whether it's cheaper to write float constants as
6897      mov/movk pairs over ldr/adrp pairs.  */
6898   unsigned HOST_WIDE_INT ival;
6899
6900   if (GET_CODE (x) == CONST_DOUBLE
6901       && SCALAR_FLOAT_MODE_P (mode)
6902       && aarch64_reinterpret_float_as_int (x, &ival))
6903     {
6904       scalar_int_mode imode = (mode == HFmode
6905                                ? SImode
6906                                : int_mode_for_mode (mode).require ());
6907       int num_instr = aarch64_internal_mov_immediate
6908                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
6909       return num_instr < 3;
6910     }
6911
6912   return false;
6913 }
6914
6915 /* Return TRUE if rtx X is immediate constant 0.0 */
6916 bool
6917 aarch64_float_const_zero_rtx_p (rtx x)
6918 {
6919   if (GET_MODE (x) == VOIDmode)
6920     return false;
6921
6922   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
6923     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
6924   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
6925 }
6926
6927 /* Return TRUE if rtx X is immediate constant that fits in a single
6928    MOVI immediate operation.  */
6929 bool
6930 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
6931 {
6932   if (!TARGET_SIMD)
6933      return false;
6934
6935   machine_mode vmode;
6936   scalar_int_mode imode;
6937   unsigned HOST_WIDE_INT ival;
6938
6939   if (GET_CODE (x) == CONST_DOUBLE
6940       && SCALAR_FLOAT_MODE_P (mode))
6941     {
6942       if (!aarch64_reinterpret_float_as_int (x, &ival))
6943         return false;
6944
6945       /* We make a general exception for 0.  */
6946       if (aarch64_float_const_zero_rtx_p (x))
6947         return true;
6948
6949       imode = int_mode_for_mode (mode).require ();
6950     }
6951   else if (GET_CODE (x) == CONST_INT
6952            && is_a <scalar_int_mode> (mode, &imode))
6953     ival = INTVAL (x);
6954   else
6955     return false;
6956
6957    /* use a 64 bit mode for everything except for DI/DF mode, where we use
6958      a 128 bit vector mode.  */
6959   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
6960
6961   vmode = aarch64_simd_container_mode (imode, width);
6962   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
6963
6964   return aarch64_simd_valid_immediate (v_op, NULL);
6965 }
6966
6967
6968 /* Return the fixed registers used for condition codes.  */
6969
6970 static bool
6971 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
6972 {
6973   *p1 = CC_REGNUM;
6974   *p2 = INVALID_REGNUM;
6975   return true;
6976 }
6977
6978 /* This function is used by the call expanders of the machine description.
6979    RESULT is the register in which the result is returned.  It's NULL for
6980    "call" and "sibcall".
6981    MEM is the location of the function call.
6982    SIBCALL indicates whether this function call is normal call or sibling call.
6983    It will generate different pattern accordingly.  */
6984
6985 void
6986 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
6987 {
6988   rtx call, callee, tmp;
6989   rtvec vec;
6990   machine_mode mode;
6991
6992   gcc_assert (MEM_P (mem));
6993   callee = XEXP (mem, 0);
6994   mode = GET_MODE (callee);
6995   gcc_assert (mode == Pmode);
6996
6997   /* Decide if we should generate indirect calls by loading the
6998      address of the callee into a register before performing
6999      the branch-and-link.  */
7000   if (SYMBOL_REF_P (callee)
7001       ? (aarch64_is_long_call_p (callee)
7002          || aarch64_is_noplt_call_p (callee))
7003       : !REG_P (callee))
7004     XEXP (mem, 0) = force_reg (mode, callee);
7005
7006   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
7007
7008   if (result != NULL_RTX)
7009     call = gen_rtx_SET (result, call);
7010
7011   if (sibcall)
7012     tmp = ret_rtx;
7013   else
7014     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
7015
7016   vec = gen_rtvec (2, call, tmp);
7017   call = gen_rtx_PARALLEL (VOIDmode, vec);
7018
7019   aarch64_emit_call_insn (call);
7020 }
7021
7022 /* Emit call insn with PAT and do aarch64-specific handling.  */
7023
7024 void
7025 aarch64_emit_call_insn (rtx pat)
7026 {
7027   rtx insn = emit_call_insn (pat);
7028
7029   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
7030   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
7031   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
7032 }
7033
7034 machine_mode
7035 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
7036 {
7037   /* All floating point compares return CCFP if it is an equality
7038      comparison, and CCFPE otherwise.  */
7039   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
7040     {
7041       switch (code)
7042         {
7043         case EQ:
7044         case NE:
7045         case UNORDERED:
7046         case ORDERED:
7047         case UNLT:
7048         case UNLE:
7049         case UNGT:
7050         case UNGE:
7051         case UNEQ:
7052           return CCFPmode;
7053
7054         case LT:
7055         case LE:
7056         case GT:
7057         case GE:
7058         case LTGT:
7059           return CCFPEmode;
7060
7061         default:
7062           gcc_unreachable ();
7063         }
7064     }
7065
7066   /* Equality comparisons of short modes against zero can be performed
7067      using the TST instruction with the appropriate bitmask.  */
7068   if (y == const0_rtx && REG_P (x)
7069       && (code == EQ || code == NE)
7070       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
7071     return CC_NZmode;
7072
7073   /* Similarly, comparisons of zero_extends from shorter modes can
7074      be performed using an ANDS with an immediate mask.  */
7075   if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
7076       && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
7077       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
7078       && (code == EQ || code == NE))
7079     return CC_NZmode;
7080
7081   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
7082       && y == const0_rtx
7083       && (code == EQ || code == NE || code == LT || code == GE)
7084       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
7085           || GET_CODE (x) == NEG
7086           || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
7087               && CONST_INT_P (XEXP (x, 2)))))
7088     return CC_NZmode;
7089
7090   /* A compare with a shifted operand.  Because of canonicalization,
7091      the comparison will have to be swapped when we emit the assembly
7092      code.  */
7093   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
7094       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
7095       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
7096           || GET_CODE (x) == LSHIFTRT
7097           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
7098     return CC_SWPmode;
7099
7100   /* Similarly for a negated operand, but we can only do this for
7101      equalities.  */
7102   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
7103       && (REG_P (y) || GET_CODE (y) == SUBREG)
7104       && (code == EQ || code == NE)
7105       && GET_CODE (x) == NEG)
7106     return CC_Zmode;
7107
7108   /* A test for unsigned overflow.  */
7109   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
7110       && code == NE
7111       && GET_CODE (x) == PLUS
7112       && GET_CODE (y) == ZERO_EXTEND)
7113     return CC_Cmode;
7114
7115   /* A test for signed overflow.  */
7116   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
7117       && code == NE
7118       && GET_CODE (x) == PLUS
7119       && GET_CODE (y) == SIGN_EXTEND)
7120     return CC_Vmode;
7121
7122   /* For everything else, return CCmode.  */
7123   return CCmode;
7124 }
7125
7126 static int
7127 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
7128
7129 int
7130 aarch64_get_condition_code (rtx x)
7131 {
7132   machine_mode mode = GET_MODE (XEXP (x, 0));
7133   enum rtx_code comp_code = GET_CODE (x);
7134
7135   if (GET_MODE_CLASS (mode) != MODE_CC)
7136     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
7137   return aarch64_get_condition_code_1 (mode, comp_code);
7138 }
7139
7140 static int
7141 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
7142 {
7143   switch (mode)
7144     {
7145     case E_CCFPmode:
7146     case E_CCFPEmode:
7147       switch (comp_code)
7148         {
7149         case GE: return AARCH64_GE;
7150         case GT: return AARCH64_GT;
7151         case LE: return AARCH64_LS;
7152         case LT: return AARCH64_MI;
7153         case NE: return AARCH64_NE;
7154         case EQ: return AARCH64_EQ;
7155         case ORDERED: return AARCH64_VC;
7156         case UNORDERED: return AARCH64_VS;
7157         case UNLT: return AARCH64_LT;
7158         case UNLE: return AARCH64_LE;
7159         case UNGT: return AARCH64_HI;
7160         case UNGE: return AARCH64_PL;
7161         default: return -1;
7162         }
7163       break;
7164
7165     case E_CCmode:
7166       switch (comp_code)
7167         {
7168         case NE: return AARCH64_NE;
7169         case EQ: return AARCH64_EQ;
7170         case GE: return AARCH64_GE;
7171         case GT: return AARCH64_GT;
7172         case LE: return AARCH64_LE;
7173         case LT: return AARCH64_LT;
7174         case GEU: return AARCH64_CS;
7175         case GTU: return AARCH64_HI;
7176         case LEU: return AARCH64_LS;
7177         case LTU: return AARCH64_CC;
7178         default: return -1;
7179         }
7180       break;
7181
7182     case E_CC_SWPmode:
7183       switch (comp_code)
7184         {
7185         case NE: return AARCH64_NE;
7186         case EQ: return AARCH64_EQ;
7187         case GE: return AARCH64_LE;
7188         case GT: return AARCH64_LT;
7189         case LE: return AARCH64_GE;
7190         case LT: return AARCH64_GT;
7191         case GEU: return AARCH64_LS;
7192         case GTU: return AARCH64_CC;
7193         case LEU: return AARCH64_CS;
7194         case LTU: return AARCH64_HI;
7195         default: return -1;
7196         }
7197       break;
7198
7199     case E_CC_NZmode:
7200       switch (comp_code)
7201         {
7202         case NE: return AARCH64_NE;
7203         case EQ: return AARCH64_EQ;
7204         case GE: return AARCH64_PL;
7205         case LT: return AARCH64_MI;
7206         default: return -1;
7207         }
7208       break;
7209
7210     case E_CC_Zmode:
7211       switch (comp_code)
7212         {
7213         case NE: return AARCH64_NE;
7214         case EQ: return AARCH64_EQ;
7215         default: return -1;
7216         }
7217       break;
7218
7219     case E_CC_Cmode:
7220       switch (comp_code)
7221         {
7222         case NE: return AARCH64_CS;
7223         case EQ: return AARCH64_CC;
7224         default: return -1;
7225         }
7226       break;
7227
7228     case E_CC_Vmode:
7229       switch (comp_code)
7230         {
7231         case NE: return AARCH64_VS;
7232         case EQ: return AARCH64_VC;
7233         default: return -1;
7234         }
7235       break;
7236
7237     default:
7238       return -1;
7239     }
7240
7241   return -1;
7242 }
7243
7244 bool
7245 aarch64_const_vec_all_same_in_range_p (rtx x,
7246                                        HOST_WIDE_INT minval,
7247                                        HOST_WIDE_INT maxval)
7248 {
7249   rtx elt;
7250   return (const_vec_duplicate_p (x, &elt)
7251           && CONST_INT_P (elt)
7252           && IN_RANGE (INTVAL (elt), minval, maxval));
7253 }
7254
7255 bool
7256 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
7257 {
7258   return aarch64_const_vec_all_same_in_range_p (x, val, val);
7259 }
7260
7261 /* Return true if VEC is a constant in which every element is in the range
7262    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
7263
7264 static bool
7265 aarch64_const_vec_all_in_range_p (rtx vec,
7266                                   HOST_WIDE_INT minval,
7267                                   HOST_WIDE_INT maxval)
7268 {
7269   if (GET_CODE (vec) != CONST_VECTOR
7270       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
7271     return false;
7272
7273   int nunits;
7274   if (!CONST_VECTOR_STEPPED_P (vec))
7275     nunits = const_vector_encoded_nelts (vec);
7276   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
7277     return false;
7278
7279   for (int i = 0; i < nunits; i++)
7280     {
7281       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
7282       if (!CONST_INT_P (vec_elem)
7283           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
7284         return false;
7285     }
7286   return true;
7287 }
7288
7289 /* N Z C V.  */
7290 #define AARCH64_CC_V 1
7291 #define AARCH64_CC_C (1 << 1)
7292 #define AARCH64_CC_Z (1 << 2)
7293 #define AARCH64_CC_N (1 << 3)
7294
7295 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
7296 static const int aarch64_nzcv_codes[] =
7297 {
7298   0,            /* EQ, Z == 1.  */
7299   AARCH64_CC_Z, /* NE, Z == 0.  */
7300   0,            /* CS, C == 1.  */
7301   AARCH64_CC_C, /* CC, C == 0.  */
7302   0,            /* MI, N == 1.  */
7303   AARCH64_CC_N, /* PL, N == 0.  */
7304   0,            /* VS, V == 1.  */
7305   AARCH64_CC_V, /* VC, V == 0.  */
7306   0,            /* HI, C ==1 && Z == 0.  */
7307   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
7308   AARCH64_CC_V, /* GE, N == V.  */
7309   0,            /* LT, N != V.  */
7310   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
7311   0,            /* LE, !(Z == 0 && N == V).  */
7312   0,            /* AL, Any.  */
7313   0             /* NV, Any.  */
7314 };
7315
7316 /* Print floating-point vector immediate operand X to F, negating it
7317    first if NEGATE is true.  Return true on success, false if it isn't
7318    a constant we can handle.  */
7319
7320 static bool
7321 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
7322 {
7323   rtx elt;
7324
7325   if (!const_vec_duplicate_p (x, &elt))
7326     return false;
7327
7328   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
7329   if (negate)
7330     r = real_value_negate (&r);
7331
7332   /* We only handle the SVE single-bit immediates here.  */
7333   if (real_equal (&r, &dconst0))
7334     asm_fprintf (f, "0.0");
7335   else if (real_equal (&r, &dconst1))
7336     asm_fprintf (f, "1.0");
7337   else if (real_equal (&r, &dconsthalf))
7338     asm_fprintf (f, "0.5");
7339   else
7340     return false;
7341
7342   return true;
7343 }
7344
7345 /* Return the equivalent letter for size.  */
7346 static char
7347 sizetochar (int size)
7348 {
7349   switch (size)
7350     {
7351     case 64: return 'd';
7352     case 32: return 's';
7353     case 16: return 'h';
7354     case 8 : return 'b';
7355     default: gcc_unreachable ();
7356     }
7357 }
7358
7359 /* Print operand X to file F in a target specific manner according to CODE.
7360    The acceptable formatting commands given by CODE are:
7361      'c':               An integer or symbol address without a preceding #
7362                         sign.
7363      'C':               Take the duplicated element in a vector constant
7364                         and print it in hex.
7365      'D':               Take the duplicated element in a vector constant
7366                         and print it as an unsigned integer, in decimal.
7367      'e':               Print the sign/zero-extend size as a character 8->b,
7368                         16->h, 32->w.
7369      'p':               Prints N such that 2^N == X (X must be power of 2 and
7370                         const int).
7371      'P':               Print the number of non-zero bits in X (a const_int).
7372      'H':               Print the higher numbered register of a pair (TImode)
7373                         of regs.
7374      'm':               Print a condition (eq, ne, etc).
7375      'M':               Same as 'm', but invert condition.
7376      'N':               Take the duplicated element in a vector constant
7377                         and print the negative of it in decimal.
7378      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
7379      'S/T/U/V':         Print a FP/SIMD register name for a register list.
7380                         The register printed is the FP/SIMD register name
7381                         of X + 0/1/2/3 for S/T/U/V.
7382      'R':               Print a scalar FP/SIMD register name + 1.
7383      'X':               Print bottom 16 bits of integer constant in hex.
7384      'w/x':             Print a general register name or the zero register
7385                         (32-bit or 64-bit).
7386      '0':               Print a normal operand, if it's a general register,
7387                         then we assume DImode.
7388      'k':               Print NZCV for conditional compare instructions.
7389      'A':               Output address constant representing the first
7390                         argument of X, specifying a relocation offset
7391                         if appropriate.
7392      'L':               Output constant address specified by X
7393                         with a relocation offset if appropriate.
7394      'G':               Prints address of X, specifying a PC relative
7395                         relocation mode if appropriate.
7396      'y':               Output address of LDP or STP - this is used for
7397                         some LDP/STPs which don't use a PARALLEL in their
7398                         pattern (so the mode needs to be adjusted).
7399      'z':               Output address of a typical LDP or STP.  */
7400
7401 static void
7402 aarch64_print_operand (FILE *f, rtx x, int code)
7403 {
7404   rtx elt;
7405   switch (code)
7406     {
7407     case 'c':
7408       switch (GET_CODE (x))
7409         {
7410         case CONST_INT:
7411           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
7412           break;
7413
7414         case SYMBOL_REF:
7415           output_addr_const (f, x);
7416           break;
7417
7418         case CONST:
7419           if (GET_CODE (XEXP (x, 0)) == PLUS
7420               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
7421             {
7422               output_addr_const (f, x);
7423               break;
7424             }
7425           /* Fall through.  */
7426
7427         default:
7428           output_operand_lossage ("unsupported operand for code '%c'", code);
7429         }
7430       break;
7431
7432     case 'e':
7433       {
7434         int n;
7435
7436         if (!CONST_INT_P (x)
7437             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
7438           {
7439             output_operand_lossage ("invalid operand for '%%%c'", code);
7440             return;
7441           }
7442
7443         switch (n)
7444           {
7445           case 3:
7446             fputc ('b', f);
7447             break;
7448           case 4:
7449             fputc ('h', f);
7450             break;
7451           case 5:
7452             fputc ('w', f);
7453             break;
7454           default:
7455             output_operand_lossage ("invalid operand for '%%%c'", code);
7456             return;
7457           }
7458       }
7459       break;
7460
7461     case 'p':
7462       {
7463         int n;
7464
7465         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
7466           {
7467             output_operand_lossage ("invalid operand for '%%%c'", code);
7468             return;
7469           }
7470
7471         asm_fprintf (f, "%d", n);
7472       }
7473       break;
7474
7475     case 'P':
7476       if (!CONST_INT_P (x))
7477         {
7478           output_operand_lossage ("invalid operand for '%%%c'", code);
7479           return;
7480         }
7481
7482       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
7483       break;
7484
7485     case 'H':
7486       if (x == const0_rtx)
7487         {
7488           asm_fprintf (f, "xzr");
7489           break;
7490         }
7491
7492       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
7493         {
7494           output_operand_lossage ("invalid operand for '%%%c'", code);
7495           return;
7496         }
7497
7498       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
7499       break;
7500
7501     case 'M':
7502     case 'm':
7503       {
7504         int cond_code;
7505         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
7506         if (x == const_true_rtx)
7507           {
7508             if (code == 'M')
7509               fputs ("nv", f);
7510             return;
7511           }
7512
7513         if (!COMPARISON_P (x))
7514           {
7515             output_operand_lossage ("invalid operand for '%%%c'", code);
7516             return;
7517           }
7518
7519         cond_code = aarch64_get_condition_code (x);
7520         gcc_assert (cond_code >= 0);
7521         if (code == 'M')
7522           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
7523         fputs (aarch64_condition_codes[cond_code], f);
7524       }
7525       break;
7526
7527     case 'N':
7528       if (!const_vec_duplicate_p (x, &elt))
7529         {
7530           output_operand_lossage ("invalid vector constant");
7531           return;
7532         }
7533
7534       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
7535         asm_fprintf (f, "%wd", -INTVAL (elt));
7536       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
7537                && aarch64_print_vector_float_operand (f, x, true))
7538         ;
7539       else
7540         {
7541           output_operand_lossage ("invalid vector constant");
7542           return;
7543         }
7544       break;
7545
7546     case 'b':
7547     case 'h':
7548     case 's':
7549     case 'd':
7550     case 'q':
7551       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7552         {
7553           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7554           return;
7555         }
7556       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
7557       break;
7558
7559     case 'S':
7560     case 'T':
7561     case 'U':
7562     case 'V':
7563       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7564         {
7565           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7566           return;
7567         }
7568       asm_fprintf (f, "%c%d",
7569                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
7570                    REGNO (x) - V0_REGNUM + (code - 'S'));
7571       break;
7572
7573     case 'R':
7574       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7575         {
7576           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7577           return;
7578         }
7579       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
7580       break;
7581
7582     case 'X':
7583       if (!CONST_INT_P (x))
7584         {
7585           output_operand_lossage ("invalid operand for '%%%c'", code);
7586           return;
7587         }
7588       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
7589       break;
7590
7591     case 'C':
7592       {
7593         /* Print a replicated constant in hex.  */
7594         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
7595           {
7596             output_operand_lossage ("invalid operand for '%%%c'", code);
7597             return;
7598           }
7599         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
7600         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
7601       }
7602       break;
7603
7604     case 'D':
7605       {
7606         /* Print a replicated constant in decimal, treating it as
7607            unsigned.  */
7608         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
7609           {
7610             output_operand_lossage ("invalid operand for '%%%c'", code);
7611             return;
7612           }
7613         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
7614         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
7615       }
7616       break;
7617
7618     case 'w':
7619     case 'x':
7620       if (x == const0_rtx
7621           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
7622         {
7623           asm_fprintf (f, "%czr", code);
7624           break;
7625         }
7626
7627       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
7628         {
7629           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
7630           break;
7631         }
7632
7633       if (REG_P (x) && REGNO (x) == SP_REGNUM)
7634         {
7635           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
7636           break;
7637         }
7638
7639       /* Fall through */
7640
7641     case 0:
7642       if (x == NULL)
7643         {
7644           output_operand_lossage ("missing operand");
7645           return;
7646         }
7647
7648       switch (GET_CODE (x))
7649         {
7650         case REG:
7651           if (aarch64_sve_data_mode_p (GET_MODE (x)))
7652             {
7653               if (REG_NREGS (x) == 1)
7654                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
7655               else
7656                 {
7657                   char suffix
7658                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
7659                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
7660                                REGNO (x) - V0_REGNUM, suffix,
7661                                END_REGNO (x) - V0_REGNUM - 1, suffix);
7662                 }
7663             }
7664           else
7665             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
7666           break;
7667
7668         case MEM:
7669           output_address (GET_MODE (x), XEXP (x, 0));
7670           break;
7671
7672         case LABEL_REF:
7673         case SYMBOL_REF:
7674           output_addr_const (asm_out_file, x);
7675           break;
7676
7677         case CONST_INT:
7678           asm_fprintf (f, "%wd", INTVAL (x));
7679           break;
7680
7681         case CONST:
7682           if (!VECTOR_MODE_P (GET_MODE (x)))
7683             {
7684               output_addr_const (asm_out_file, x);
7685               break;
7686             }
7687           /* fall through */
7688
7689         case CONST_VECTOR:
7690           if (!const_vec_duplicate_p (x, &elt))
7691             {
7692               output_operand_lossage ("invalid vector constant");
7693               return;
7694             }
7695
7696           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
7697             asm_fprintf (f, "%wd", INTVAL (elt));
7698           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
7699                    && aarch64_print_vector_float_operand (f, x, false))
7700             ;
7701           else
7702             {
7703               output_operand_lossage ("invalid vector constant");
7704               return;
7705             }
7706           break;
7707
7708         case CONST_DOUBLE:
7709           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
7710              be getting CONST_DOUBLEs holding integers.  */
7711           gcc_assert (GET_MODE (x) != VOIDmode);
7712           if (aarch64_float_const_zero_rtx_p (x))
7713             {
7714               fputc ('0', f);
7715               break;
7716             }
7717           else if (aarch64_float_const_representable_p (x))
7718             {
7719 #define buf_size 20
7720               char float_buf[buf_size] = {'\0'};
7721               real_to_decimal_for_mode (float_buf,
7722                                         CONST_DOUBLE_REAL_VALUE (x),
7723                                         buf_size, buf_size,
7724                                         1, GET_MODE (x));
7725               asm_fprintf (asm_out_file, "%s", float_buf);
7726               break;
7727 #undef buf_size
7728             }
7729           output_operand_lossage ("invalid constant");
7730           return;
7731         default:
7732           output_operand_lossage ("invalid operand");
7733           return;
7734         }
7735       break;
7736
7737     case 'A':
7738       if (GET_CODE (x) == HIGH)
7739         x = XEXP (x, 0);
7740
7741       switch (aarch64_classify_symbolic_expression (x))
7742         {
7743         case SYMBOL_SMALL_GOT_4G:
7744           asm_fprintf (asm_out_file, ":got:");
7745           break;
7746
7747         case SYMBOL_SMALL_TLSGD:
7748           asm_fprintf (asm_out_file, ":tlsgd:");
7749           break;
7750
7751         case SYMBOL_SMALL_TLSDESC:
7752           asm_fprintf (asm_out_file, ":tlsdesc:");
7753           break;
7754
7755         case SYMBOL_SMALL_TLSIE:
7756           asm_fprintf (asm_out_file, ":gottprel:");
7757           break;
7758
7759         case SYMBOL_TLSLE24:
7760           asm_fprintf (asm_out_file, ":tprel:");
7761           break;
7762
7763         case SYMBOL_TINY_GOT:
7764           gcc_unreachable ();
7765           break;
7766
7767         default:
7768           break;
7769         }
7770       output_addr_const (asm_out_file, x);
7771       break;
7772
7773     case 'L':
7774       switch (aarch64_classify_symbolic_expression (x))
7775         {
7776         case SYMBOL_SMALL_GOT_4G:
7777           asm_fprintf (asm_out_file, ":lo12:");
7778           break;
7779
7780         case SYMBOL_SMALL_TLSGD:
7781           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
7782           break;
7783
7784         case SYMBOL_SMALL_TLSDESC:
7785           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
7786           break;
7787
7788         case SYMBOL_SMALL_TLSIE:
7789           asm_fprintf (asm_out_file, ":gottprel_lo12:");
7790           break;
7791
7792         case SYMBOL_TLSLE12:
7793           asm_fprintf (asm_out_file, ":tprel_lo12:");
7794           break;
7795
7796         case SYMBOL_TLSLE24:
7797           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
7798           break;
7799
7800         case SYMBOL_TINY_GOT:
7801           asm_fprintf (asm_out_file, ":got:");
7802           break;
7803
7804         case SYMBOL_TINY_TLSIE:
7805           asm_fprintf (asm_out_file, ":gottprel:");
7806           break;
7807
7808         default:
7809           break;
7810         }
7811       output_addr_const (asm_out_file, x);
7812       break;
7813
7814     case 'G':
7815       switch (aarch64_classify_symbolic_expression (x))
7816         {
7817         case SYMBOL_TLSLE24:
7818           asm_fprintf (asm_out_file, ":tprel_hi12:");
7819           break;
7820         default:
7821           break;
7822         }
7823       output_addr_const (asm_out_file, x);
7824       break;
7825
7826     case 'k':
7827       {
7828         HOST_WIDE_INT cond_code;
7829
7830         if (!CONST_INT_P (x))
7831           {
7832             output_operand_lossage ("invalid operand for '%%%c'", code);
7833             return;
7834           }
7835
7836         cond_code = INTVAL (x);
7837         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
7838         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
7839       }
7840       break;
7841
7842     case 'y':
7843     case 'z':
7844       {
7845         machine_mode mode = GET_MODE (x);
7846
7847         if (GET_CODE (x) != MEM
7848             || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
7849           {
7850             output_operand_lossage ("invalid operand for '%%%c'", code);
7851             return;
7852           }
7853
7854         if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
7855                                             code == 'y'
7856                                             ? ADDR_QUERY_LDP_STP_N
7857                                             : ADDR_QUERY_LDP_STP))
7858           output_operand_lossage ("invalid operand prefix '%%%c'", code);
7859       }
7860       break;
7861
7862     default:
7863       output_operand_lossage ("invalid operand prefix '%%%c'", code);
7864       return;
7865     }
7866 }
7867
7868 /* Print address 'x' of a memory access with mode 'mode'.
7869    'op' is the context required by aarch64_classify_address.  It can either be
7870    MEM for a normal memory access or PARALLEL for LDP/STP.  */
7871 static bool
7872 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
7873                                 aarch64_addr_query_type type)
7874 {
7875   struct aarch64_address_info addr;
7876   unsigned int size;
7877
7878   /* Check all addresses are Pmode - including ILP32.  */
7879   if (GET_MODE (x) != Pmode
7880       && (!CONST_INT_P (x)
7881           || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
7882     {
7883       output_operand_lossage ("invalid address mode");
7884       return false;
7885     }
7886
7887   if (aarch64_classify_address (&addr, x, mode, true, type))
7888     switch (addr.type)
7889       {
7890       case ADDRESS_REG_IMM:
7891         if (known_eq (addr.const_offset, 0))
7892           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
7893         else if (aarch64_sve_data_mode_p (mode))
7894           {
7895             HOST_WIDE_INT vnum
7896               = exact_div (addr.const_offset,
7897                            BYTES_PER_SVE_VECTOR).to_constant ();
7898             asm_fprintf (f, "[%s, #%wd, mul vl]",
7899                          reg_names[REGNO (addr.base)], vnum);
7900           }
7901         else if (aarch64_sve_pred_mode_p (mode))
7902           {
7903             HOST_WIDE_INT vnum
7904               = exact_div (addr.const_offset,
7905                            BYTES_PER_SVE_PRED).to_constant ();
7906             asm_fprintf (f, "[%s, #%wd, mul vl]",
7907                          reg_names[REGNO (addr.base)], vnum);
7908           }
7909         else
7910           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
7911                        INTVAL (addr.offset));
7912         return true;
7913
7914       case ADDRESS_REG_REG:
7915         if (addr.shift == 0)
7916           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
7917                        reg_names [REGNO (addr.offset)]);
7918         else
7919           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
7920                        reg_names [REGNO (addr.offset)], addr.shift);
7921         return true;
7922
7923       case ADDRESS_REG_UXTW:
7924         if (addr.shift == 0)
7925           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
7926                        REGNO (addr.offset) - R0_REGNUM);
7927         else
7928           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
7929                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
7930         return true;
7931
7932       case ADDRESS_REG_SXTW:
7933         if (addr.shift == 0)
7934           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
7935                        REGNO (addr.offset) - R0_REGNUM);
7936         else
7937           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
7938                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
7939         return true;
7940
7941       case ADDRESS_REG_WB:
7942         /* Writeback is only supported for fixed-width modes.  */
7943         size = GET_MODE_SIZE (mode).to_constant ();
7944         switch (GET_CODE (x))
7945           {
7946           case PRE_INC:
7947             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
7948             return true;
7949           case POST_INC:
7950             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
7951             return true;
7952           case PRE_DEC:
7953             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
7954             return true;
7955           case POST_DEC:
7956             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
7957             return true;
7958           case PRE_MODIFY:
7959             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
7960                          INTVAL (addr.offset));
7961             return true;
7962           case POST_MODIFY:
7963             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
7964                          INTVAL (addr.offset));
7965             return true;
7966           default:
7967             break;
7968           }
7969         break;
7970
7971       case ADDRESS_LO_SUM:
7972         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
7973         output_addr_const (f, addr.offset);
7974         asm_fprintf (f, "]");
7975         return true;
7976
7977       case ADDRESS_SYMBOLIC:
7978         output_addr_const (f, x);
7979         return true;
7980       }
7981
7982   return false;
7983 }
7984
7985 /* Print address 'x' of a memory access with mode 'mode'.  */
7986 static void
7987 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
7988 {
7989   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
7990     output_addr_const (f, x);
7991 }
7992
7993 bool
7994 aarch64_label_mentioned_p (rtx x)
7995 {
7996   const char *fmt;
7997   int i;
7998
7999   if (GET_CODE (x) == LABEL_REF)
8000     return true;
8001
8002   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
8003      referencing instruction, but they are constant offsets, not
8004      symbols.  */
8005   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
8006     return false;
8007
8008   fmt = GET_RTX_FORMAT (GET_CODE (x));
8009   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
8010     {
8011       if (fmt[i] == 'E')
8012         {
8013           int j;
8014
8015           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
8016             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
8017               return 1;
8018         }
8019       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
8020         return 1;
8021     }
8022
8023   return 0;
8024 }
8025
8026 /* Implement REGNO_REG_CLASS.  */
8027
8028 enum reg_class
8029 aarch64_regno_regclass (unsigned regno)
8030 {
8031   if (GP_REGNUM_P (regno))
8032     return GENERAL_REGS;
8033
8034   if (regno == SP_REGNUM)
8035     return STACK_REG;
8036
8037   if (regno == FRAME_POINTER_REGNUM
8038       || regno == ARG_POINTER_REGNUM)
8039     return POINTER_REGS;
8040
8041   if (FP_REGNUM_P (regno))
8042     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
8043
8044   if (PR_REGNUM_P (regno))
8045     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
8046
8047   return NO_REGS;
8048 }
8049
8050 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
8051    If OFFSET is out of range, return an offset of an anchor point
8052    that is in range.  Return 0 otherwise.  */
8053
8054 static HOST_WIDE_INT
8055 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
8056                        machine_mode mode)
8057 {
8058   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
8059   if (size > 16)
8060     return (offset + 0x400) & ~0x7f0;
8061
8062   /* For offsets that aren't a multiple of the access size, the limit is
8063      -256...255.  */
8064   if (offset & (size - 1))
8065     {
8066       /* BLKmode typically uses LDP of X-registers.  */
8067       if (mode == BLKmode)
8068         return (offset + 512) & ~0x3ff;
8069       return (offset + 0x100) & ~0x1ff;
8070     }
8071
8072   /* Small negative offsets are supported.  */
8073   if (IN_RANGE (offset, -256, 0))
8074     return 0;
8075
8076   if (mode == TImode || mode == TFmode)
8077     return (offset + 0x100) & ~0x1ff;
8078
8079   /* Use 12-bit offset by access size.  */
8080   return offset & (~0xfff * size);
8081 }
8082
8083 static rtx
8084 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
8085 {
8086   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
8087      where mask is selected by alignment and size of the offset.
8088      We try to pick as large a range for the offset as possible to
8089      maximize the chance of a CSE.  However, for aligned addresses
8090      we limit the range to 4k so that structures with different sized
8091      elements are likely to use the same base.  We need to be careful
8092      not to split a CONST for some forms of address expression, otherwise
8093      it will generate sub-optimal code.  */
8094
8095   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
8096     {
8097       rtx base = XEXP (x, 0);
8098       rtx offset_rtx = XEXP (x, 1);
8099       HOST_WIDE_INT offset = INTVAL (offset_rtx);
8100
8101       if (GET_CODE (base) == PLUS)
8102         {
8103           rtx op0 = XEXP (base, 0);
8104           rtx op1 = XEXP (base, 1);
8105
8106           /* Force any scaling into a temp for CSE.  */
8107           op0 = force_reg (Pmode, op0);
8108           op1 = force_reg (Pmode, op1);
8109
8110           /* Let the pointer register be in op0.  */
8111           if (REG_POINTER (op1))
8112             std::swap (op0, op1);
8113
8114           /* If the pointer is virtual or frame related, then we know that
8115              virtual register instantiation or register elimination is going
8116              to apply a second constant.  We want the two constants folded
8117              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
8118           if (virt_or_elim_regno_p (REGNO (op0)))
8119             {
8120               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
8121                                    NULL_RTX, true, OPTAB_DIRECT);
8122               return gen_rtx_PLUS (Pmode, base, op1);
8123             }
8124
8125           /* Otherwise, in order to encourage CSE (and thence loop strength
8126              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
8127           base = expand_binop (Pmode, add_optab, op0, op1,
8128                                NULL_RTX, true, OPTAB_DIRECT);
8129           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
8130         }
8131
8132       HOST_WIDE_INT size;
8133       if (GET_MODE_SIZE (mode).is_constant (&size))
8134         {
8135           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
8136                                                              mode);
8137           if (base_offset != 0)
8138             {
8139               base = plus_constant (Pmode, base, base_offset);
8140               base = force_operand (base, NULL_RTX);
8141               return plus_constant (Pmode, base, offset - base_offset);
8142             }
8143         }
8144     }
8145
8146   return x;
8147 }
8148
8149 static reg_class_t
8150 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
8151                           reg_class_t rclass,
8152                           machine_mode mode,
8153                           secondary_reload_info *sri)
8154 {
8155   /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
8156      directly by the *aarch64_sve_mov<mode>_be move pattern.  See the
8157      comment at the head of aarch64-sve.md for more details about the
8158      big-endian handling.  */
8159   if (BYTES_BIG_ENDIAN
8160       && reg_class_subset_p (rclass, FP_REGS)
8161       && !((REG_P (x) && HARD_REGISTER_P (x))
8162            || aarch64_simd_valid_immediate (x, NULL))
8163       && aarch64_sve_data_mode_p (mode))
8164     {
8165       sri->icode = CODE_FOR_aarch64_sve_reload_be;
8166       return NO_REGS;
8167     }
8168
8169   /* If we have to disable direct literal pool loads and stores because the
8170      function is too big, then we need a scratch register.  */
8171   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
8172       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
8173           || targetm.vector_mode_supported_p (GET_MODE (x)))
8174       && !aarch64_pcrelative_literal_loads)
8175     {
8176       sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
8177       return NO_REGS;
8178     }
8179
8180   /* Without the TARGET_SIMD instructions we cannot move a Q register
8181      to a Q register directly.  We need a scratch.  */
8182   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
8183       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
8184       && reg_class_subset_p (rclass, FP_REGS))
8185     {
8186       sri->icode = code_for_aarch64_reload_mov (mode);
8187       return NO_REGS;
8188     }
8189
8190   /* A TFmode or TImode memory access should be handled via an FP_REGS
8191      because AArch64 has richer addressing modes for LDR/STR instructions
8192      than LDP/STP instructions.  */
8193   if (TARGET_FLOAT && rclass == GENERAL_REGS
8194       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
8195     return FP_REGS;
8196
8197   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
8198       return GENERAL_REGS;
8199
8200   return NO_REGS;
8201 }
8202
8203 static bool
8204 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
8205 {
8206   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
8207
8208   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
8209      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
8210   if (frame_pointer_needed)
8211     return to == HARD_FRAME_POINTER_REGNUM;
8212   return true;
8213 }
8214
8215 poly_int64
8216 aarch64_initial_elimination_offset (unsigned from, unsigned to)
8217 {
8218   if (to == HARD_FRAME_POINTER_REGNUM)
8219     {
8220       if (from == ARG_POINTER_REGNUM)
8221         return cfun->machine->frame.hard_fp_offset;
8222
8223       if (from == FRAME_POINTER_REGNUM)
8224         return cfun->machine->frame.hard_fp_offset
8225                - cfun->machine->frame.locals_offset;
8226     }
8227
8228   if (to == STACK_POINTER_REGNUM)
8229     {
8230       if (from == FRAME_POINTER_REGNUM)
8231           return cfun->machine->frame.frame_size
8232                  - cfun->machine->frame.locals_offset;
8233     }
8234
8235   return cfun->machine->frame.frame_size;
8236 }
8237
8238 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
8239    previous frame.  */
8240
8241 rtx
8242 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
8243 {
8244   if (count != 0)
8245     return const0_rtx;
8246   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
8247 }
8248
8249
8250 static void
8251 aarch64_asm_trampoline_template (FILE *f)
8252 {
8253   if (TARGET_ILP32)
8254     {
8255       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
8256       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
8257     }
8258   else
8259     {
8260       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
8261       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
8262     }
8263   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
8264   assemble_aligned_integer (4, const0_rtx);
8265   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
8266   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
8267 }
8268
8269 static void
8270 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
8271 {
8272   rtx fnaddr, mem, a_tramp;
8273   const int tramp_code_sz = 16;
8274
8275   /* Don't need to copy the trailing D-words, we fill those in below.  */
8276   emit_block_move (m_tramp, assemble_trampoline_template (),
8277                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
8278   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
8279   fnaddr = XEXP (DECL_RTL (fndecl), 0);
8280   if (GET_MODE (fnaddr) != ptr_mode)
8281     fnaddr = convert_memory_address (ptr_mode, fnaddr);
8282   emit_move_insn (mem, fnaddr);
8283
8284   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
8285   emit_move_insn (mem, chain_value);
8286
8287   /* XXX We should really define a "clear_cache" pattern and use
8288      gen_clear_cache().  */
8289   a_tramp = XEXP (m_tramp, 0);
8290   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
8291                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
8292                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
8293                      ptr_mode);
8294 }
8295
8296 static unsigned char
8297 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
8298 {
8299   /* ??? Logically we should only need to provide a value when
8300      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
8301      can hold MODE, but at the moment we need to handle all modes.
8302      Just ignore any runtime parts for registers that can't store them.  */
8303   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
8304   unsigned int nregs;
8305   switch (regclass)
8306     {
8307     case TAILCALL_ADDR_REGS:
8308     case POINTER_REGS:
8309     case GENERAL_REGS:
8310     case ALL_REGS:
8311     case POINTER_AND_FP_REGS:
8312     case FP_REGS:
8313     case FP_LO_REGS:
8314       if (aarch64_sve_data_mode_p (mode)
8315           && constant_multiple_p (GET_MODE_SIZE (mode),
8316                                   BYTES_PER_SVE_VECTOR, &nregs))
8317         return nregs;
8318       return (aarch64_vector_data_mode_p (mode)
8319               ? CEIL (lowest_size, UNITS_PER_VREG)
8320               : CEIL (lowest_size, UNITS_PER_WORD));
8321     case STACK_REG:
8322     case PR_REGS:
8323     case PR_LO_REGS:
8324     case PR_HI_REGS:
8325       return 1;
8326
8327     case NO_REGS:
8328       return 0;
8329
8330     default:
8331       break;
8332     }
8333   gcc_unreachable ();
8334 }
8335
8336 static reg_class_t
8337 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
8338 {
8339   if (regclass == POINTER_REGS)
8340     return GENERAL_REGS;
8341
8342   if (regclass == STACK_REG)
8343     {
8344       if (REG_P(x)
8345           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
8346           return regclass;
8347
8348       return NO_REGS;
8349     }
8350
8351   /* Register eliminiation can result in a request for
8352      SP+constant->FP_REGS.  We cannot support such operations which
8353      use SP as source and an FP_REG as destination, so reject out
8354      right now.  */
8355   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
8356     {
8357       rtx lhs = XEXP (x, 0);
8358
8359       /* Look through a possible SUBREG introduced by ILP32.  */
8360       if (GET_CODE (lhs) == SUBREG)
8361         lhs = SUBREG_REG (lhs);
8362
8363       gcc_assert (REG_P (lhs));
8364       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
8365                                       POINTER_REGS));
8366       return NO_REGS;
8367     }
8368
8369   return regclass;
8370 }
8371
8372 void
8373 aarch64_asm_output_labelref (FILE* f, const char *name)
8374 {
8375   asm_fprintf (f, "%U%s", name);
8376 }
8377
8378 static void
8379 aarch64_elf_asm_constructor (rtx symbol, int priority)
8380 {
8381   if (priority == DEFAULT_INIT_PRIORITY)
8382     default_ctor_section_asm_out_constructor (symbol, priority);
8383   else
8384     {
8385       section *s;
8386       /* While priority is known to be in range [0, 65535], so 18 bytes
8387          would be enough, the compiler might not know that.  To avoid
8388          -Wformat-truncation false positive, use a larger size.  */
8389       char buf[23];
8390       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
8391       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
8392       switch_to_section (s);
8393       assemble_align (POINTER_SIZE);
8394       assemble_aligned_integer (POINTER_BYTES, symbol);
8395     }
8396 }
8397
8398 static void
8399 aarch64_elf_asm_destructor (rtx symbol, int priority)
8400 {
8401   if (priority == DEFAULT_INIT_PRIORITY)
8402     default_dtor_section_asm_out_destructor (symbol, priority);
8403   else
8404     {
8405       section *s;
8406       /* While priority is known to be in range [0, 65535], so 18 bytes
8407          would be enough, the compiler might not know that.  To avoid
8408          -Wformat-truncation false positive, use a larger size.  */
8409       char buf[23];
8410       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
8411       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
8412       switch_to_section (s);
8413       assemble_align (POINTER_SIZE);
8414       assemble_aligned_integer (POINTER_BYTES, symbol);
8415     }
8416 }
8417
8418 const char*
8419 aarch64_output_casesi (rtx *operands)
8420 {
8421   char buf[100];
8422   char label[100];
8423   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
8424   int index;
8425   static const char *const patterns[4][2] =
8426   {
8427     {
8428       "ldrb\t%w3, [%0,%w1,uxtw]",
8429       "add\t%3, %4, %w3, sxtb #2"
8430     },
8431     {
8432       "ldrh\t%w3, [%0,%w1,uxtw #1]",
8433       "add\t%3, %4, %w3, sxth #2"
8434     },
8435     {
8436       "ldr\t%w3, [%0,%w1,uxtw #2]",
8437       "add\t%3, %4, %w3, sxtw #2"
8438     },
8439     /* We assume that DImode is only generated when not optimizing and
8440        that we don't really need 64-bit address offsets.  That would
8441        imply an object file with 8GB of code in a single function!  */
8442     {
8443       "ldr\t%w3, [%0,%w1,uxtw #2]",
8444       "add\t%3, %4, %w3, sxtw #2"
8445     }
8446   };
8447
8448   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
8449
8450   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
8451   index = exact_log2 (GET_MODE_SIZE (mode));
8452
8453   gcc_assert (index >= 0 && index <= 3);
8454
8455   /* Need to implement table size reduction, by chaning the code below.  */
8456   output_asm_insn (patterns[index][0], operands);
8457   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
8458   snprintf (buf, sizeof (buf),
8459             "adr\t%%4, %s", targetm.strip_name_encoding (label));
8460   output_asm_insn (buf, operands);
8461   output_asm_insn (patterns[index][1], operands);
8462   output_asm_insn ("br\t%3", operands);
8463   assemble_label (asm_out_file, label);
8464   return "";
8465 }
8466
8467
8468 /* Return size in bits of an arithmetic operand which is shifted/scaled and
8469    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
8470    operator.  */
8471
8472 int
8473 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
8474 {
8475   if (shift >= 0 && shift <= 3)
8476     {
8477       int size;
8478       for (size = 8; size <= 32; size *= 2)
8479         {
8480           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
8481           if (mask == bits << shift)
8482             return size;
8483         }
8484     }
8485   return 0;
8486 }
8487
8488 /* Constant pools are per function only when PC relative
8489    literal loads are true or we are in the large memory
8490    model.  */
8491
8492 static inline bool
8493 aarch64_can_use_per_function_literal_pools_p (void)
8494 {
8495   return (aarch64_pcrelative_literal_loads
8496           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
8497 }
8498
8499 static bool
8500 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
8501 {
8502   /* We can't use blocks for constants when we're using a per-function
8503      constant pool.  */
8504   return !aarch64_can_use_per_function_literal_pools_p ();
8505 }
8506
8507 /* Select appropriate section for constants depending
8508    on where we place literal pools.  */
8509
8510 static section *
8511 aarch64_select_rtx_section (machine_mode mode,
8512                             rtx x,
8513                             unsigned HOST_WIDE_INT align)
8514 {
8515   if (aarch64_can_use_per_function_literal_pools_p ())
8516     return function_section (current_function_decl);
8517
8518   return default_elf_select_rtx_section (mode, x, align);
8519 }
8520
8521 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
8522 void
8523 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
8524                                   HOST_WIDE_INT offset)
8525 {
8526   /* When using per-function literal pools, we must ensure that any code
8527      section is aligned to the minimal instruction length, lest we get
8528      errors from the assembler re "unaligned instructions".  */
8529   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
8530     ASM_OUTPUT_ALIGN (f, 2);
8531 }
8532
8533 /* Costs.  */
8534
8535 /* Helper function for rtx cost calculation.  Strip a shift expression
8536    from X.  Returns the inner operand if successful, or the original
8537    expression on failure.  */
8538 static rtx
8539 aarch64_strip_shift (rtx x)
8540 {
8541   rtx op = x;
8542
8543   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
8544      we can convert both to ROR during final output.  */
8545   if ((GET_CODE (op) == ASHIFT
8546        || GET_CODE (op) == ASHIFTRT
8547        || GET_CODE (op) == LSHIFTRT
8548        || GET_CODE (op) == ROTATERT
8549        || GET_CODE (op) == ROTATE)
8550       && CONST_INT_P (XEXP (op, 1)))
8551     return XEXP (op, 0);
8552
8553   if (GET_CODE (op) == MULT
8554       && CONST_INT_P (XEXP (op, 1))
8555       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
8556     return XEXP (op, 0);
8557
8558   return x;
8559 }
8560
8561 /* Helper function for rtx cost calculation.  Strip an extend
8562    expression from X.  Returns the inner operand if successful, or the
8563    original expression on failure.  We deal with a number of possible
8564    canonicalization variations here. If STRIP_SHIFT is true, then
8565    we can strip off a shift also.  */
8566 static rtx
8567 aarch64_strip_extend (rtx x, bool strip_shift)
8568 {
8569   scalar_int_mode mode;
8570   rtx op = x;
8571
8572   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
8573     return op;
8574
8575   /* Zero and sign extraction of a widened value.  */
8576   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
8577       && XEXP (op, 2) == const0_rtx
8578       && GET_CODE (XEXP (op, 0)) == MULT
8579       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
8580                                          XEXP (op, 1)))
8581     return XEXP (XEXP (op, 0), 0);
8582
8583   /* It can also be represented (for zero-extend) as an AND with an
8584      immediate.  */
8585   if (GET_CODE (op) == AND
8586       && GET_CODE (XEXP (op, 0)) == MULT
8587       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
8588       && CONST_INT_P (XEXP (op, 1))
8589       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
8590                            INTVAL (XEXP (op, 1))) != 0)
8591     return XEXP (XEXP (op, 0), 0);
8592
8593   /* Now handle extended register, as this may also have an optional
8594      left shift by 1..4.  */
8595   if (strip_shift
8596       && GET_CODE (op) == ASHIFT
8597       && CONST_INT_P (XEXP (op, 1))
8598       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
8599     op = XEXP (op, 0);
8600
8601   if (GET_CODE (op) == ZERO_EXTEND
8602       || GET_CODE (op) == SIGN_EXTEND)
8603     op = XEXP (op, 0);
8604
8605   if (op != x)
8606     return op;
8607
8608   return x;
8609 }
8610
8611 /* Return true iff CODE is a shift supported in combination
8612    with arithmetic instructions.  */
8613
8614 static bool
8615 aarch64_shift_p (enum rtx_code code)
8616 {
8617   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
8618 }
8619
8620
8621 /* Return true iff X is a cheap shift without a sign extend. */
8622
8623 static bool
8624 aarch64_cheap_mult_shift_p (rtx x)
8625 {
8626   rtx op0, op1;
8627
8628   op0 = XEXP (x, 0);
8629   op1 = XEXP (x, 1);
8630
8631   if (!(aarch64_tune_params.extra_tuning_flags
8632                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
8633     return false;
8634
8635   if (GET_CODE (op0) == SIGN_EXTEND)
8636     return false;
8637
8638   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
8639       && UINTVAL (op1) <= 4)
8640     return true;
8641
8642   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
8643     return false;
8644
8645   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
8646
8647   if (l2 > 0 && l2 <= 4)
8648     return true;
8649
8650   return false;
8651 }
8652
8653 /* Helper function for rtx cost calculation.  Calculate the cost of
8654    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
8655    Return the calculated cost of the expression, recursing manually in to
8656    operands where needed.  */
8657
8658 static int
8659 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
8660 {
8661   rtx op0, op1;
8662   const struct cpu_cost_table *extra_cost
8663     = aarch64_tune_params.insn_extra_cost;
8664   int cost = 0;
8665   bool compound_p = (outer == PLUS || outer == MINUS);
8666   machine_mode mode = GET_MODE (x);
8667
8668   gcc_checking_assert (code == MULT);
8669
8670   op0 = XEXP (x, 0);
8671   op1 = XEXP (x, 1);
8672
8673   if (VECTOR_MODE_P (mode))
8674     mode = GET_MODE_INNER (mode);
8675
8676   /* Integer multiply/fma.  */
8677   if (GET_MODE_CLASS (mode) == MODE_INT)
8678     {
8679       /* The multiply will be canonicalized as a shift, cost it as such.  */
8680       if (aarch64_shift_p (GET_CODE (x))
8681           || (CONST_INT_P (op1)
8682               && exact_log2 (INTVAL (op1)) > 0))
8683         {
8684           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
8685                            || GET_CODE (op0) == SIGN_EXTEND;
8686           if (speed)
8687             {
8688               if (compound_p)
8689                 {
8690                   /* If the shift is considered cheap,
8691                      then don't add any cost. */
8692                   if (aarch64_cheap_mult_shift_p (x))
8693                     ;
8694                   else if (REG_P (op1))
8695                     /* ARITH + shift-by-register.  */
8696                     cost += extra_cost->alu.arith_shift_reg;
8697                   else if (is_extend)
8698                     /* ARITH + extended register.  We don't have a cost field
8699                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
8700                     cost += extra_cost->alu.extend_arith;
8701                   else
8702                     /* ARITH + shift-by-immediate.  */
8703                     cost += extra_cost->alu.arith_shift;
8704                 }
8705               else
8706                 /* LSL (immediate).  */
8707                 cost += extra_cost->alu.shift;
8708
8709             }
8710           /* Strip extends as we will have costed them in the case above.  */
8711           if (is_extend)
8712             op0 = aarch64_strip_extend (op0, true);
8713
8714           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
8715
8716           return cost;
8717         }
8718
8719       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
8720          compound and let the below cases handle it.  After all, MNEG is a
8721          special-case alias of MSUB.  */
8722       if (GET_CODE (op0) == NEG)
8723         {
8724           op0 = XEXP (op0, 0);
8725           compound_p = true;
8726         }
8727
8728       /* Integer multiplies or FMAs have zero/sign extending variants.  */
8729       if ((GET_CODE (op0) == ZERO_EXTEND
8730            && GET_CODE (op1) == ZERO_EXTEND)
8731           || (GET_CODE (op0) == SIGN_EXTEND
8732               && GET_CODE (op1) == SIGN_EXTEND))
8733         {
8734           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
8735           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
8736
8737           if (speed)
8738             {
8739               if (compound_p)
8740                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
8741                 cost += extra_cost->mult[0].extend_add;
8742               else
8743                 /* MUL/SMULL/UMULL.  */
8744                 cost += extra_cost->mult[0].extend;
8745             }
8746
8747           return cost;
8748         }
8749
8750       /* This is either an integer multiply or a MADD.  In both cases
8751          we want to recurse and cost the operands.  */
8752       cost += rtx_cost (op0, mode, MULT, 0, speed);
8753       cost += rtx_cost (op1, mode, MULT, 1, speed);
8754
8755       if (speed)
8756         {
8757           if (compound_p)
8758             /* MADD/MSUB.  */
8759             cost += extra_cost->mult[mode == DImode].add;
8760           else
8761             /* MUL.  */
8762             cost += extra_cost->mult[mode == DImode].simple;
8763         }
8764
8765       return cost;
8766     }
8767   else
8768     {
8769       if (speed)
8770         {
8771           /* Floating-point FMA/FMUL can also support negations of the
8772              operands, unless the rounding mode is upward or downward in
8773              which case FNMUL is different than FMUL with operand negation.  */
8774           bool neg0 = GET_CODE (op0) == NEG;
8775           bool neg1 = GET_CODE (op1) == NEG;
8776           if (compound_p || !flag_rounding_math || (neg0 && neg1))
8777             {
8778               if (neg0)
8779                 op0 = XEXP (op0, 0);
8780               if (neg1)
8781                 op1 = XEXP (op1, 0);
8782             }
8783
8784           if (compound_p)
8785             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
8786             cost += extra_cost->fp[mode == DFmode].fma;
8787           else
8788             /* FMUL/FNMUL.  */
8789             cost += extra_cost->fp[mode == DFmode].mult;
8790         }
8791
8792       cost += rtx_cost (op0, mode, MULT, 0, speed);
8793       cost += rtx_cost (op1, mode, MULT, 1, speed);
8794       return cost;
8795     }
8796 }
8797
8798 static int
8799 aarch64_address_cost (rtx x,
8800                       machine_mode mode,
8801                       addr_space_t as ATTRIBUTE_UNUSED,
8802                       bool speed)
8803 {
8804   enum rtx_code c = GET_CODE (x);
8805   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
8806   struct aarch64_address_info info;
8807   int cost = 0;
8808   info.shift = 0;
8809
8810   if (!aarch64_classify_address (&info, x, mode, false))
8811     {
8812       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
8813         {
8814           /* This is a CONST or SYMBOL ref which will be split
8815              in a different way depending on the code model in use.
8816              Cost it through the generic infrastructure.  */
8817           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
8818           /* Divide through by the cost of one instruction to
8819              bring it to the same units as the address costs.  */
8820           cost_symbol_ref /= COSTS_N_INSNS (1);
8821           /* The cost is then the cost of preparing the address,
8822              followed by an immediate (possibly 0) offset.  */
8823           return cost_symbol_ref + addr_cost->imm_offset;
8824         }
8825       else
8826         {
8827           /* This is most likely a jump table from a case
8828              statement.  */
8829           return addr_cost->register_offset;
8830         }
8831     }
8832
8833   switch (info.type)
8834     {
8835       case ADDRESS_LO_SUM:
8836       case ADDRESS_SYMBOLIC:
8837       case ADDRESS_REG_IMM:
8838         cost += addr_cost->imm_offset;
8839         break;
8840
8841       case ADDRESS_REG_WB:
8842         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
8843           cost += addr_cost->pre_modify;
8844         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
8845           cost += addr_cost->post_modify;
8846         else
8847           gcc_unreachable ();
8848
8849         break;
8850
8851       case ADDRESS_REG_REG:
8852         cost += addr_cost->register_offset;
8853         break;
8854
8855       case ADDRESS_REG_SXTW:
8856         cost += addr_cost->register_sextend;
8857         break;
8858
8859       case ADDRESS_REG_UXTW:
8860         cost += addr_cost->register_zextend;
8861         break;
8862
8863       default:
8864         gcc_unreachable ();
8865     }
8866
8867
8868   if (info.shift > 0)
8869     {
8870       /* For the sake of calculating the cost of the shifted register
8871          component, we can treat same sized modes in the same way.  */
8872       if (known_eq (GET_MODE_BITSIZE (mode), 16))
8873         cost += addr_cost->addr_scale_costs.hi;
8874       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
8875         cost += addr_cost->addr_scale_costs.si;
8876       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
8877         cost += addr_cost->addr_scale_costs.di;
8878       else
8879         /* We can't tell, or this is a 128-bit vector.  */
8880         cost += addr_cost->addr_scale_costs.ti;
8881     }
8882
8883   return cost;
8884 }
8885
8886 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
8887    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
8888    to be taken.  */
8889
8890 int
8891 aarch64_branch_cost (bool speed_p, bool predictable_p)
8892 {
8893   /* When optimizing for speed, use the cost of unpredictable branches.  */
8894   const struct cpu_branch_cost *branch_costs =
8895     aarch64_tune_params.branch_costs;
8896
8897   if (!speed_p || predictable_p)
8898     return branch_costs->predictable;
8899   else
8900     return branch_costs->unpredictable;
8901 }
8902
8903 /* Return true if the RTX X in mode MODE is a zero or sign extract
8904    usable in an ADD or SUB (extended register) instruction.  */
8905 static bool
8906 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
8907 {
8908   /* Catch add with a sign extract.
8909      This is add_<optab><mode>_multp2.  */
8910   if (GET_CODE (x) == SIGN_EXTRACT
8911       || GET_CODE (x) == ZERO_EXTRACT)
8912     {
8913       rtx op0 = XEXP (x, 0);
8914       rtx op1 = XEXP (x, 1);
8915       rtx op2 = XEXP (x, 2);
8916
8917       if (GET_CODE (op0) == MULT
8918           && CONST_INT_P (op1)
8919           && op2 == const0_rtx
8920           && CONST_INT_P (XEXP (op0, 1))
8921           && aarch64_is_extend_from_extract (mode,
8922                                              XEXP (op0, 1),
8923                                              op1))
8924         {
8925           return true;
8926         }
8927     }
8928   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
8929      No shift.  */
8930   else if (GET_CODE (x) == SIGN_EXTEND
8931            || GET_CODE (x) == ZERO_EXTEND)
8932     return REG_P (XEXP (x, 0));
8933
8934   return false;
8935 }
8936
8937 static bool
8938 aarch64_frint_unspec_p (unsigned int u)
8939 {
8940   switch (u)
8941     {
8942       case UNSPEC_FRINTZ:
8943       case UNSPEC_FRINTP:
8944       case UNSPEC_FRINTM:
8945       case UNSPEC_FRINTA:
8946       case UNSPEC_FRINTN:
8947       case UNSPEC_FRINTX:
8948       case UNSPEC_FRINTI:
8949         return true;
8950
8951       default:
8952         return false;
8953     }
8954 }
8955
8956 /* Return true iff X is an rtx that will match an extr instruction
8957    i.e. as described in the *extr<mode>5_insn family of patterns.
8958    OP0 and OP1 will be set to the operands of the shifts involved
8959    on success and will be NULL_RTX otherwise.  */
8960
8961 static bool
8962 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
8963 {
8964   rtx op0, op1;
8965   scalar_int_mode mode;
8966   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
8967     return false;
8968
8969   *res_op0 = NULL_RTX;
8970   *res_op1 = NULL_RTX;
8971
8972   if (GET_CODE (x) != IOR)
8973     return false;
8974
8975   op0 = XEXP (x, 0);
8976   op1 = XEXP (x, 1);
8977
8978   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
8979       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
8980     {
8981      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
8982       if (GET_CODE (op1) == ASHIFT)
8983         std::swap (op0, op1);
8984
8985       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
8986         return false;
8987
8988       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
8989       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
8990
8991       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
8992           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
8993         {
8994           *res_op0 = XEXP (op0, 0);
8995           *res_op1 = XEXP (op1, 0);
8996           return true;
8997         }
8998     }
8999
9000   return false;
9001 }
9002
9003 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
9004    storing it in *COST.  Result is true if the total cost of the operation
9005    has now been calculated.  */
9006 static bool
9007 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
9008 {
9009   rtx inner;
9010   rtx comparator;
9011   enum rtx_code cmpcode;
9012
9013   if (COMPARISON_P (op0))
9014     {
9015       inner = XEXP (op0, 0);
9016       comparator = XEXP (op0, 1);
9017       cmpcode = GET_CODE (op0);
9018     }
9019   else
9020     {
9021       inner = op0;
9022       comparator = const0_rtx;
9023       cmpcode = NE;
9024     }
9025
9026   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
9027     {
9028       /* Conditional branch.  */
9029       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9030         return true;
9031       else
9032         {
9033           if (cmpcode == NE || cmpcode == EQ)
9034             {
9035               if (comparator == const0_rtx)
9036                 {
9037                   /* TBZ/TBNZ/CBZ/CBNZ.  */
9038                   if (GET_CODE (inner) == ZERO_EXTRACT)
9039                     /* TBZ/TBNZ.  */
9040                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
9041                                        ZERO_EXTRACT, 0, speed);
9042                   else
9043                     /* CBZ/CBNZ.  */
9044                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
9045
9046                 return true;
9047               }
9048             }
9049           else if (cmpcode == LT || cmpcode == GE)
9050             {
9051               /* TBZ/TBNZ.  */
9052               if (comparator == const0_rtx)
9053                 return true;
9054             }
9055         }
9056     }
9057   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9058     {
9059       /* CCMP.  */
9060       if (GET_CODE (op1) == COMPARE)
9061         {
9062           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
9063           if (XEXP (op1, 1) == const0_rtx)
9064             *cost += 1;
9065           if (speed)
9066             {
9067               machine_mode mode = GET_MODE (XEXP (op1, 0));
9068               const struct cpu_cost_table *extra_cost
9069                 = aarch64_tune_params.insn_extra_cost;
9070
9071               if (GET_MODE_CLASS (mode) == MODE_INT)
9072                 *cost += extra_cost->alu.arith;
9073               else
9074                 *cost += extra_cost->fp[mode == DFmode].compare;
9075             }
9076           return true;
9077         }
9078
9079       /* It's a conditional operation based on the status flags,
9080          so it must be some flavor of CSEL.  */
9081
9082       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
9083       if (GET_CODE (op1) == NEG
9084           || GET_CODE (op1) == NOT
9085           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
9086         op1 = XEXP (op1, 0);
9087       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
9088         {
9089           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
9090           op1 = XEXP (op1, 0);
9091           op2 = XEXP (op2, 0);
9092         }
9093
9094       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
9095       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
9096       return true;
9097     }
9098
9099   /* We don't know what this is, cost all operands.  */
9100   return false;
9101 }
9102
9103 /* Check whether X is a bitfield operation of the form shift + extend that
9104    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
9105    operand to which the bitfield operation is applied.  Otherwise return
9106    NULL_RTX.  */
9107
9108 static rtx
9109 aarch64_extend_bitfield_pattern_p (rtx x)
9110 {
9111   rtx_code outer_code = GET_CODE (x);
9112   machine_mode outer_mode = GET_MODE (x);
9113
9114   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
9115       && outer_mode != SImode && outer_mode != DImode)
9116     return NULL_RTX;
9117
9118   rtx inner = XEXP (x, 0);
9119   rtx_code inner_code = GET_CODE (inner);
9120   machine_mode inner_mode = GET_MODE (inner);
9121   rtx op = NULL_RTX;
9122
9123   switch (inner_code)
9124     {
9125       case ASHIFT:
9126         if (CONST_INT_P (XEXP (inner, 1))
9127             && (inner_mode == QImode || inner_mode == HImode))
9128           op = XEXP (inner, 0);
9129         break;
9130       case LSHIFTRT:
9131         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
9132             && (inner_mode == QImode || inner_mode == HImode))
9133           op = XEXP (inner, 0);
9134         break;
9135       case ASHIFTRT:
9136         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
9137             && (inner_mode == QImode || inner_mode == HImode))
9138           op = XEXP (inner, 0);
9139         break;
9140       default:
9141         break;
9142     }
9143
9144   return op;
9145 }
9146
9147 /* Return true if the mask and a shift amount from an RTX of the form
9148    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
9149    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
9150
9151 bool
9152 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
9153                                     rtx shft_amnt)
9154 {
9155   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
9156          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
9157          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
9158          && (INTVAL (mask)
9159              & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
9160 }
9161
9162 /* Calculate the cost of calculating X, storing it in *COST.  Result
9163    is true if the total cost of the operation has now been calculated.  */
9164 static bool
9165 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
9166                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
9167 {
9168   rtx op0, op1, op2;
9169   const struct cpu_cost_table *extra_cost
9170     = aarch64_tune_params.insn_extra_cost;
9171   int code = GET_CODE (x);
9172   scalar_int_mode int_mode;
9173
9174   /* By default, assume that everything has equivalent cost to the
9175      cheapest instruction.  Any additional costs are applied as a delta
9176      above this default.  */
9177   *cost = COSTS_N_INSNS (1);
9178
9179   switch (code)
9180     {
9181     case SET:
9182       /* The cost depends entirely on the operands to SET.  */
9183       *cost = 0;
9184       op0 = SET_DEST (x);
9185       op1 = SET_SRC (x);
9186
9187       switch (GET_CODE (op0))
9188         {
9189         case MEM:
9190           if (speed)
9191             {
9192               rtx address = XEXP (op0, 0);
9193               if (VECTOR_MODE_P (mode))
9194                 *cost += extra_cost->ldst.storev;
9195               else if (GET_MODE_CLASS (mode) == MODE_INT)
9196                 *cost += extra_cost->ldst.store;
9197               else if (mode == SFmode)
9198                 *cost += extra_cost->ldst.storef;
9199               else if (mode == DFmode)
9200                 *cost += extra_cost->ldst.stored;
9201
9202               *cost +=
9203                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9204                                                      0, speed));
9205             }
9206
9207           *cost += rtx_cost (op1, mode, SET, 1, speed);
9208           return true;
9209
9210         case SUBREG:
9211           if (! REG_P (SUBREG_REG (op0)))
9212             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
9213
9214           /* Fall through.  */
9215         case REG:
9216           /* The cost is one per vector-register copied.  */
9217           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
9218             {
9219               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
9220               *cost = COSTS_N_INSNS (nregs);
9221             }
9222           /* const0_rtx is in general free, but we will use an
9223              instruction to set a register to 0.  */
9224           else if (REG_P (op1) || op1 == const0_rtx)
9225             {
9226               /* The cost is 1 per register copied.  */
9227               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
9228               *cost = COSTS_N_INSNS (nregs);
9229             }
9230           else
9231             /* Cost is just the cost of the RHS of the set.  */
9232             *cost += rtx_cost (op1, mode, SET, 1, speed);
9233           return true;
9234
9235         case ZERO_EXTRACT:
9236         case SIGN_EXTRACT:
9237           /* Bit-field insertion.  Strip any redundant widening of
9238              the RHS to meet the width of the target.  */
9239           if (GET_CODE (op1) == SUBREG)
9240             op1 = SUBREG_REG (op1);
9241           if ((GET_CODE (op1) == ZERO_EXTEND
9242                || GET_CODE (op1) == SIGN_EXTEND)
9243               && CONST_INT_P (XEXP (op0, 1))
9244               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
9245               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
9246             op1 = XEXP (op1, 0);
9247
9248           if (CONST_INT_P (op1))
9249             {
9250               /* MOV immediate is assumed to always be cheap.  */
9251               *cost = COSTS_N_INSNS (1);
9252             }
9253           else
9254             {
9255               /* BFM.  */
9256               if (speed)
9257                 *cost += extra_cost->alu.bfi;
9258               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
9259             }
9260
9261           return true;
9262
9263         default:
9264           /* We can't make sense of this, assume default cost.  */
9265           *cost = COSTS_N_INSNS (1);
9266           return false;
9267         }
9268       return false;
9269
9270     case CONST_INT:
9271       /* If an instruction can incorporate a constant within the
9272          instruction, the instruction's expression avoids calling
9273          rtx_cost() on the constant.  If rtx_cost() is called on a
9274          constant, then it is usually because the constant must be
9275          moved into a register by one or more instructions.
9276
9277          The exception is constant 0, which can be expressed
9278          as XZR/WZR and is therefore free.  The exception to this is
9279          if we have (set (reg) (const0_rtx)) in which case we must cost
9280          the move.  However, we can catch that when we cost the SET, so
9281          we don't need to consider that here.  */
9282       if (x == const0_rtx)
9283         *cost = 0;
9284       else
9285         {
9286           /* To an approximation, building any other constant is
9287              proportionally expensive to the number of instructions
9288              required to build that constant.  This is true whether we
9289              are compiling for SPEED or otherwise.  */
9290           if (!is_a <scalar_int_mode> (mode, &int_mode))
9291             int_mode = word_mode;
9292           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
9293                                  (NULL_RTX, x, false, int_mode));
9294         }
9295       return true;
9296
9297     case CONST_DOUBLE:
9298
9299       /* First determine number of instructions to do the move
9300           as an integer constant.  */
9301       if (!aarch64_float_const_representable_p (x)
9302            && !aarch64_can_const_movi_rtx_p (x, mode)
9303            && aarch64_float_const_rtx_p (x))
9304         {
9305           unsigned HOST_WIDE_INT ival;
9306           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
9307           gcc_assert (succeed);
9308
9309           scalar_int_mode imode = (mode == HFmode
9310                                    ? SImode
9311                                    : int_mode_for_mode (mode).require ());
9312           int ncost = aarch64_internal_mov_immediate
9313                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
9314           *cost += COSTS_N_INSNS (ncost);
9315           return true;
9316         }
9317
9318       if (speed)
9319         {
9320           /* mov[df,sf]_aarch64.  */
9321           if (aarch64_float_const_representable_p (x))
9322             /* FMOV (scalar immediate).  */
9323             *cost += extra_cost->fp[mode == DFmode].fpconst;
9324           else if (!aarch64_float_const_zero_rtx_p (x))
9325             {
9326               /* This will be a load from memory.  */
9327               if (mode == DFmode)
9328                 *cost += extra_cost->ldst.loadd;
9329               else
9330                 *cost += extra_cost->ldst.loadf;
9331             }
9332           else
9333             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
9334                or MOV v0.s[0], wzr - neither of which are modeled by the
9335                cost tables.  Just use the default cost.  */
9336             {
9337             }
9338         }
9339
9340       return true;
9341
9342     case MEM:
9343       if (speed)
9344         {
9345           /* For loads we want the base cost of a load, plus an
9346              approximation for the additional cost of the addressing
9347              mode.  */
9348           rtx address = XEXP (x, 0);
9349           if (VECTOR_MODE_P (mode))
9350             *cost += extra_cost->ldst.loadv;
9351           else if (GET_MODE_CLASS (mode) == MODE_INT)
9352             *cost += extra_cost->ldst.load;
9353           else if (mode == SFmode)
9354             *cost += extra_cost->ldst.loadf;
9355           else if (mode == DFmode)
9356             *cost += extra_cost->ldst.loadd;
9357
9358           *cost +=
9359                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9360                                                      0, speed));
9361         }
9362
9363       return true;
9364
9365     case NEG:
9366       op0 = XEXP (x, 0);
9367
9368       if (VECTOR_MODE_P (mode))
9369         {
9370           if (speed)
9371             {
9372               /* FNEG.  */
9373               *cost += extra_cost->vect.alu;
9374             }
9375           return false;
9376         }
9377
9378       if (GET_MODE_CLASS (mode) == MODE_INT)
9379         {
9380           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
9381               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
9382             {
9383               /* CSETM.  */
9384               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
9385               return true;
9386             }
9387
9388           /* Cost this as SUB wzr, X.  */
9389           op0 = CONST0_RTX (mode);
9390           op1 = XEXP (x, 0);
9391           goto cost_minus;
9392         }
9393
9394       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9395         {
9396           /* Support (neg(fma...)) as a single instruction only if
9397              sign of zeros is unimportant.  This matches the decision
9398              making in aarch64.md.  */
9399           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
9400             {
9401               /* FNMADD.  */
9402               *cost = rtx_cost (op0, mode, NEG, 0, speed);
9403               return true;
9404             }
9405           if (GET_CODE (op0) == MULT)
9406             {
9407               /* FNMUL.  */
9408               *cost = rtx_cost (op0, mode, NEG, 0, speed);
9409               return true;
9410             }
9411           if (speed)
9412             /* FNEG.  */
9413             *cost += extra_cost->fp[mode == DFmode].neg;
9414           return false;
9415         }
9416
9417       return false;
9418
9419     case CLRSB:
9420     case CLZ:
9421       if (speed)
9422         {
9423           if (VECTOR_MODE_P (mode))
9424             *cost += extra_cost->vect.alu;
9425           else
9426             *cost += extra_cost->alu.clz;
9427         }
9428
9429       return false;
9430
9431     case COMPARE:
9432       op0 = XEXP (x, 0);
9433       op1 = XEXP (x, 1);
9434
9435       if (op1 == const0_rtx
9436           && GET_CODE (op0) == AND)
9437         {
9438           x = op0;
9439           mode = GET_MODE (op0);
9440           goto cost_logic;
9441         }
9442
9443       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
9444         {
9445           /* TODO: A write to the CC flags possibly costs extra, this
9446              needs encoding in the cost tables.  */
9447
9448           mode = GET_MODE (op0);
9449           /* ANDS.  */
9450           if (GET_CODE (op0) == AND)
9451             {
9452               x = op0;
9453               goto cost_logic;
9454             }
9455
9456           if (GET_CODE (op0) == PLUS)
9457             {
9458               /* ADDS (and CMN alias).  */
9459               x = op0;
9460               goto cost_plus;
9461             }
9462
9463           if (GET_CODE (op0) == MINUS)
9464             {
9465               /* SUBS.  */
9466               x = op0;
9467               goto cost_minus;
9468             }
9469
9470           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
9471               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
9472               && CONST_INT_P (XEXP (op0, 2)))
9473             {
9474               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
9475                  Handle it here directly rather than going to cost_logic
9476                  since we know the immediate generated for the TST is valid
9477                  so we can avoid creating an intermediate rtx for it only
9478                  for costing purposes.  */
9479               if (speed)
9480                 *cost += extra_cost->alu.logical;
9481
9482               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
9483                                  ZERO_EXTRACT, 0, speed);
9484               return true;
9485             }
9486
9487           if (GET_CODE (op1) == NEG)
9488             {
9489               /* CMN.  */
9490               if (speed)
9491                 *cost += extra_cost->alu.arith;
9492
9493               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
9494               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
9495               return true;
9496             }
9497
9498           /* CMP.
9499
9500              Compare can freely swap the order of operands, and
9501              canonicalization puts the more complex operation first.
9502              But the integer MINUS logic expects the shift/extend
9503              operation in op1.  */
9504           if (! (REG_P (op0)
9505                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
9506           {
9507             op0 = XEXP (x, 1);
9508             op1 = XEXP (x, 0);
9509           }
9510           goto cost_minus;
9511         }
9512
9513       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
9514         {
9515           /* FCMP.  */
9516           if (speed)
9517             *cost += extra_cost->fp[mode == DFmode].compare;
9518
9519           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
9520             {
9521               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
9522               /* FCMP supports constant 0.0 for no extra cost. */
9523               return true;
9524             }
9525           return false;
9526         }
9527
9528       if (VECTOR_MODE_P (mode))
9529         {
9530           /* Vector compare.  */
9531           if (speed)
9532             *cost += extra_cost->vect.alu;
9533
9534           if (aarch64_float_const_zero_rtx_p (op1))
9535             {
9536               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
9537                  cost.  */
9538               return true;
9539             }
9540           return false;
9541         }
9542       return false;
9543
9544     case MINUS:
9545       {
9546         op0 = XEXP (x, 0);
9547         op1 = XEXP (x, 1);
9548
9549 cost_minus:
9550         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
9551
9552         /* Detect valid immediates.  */
9553         if ((GET_MODE_CLASS (mode) == MODE_INT
9554              || (GET_MODE_CLASS (mode) == MODE_CC
9555                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
9556             && CONST_INT_P (op1)
9557             && aarch64_uimm12_shift (INTVAL (op1)))
9558           {
9559             if (speed)
9560               /* SUB(S) (immediate).  */
9561               *cost += extra_cost->alu.arith;
9562             return true;
9563           }
9564
9565         /* Look for SUB (extended register).  */
9566         if (is_a <scalar_int_mode> (mode, &int_mode)
9567             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
9568           {
9569             if (speed)
9570               *cost += extra_cost->alu.extend_arith;
9571
9572             op1 = aarch64_strip_extend (op1, true);
9573             *cost += rtx_cost (op1, VOIDmode,
9574                                (enum rtx_code) GET_CODE (op1), 0, speed);
9575             return true;
9576           }
9577
9578         rtx new_op1 = aarch64_strip_extend (op1, false);
9579
9580         /* Cost this as an FMA-alike operation.  */
9581         if ((GET_CODE (new_op1) == MULT
9582              || aarch64_shift_p (GET_CODE (new_op1)))
9583             && code != COMPARE)
9584           {
9585             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
9586                                             (enum rtx_code) code,
9587                                             speed);
9588             return true;
9589           }
9590
9591         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
9592
9593         if (speed)
9594           {
9595             if (VECTOR_MODE_P (mode))
9596               {
9597                 /* Vector SUB.  */
9598                 *cost += extra_cost->vect.alu;
9599               }
9600             else if (GET_MODE_CLASS (mode) == MODE_INT)
9601               {
9602                 /* SUB(S).  */
9603                 *cost += extra_cost->alu.arith;
9604               }
9605             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9606               {
9607                 /* FSUB.  */
9608                 *cost += extra_cost->fp[mode == DFmode].addsub;
9609               }
9610           }
9611         return true;
9612       }
9613
9614     case PLUS:
9615       {
9616         rtx new_op0;
9617
9618         op0 = XEXP (x, 0);
9619         op1 = XEXP (x, 1);
9620
9621 cost_plus:
9622         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
9623             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
9624           {
9625             /* CSINC.  */
9626             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
9627             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
9628             return true;
9629           }
9630
9631         if (GET_MODE_CLASS (mode) == MODE_INT
9632             && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
9633                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
9634           {
9635             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
9636
9637             if (speed)
9638               /* ADD (immediate).  */
9639               *cost += extra_cost->alu.arith;
9640             return true;
9641           }
9642
9643         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
9644
9645         /* Look for ADD (extended register).  */
9646         if (is_a <scalar_int_mode> (mode, &int_mode)
9647             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
9648           {
9649             if (speed)
9650               *cost += extra_cost->alu.extend_arith;
9651
9652             op0 = aarch64_strip_extend (op0, true);
9653             *cost += rtx_cost (op0, VOIDmode,
9654                                (enum rtx_code) GET_CODE (op0), 0, speed);
9655             return true;
9656           }
9657
9658         /* Strip any extend, leave shifts behind as we will
9659            cost them through mult_cost.  */
9660         new_op0 = aarch64_strip_extend (op0, false);
9661
9662         if (GET_CODE (new_op0) == MULT
9663             || aarch64_shift_p (GET_CODE (new_op0)))
9664           {
9665             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
9666                                             speed);
9667             return true;
9668           }
9669
9670         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
9671
9672         if (speed)
9673           {
9674             if (VECTOR_MODE_P (mode))
9675               {
9676                 /* Vector ADD.  */
9677                 *cost += extra_cost->vect.alu;
9678               }
9679             else if (GET_MODE_CLASS (mode) == MODE_INT)
9680               {
9681                 /* ADD.  */
9682                 *cost += extra_cost->alu.arith;
9683               }
9684             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9685               {
9686                 /* FADD.  */
9687                 *cost += extra_cost->fp[mode == DFmode].addsub;
9688               }
9689           }
9690         return true;
9691       }
9692
9693     case BSWAP:
9694       *cost = COSTS_N_INSNS (1);
9695
9696       if (speed)
9697         {
9698           if (VECTOR_MODE_P (mode))
9699             *cost += extra_cost->vect.alu;
9700           else
9701             *cost += extra_cost->alu.rev;
9702         }
9703       return false;
9704
9705     case IOR:
9706       if (aarch_rev16_p (x))
9707         {
9708           *cost = COSTS_N_INSNS (1);
9709
9710           if (speed)
9711             {
9712               if (VECTOR_MODE_P (mode))
9713                 *cost += extra_cost->vect.alu;
9714               else
9715                 *cost += extra_cost->alu.rev;
9716             }
9717           return true;
9718         }
9719
9720       if (aarch64_extr_rtx_p (x, &op0, &op1))
9721         {
9722           *cost += rtx_cost (op0, mode, IOR, 0, speed);
9723           *cost += rtx_cost (op1, mode, IOR, 1, speed);
9724           if (speed)
9725             *cost += extra_cost->alu.shift;
9726
9727           return true;
9728         }
9729     /* Fall through.  */
9730     case XOR:
9731     case AND:
9732     cost_logic:
9733       op0 = XEXP (x, 0);
9734       op1 = XEXP (x, 1);
9735
9736       if (VECTOR_MODE_P (mode))
9737         {
9738           if (speed)
9739             *cost += extra_cost->vect.alu;
9740           return true;
9741         }
9742
9743       if (code == AND
9744           && GET_CODE (op0) == MULT
9745           && CONST_INT_P (XEXP (op0, 1))
9746           && CONST_INT_P (op1)
9747           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
9748                                INTVAL (op1)) != 0)
9749         {
9750           /* This is a UBFM/SBFM.  */
9751           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
9752           if (speed)
9753             *cost += extra_cost->alu.bfx;
9754           return true;
9755         }
9756
9757       if (is_int_mode (mode, &int_mode))
9758         {
9759           if (CONST_INT_P (op1))
9760             {
9761               /* We have a mask + shift version of a UBFIZ
9762                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
9763               if (GET_CODE (op0) == ASHIFT
9764                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
9765                                                          XEXP (op0, 1)))
9766                 {
9767                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
9768                                      (enum rtx_code) code, 0, speed);
9769                   if (speed)
9770                     *cost += extra_cost->alu.bfx;
9771
9772                   return true;
9773                 }
9774               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
9775                 {
9776                 /* We possibly get the immediate for free, this is not
9777                    modelled.  */
9778                   *cost += rtx_cost (op0, int_mode,
9779                                      (enum rtx_code) code, 0, speed);
9780                   if (speed)
9781                     *cost += extra_cost->alu.logical;
9782
9783                   return true;
9784                 }
9785             }
9786           else
9787             {
9788               rtx new_op0 = op0;
9789
9790               /* Handle ORN, EON, or BIC.  */
9791               if (GET_CODE (op0) == NOT)
9792                 op0 = XEXP (op0, 0);
9793
9794               new_op0 = aarch64_strip_shift (op0);
9795
9796               /* If we had a shift on op0 then this is a logical-shift-
9797                  by-register/immediate operation.  Otherwise, this is just
9798                  a logical operation.  */
9799               if (speed)
9800                 {
9801                   if (new_op0 != op0)
9802                     {
9803                       /* Shift by immediate.  */
9804                       if (CONST_INT_P (XEXP (op0, 1)))
9805                         *cost += extra_cost->alu.log_shift;
9806                       else
9807                         *cost += extra_cost->alu.log_shift_reg;
9808                     }
9809                   else
9810                     *cost += extra_cost->alu.logical;
9811                 }
9812
9813               /* In both cases we want to cost both operands.  */
9814               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
9815                                  0, speed);
9816               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
9817                                  1, speed);
9818
9819               return true;
9820             }
9821         }
9822       return false;
9823
9824     case NOT:
9825       x = XEXP (x, 0);
9826       op0 = aarch64_strip_shift (x);
9827
9828       if (VECTOR_MODE_P (mode))
9829         {
9830           /* Vector NOT.  */
9831           *cost += extra_cost->vect.alu;
9832           return false;
9833         }
9834
9835       /* MVN-shifted-reg.  */
9836       if (op0 != x)
9837         {
9838           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9839
9840           if (speed)
9841             *cost += extra_cost->alu.log_shift;
9842
9843           return true;
9844         }
9845       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
9846          Handle the second form here taking care that 'a' in the above can
9847          be a shift.  */
9848       else if (GET_CODE (op0) == XOR)
9849         {
9850           rtx newop0 = XEXP (op0, 0);
9851           rtx newop1 = XEXP (op0, 1);
9852           rtx op0_stripped = aarch64_strip_shift (newop0);
9853
9854           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
9855           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
9856
9857           if (speed)
9858             {
9859               if (op0_stripped != newop0)
9860                 *cost += extra_cost->alu.log_shift;
9861               else
9862                 *cost += extra_cost->alu.logical;
9863             }
9864
9865           return true;
9866         }
9867       /* MVN.  */
9868       if (speed)
9869         *cost += extra_cost->alu.logical;
9870
9871       return false;
9872
9873     case ZERO_EXTEND:
9874
9875       op0 = XEXP (x, 0);
9876       /* If a value is written in SI mode, then zero extended to DI
9877          mode, the operation will in general be free as a write to
9878          a 'w' register implicitly zeroes the upper bits of an 'x'
9879          register.  However, if this is
9880
9881            (set (reg) (zero_extend (reg)))
9882
9883          we must cost the explicit register move.  */
9884       if (mode == DImode
9885           && GET_MODE (op0) == SImode
9886           && outer == SET)
9887         {
9888           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
9889
9890         /* If OP_COST is non-zero, then the cost of the zero extend
9891            is effectively the cost of the inner operation.  Otherwise
9892            we have a MOV instruction and we take the cost from the MOV
9893            itself.  This is true independently of whether we are
9894            optimizing for space or time.  */
9895           if (op_cost)
9896             *cost = op_cost;
9897
9898           return true;
9899         }
9900       else if (MEM_P (op0))
9901         {
9902           /* All loads can zero extend to any size for free.  */
9903           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
9904           return true;
9905         }
9906
9907       op0 = aarch64_extend_bitfield_pattern_p (x);
9908       if (op0)
9909         {
9910           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
9911           if (speed)
9912             *cost += extra_cost->alu.bfx;
9913           return true;
9914         }
9915
9916       if (speed)
9917         {
9918           if (VECTOR_MODE_P (mode))
9919             {
9920               /* UMOV.  */
9921               *cost += extra_cost->vect.alu;
9922             }
9923           else
9924             {
9925               /* We generate an AND instead of UXTB/UXTH.  */
9926               *cost += extra_cost->alu.logical;
9927             }
9928         }
9929       return false;
9930
9931     case SIGN_EXTEND:
9932       if (MEM_P (XEXP (x, 0)))
9933         {
9934           /* LDRSH.  */
9935           if (speed)
9936             {
9937               rtx address = XEXP (XEXP (x, 0), 0);
9938               *cost += extra_cost->ldst.load_sign_extend;
9939
9940               *cost +=
9941                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9942                                                      0, speed));
9943             }
9944           return true;
9945         }
9946
9947       op0 = aarch64_extend_bitfield_pattern_p (x);
9948       if (op0)
9949         {
9950           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
9951           if (speed)
9952             *cost += extra_cost->alu.bfx;
9953           return true;
9954         }
9955
9956       if (speed)
9957         {
9958           if (VECTOR_MODE_P (mode))
9959             *cost += extra_cost->vect.alu;
9960           else
9961             *cost += extra_cost->alu.extend;
9962         }
9963       return false;
9964
9965     case ASHIFT:
9966       op0 = XEXP (x, 0);
9967       op1 = XEXP (x, 1);
9968
9969       if (CONST_INT_P (op1))
9970         {
9971           if (speed)
9972             {
9973               if (VECTOR_MODE_P (mode))
9974                 {
9975                   /* Vector shift (immediate).  */
9976                   *cost += extra_cost->vect.alu;
9977                 }
9978               else
9979                 {
9980                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
9981                      aliases.  */
9982                   *cost += extra_cost->alu.shift;
9983                 }
9984             }
9985
9986           /* We can incorporate zero/sign extend for free.  */
9987           if (GET_CODE (op0) == ZERO_EXTEND
9988               || GET_CODE (op0) == SIGN_EXTEND)
9989             op0 = XEXP (op0, 0);
9990
9991           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
9992           return true;
9993         }
9994       else
9995         {
9996           if (VECTOR_MODE_P (mode))
9997             {
9998               if (speed)
9999                 /* Vector shift (register).  */
10000                 *cost += extra_cost->vect.alu;
10001             }
10002           else
10003             {
10004               if (speed)
10005                 /* LSLV.  */
10006                 *cost += extra_cost->alu.shift_reg;
10007
10008               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10009                   && CONST_INT_P (XEXP (op1, 1))
10010                   && known_eq (INTVAL (XEXP (op1, 1)),
10011                                GET_MODE_BITSIZE (mode) - 1))
10012                 {
10013                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10014                   /* We already demanded XEXP (op1, 0) to be REG_P, so
10015                      don't recurse into it.  */
10016                   return true;
10017                 }
10018             }
10019           return false;  /* All arguments need to be in registers.  */
10020         }
10021
10022     case ROTATE:
10023     case ROTATERT:
10024     case LSHIFTRT:
10025     case ASHIFTRT:
10026       op0 = XEXP (x, 0);
10027       op1 = XEXP (x, 1);
10028
10029       if (CONST_INT_P (op1))
10030         {
10031           /* ASR (immediate) and friends.  */
10032           if (speed)
10033             {
10034               if (VECTOR_MODE_P (mode))
10035                 *cost += extra_cost->vect.alu;
10036               else
10037                 *cost += extra_cost->alu.shift;
10038             }
10039
10040           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10041           return true;
10042         }
10043       else
10044         {
10045           if (VECTOR_MODE_P (mode))
10046             {
10047               if (speed)
10048                 /* Vector shift (register).  */
10049                 *cost += extra_cost->vect.alu;
10050             }
10051           else
10052             {
10053               if (speed)
10054                 /* ASR (register) and friends.  */
10055                 *cost += extra_cost->alu.shift_reg;
10056
10057               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10058                   && CONST_INT_P (XEXP (op1, 1))
10059                   && known_eq (INTVAL (XEXP (op1, 1)),
10060                                GET_MODE_BITSIZE (mode) - 1))
10061                 {
10062                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10063                   /* We already demanded XEXP (op1, 0) to be REG_P, so
10064                      don't recurse into it.  */
10065                   return true;
10066                 }
10067             }
10068           return false;  /* All arguments need to be in registers.  */
10069         }
10070
10071     case SYMBOL_REF:
10072
10073       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
10074           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
10075         {
10076           /* LDR.  */
10077           if (speed)
10078             *cost += extra_cost->ldst.load;
10079         }
10080       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
10081                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
10082         {
10083           /* ADRP, followed by ADD.  */
10084           *cost += COSTS_N_INSNS (1);
10085           if (speed)
10086             *cost += 2 * extra_cost->alu.arith;
10087         }
10088       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
10089                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10090         {
10091           /* ADR.  */
10092           if (speed)
10093             *cost += extra_cost->alu.arith;
10094         }
10095
10096       if (flag_pic)
10097         {
10098           /* One extra load instruction, after accessing the GOT.  */
10099           *cost += COSTS_N_INSNS (1);
10100           if (speed)
10101             *cost += extra_cost->ldst.load;
10102         }
10103       return true;
10104
10105     case HIGH:
10106     case LO_SUM:
10107       /* ADRP/ADD (immediate).  */
10108       if (speed)
10109         *cost += extra_cost->alu.arith;
10110       return true;
10111
10112     case ZERO_EXTRACT:
10113     case SIGN_EXTRACT:
10114       /* UBFX/SBFX.  */
10115       if (speed)
10116         {
10117           if (VECTOR_MODE_P (mode))
10118             *cost += extra_cost->vect.alu;
10119           else
10120             *cost += extra_cost->alu.bfx;
10121         }
10122
10123       /* We can trust that the immediates used will be correct (there
10124          are no by-register forms), so we need only cost op0.  */
10125       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
10126       return true;
10127
10128     case MULT:
10129       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
10130       /* aarch64_rtx_mult_cost always handles recursion to its
10131          operands.  */
10132       return true;
10133
10134     case MOD:
10135     /* We can expand signed mod by power of 2 using a NEGS, two parallel
10136        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
10137        an unconditional negate.  This case should only ever be reached through
10138        the set_smod_pow2_cheap check in expmed.c.  */
10139       if (CONST_INT_P (XEXP (x, 1))
10140           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
10141           && (mode == SImode || mode == DImode))
10142         {
10143           /* We expand to 4 instructions.  Reset the baseline.  */
10144           *cost = COSTS_N_INSNS (4);
10145
10146           if (speed)
10147             *cost += 2 * extra_cost->alu.logical
10148                      + 2 * extra_cost->alu.arith;
10149
10150           return true;
10151         }
10152
10153     /* Fall-through.  */
10154     case UMOD:
10155       if (speed)
10156         {
10157           /* Slighly prefer UMOD over SMOD.  */
10158           if (VECTOR_MODE_P (mode))
10159             *cost += extra_cost->vect.alu;
10160           else if (GET_MODE_CLASS (mode) == MODE_INT)
10161             *cost += (extra_cost->mult[mode == DImode].add
10162                       + extra_cost->mult[mode == DImode].idiv
10163                       + (code == MOD ? 1 : 0));
10164         }
10165       return false;  /* All arguments need to be in registers.  */
10166
10167     case DIV:
10168     case UDIV:
10169     case SQRT:
10170       if (speed)
10171         {
10172           if (VECTOR_MODE_P (mode))
10173             *cost += extra_cost->vect.alu;
10174           else if (GET_MODE_CLASS (mode) == MODE_INT)
10175             /* There is no integer SQRT, so only DIV and UDIV can get
10176                here.  */
10177             *cost += (extra_cost->mult[mode == DImode].idiv
10178                      /* Slighly prefer UDIV over SDIV.  */
10179                      + (code == DIV ? 1 : 0));
10180           else
10181             *cost += extra_cost->fp[mode == DFmode].div;
10182         }
10183       return false;  /* All arguments need to be in registers.  */
10184
10185     case IF_THEN_ELSE:
10186       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
10187                                          XEXP (x, 2), cost, speed);
10188
10189     case EQ:
10190     case NE:
10191     case GT:
10192     case GTU:
10193     case LT:
10194     case LTU:
10195     case GE:
10196     case GEU:
10197     case LE:
10198     case LEU:
10199
10200       return false; /* All arguments must be in registers.  */
10201
10202     case FMA:
10203       op0 = XEXP (x, 0);
10204       op1 = XEXP (x, 1);
10205       op2 = XEXP (x, 2);
10206
10207       if (speed)
10208         {
10209           if (VECTOR_MODE_P (mode))
10210             *cost += extra_cost->vect.alu;
10211           else
10212             *cost += extra_cost->fp[mode == DFmode].fma;
10213         }
10214
10215       /* FMSUB, FNMADD, and FNMSUB are free.  */
10216       if (GET_CODE (op0) == NEG)
10217         op0 = XEXP (op0, 0);
10218
10219       if (GET_CODE (op2) == NEG)
10220         op2 = XEXP (op2, 0);
10221
10222       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
10223          and the by-element operand as operand 0.  */
10224       if (GET_CODE (op1) == NEG)
10225         op1 = XEXP (op1, 0);
10226
10227       /* Catch vector-by-element operations.  The by-element operand can
10228          either be (vec_duplicate (vec_select (x))) or just
10229          (vec_select (x)), depending on whether we are multiplying by
10230          a vector or a scalar.
10231
10232          Canonicalization is not very good in these cases, FMA4 will put the
10233          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
10234       if (GET_CODE (op0) == VEC_DUPLICATE)
10235         op0 = XEXP (op0, 0);
10236       else if (GET_CODE (op1) == VEC_DUPLICATE)
10237         op1 = XEXP (op1, 0);
10238
10239       if (GET_CODE (op0) == VEC_SELECT)
10240         op0 = XEXP (op0, 0);
10241       else if (GET_CODE (op1) == VEC_SELECT)
10242         op1 = XEXP (op1, 0);
10243
10244       /* If the remaining parameters are not registers,
10245          get the cost to put them into registers.  */
10246       *cost += rtx_cost (op0, mode, FMA, 0, speed);
10247       *cost += rtx_cost (op1, mode, FMA, 1, speed);
10248       *cost += rtx_cost (op2, mode, FMA, 2, speed);
10249       return true;
10250
10251     case FLOAT:
10252     case UNSIGNED_FLOAT:
10253       if (speed)
10254         *cost += extra_cost->fp[mode == DFmode].fromint;
10255       return false;
10256
10257     case FLOAT_EXTEND:
10258       if (speed)
10259         {
10260           if (VECTOR_MODE_P (mode))
10261             {
10262               /*Vector truncate.  */
10263               *cost += extra_cost->vect.alu;
10264             }
10265           else
10266             *cost += extra_cost->fp[mode == DFmode].widen;
10267         }
10268       return false;
10269
10270     case FLOAT_TRUNCATE:
10271       if (speed)
10272         {
10273           if (VECTOR_MODE_P (mode))
10274             {
10275               /*Vector conversion.  */
10276               *cost += extra_cost->vect.alu;
10277             }
10278           else
10279             *cost += extra_cost->fp[mode == DFmode].narrow;
10280         }
10281       return false;
10282
10283     case FIX:
10284     case UNSIGNED_FIX:
10285       x = XEXP (x, 0);
10286       /* Strip the rounding part.  They will all be implemented
10287          by the fcvt* family of instructions anyway.  */
10288       if (GET_CODE (x) == UNSPEC)
10289         {
10290           unsigned int uns_code = XINT (x, 1);
10291
10292           if (uns_code == UNSPEC_FRINTA
10293               || uns_code == UNSPEC_FRINTM
10294               || uns_code == UNSPEC_FRINTN
10295               || uns_code == UNSPEC_FRINTP
10296               || uns_code == UNSPEC_FRINTZ)
10297             x = XVECEXP (x, 0, 0);
10298         }
10299
10300       if (speed)
10301         {
10302           if (VECTOR_MODE_P (mode))
10303             *cost += extra_cost->vect.alu;
10304           else
10305             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
10306         }
10307
10308       /* We can combine fmul by a power of 2 followed by a fcvt into a single
10309          fixed-point fcvt.  */
10310       if (GET_CODE (x) == MULT
10311           && ((VECTOR_MODE_P (mode)
10312                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
10313               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
10314         {
10315           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
10316                              0, speed);
10317           return true;
10318         }
10319
10320       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
10321       return true;
10322
10323     case ABS:
10324       if (VECTOR_MODE_P (mode))
10325         {
10326           /* ABS (vector).  */
10327           if (speed)
10328             *cost += extra_cost->vect.alu;
10329         }
10330       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10331         {
10332           op0 = XEXP (x, 0);
10333
10334           /* FABD, which is analogous to FADD.  */
10335           if (GET_CODE (op0) == MINUS)
10336             {
10337               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
10338               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
10339               if (speed)
10340                 *cost += extra_cost->fp[mode == DFmode].addsub;
10341
10342               return true;
10343             }
10344           /* Simple FABS is analogous to FNEG.  */
10345           if (speed)
10346             *cost += extra_cost->fp[mode == DFmode].neg;
10347         }
10348       else
10349         {
10350           /* Integer ABS will either be split to
10351              two arithmetic instructions, or will be an ABS
10352              (scalar), which we don't model.  */
10353           *cost = COSTS_N_INSNS (2);
10354           if (speed)
10355             *cost += 2 * extra_cost->alu.arith;
10356         }
10357       return false;
10358
10359     case SMAX:
10360     case SMIN:
10361       if (speed)
10362         {
10363           if (VECTOR_MODE_P (mode))
10364             *cost += extra_cost->vect.alu;
10365           else
10366             {
10367               /* FMAXNM/FMINNM/FMAX/FMIN.
10368                  TODO: This may not be accurate for all implementations, but
10369                  we do not model this in the cost tables.  */
10370               *cost += extra_cost->fp[mode == DFmode].addsub;
10371             }
10372         }
10373       return false;
10374
10375     case UNSPEC:
10376       /* The floating point round to integer frint* instructions.  */
10377       if (aarch64_frint_unspec_p (XINT (x, 1)))
10378         {
10379           if (speed)
10380             *cost += extra_cost->fp[mode == DFmode].roundint;
10381
10382           return false;
10383         }
10384
10385       if (XINT (x, 1) == UNSPEC_RBIT)
10386         {
10387           if (speed)
10388             *cost += extra_cost->alu.rev;
10389
10390           return false;
10391         }
10392       break;
10393
10394     case TRUNCATE:
10395
10396       /* Decompose <su>muldi3_highpart.  */
10397       if (/* (truncate:DI  */
10398           mode == DImode
10399           /*   (lshiftrt:TI  */
10400           && GET_MODE (XEXP (x, 0)) == TImode
10401           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
10402           /*      (mult:TI  */
10403           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
10404           /*        (ANY_EXTEND:TI (reg:DI))
10405                     (ANY_EXTEND:TI (reg:DI)))  */
10406           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
10407                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
10408               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
10409                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
10410           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
10411           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
10412           /*     (const_int 64)  */
10413           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10414           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
10415         {
10416           /* UMULH/SMULH.  */
10417           if (speed)
10418             *cost += extra_cost->mult[mode == DImode].extend;
10419           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
10420                              mode, MULT, 0, speed);
10421           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
10422                              mode, MULT, 1, speed);
10423           return true;
10424         }
10425
10426       /* Fall through.  */
10427     default:
10428       break;
10429     }
10430
10431   if (dump_file
10432       && flag_aarch64_verbose_cost)
10433     fprintf (dump_file,
10434       "\nFailed to cost RTX.  Assuming default cost.\n");
10435
10436   return true;
10437 }
10438
10439 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
10440    calculated for X.  This cost is stored in *COST.  Returns true
10441    if the total cost of X was calculated.  */
10442 static bool
10443 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
10444                    int param, int *cost, bool speed)
10445 {
10446   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
10447
10448   if (dump_file
10449       && flag_aarch64_verbose_cost)
10450     {
10451       print_rtl_single (dump_file, x);
10452       fprintf (dump_file, "\n%s cost: %d (%s)\n",
10453                speed ? "Hot" : "Cold",
10454                *cost, result ? "final" : "partial");
10455     }
10456
10457   return result;
10458 }
10459
10460 static int
10461 aarch64_register_move_cost (machine_mode mode,
10462                             reg_class_t from_i, reg_class_t to_i)
10463 {
10464   enum reg_class from = (enum reg_class) from_i;
10465   enum reg_class to = (enum reg_class) to_i;
10466   const struct cpu_regmove_cost *regmove_cost
10467     = aarch64_tune_params.regmove_cost;
10468
10469   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
10470   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
10471     to = GENERAL_REGS;
10472
10473   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
10474     from = GENERAL_REGS;
10475
10476   /* Moving between GPR and stack cost is the same as GP2GP.  */
10477   if ((from == GENERAL_REGS && to == STACK_REG)
10478       || (to == GENERAL_REGS && from == STACK_REG))
10479     return regmove_cost->GP2GP;
10480
10481   /* To/From the stack register, we move via the gprs.  */
10482   if (to == STACK_REG || from == STACK_REG)
10483     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
10484             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
10485
10486   if (known_eq (GET_MODE_SIZE (mode), 16))
10487     {
10488       /* 128-bit operations on general registers require 2 instructions.  */
10489       if (from == GENERAL_REGS && to == GENERAL_REGS)
10490         return regmove_cost->GP2GP * 2;
10491       else if (from == GENERAL_REGS)
10492         return regmove_cost->GP2FP * 2;
10493       else if (to == GENERAL_REGS)
10494         return regmove_cost->FP2GP * 2;
10495
10496       /* When AdvSIMD instructions are disabled it is not possible to move
10497          a 128-bit value directly between Q registers.  This is handled in
10498          secondary reload.  A general register is used as a scratch to move
10499          the upper DI value and the lower DI value is moved directly,
10500          hence the cost is the sum of three moves. */
10501       if (! TARGET_SIMD)
10502         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
10503
10504       return regmove_cost->FP2FP;
10505     }
10506
10507   if (from == GENERAL_REGS && to == GENERAL_REGS)
10508     return regmove_cost->GP2GP;
10509   else if (from == GENERAL_REGS)
10510     return regmove_cost->GP2FP;
10511   else if (to == GENERAL_REGS)
10512     return regmove_cost->FP2GP;
10513
10514   return regmove_cost->FP2FP;
10515 }
10516
10517 static int
10518 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
10519                           reg_class_t rclass ATTRIBUTE_UNUSED,
10520                           bool in ATTRIBUTE_UNUSED)
10521 {
10522   return aarch64_tune_params.memmov_cost;
10523 }
10524
10525 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
10526    to optimize 1.0/sqrt.  */
10527
10528 static bool
10529 use_rsqrt_p (machine_mode mode)
10530 {
10531   return (!flag_trapping_math
10532           && flag_unsafe_math_optimizations
10533           && ((aarch64_tune_params.approx_modes->recip_sqrt
10534                & AARCH64_APPROX_MODE (mode))
10535               || flag_mrecip_low_precision_sqrt));
10536 }
10537
10538 /* Function to decide when to use the approximate reciprocal square root
10539    builtin.  */
10540
10541 static tree
10542 aarch64_builtin_reciprocal (tree fndecl)
10543 {
10544   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
10545
10546   if (!use_rsqrt_p (mode))
10547     return NULL_TREE;
10548   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
10549 }
10550
10551 /* Emit instruction sequence to compute either the approximate square root
10552    or its approximate reciprocal, depending on the flag RECP, and return
10553    whether the sequence was emitted or not.  */
10554
10555 bool
10556 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
10557 {
10558   machine_mode mode = GET_MODE (dst);
10559
10560   if (GET_MODE_INNER (mode) == HFmode)
10561     {
10562       gcc_assert (!recp);
10563       return false;
10564     }
10565
10566   if (!recp)
10567     {
10568       if (!(flag_mlow_precision_sqrt
10569             || (aarch64_tune_params.approx_modes->sqrt
10570                 & AARCH64_APPROX_MODE (mode))))
10571         return false;
10572
10573       if (flag_finite_math_only
10574           || flag_trapping_math
10575           || !flag_unsafe_math_optimizations
10576           || optimize_function_for_size_p (cfun))
10577         return false;
10578     }
10579   else
10580     /* Caller assumes we cannot fail.  */
10581     gcc_assert (use_rsqrt_p (mode));
10582
10583   machine_mode mmsk = mode_for_int_vector (mode).require ();
10584   rtx xmsk = gen_reg_rtx (mmsk);
10585   if (!recp)
10586     /* When calculating the approximate square root, compare the
10587        argument with 0.0 and create a mask.  */
10588     emit_insn (gen_rtx_SET (xmsk,
10589                             gen_rtx_NEG (mmsk,
10590                                          gen_rtx_EQ (mmsk, src,
10591                                                      CONST0_RTX (mode)))));
10592
10593   /* Estimate the approximate reciprocal square root.  */
10594   rtx xdst = gen_reg_rtx (mode);
10595   emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
10596
10597   /* Iterate over the series twice for SF and thrice for DF.  */
10598   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10599
10600   /* Optionally iterate over the series once less for faster performance
10601      while sacrificing the accuracy.  */
10602   if ((recp && flag_mrecip_low_precision_sqrt)
10603       || (!recp && flag_mlow_precision_sqrt))
10604     iterations--;
10605
10606   /* Iterate over the series to calculate the approximate reciprocal square
10607      root.  */
10608   rtx x1 = gen_reg_rtx (mode);
10609   while (iterations--)
10610     {
10611       rtx x2 = gen_reg_rtx (mode);
10612       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
10613
10614       emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
10615
10616       if (iterations > 0)
10617         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
10618     }
10619
10620   if (!recp)
10621     {
10622       /* Qualify the approximate reciprocal square root when the argument is
10623          0.0 by squashing the intermediary result to 0.0.  */
10624       rtx xtmp = gen_reg_rtx (mmsk);
10625       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
10626                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
10627       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
10628
10629       /* Calculate the approximate square root.  */
10630       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
10631     }
10632
10633   /* Finalize the approximation.  */
10634   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
10635
10636   return true;
10637 }
10638
10639 /* Emit the instruction sequence to compute the approximation for the division
10640    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
10641
10642 bool
10643 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
10644 {
10645   machine_mode mode = GET_MODE (quo);
10646
10647   if (GET_MODE_INNER (mode) == HFmode)
10648     return false;
10649
10650   bool use_approx_division_p = (flag_mlow_precision_div
10651                                 || (aarch64_tune_params.approx_modes->division
10652                                     & AARCH64_APPROX_MODE (mode)));
10653
10654   if (!flag_finite_math_only
10655       || flag_trapping_math
10656       || !flag_unsafe_math_optimizations
10657       || optimize_function_for_size_p (cfun)
10658       || !use_approx_division_p)
10659     return false;
10660
10661   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
10662     return false;
10663
10664   /* Estimate the approximate reciprocal.  */
10665   rtx xrcp = gen_reg_rtx (mode);
10666   emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
10667
10668   /* Iterate over the series twice for SF and thrice for DF.  */
10669   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10670
10671   /* Optionally iterate over the series once less for faster performance,
10672      while sacrificing the accuracy.  */
10673   if (flag_mlow_precision_div)
10674     iterations--;
10675
10676   /* Iterate over the series to calculate the approximate reciprocal.  */
10677   rtx xtmp = gen_reg_rtx (mode);
10678   while (iterations--)
10679     {
10680       emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
10681
10682       if (iterations > 0)
10683         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
10684     }
10685
10686   if (num != CONST1_RTX (mode))
10687     {
10688       /* As the approximate reciprocal of DEN is already calculated, only
10689          calculate the approximate division when NUM is not 1.0.  */
10690       rtx xnum = force_reg (mode, num);
10691       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
10692     }
10693
10694   /* Finalize the approximation.  */
10695   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
10696   return true;
10697 }
10698
10699 /* Return the number of instructions that can be issued per cycle.  */
10700 static int
10701 aarch64_sched_issue_rate (void)
10702 {
10703   return aarch64_tune_params.issue_rate;
10704 }
10705
10706 static int
10707 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
10708 {
10709   int issue_rate = aarch64_sched_issue_rate ();
10710
10711   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
10712 }
10713
10714
10715 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
10716    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
10717    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
10718
10719 static int
10720 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
10721                                                     int ready_index)
10722 {
10723   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
10724 }
10725
10726
10727 /* Vectorizer cost model target hooks.  */
10728
10729 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
10730 static int
10731 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
10732                                     tree vectype,
10733                                     int misalign ATTRIBUTE_UNUSED)
10734 {
10735   unsigned elements;
10736   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
10737   bool fp = false;
10738
10739   if (vectype != NULL)
10740     fp = FLOAT_TYPE_P (vectype);
10741
10742   switch (type_of_cost)
10743     {
10744       case scalar_stmt:
10745         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
10746
10747       case scalar_load:
10748         return costs->scalar_load_cost;
10749
10750       case scalar_store:
10751         return costs->scalar_store_cost;
10752
10753       case vector_stmt:
10754         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10755
10756       case vector_load:
10757         return costs->vec_align_load_cost;
10758
10759       case vector_store:
10760         return costs->vec_store_cost;
10761
10762       case vec_to_scalar:
10763         return costs->vec_to_scalar_cost;
10764
10765       case scalar_to_vec:
10766         return costs->scalar_to_vec_cost;
10767
10768       case unaligned_load:
10769       case vector_gather_load:
10770         return costs->vec_unalign_load_cost;
10771
10772       case unaligned_store:
10773       case vector_scatter_store:
10774         return costs->vec_unalign_store_cost;
10775
10776       case cond_branch_taken:
10777         return costs->cond_taken_branch_cost;
10778
10779       case cond_branch_not_taken:
10780         return costs->cond_not_taken_branch_cost;
10781
10782       case vec_perm:
10783         return costs->vec_permute_cost;
10784
10785       case vec_promote_demote:
10786         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10787
10788       case vec_construct:
10789         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
10790         return elements / 2 + 1;
10791
10792       default:
10793         gcc_unreachable ();
10794     }
10795 }
10796
10797 /* Implement targetm.vectorize.add_stmt_cost.  */
10798 static unsigned
10799 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
10800                        struct _stmt_vec_info *stmt_info, int misalign,
10801                        enum vect_cost_model_location where)
10802 {
10803   unsigned *cost = (unsigned *) data;
10804   unsigned retval = 0;
10805
10806   if (flag_vect_cost_model)
10807     {
10808       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
10809       int stmt_cost =
10810             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
10811
10812       /* Statements in an inner loop relative to the loop being
10813          vectorized are weighted more heavily.  The value here is
10814          arbitrary and could potentially be improved with analysis.  */
10815       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
10816         count *= 50; /*  FIXME  */
10817
10818       retval = (unsigned) (count * stmt_cost);
10819       cost[where] += retval;
10820     }
10821
10822   return retval;
10823 }
10824
10825 static void initialize_aarch64_code_model (struct gcc_options *);
10826
10827 /* Parse the TO_PARSE string and put the architecture struct that it
10828    selects into RES and the architectural features into ISA_FLAGS.
10829    Return an aarch64_parse_opt_result describing the parse result.
10830    If there is an error parsing, RES and ISA_FLAGS are left unchanged.
10831    When the TO_PARSE string contains an invalid extension,
10832    a copy of the string is created and stored to INVALID_EXTENSION.  */
10833
10834 static enum aarch64_parse_opt_result
10835 aarch64_parse_arch (const char *to_parse, const struct processor **res,
10836                     unsigned long *isa_flags, std::string *invalid_extension)
10837 {
10838   const char *ext;
10839   const struct processor *arch;
10840   size_t len;
10841
10842   ext = strchr (to_parse, '+');
10843
10844   if (ext != NULL)
10845     len = ext - to_parse;
10846   else
10847     len = strlen (to_parse);
10848
10849   if (len == 0)
10850     return AARCH64_PARSE_MISSING_ARG;
10851
10852
10853   /* Loop through the list of supported ARCHes to find a match.  */
10854   for (arch = all_architectures; arch->name != NULL; arch++)
10855     {
10856       if (strlen (arch->name) == len
10857           && strncmp (arch->name, to_parse, len) == 0)
10858         {
10859           unsigned long isa_temp = arch->flags;
10860
10861           if (ext != NULL)
10862             {
10863               /* TO_PARSE string contains at least one extension.  */
10864               enum aarch64_parse_opt_result ext_res
10865                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
10866
10867               if (ext_res != AARCH64_PARSE_OK)
10868                 return ext_res;
10869             }
10870           /* Extension parsing was successful.  Confirm the result
10871              arch and ISA flags.  */
10872           *res = arch;
10873           *isa_flags = isa_temp;
10874           return AARCH64_PARSE_OK;
10875         }
10876     }
10877
10878   /* ARCH name not found in list.  */
10879   return AARCH64_PARSE_INVALID_ARG;
10880 }
10881
10882 /* Parse the TO_PARSE string and put the result tuning in RES and the
10883    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
10884    describing the parse result.  If there is an error parsing, RES and
10885    ISA_FLAGS are left unchanged.
10886    When the TO_PARSE string contains an invalid extension,
10887    a copy of the string is created and stored to INVALID_EXTENSION.  */
10888
10889 static enum aarch64_parse_opt_result
10890 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
10891                    unsigned long *isa_flags, std::string *invalid_extension)
10892 {
10893   const char *ext;
10894   const struct processor *cpu;
10895   size_t len;
10896
10897   ext = strchr (to_parse, '+');
10898
10899   if (ext != NULL)
10900     len = ext - to_parse;
10901   else
10902     len = strlen (to_parse);
10903
10904   if (len == 0)
10905     return AARCH64_PARSE_MISSING_ARG;
10906
10907
10908   /* Loop through the list of supported CPUs to find a match.  */
10909   for (cpu = all_cores; cpu->name != NULL; cpu++)
10910     {
10911       if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
10912         {
10913           unsigned long isa_temp = cpu->flags;
10914
10915
10916           if (ext != NULL)
10917             {
10918               /* TO_PARSE string contains at least one extension.  */
10919               enum aarch64_parse_opt_result ext_res
10920                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
10921
10922               if (ext_res != AARCH64_PARSE_OK)
10923                 return ext_res;
10924             }
10925           /* Extension parsing was successfull.  Confirm the result
10926              cpu and ISA flags.  */
10927           *res = cpu;
10928           *isa_flags = isa_temp;
10929           return AARCH64_PARSE_OK;
10930         }
10931     }
10932
10933   /* CPU name not found in list.  */
10934   return AARCH64_PARSE_INVALID_ARG;
10935 }
10936
10937 /* Parse the TO_PARSE string and put the cpu it selects into RES.
10938    Return an aarch64_parse_opt_result describing the parse result.
10939    If the parsing fails the RES does not change.  */
10940
10941 static enum aarch64_parse_opt_result
10942 aarch64_parse_tune (const char *to_parse, const struct processor **res)
10943 {
10944   const struct processor *cpu;
10945
10946   /* Loop through the list of supported CPUs to find a match.  */
10947   for (cpu = all_cores; cpu->name != NULL; cpu++)
10948     {
10949       if (strcmp (cpu->name, to_parse) == 0)
10950         {
10951           *res = cpu;
10952           return AARCH64_PARSE_OK;
10953         }
10954     }
10955
10956   /* CPU name not found in list.  */
10957   return AARCH64_PARSE_INVALID_ARG;
10958 }
10959
10960 /* Parse TOKEN, which has length LENGTH to see if it is an option
10961    described in FLAG.  If it is, return the index bit for that fusion type.
10962    If not, error (printing OPTION_NAME) and return zero.  */
10963
10964 static unsigned int
10965 aarch64_parse_one_option_token (const char *token,
10966                                 size_t length,
10967                                 const struct aarch64_flag_desc *flag,
10968                                 const char *option_name)
10969 {
10970   for (; flag->name != NULL; flag++)
10971     {
10972       if (length == strlen (flag->name)
10973           && !strncmp (flag->name, token, length))
10974         return flag->flag;
10975     }
10976
10977   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
10978   return 0;
10979 }
10980
10981 /* Parse OPTION which is a comma-separated list of flags to enable.
10982    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
10983    default state we inherit from the CPU tuning structures.  OPTION_NAME
10984    gives the top-level option we are parsing in the -moverride string,
10985    for use in error messages.  */
10986
10987 static unsigned int
10988 aarch64_parse_boolean_options (const char *option,
10989                                const struct aarch64_flag_desc *flags,
10990                                unsigned int initial_state,
10991                                const char *option_name)
10992 {
10993   const char separator = '.';
10994   const char* specs = option;
10995   const char* ntoken = option;
10996   unsigned int found_flags = initial_state;
10997
10998   while ((ntoken = strchr (specs, separator)))
10999     {
11000       size_t token_length = ntoken - specs;
11001       unsigned token_ops = aarch64_parse_one_option_token (specs,
11002                                                            token_length,
11003                                                            flags,
11004                                                            option_name);
11005       /* If we find "none" (or, for simplicity's sake, an error) anywhere
11006          in the token stream, reset the supported operations.  So:
11007
11008            adrp+add.cmp+branch.none.adrp+add
11009
11010            would have the result of turning on only adrp+add fusion.  */
11011       if (!token_ops)
11012         found_flags = 0;
11013
11014       found_flags |= token_ops;
11015       specs = ++ntoken;
11016     }
11017
11018   /* We ended with a comma, print something.  */
11019   if (!(*specs))
11020     {
11021       error ("%s string ill-formed\n", option_name);
11022       return 0;
11023     }
11024
11025   /* We still have one more token to parse.  */
11026   size_t token_length = strlen (specs);
11027   unsigned token_ops = aarch64_parse_one_option_token (specs,
11028                                                        token_length,
11029                                                        flags,
11030                                                        option_name);
11031    if (!token_ops)
11032      found_flags = 0;
11033
11034   found_flags |= token_ops;
11035   return found_flags;
11036 }
11037
11038 /* Support for overriding instruction fusion.  */
11039
11040 static void
11041 aarch64_parse_fuse_string (const char *fuse_string,
11042                             struct tune_params *tune)
11043 {
11044   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
11045                                                      aarch64_fusible_pairs,
11046                                                      tune->fusible_ops,
11047                                                      "fuse=");
11048 }
11049
11050 /* Support for overriding other tuning flags.  */
11051
11052 static void
11053 aarch64_parse_tune_string (const char *tune_string,
11054                             struct tune_params *tune)
11055 {
11056   tune->extra_tuning_flags
11057     = aarch64_parse_boolean_options (tune_string,
11058                                      aarch64_tuning_flags,
11059                                      tune->extra_tuning_flags,
11060                                      "tune=");
11061 }
11062
11063 /* Parse the sve_width tuning moverride string in TUNE_STRING.
11064    Accept the valid SVE vector widths allowed by
11065    aarch64_sve_vector_bits_enum and use it to override sve_width
11066    in TUNE.  */
11067
11068 static void
11069 aarch64_parse_sve_width_string (const char *tune_string,
11070                                 struct tune_params *tune)
11071 {
11072   int width = -1;
11073
11074   int n = sscanf (tune_string, "%d", &width);
11075   if (n == EOF)
11076     {
11077       error ("invalid format for sve_width");
11078       return;
11079     }
11080   switch (width)
11081     {
11082     case SVE_128:
11083     case SVE_256:
11084     case SVE_512:
11085     case SVE_1024:
11086     case SVE_2048:
11087       break;
11088     default:
11089       error ("invalid sve_width value: %d", width);
11090     }
11091   tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
11092 }
11093
11094 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
11095    we understand.  If it is, extract the option string and handoff to
11096    the appropriate function.  */
11097
11098 void
11099 aarch64_parse_one_override_token (const char* token,
11100                                   size_t length,
11101                                   struct tune_params *tune)
11102 {
11103   const struct aarch64_tuning_override_function *fn
11104     = aarch64_tuning_override_functions;
11105
11106   const char *option_part = strchr (token, '=');
11107   if (!option_part)
11108     {
11109       error ("tuning string missing in option (%s)", token);
11110       return;
11111     }
11112
11113   /* Get the length of the option name.  */
11114   length = option_part - token;
11115   /* Skip the '=' to get to the option string.  */
11116   option_part++;
11117
11118   for (; fn->name != NULL; fn++)
11119     {
11120       if (!strncmp (fn->name, token, length))
11121         {
11122           fn->parse_override (option_part, tune);
11123           return;
11124         }
11125     }
11126
11127   error ("unknown tuning option (%s)",token);
11128   return;
11129 }
11130
11131 /* A checking mechanism for the implementation of the tls size.  */
11132
11133 static void
11134 initialize_aarch64_tls_size (struct gcc_options *opts)
11135 {
11136   if (aarch64_tls_size == 0)
11137     aarch64_tls_size = 24;
11138
11139   switch (opts->x_aarch64_cmodel_var)
11140     {
11141     case AARCH64_CMODEL_TINY:
11142       /* Both the default and maximum TLS size allowed under tiny is 1M which
11143          needs two instructions to address, so we clamp the size to 24.  */
11144       if (aarch64_tls_size > 24)
11145         aarch64_tls_size = 24;
11146       break;
11147     case AARCH64_CMODEL_SMALL:
11148       /* The maximum TLS size allowed under small is 4G.  */
11149       if (aarch64_tls_size > 32)
11150         aarch64_tls_size = 32;
11151       break;
11152     case AARCH64_CMODEL_LARGE:
11153       /* The maximum TLS size allowed under large is 16E.
11154          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
11155       if (aarch64_tls_size > 48)
11156         aarch64_tls_size = 48;
11157       break;
11158     default:
11159       gcc_unreachable ();
11160     }
11161
11162   return;
11163 }
11164
11165 /* Parse STRING looking for options in the format:
11166      string     :: option:string
11167      option     :: name=substring
11168      name       :: {a-z}
11169      substring  :: defined by option.  */
11170
11171 static void
11172 aarch64_parse_override_string (const char* input_string,
11173                                struct tune_params* tune)
11174 {
11175   const char separator = ':';
11176   size_t string_length = strlen (input_string) + 1;
11177   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
11178   char *string = string_root;
11179   strncpy (string, input_string, string_length);
11180   string[string_length - 1] = '\0';
11181
11182   char* ntoken = string;
11183
11184   while ((ntoken = strchr (string, separator)))
11185     {
11186       size_t token_length = ntoken - string;
11187       /* Make this substring look like a string.  */
11188       *ntoken = '\0';
11189       aarch64_parse_one_override_token (string, token_length, tune);
11190       string = ++ntoken;
11191     }
11192
11193   /* One last option to parse.  */
11194   aarch64_parse_one_override_token (string, strlen (string), tune);
11195   free (string_root);
11196 }
11197
11198
11199 static void
11200 aarch64_override_options_after_change_1 (struct gcc_options *opts)
11201 {
11202   if (accepted_branch_protection_string)
11203     {
11204       opts->x_aarch64_branch_protection_string
11205         = xstrdup (accepted_branch_protection_string);
11206     }
11207
11208   /* PR 70044: We have to be careful about being called multiple times for the
11209      same function.  This means all changes should be repeatable.  */
11210
11211   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
11212      Disable the frame pointer flag so the mid-end will not use a frame
11213      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
11214      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
11215      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
11216   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
11217   if (opts->x_flag_omit_frame_pointer == 0)
11218     opts->x_flag_omit_frame_pointer = 2;
11219
11220   /* If not optimizing for size, set the default
11221      alignment to what the target wants.  */
11222   if (!opts->x_optimize_size)
11223     {
11224       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
11225         opts->x_str_align_loops = aarch64_tune_params.loop_align;
11226       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
11227         opts->x_str_align_jumps = aarch64_tune_params.jump_align;
11228       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
11229         opts->x_str_align_functions = aarch64_tune_params.function_align;
11230     }
11231
11232   /* We default to no pc-relative literal loads.  */
11233
11234   aarch64_pcrelative_literal_loads = false;
11235
11236   /* If -mpc-relative-literal-loads is set on the command line, this
11237      implies that the user asked for PC relative literal loads.  */
11238   if (opts->x_pcrelative_literal_loads == 1)
11239     aarch64_pcrelative_literal_loads = true;
11240
11241   /* In the tiny memory model it makes no sense to disallow PC relative
11242      literal pool loads.  */
11243   if (aarch64_cmodel == AARCH64_CMODEL_TINY
11244       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
11245     aarch64_pcrelative_literal_loads = true;
11246
11247   /* When enabling the lower precision Newton series for the square root, also
11248      enable it for the reciprocal square root, since the latter is an
11249      intermediary step for the former.  */
11250   if (flag_mlow_precision_sqrt)
11251     flag_mrecip_low_precision_sqrt = true;
11252 }
11253
11254 /* 'Unpack' up the internal tuning structs and update the options
11255     in OPTS.  The caller must have set up selected_tune and selected_arch
11256     as all the other target-specific codegen decisions are
11257     derived from them.  */
11258
11259 void
11260 aarch64_override_options_internal (struct gcc_options *opts)
11261 {
11262   aarch64_tune_flags = selected_tune->flags;
11263   aarch64_tune = selected_tune->sched_core;
11264   /* Make a copy of the tuning parameters attached to the core, which
11265      we may later overwrite.  */
11266   aarch64_tune_params = *(selected_tune->tune);
11267   aarch64_architecture_version = selected_arch->architecture_version;
11268
11269   if (opts->x_aarch64_override_tune_string)
11270     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
11271                                   &aarch64_tune_params);
11272
11273   /* This target defaults to strict volatile bitfields.  */
11274   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
11275     opts->x_flag_strict_volatile_bitfields = 1;
11276
11277   initialize_aarch64_code_model (opts);
11278   initialize_aarch64_tls_size (opts);
11279
11280   int queue_depth = 0;
11281   switch (aarch64_tune_params.autoprefetcher_model)
11282     {
11283       case tune_params::AUTOPREFETCHER_OFF:
11284         queue_depth = -1;
11285         break;
11286       case tune_params::AUTOPREFETCHER_WEAK:
11287         queue_depth = 0;
11288         break;
11289       case tune_params::AUTOPREFETCHER_STRONG:
11290         queue_depth = max_insn_queue_index + 1;
11291         break;
11292       default:
11293         gcc_unreachable ();
11294     }
11295
11296   /* We don't mind passing in global_options_set here as we don't use
11297      the *options_set structs anyway.  */
11298   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
11299                          queue_depth,
11300                          opts->x_param_values,
11301                          global_options_set.x_param_values);
11302
11303   /* Set up parameters to be used in prefetching algorithm.  Do not
11304      override the defaults unless we are tuning for a core we have
11305      researched values for.  */
11306   if (aarch64_tune_params.prefetch->num_slots > 0)
11307     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
11308                            aarch64_tune_params.prefetch->num_slots,
11309                            opts->x_param_values,
11310                            global_options_set.x_param_values);
11311   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
11312     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
11313                            aarch64_tune_params.prefetch->l1_cache_size,
11314                            opts->x_param_values,
11315                            global_options_set.x_param_values);
11316   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
11317     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
11318                            aarch64_tune_params.prefetch->l1_cache_line_size,
11319                            opts->x_param_values,
11320                            global_options_set.x_param_values);
11321   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
11322     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
11323                            aarch64_tune_params.prefetch->l2_cache_size,
11324                            opts->x_param_values,
11325                            global_options_set.x_param_values);
11326   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
11327     maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
11328                            0,
11329                            opts->x_param_values,
11330                            global_options_set.x_param_values);
11331   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
11332     maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
11333                            aarch64_tune_params.prefetch->minimum_stride,
11334                            opts->x_param_values,
11335                            global_options_set.x_param_values);
11336
11337   /* Use the alternative scheduling-pressure algorithm by default.  */
11338   maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
11339                          opts->x_param_values,
11340                          global_options_set.x_param_values);
11341
11342   /* If the user hasn't changed it via configure then set the default to 64 KB
11343      for the backend.  */
11344   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
11345                          DEFAULT_STK_CLASH_GUARD_SIZE == 0
11346                            ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE,
11347                          opts->x_param_values,
11348                          global_options_set.x_param_values);
11349
11350   /* Validate the guard size.  */
11351   int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
11352
11353   /* Enforce that interval is the same size as size so the mid-end does the
11354      right thing.  */
11355   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
11356                          guard_size,
11357                          opts->x_param_values,
11358                          global_options_set.x_param_values);
11359
11360   /* The maybe_set calls won't update the value if the user has explicitly set
11361      one.  Which means we need to validate that probing interval and guard size
11362      are equal.  */
11363   int probe_interval
11364     = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
11365   if (guard_size != probe_interval)
11366     error ("stack clash guard size '%d' must be equal to probing interval "
11367            "'%d'", guard_size, probe_interval);
11368
11369   /* Enable sw prefetching at specified optimization level for
11370      CPUS that have prefetch.  Lower optimization level threshold by 1
11371      when profiling is enabled.  */
11372   if (opts->x_flag_prefetch_loop_arrays < 0
11373       && !opts->x_optimize_size
11374       && aarch64_tune_params.prefetch->default_opt_level >= 0
11375       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
11376     opts->x_flag_prefetch_loop_arrays = 1;
11377
11378   if (opts->x_aarch64_arch_string == NULL)
11379     opts->x_aarch64_arch_string = selected_arch->name;
11380   if (opts->x_aarch64_cpu_string == NULL)
11381     opts->x_aarch64_cpu_string = selected_cpu->name;
11382   if (opts->x_aarch64_tune_string == NULL)
11383     opts->x_aarch64_tune_string = selected_tune->name;
11384
11385   aarch64_override_options_after_change_1 (opts);
11386 }
11387
11388 /* Print a hint with a suggestion for a core or architecture name that
11389    most closely resembles what the user passed in STR.  ARCH is true if
11390    the user is asking for an architecture name.  ARCH is false if the user
11391    is asking for a core name.  */
11392
11393 static void
11394 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
11395 {
11396   auto_vec<const char *> candidates;
11397   const struct processor *entry = arch ? all_architectures : all_cores;
11398   for (; entry->name != NULL; entry++)
11399     candidates.safe_push (entry->name);
11400
11401 #ifdef HAVE_LOCAL_CPU_DETECT
11402   /* Add also "native" as possible value.  */
11403   if (arch)
11404     candidates.safe_push ("native");
11405 #endif
11406
11407   char *s;
11408   const char *hint = candidates_list_and_hint (str, s, candidates);
11409   if (hint)
11410     inform (input_location, "valid arguments are: %s;"
11411                              " did you mean %qs?", s, hint);
11412   else
11413     inform (input_location, "valid arguments are: %s", s);
11414
11415   XDELETEVEC (s);
11416 }
11417
11418 /* Print a hint with a suggestion for a core name that most closely resembles
11419    what the user passed in STR.  */
11420
11421 inline static void
11422 aarch64_print_hint_for_core (const char *str)
11423 {
11424   aarch64_print_hint_for_core_or_arch (str, false);
11425 }
11426
11427 /* Print a hint with a suggestion for an architecture name that most closely
11428    resembles what the user passed in STR.  */
11429
11430 inline static void
11431 aarch64_print_hint_for_arch (const char *str)
11432 {
11433   aarch64_print_hint_for_core_or_arch (str, true);
11434 }
11435
11436
11437 /* Print a hint with a suggestion for an extension name
11438    that most closely resembles what the user passed in STR.  */
11439
11440 void
11441 aarch64_print_hint_for_extensions (const std::string &str)
11442 {
11443   auto_vec<const char *> candidates;
11444   aarch64_get_all_extension_candidates (&candidates);
11445   char *s;
11446   const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
11447   if (hint)
11448     inform (input_location, "valid arguments are: %s;"
11449                              " did you mean %qs?", s, hint);
11450   else
11451     inform (input_location, "valid arguments are: %s;", s);
11452
11453   XDELETEVEC (s);
11454 }
11455
11456 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
11457    specified in STR and throw errors if appropriate.  Put the results if
11458    they are valid in RES and ISA_FLAGS.  Return whether the option is
11459    valid.  */
11460
11461 static bool
11462 aarch64_validate_mcpu (const char *str, const struct processor **res,
11463                        unsigned long *isa_flags)
11464 {
11465   std::string invalid_extension;
11466   enum aarch64_parse_opt_result parse_res
11467     = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
11468
11469   if (parse_res == AARCH64_PARSE_OK)
11470     return true;
11471
11472   switch (parse_res)
11473     {
11474       case AARCH64_PARSE_MISSING_ARG:
11475         error ("missing cpu name in %<-mcpu=%s%>", str);
11476         break;
11477       case AARCH64_PARSE_INVALID_ARG:
11478         error ("unknown value %qs for -mcpu", str);
11479         aarch64_print_hint_for_core (str);
11480         break;
11481       case AARCH64_PARSE_INVALID_FEATURE:
11482         error ("invalid feature modifier %qs in %<-mcpu=%s%>",
11483                invalid_extension.c_str (), str);
11484         aarch64_print_hint_for_extensions (invalid_extension);
11485         break;
11486       default:
11487         gcc_unreachable ();
11488     }
11489
11490   return false;
11491 }
11492
11493 /* Parses CONST_STR for branch protection features specified in
11494    aarch64_branch_protect_types, and set any global variables required.  Returns
11495    the parsing result and assigns LAST_STR to the last processed token from
11496    CONST_STR so that it can be used for error reporting.  */
11497
11498 static enum
11499 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
11500                                                           char** last_str)
11501 {
11502   char *str_root = xstrdup (const_str);
11503   char* token_save = NULL;
11504   char *str = strtok_r (str_root, "+", &token_save);
11505   enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
11506   if (!str)
11507     res = AARCH64_PARSE_MISSING_ARG;
11508   else
11509     {
11510       char *next_str = strtok_r (NULL, "+", &token_save);
11511       /* Reset the branch protection features to their defaults.  */
11512       aarch64_handle_no_branch_protection (NULL, NULL);
11513
11514       while (str && res == AARCH64_PARSE_OK)
11515         {
11516           const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
11517           bool found = false;
11518           /* Search for this type.  */
11519           while (type && type->name && !found && res == AARCH64_PARSE_OK)
11520             {
11521               if (strcmp (str, type->name) == 0)
11522                 {
11523                   found = true;
11524                   res = type->handler (str, next_str);
11525                   str = next_str;
11526                   next_str = strtok_r (NULL, "+", &token_save);
11527                 }
11528               else
11529                 type++;
11530             }
11531           if (found && res == AARCH64_PARSE_OK)
11532             {
11533               bool found_subtype = true;
11534               /* Loop through each token until we find one that isn't a
11535                  subtype.  */
11536               while (found_subtype)
11537                 {
11538                   found_subtype = false;
11539                   const aarch64_branch_protect_type *subtype = type->subtypes;
11540                   /* Search for the subtype.  */
11541                   while (str && subtype && subtype->name && !found_subtype
11542                           && res == AARCH64_PARSE_OK)
11543                     {
11544                       if (strcmp (str, subtype->name) == 0)
11545                         {
11546                           found_subtype = true;
11547                           res = subtype->handler (str, next_str);
11548                           str = next_str;
11549                           next_str = strtok_r (NULL, "+", &token_save);
11550                         }
11551                       else
11552                         subtype++;
11553                     }
11554                 }
11555             }
11556           else if (!found)
11557             res = AARCH64_PARSE_INVALID_ARG;
11558         }
11559     }
11560   /* Copy the last processed token into the argument to pass it back.
11561     Used by option and attribute validation to print the offending token.  */
11562   if (last_str)
11563     {
11564       if (str) strcpy (*last_str, str);
11565       else *last_str = NULL;
11566     }
11567   if (res == AARCH64_PARSE_OK)
11568     {
11569       /* If needed, alloc the accepted string then copy in const_str.
11570         Used by override_option_after_change_1.  */
11571       if (!accepted_branch_protection_string)
11572         accepted_branch_protection_string = (char *) xmalloc (
11573                                                       BRANCH_PROTECT_STR_MAX
11574                                                         + 1);
11575       strncpy (accepted_branch_protection_string, const_str,
11576                 BRANCH_PROTECT_STR_MAX + 1);
11577       /* Forcibly null-terminate.  */
11578       accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
11579     }
11580   return res;
11581 }
11582
11583 static bool
11584 aarch64_validate_mbranch_protection (const char *const_str)
11585 {
11586   char *str = (char *) xmalloc (strlen (const_str));
11587   enum aarch64_parse_opt_result res =
11588     aarch64_parse_branch_protection (const_str, &str);
11589   if (res == AARCH64_PARSE_INVALID_ARG)
11590     error ("invalid arg %<%s%> for %<-mbranch-protection=%>", str);
11591   else if (res == AARCH64_PARSE_MISSING_ARG)
11592     error ("missing arg for %<-mbranch-protection=%>");
11593   free (str);
11594   return res == AARCH64_PARSE_OK;
11595 }
11596
11597 /* Validate a command-line -march option.  Parse the arch and extensions
11598    (if any) specified in STR and throw errors if appropriate.  Put the
11599    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
11600    option is valid.  */
11601
11602 static bool
11603 aarch64_validate_march (const char *str, const struct processor **res,
11604                          unsigned long *isa_flags)
11605 {
11606   std::string invalid_extension;
11607   enum aarch64_parse_opt_result parse_res
11608     = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
11609
11610   if (parse_res == AARCH64_PARSE_OK)
11611     return true;
11612
11613   switch (parse_res)
11614     {
11615       case AARCH64_PARSE_MISSING_ARG:
11616         error ("missing arch name in %<-march=%s%>", str);
11617         break;
11618       case AARCH64_PARSE_INVALID_ARG:
11619         error ("unknown value %qs for -march", str);
11620         aarch64_print_hint_for_arch (str);
11621         break;
11622       case AARCH64_PARSE_INVALID_FEATURE:
11623         error ("invalid feature modifier %qs in %<-march=%s%>",
11624                invalid_extension.c_str (), str);
11625         aarch64_print_hint_for_extensions (invalid_extension);
11626         break;
11627       default:
11628         gcc_unreachable ();
11629     }
11630
11631   return false;
11632 }
11633
11634 /* Validate a command-line -mtune option.  Parse the cpu
11635    specified in STR and throw errors if appropriate.  Put the
11636    result, if it is valid, in RES.  Return whether the option is
11637    valid.  */
11638
11639 static bool
11640 aarch64_validate_mtune (const char *str, const struct processor **res)
11641 {
11642   enum aarch64_parse_opt_result parse_res
11643     = aarch64_parse_tune (str, res);
11644
11645   if (parse_res == AARCH64_PARSE_OK)
11646     return true;
11647
11648   switch (parse_res)
11649     {
11650       case AARCH64_PARSE_MISSING_ARG:
11651         error ("missing cpu name in %<-mtune=%s%>", str);
11652         break;
11653       case AARCH64_PARSE_INVALID_ARG:
11654         error ("unknown value %qs for -mtune", str);
11655         aarch64_print_hint_for_core (str);
11656         break;
11657       default:
11658         gcc_unreachable ();
11659     }
11660   return false;
11661 }
11662
11663 /* Return the CPU corresponding to the enum CPU.
11664    If it doesn't specify a cpu, return the default.  */
11665
11666 static const struct processor *
11667 aarch64_get_tune_cpu (enum aarch64_processor cpu)
11668 {
11669   if (cpu != aarch64_none)
11670     return &all_cores[cpu];
11671
11672   /* The & 0x3f is to extract the bottom 6 bits that encode the
11673      default cpu as selected by the --with-cpu GCC configure option
11674      in config.gcc.
11675      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
11676      flags mechanism should be reworked to make it more sane.  */
11677   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
11678 }
11679
11680 /* Return the architecture corresponding to the enum ARCH.
11681    If it doesn't specify a valid architecture, return the default.  */
11682
11683 static const struct processor *
11684 aarch64_get_arch (enum aarch64_arch arch)
11685 {
11686   if (arch != aarch64_no_arch)
11687     return &all_architectures[arch];
11688
11689   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
11690
11691   return &all_architectures[cpu->arch];
11692 }
11693
11694 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
11695
11696 static poly_uint16
11697 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
11698 {
11699   /* For now generate vector-length agnostic code for -msve-vector-bits=128.
11700      This ensures we can clearly distinguish SVE and Advanced SIMD modes when
11701      deciding which .md file patterns to use and when deciding whether
11702      something is a legitimate address or constant.  */
11703   if (value == SVE_SCALABLE || value == SVE_128)
11704     return poly_uint16 (2, 2);
11705   else
11706     return (int) value / 64;
11707 }
11708
11709 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
11710    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
11711    tuning structs.  In particular it must set selected_tune and
11712    aarch64_isa_flags that define the available ISA features and tuning
11713    decisions.  It must also set selected_arch as this will be used to
11714    output the .arch asm tags for each function.  */
11715
11716 static void
11717 aarch64_override_options (void)
11718 {
11719   unsigned long cpu_isa = 0;
11720   unsigned long arch_isa = 0;
11721   aarch64_isa_flags = 0;
11722
11723   bool valid_cpu = true;
11724   bool valid_tune = true;
11725   bool valid_arch = true;
11726
11727   selected_cpu = NULL;
11728   selected_arch = NULL;
11729   selected_tune = NULL;
11730
11731   if (aarch64_branch_protection_string)
11732     aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
11733
11734   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
11735      If either of -march or -mtune is given, they override their
11736      respective component of -mcpu.  */
11737   if (aarch64_cpu_string)
11738     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
11739                                         &cpu_isa);
11740
11741   if (aarch64_arch_string)
11742     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
11743                                           &arch_isa);
11744
11745   if (aarch64_tune_string)
11746     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
11747
11748 #ifdef SUBTARGET_OVERRIDE_OPTIONS
11749   SUBTARGET_OVERRIDE_OPTIONS;
11750 #endif
11751
11752   /* If the user did not specify a processor, choose the default
11753      one for them.  This will be the CPU set during configuration using
11754      --with-cpu, otherwise it is "generic".  */
11755   if (!selected_cpu)
11756     {
11757       if (selected_arch)
11758         {
11759           selected_cpu = &all_cores[selected_arch->ident];
11760           aarch64_isa_flags = arch_isa;
11761           explicit_arch = selected_arch->arch;
11762         }
11763       else
11764         {
11765           /* Get default configure-time CPU.  */
11766           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
11767           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
11768         }
11769
11770       if (selected_tune)
11771         explicit_tune_core = selected_tune->ident;
11772     }
11773   /* If both -mcpu and -march are specified check that they are architecturally
11774      compatible, warn if they're not and prefer the -march ISA flags.  */
11775   else if (selected_arch)
11776     {
11777       if (selected_arch->arch != selected_cpu->arch)
11778         {
11779           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
11780                        all_architectures[selected_cpu->arch].name,
11781                        selected_arch->name);
11782         }
11783       aarch64_isa_flags = arch_isa;
11784       explicit_arch = selected_arch->arch;
11785       explicit_tune_core = selected_tune ? selected_tune->ident
11786                                           : selected_cpu->ident;
11787     }
11788   else
11789     {
11790       /* -mcpu but no -march.  */
11791       aarch64_isa_flags = cpu_isa;
11792       explicit_tune_core = selected_tune ? selected_tune->ident
11793                                           : selected_cpu->ident;
11794       gcc_assert (selected_cpu);
11795       selected_arch = &all_architectures[selected_cpu->arch];
11796       explicit_arch = selected_arch->arch;
11797     }
11798
11799   /* Set the arch as well as we will need it when outputing
11800      the .arch directive in assembly.  */
11801   if (!selected_arch)
11802     {
11803       gcc_assert (selected_cpu);
11804       selected_arch = &all_architectures[selected_cpu->arch];
11805     }
11806
11807   if (!selected_tune)
11808     selected_tune = selected_cpu;
11809
11810 #ifndef HAVE_AS_MABI_OPTION
11811   /* The compiler may have been configured with 2.23.* binutils, which does
11812      not have support for ILP32.  */
11813   if (TARGET_ILP32)
11814     error ("assembler does not support -mabi=ilp32");
11815 #endif
11816
11817   /* Convert -msve-vector-bits to a VG count.  */
11818   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
11819
11820   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
11821     sorry ("return address signing is only supported for -mabi=lp64");
11822
11823   /* Make sure we properly set up the explicit options.  */
11824   if ((aarch64_cpu_string && valid_cpu)
11825        || (aarch64_tune_string && valid_tune))
11826     gcc_assert (explicit_tune_core != aarch64_none);
11827
11828   if ((aarch64_cpu_string && valid_cpu)
11829        || (aarch64_arch_string && valid_arch))
11830     gcc_assert (explicit_arch != aarch64_no_arch);
11831
11832   /* The pass to insert speculation tracking runs before
11833      shrink-wrapping and the latter does not know how to update the
11834      tracking status.  So disable it in this case.  */
11835   if (aarch64_track_speculation)
11836     flag_shrink_wrap = 0;
11837
11838   aarch64_override_options_internal (&global_options);
11839
11840   /* Save these options as the default ones in case we push and pop them later
11841      while processing functions with potential target attributes.  */
11842   target_option_default_node = target_option_current_node
11843       = build_target_option_node (&global_options);
11844 }
11845
11846 /* Implement targetm.override_options_after_change.  */
11847
11848 static void
11849 aarch64_override_options_after_change (void)
11850 {
11851   aarch64_override_options_after_change_1 (&global_options);
11852 }
11853
11854 static struct machine_function *
11855 aarch64_init_machine_status (void)
11856 {
11857   struct machine_function *machine;
11858   machine = ggc_cleared_alloc<machine_function> ();
11859   return machine;
11860 }
11861
11862 void
11863 aarch64_init_expanders (void)
11864 {
11865   init_machine_status = aarch64_init_machine_status;
11866 }
11867
11868 /* A checking mechanism for the implementation of the various code models.  */
11869 static void
11870 initialize_aarch64_code_model (struct gcc_options *opts)
11871 {
11872    if (opts->x_flag_pic)
11873      {
11874        switch (opts->x_aarch64_cmodel_var)
11875          {
11876          case AARCH64_CMODEL_TINY:
11877            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
11878            break;
11879          case AARCH64_CMODEL_SMALL:
11880 #ifdef HAVE_AS_SMALL_PIC_RELOCS
11881            aarch64_cmodel = (flag_pic == 2
11882                              ? AARCH64_CMODEL_SMALL_PIC
11883                              : AARCH64_CMODEL_SMALL_SPIC);
11884 #else
11885            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
11886 #endif
11887            break;
11888          case AARCH64_CMODEL_LARGE:
11889            sorry ("code model %qs with -f%s", "large",
11890                   opts->x_flag_pic > 1 ? "PIC" : "pic");
11891            break;
11892          default:
11893            gcc_unreachable ();
11894          }
11895      }
11896    else
11897      aarch64_cmodel = opts->x_aarch64_cmodel_var;
11898 }
11899
11900 /* Implement TARGET_OPTION_SAVE.  */
11901
11902 static void
11903 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
11904 {
11905   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
11906   ptr->x_aarch64_branch_protection_string
11907     = opts->x_aarch64_branch_protection_string;
11908 }
11909
11910 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
11911    using the information saved in PTR.  */
11912
11913 static void
11914 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
11915 {
11916   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
11917   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11918   opts->x_explicit_arch = ptr->x_explicit_arch;
11919   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
11920   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
11921   opts->x_aarch64_branch_protection_string
11922     = ptr->x_aarch64_branch_protection_string;
11923   if (opts->x_aarch64_branch_protection_string)
11924     {
11925       aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
11926                                         NULL);
11927     }
11928
11929   aarch64_override_options_internal (opts);
11930 }
11931
11932 /* Implement TARGET_OPTION_PRINT.  */
11933
11934 static void
11935 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
11936 {
11937   const struct processor *cpu
11938     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11939   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
11940   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
11941   std::string extension
11942     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
11943
11944   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
11945   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
11946            arch->name, extension.c_str ());
11947 }
11948
11949 static GTY(()) tree aarch64_previous_fndecl;
11950
11951 void
11952 aarch64_reset_previous_fndecl (void)
11953 {
11954   aarch64_previous_fndecl = NULL;
11955 }
11956
11957 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
11958    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
11959    make sure optab availability predicates are recomputed when necessary.  */
11960
11961 void
11962 aarch64_save_restore_target_globals (tree new_tree)
11963 {
11964   if (TREE_TARGET_GLOBALS (new_tree))
11965     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
11966   else if (new_tree == target_option_default_node)
11967     restore_target_globals (&default_target_globals);
11968   else
11969     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
11970 }
11971
11972 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
11973    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
11974    of the function, if such exists.  This function may be called multiple
11975    times on a single function so use aarch64_previous_fndecl to avoid
11976    setting up identical state.  */
11977
11978 static void
11979 aarch64_set_current_function (tree fndecl)
11980 {
11981   if (!fndecl || fndecl == aarch64_previous_fndecl)
11982     return;
11983
11984   tree old_tree = (aarch64_previous_fndecl
11985                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
11986                    : NULL_TREE);
11987
11988   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11989
11990   /* If current function has no attributes but the previous one did,
11991      use the default node.  */
11992   if (!new_tree && old_tree)
11993     new_tree = target_option_default_node;
11994
11995   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
11996      the default have been handled by aarch64_save_restore_target_globals from
11997      aarch64_pragma_target_parse.  */
11998   if (old_tree == new_tree)
11999     return;
12000
12001   aarch64_previous_fndecl = fndecl;
12002
12003   /* First set the target options.  */
12004   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
12005
12006   aarch64_save_restore_target_globals (new_tree);
12007 }
12008
12009 /* Enum describing the various ways we can handle attributes.
12010    In many cases we can reuse the generic option handling machinery.  */
12011
12012 enum aarch64_attr_opt_type
12013 {
12014   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
12015   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
12016   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
12017   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
12018 };
12019
12020 /* All the information needed to handle a target attribute.
12021    NAME is the name of the attribute.
12022    ATTR_TYPE specifies the type of behavior of the attribute as described
12023    in the definition of enum aarch64_attr_opt_type.
12024    ALLOW_NEG is true if the attribute supports a "no-" form.
12025    HANDLER is the function that takes the attribute string as an argument
12026    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
12027    OPT_NUM is the enum specifying the option that the attribute modifies.
12028    This is needed for attributes that mirror the behavior of a command-line
12029    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
12030    aarch64_attr_enum.  */
12031
12032 struct aarch64_attribute_info
12033 {
12034   const char *name;
12035   enum aarch64_attr_opt_type attr_type;
12036   bool allow_neg;
12037   bool (*handler) (const char *);
12038   enum opt_code opt_num;
12039 };
12040
12041 /* Handle the ARCH_STR argument to the arch= target attribute.  */
12042
12043 static bool
12044 aarch64_handle_attr_arch (const char *str)
12045 {
12046   const struct processor *tmp_arch = NULL;
12047   std::string invalid_extension;
12048   enum aarch64_parse_opt_result parse_res
12049     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
12050
12051   if (parse_res == AARCH64_PARSE_OK)
12052     {
12053       gcc_assert (tmp_arch);
12054       selected_arch = tmp_arch;
12055       explicit_arch = selected_arch->arch;
12056       return true;
12057     }
12058
12059   switch (parse_res)
12060     {
12061       case AARCH64_PARSE_MISSING_ARG:
12062         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
12063         break;
12064       case AARCH64_PARSE_INVALID_ARG:
12065         error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
12066         aarch64_print_hint_for_arch (str);
12067         break;
12068       case AARCH64_PARSE_INVALID_FEATURE:
12069         error ("invalid feature modifier %s of value (\"%s\") in "
12070                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12071         aarch64_print_hint_for_extensions (invalid_extension);
12072         break;
12073       default:
12074         gcc_unreachable ();
12075     }
12076
12077   return false;
12078 }
12079
12080 /* Handle the argument CPU_STR to the cpu= target attribute.  */
12081
12082 static bool
12083 aarch64_handle_attr_cpu (const char *str)
12084 {
12085   const struct processor *tmp_cpu = NULL;
12086   std::string invalid_extension;
12087   enum aarch64_parse_opt_result parse_res
12088     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
12089
12090   if (parse_res == AARCH64_PARSE_OK)
12091     {
12092       gcc_assert (tmp_cpu);
12093       selected_tune = tmp_cpu;
12094       explicit_tune_core = selected_tune->ident;
12095
12096       selected_arch = &all_architectures[tmp_cpu->arch];
12097       explicit_arch = selected_arch->arch;
12098       return true;
12099     }
12100
12101   switch (parse_res)
12102     {
12103       case AARCH64_PARSE_MISSING_ARG:
12104         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
12105         break;
12106       case AARCH64_PARSE_INVALID_ARG:
12107         error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
12108         aarch64_print_hint_for_core (str);
12109         break;
12110       case AARCH64_PARSE_INVALID_FEATURE:
12111         error ("invalid feature modifier %s of value (\"%s\") in "
12112                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12113         aarch64_print_hint_for_extensions (invalid_extension);
12114         break;
12115       default:
12116         gcc_unreachable ();
12117     }
12118
12119   return false;
12120 }
12121
12122 /* Handle the argument STR to the branch-protection= attribute.  */
12123
12124  static bool
12125  aarch64_handle_attr_branch_protection (const char* str)
12126  {
12127   char *err_str = (char *) xmalloc (strlen (str));
12128   enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
12129                                                                       &err_str);
12130   bool success = false;
12131   switch (res)
12132     {
12133      case AARCH64_PARSE_MISSING_ARG:
12134        error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
12135               " attribute");
12136        break;
12137      case AARCH64_PARSE_INVALID_ARG:
12138        error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
12139               "=\")%> pragma or attribute", err_str);
12140        break;
12141      case AARCH64_PARSE_OK:
12142        success = true;
12143       /* Fall through.  */
12144      case AARCH64_PARSE_INVALID_FEATURE:
12145        break;
12146      default:
12147        gcc_unreachable ();
12148     }
12149   free (err_str);
12150   return success;
12151  }
12152
12153 /* Handle the argument STR to the tune= target attribute.  */
12154
12155 static bool
12156 aarch64_handle_attr_tune (const char *str)
12157 {
12158   const struct processor *tmp_tune = NULL;
12159   enum aarch64_parse_opt_result parse_res
12160     = aarch64_parse_tune (str, &tmp_tune);
12161
12162   if (parse_res == AARCH64_PARSE_OK)
12163     {
12164       gcc_assert (tmp_tune);
12165       selected_tune = tmp_tune;
12166       explicit_tune_core = selected_tune->ident;
12167       return true;
12168     }
12169
12170   switch (parse_res)
12171     {
12172       case AARCH64_PARSE_INVALID_ARG:
12173         error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
12174         aarch64_print_hint_for_core (str);
12175         break;
12176       default:
12177         gcc_unreachable ();
12178     }
12179
12180   return false;
12181 }
12182
12183 /* Parse an architecture extensions target attribute string specified in STR.
12184    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
12185    if successful.  Update aarch64_isa_flags to reflect the ISA features
12186    modified.  */
12187
12188 static bool
12189 aarch64_handle_attr_isa_flags (char *str)
12190 {
12191   enum aarch64_parse_opt_result parse_res;
12192   unsigned long isa_flags = aarch64_isa_flags;
12193
12194   /* We allow "+nothing" in the beginning to clear out all architectural
12195      features if the user wants to handpick specific features.  */
12196   if (strncmp ("+nothing", str, 8) == 0)
12197     {
12198       isa_flags = 0;
12199       str += 8;
12200     }
12201
12202   std::string invalid_extension;
12203   parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
12204
12205   if (parse_res == AARCH64_PARSE_OK)
12206     {
12207       aarch64_isa_flags = isa_flags;
12208       return true;
12209     }
12210
12211   switch (parse_res)
12212     {
12213       case AARCH64_PARSE_MISSING_ARG:
12214         error ("missing value in %<target()%> pragma or attribute");
12215         break;
12216
12217       case AARCH64_PARSE_INVALID_FEATURE:
12218         error ("invalid feature modifier %s of value (\"%s\") in "
12219                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12220         break;
12221
12222       default:
12223         gcc_unreachable ();
12224     }
12225
12226  return false;
12227 }
12228
12229 /* The target attributes that we support.  On top of these we also support just
12230    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
12231    handled explicitly in aarch64_process_one_target_attr.  */
12232
12233 static const struct aarch64_attribute_info aarch64_attributes[] =
12234 {
12235   { "general-regs-only", aarch64_attr_mask, false, NULL,
12236      OPT_mgeneral_regs_only },
12237   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
12238      OPT_mfix_cortex_a53_835769 },
12239   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
12240      OPT_mfix_cortex_a53_843419 },
12241   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
12242   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
12243   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
12244      OPT_momit_leaf_frame_pointer },
12245   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
12246   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
12247      OPT_march_ },
12248   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
12249   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
12250      OPT_mtune_ },
12251   { "branch-protection", aarch64_attr_custom, false,
12252      aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
12253   { "sign-return-address", aarch64_attr_enum, false, NULL,
12254      OPT_msign_return_address_ },
12255   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
12256 };
12257
12258 /* Parse ARG_STR which contains the definition of one target attribute.
12259    Show appropriate errors if any or return true if the attribute is valid.  */
12260
12261 static bool
12262 aarch64_process_one_target_attr (char *arg_str)
12263 {
12264   bool invert = false;
12265
12266   size_t len = strlen (arg_str);
12267
12268   if (len == 0)
12269     {
12270       error ("malformed %<target()%> pragma or attribute");
12271       return false;
12272     }
12273
12274   char *str_to_check = (char *) alloca (len + 1);
12275   strcpy (str_to_check, arg_str);
12276
12277   /* Skip leading whitespace.  */
12278   while (*str_to_check == ' ' || *str_to_check == '\t')
12279     str_to_check++;
12280
12281   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
12282      It is easier to detect and handle it explicitly here rather than going
12283      through the machinery for the rest of the target attributes in this
12284      function.  */
12285   if (*str_to_check == '+')
12286     return aarch64_handle_attr_isa_flags (str_to_check);
12287
12288   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
12289     {
12290       invert = true;
12291       str_to_check += 3;
12292     }
12293   char *arg = strchr (str_to_check, '=');
12294
12295   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
12296      and point ARG to "foo".  */
12297   if (arg)
12298     {
12299       *arg = '\0';
12300       arg++;
12301     }
12302   const struct aarch64_attribute_info *p_attr;
12303   bool found = false;
12304   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
12305     {
12306       /* If the names don't match up, or the user has given an argument
12307          to an attribute that doesn't accept one, or didn't give an argument
12308          to an attribute that expects one, fail to match.  */
12309       if (strcmp (str_to_check, p_attr->name) != 0)
12310         continue;
12311
12312       found = true;
12313       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
12314                               || p_attr->attr_type == aarch64_attr_enum;
12315
12316       if (attr_need_arg_p ^ (arg != NULL))
12317         {
12318           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
12319           return false;
12320         }
12321
12322       /* If the name matches but the attribute does not allow "no-" versions
12323          then we can't match.  */
12324       if (invert && !p_attr->allow_neg)
12325         {
12326           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
12327           return false;
12328         }
12329
12330       switch (p_attr->attr_type)
12331         {
12332         /* Has a custom handler registered.
12333            For example, cpu=, arch=, tune=.  */
12334           case aarch64_attr_custom:
12335             gcc_assert (p_attr->handler);
12336             if (!p_attr->handler (arg))
12337               return false;
12338             break;
12339
12340           /* Either set or unset a boolean option.  */
12341           case aarch64_attr_bool:
12342             {
12343               struct cl_decoded_option decoded;
12344
12345               generate_option (p_attr->opt_num, NULL, !invert,
12346                                CL_TARGET, &decoded);
12347               aarch64_handle_option (&global_options, &global_options_set,
12348                                       &decoded, input_location);
12349               break;
12350             }
12351           /* Set or unset a bit in the target_flags.  aarch64_handle_option
12352              should know what mask to apply given the option number.  */
12353           case aarch64_attr_mask:
12354             {
12355               struct cl_decoded_option decoded;
12356               /* We only need to specify the option number.
12357                  aarch64_handle_option will know which mask to apply.  */
12358               decoded.opt_index = p_attr->opt_num;
12359               decoded.value = !invert;
12360               aarch64_handle_option (&global_options, &global_options_set,
12361                                       &decoded, input_location);
12362               break;
12363             }
12364           /* Use the option setting machinery to set an option to an enum.  */
12365           case aarch64_attr_enum:
12366             {
12367               gcc_assert (arg);
12368               bool valid;
12369               int value;
12370               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
12371                                               &value, CL_TARGET);
12372               if (valid)
12373                 {
12374                   set_option (&global_options, NULL, p_attr->opt_num, value,
12375                               NULL, DK_UNSPECIFIED, input_location,
12376                               global_dc);
12377                 }
12378               else
12379                 {
12380                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
12381                 }
12382               break;
12383             }
12384           default:
12385             gcc_unreachable ();
12386         }
12387     }
12388
12389   /* If we reached here we either have found an attribute and validated
12390      it or didn't match any.  If we matched an attribute but its arguments
12391      were malformed we will have returned false already.  */
12392   return found;
12393 }
12394
12395 /* Count how many times the character C appears in
12396    NULL-terminated string STR.  */
12397
12398 static unsigned int
12399 num_occurences_in_str (char c, char *str)
12400 {
12401   unsigned int res = 0;
12402   while (*str != '\0')
12403     {
12404       if (*str == c)
12405         res++;
12406
12407       str++;
12408     }
12409
12410   return res;
12411 }
12412
12413 /* Parse the tree in ARGS that contains the target attribute information
12414    and update the global target options space.  */
12415
12416 bool
12417 aarch64_process_target_attr (tree args)
12418 {
12419   if (TREE_CODE (args) == TREE_LIST)
12420     {
12421       do
12422         {
12423           tree head = TREE_VALUE (args);
12424           if (head)
12425             {
12426               if (!aarch64_process_target_attr (head))
12427                 return false;
12428             }
12429           args = TREE_CHAIN (args);
12430         } while (args);
12431
12432       return true;
12433     }
12434
12435   if (TREE_CODE (args) != STRING_CST)
12436     {
12437       error ("attribute %<target%> argument not a string");
12438       return false;
12439     }
12440
12441   size_t len = strlen (TREE_STRING_POINTER (args));
12442   char *str_to_check = (char *) alloca (len + 1);
12443   strcpy (str_to_check, TREE_STRING_POINTER (args));
12444
12445   if (len == 0)
12446     {
12447       error ("malformed %<target()%> pragma or attribute");
12448       return false;
12449     }
12450
12451   /* Used to catch empty spaces between commas i.e.
12452      attribute ((target ("attr1,,attr2"))).  */
12453   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
12454
12455   /* Handle multiple target attributes separated by ','.  */
12456   char *token = strtok_r (str_to_check, ",", &str_to_check);
12457
12458   unsigned int num_attrs = 0;
12459   while (token)
12460     {
12461       num_attrs++;
12462       if (!aarch64_process_one_target_attr (token))
12463         {
12464           error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
12465           return false;
12466         }
12467
12468       token = strtok_r (NULL, ",", &str_to_check);
12469     }
12470
12471   if (num_attrs != num_commas + 1)
12472     {
12473       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
12474       return false;
12475     }
12476
12477   return true;
12478 }
12479
12480 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
12481    process attribute ((target ("..."))).  */
12482
12483 static bool
12484 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
12485 {
12486   struct cl_target_option cur_target;
12487   bool ret;
12488   tree old_optimize;
12489   tree new_target, new_optimize;
12490   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12491
12492   /* If what we're processing is the current pragma string then the
12493      target option node is already stored in target_option_current_node
12494      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
12495      having to re-parse the string.  This is especially useful to keep
12496      arm_neon.h compile times down since that header contains a lot
12497      of intrinsics enclosed in pragmas.  */
12498   if (!existing_target && args == current_target_pragma)
12499     {
12500       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
12501       return true;
12502     }
12503   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
12504
12505   old_optimize = build_optimization_node (&global_options);
12506   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
12507
12508   /* If the function changed the optimization levels as well as setting
12509      target options, start with the optimizations specified.  */
12510   if (func_optimize && func_optimize != old_optimize)
12511     cl_optimization_restore (&global_options,
12512                              TREE_OPTIMIZATION (func_optimize));
12513
12514   /* Save the current target options to restore at the end.  */
12515   cl_target_option_save (&cur_target, &global_options);
12516
12517   /* If fndecl already has some target attributes applied to it, unpack
12518      them so that we add this attribute on top of them, rather than
12519      overwriting them.  */
12520   if (existing_target)
12521     {
12522       struct cl_target_option *existing_options
12523         = TREE_TARGET_OPTION (existing_target);
12524
12525       if (existing_options)
12526         cl_target_option_restore (&global_options, existing_options);
12527     }
12528   else
12529     cl_target_option_restore (&global_options,
12530                         TREE_TARGET_OPTION (target_option_current_node));
12531
12532   ret = aarch64_process_target_attr (args);
12533
12534   /* Set up any additional state.  */
12535   if (ret)
12536     {
12537       aarch64_override_options_internal (&global_options);
12538       /* Initialize SIMD builtins if we haven't already.
12539          Set current_target_pragma to NULL for the duration so that
12540          the builtin initialization code doesn't try to tag the functions
12541          being built with the attributes specified by any current pragma, thus
12542          going into an infinite recursion.  */
12543       if (TARGET_SIMD)
12544         {
12545           tree saved_current_target_pragma = current_target_pragma;
12546           current_target_pragma = NULL;
12547           aarch64_init_simd_builtins ();
12548           current_target_pragma = saved_current_target_pragma;
12549         }
12550       new_target = build_target_option_node (&global_options);
12551     }
12552   else
12553     new_target = NULL;
12554
12555   new_optimize = build_optimization_node (&global_options);
12556
12557   if (fndecl && ret)
12558     {
12559       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
12560
12561       if (old_optimize != new_optimize)
12562         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
12563     }
12564
12565   cl_target_option_restore (&global_options, &cur_target);
12566
12567   if (old_optimize != new_optimize)
12568     cl_optimization_restore (&global_options,
12569                              TREE_OPTIMIZATION (old_optimize));
12570   return ret;
12571 }
12572
12573 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
12574    tri-bool options (yes, no, don't care) and the default value is
12575    DEF, determine whether to reject inlining.  */
12576
12577 static bool
12578 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
12579                                      int dont_care, int def)
12580 {
12581   /* If the callee doesn't care, always allow inlining.  */
12582   if (callee == dont_care)
12583     return true;
12584
12585   /* If the caller doesn't care, always allow inlining.  */
12586   if (caller == dont_care)
12587     return true;
12588
12589   /* Otherwise, allow inlining if either the callee and caller values
12590      agree, or if the callee is using the default value.  */
12591   return (callee == caller || callee == def);
12592 }
12593
12594 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
12595    to inline CALLEE into CALLER based on target-specific info.
12596    Make sure that the caller and callee have compatible architectural
12597    features.  Then go through the other possible target attributes
12598    and see if they can block inlining.  Try not to reject always_inline
12599    callees unless they are incompatible architecturally.  */
12600
12601 static bool
12602 aarch64_can_inline_p (tree caller, tree callee)
12603 {
12604   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
12605   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
12606
12607   struct cl_target_option *caller_opts
12608         = TREE_TARGET_OPTION (caller_tree ? caller_tree
12609                                            : target_option_default_node);
12610
12611   struct cl_target_option *callee_opts
12612         = TREE_TARGET_OPTION (callee_tree ? callee_tree
12613                                            : target_option_default_node);
12614
12615   /* Callee's ISA flags should be a subset of the caller's.  */
12616   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
12617        != callee_opts->x_aarch64_isa_flags)
12618     return false;
12619
12620   /* Allow non-strict aligned functions inlining into strict
12621      aligned ones.  */
12622   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
12623        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
12624       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
12625            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
12626     return false;
12627
12628   bool always_inline = lookup_attribute ("always_inline",
12629                                           DECL_ATTRIBUTES (callee));
12630
12631   /* If the architectural features match up and the callee is always_inline
12632      then the other attributes don't matter.  */
12633   if (always_inline)
12634     return true;
12635
12636   if (caller_opts->x_aarch64_cmodel_var
12637       != callee_opts->x_aarch64_cmodel_var)
12638     return false;
12639
12640   if (caller_opts->x_aarch64_tls_dialect
12641       != callee_opts->x_aarch64_tls_dialect)
12642     return false;
12643
12644   /* Honour explicit requests to workaround errata.  */
12645   if (!aarch64_tribools_ok_for_inlining_p (
12646           caller_opts->x_aarch64_fix_a53_err835769,
12647           callee_opts->x_aarch64_fix_a53_err835769,
12648           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
12649     return false;
12650
12651   if (!aarch64_tribools_ok_for_inlining_p (
12652           caller_opts->x_aarch64_fix_a53_err843419,
12653           callee_opts->x_aarch64_fix_a53_err843419,
12654           2, TARGET_FIX_ERR_A53_843419))
12655     return false;
12656
12657   /* If the user explicitly specified -momit-leaf-frame-pointer for the
12658      caller and calle and they don't match up, reject inlining.  */
12659   if (!aarch64_tribools_ok_for_inlining_p (
12660           caller_opts->x_flag_omit_leaf_frame_pointer,
12661           callee_opts->x_flag_omit_leaf_frame_pointer,
12662           2, 1))
12663     return false;
12664
12665   /* If the callee has specific tuning overrides, respect them.  */
12666   if (callee_opts->x_aarch64_override_tune_string != NULL
12667       && caller_opts->x_aarch64_override_tune_string == NULL)
12668     return false;
12669
12670   /* If the user specified tuning override strings for the
12671      caller and callee and they don't match up, reject inlining.
12672      We just do a string compare here, we don't analyze the meaning
12673      of the string, as it would be too costly for little gain.  */
12674   if (callee_opts->x_aarch64_override_tune_string
12675       && caller_opts->x_aarch64_override_tune_string
12676       && (strcmp (callee_opts->x_aarch64_override_tune_string,
12677                   caller_opts->x_aarch64_override_tune_string) != 0))
12678     return false;
12679
12680   return true;
12681 }
12682
12683 /* Return true if SYMBOL_REF X binds locally.  */
12684
12685 static bool
12686 aarch64_symbol_binds_local_p (const_rtx x)
12687 {
12688   return (SYMBOL_REF_DECL (x)
12689           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
12690           : SYMBOL_REF_LOCAL_P (x));
12691 }
12692
12693 /* Return true if SYMBOL_REF X is thread local */
12694 static bool
12695 aarch64_tls_symbol_p (rtx x)
12696 {
12697   if (! TARGET_HAVE_TLS)
12698     return false;
12699
12700   if (GET_CODE (x) != SYMBOL_REF)
12701     return false;
12702
12703   return SYMBOL_REF_TLS_MODEL (x) != 0;
12704 }
12705
12706 /* Classify a TLS symbol into one of the TLS kinds.  */
12707 enum aarch64_symbol_type
12708 aarch64_classify_tls_symbol (rtx x)
12709 {
12710   enum tls_model tls_kind = tls_symbolic_operand_type (x);
12711
12712   switch (tls_kind)
12713     {
12714     case TLS_MODEL_GLOBAL_DYNAMIC:
12715     case TLS_MODEL_LOCAL_DYNAMIC:
12716       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
12717
12718     case TLS_MODEL_INITIAL_EXEC:
12719       switch (aarch64_cmodel)
12720         {
12721         case AARCH64_CMODEL_TINY:
12722         case AARCH64_CMODEL_TINY_PIC:
12723           return SYMBOL_TINY_TLSIE;
12724         default:
12725           return SYMBOL_SMALL_TLSIE;
12726         }
12727
12728     case TLS_MODEL_LOCAL_EXEC:
12729       if (aarch64_tls_size == 12)
12730         return SYMBOL_TLSLE12;
12731       else if (aarch64_tls_size == 24)
12732         return SYMBOL_TLSLE24;
12733       else if (aarch64_tls_size == 32)
12734         return SYMBOL_TLSLE32;
12735       else if (aarch64_tls_size == 48)
12736         return SYMBOL_TLSLE48;
12737       else
12738         gcc_unreachable ();
12739
12740     case TLS_MODEL_EMULATED:
12741     case TLS_MODEL_NONE:
12742       return SYMBOL_FORCE_TO_MEM;
12743
12744     default:
12745       gcc_unreachable ();
12746     }
12747 }
12748
12749 /* Return the correct method for accessing X + OFFSET, where X is either
12750    a SYMBOL_REF or LABEL_REF.  */
12751
12752 enum aarch64_symbol_type
12753 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
12754 {
12755   if (GET_CODE (x) == LABEL_REF)
12756     {
12757       switch (aarch64_cmodel)
12758         {
12759         case AARCH64_CMODEL_LARGE:
12760           return SYMBOL_FORCE_TO_MEM;
12761
12762         case AARCH64_CMODEL_TINY_PIC:
12763         case AARCH64_CMODEL_TINY:
12764           return SYMBOL_TINY_ABSOLUTE;
12765
12766         case AARCH64_CMODEL_SMALL_SPIC:
12767         case AARCH64_CMODEL_SMALL_PIC:
12768         case AARCH64_CMODEL_SMALL:
12769           return SYMBOL_SMALL_ABSOLUTE;
12770
12771         default:
12772           gcc_unreachable ();
12773         }
12774     }
12775
12776   if (GET_CODE (x) == SYMBOL_REF)
12777     {
12778       if (aarch64_tls_symbol_p (x))
12779         return aarch64_classify_tls_symbol (x);
12780
12781       switch (aarch64_cmodel)
12782         {
12783         case AARCH64_CMODEL_TINY:
12784           /* When we retrieve symbol + offset address, we have to make sure
12785              the offset does not cause overflow of the final address.  But
12786              we have no way of knowing the address of symbol at compile time
12787              so we can't accurately say if the distance between the PC and
12788              symbol + offset is outside the addressible range of +/-1M in the
12789              TINY code model.  So we rely on images not being greater than
12790              1M and cap the offset at 1M and anything beyond 1M will have to
12791              be loaded using an alternative mechanism.  Furthermore if the
12792              symbol is a weak reference to something that isn't known to
12793              resolve to a symbol in this module, then force to memory.  */
12794           if ((SYMBOL_REF_WEAK (x)
12795                && !aarch64_symbol_binds_local_p (x))
12796               || !IN_RANGE (offset, -1048575, 1048575))
12797             return SYMBOL_FORCE_TO_MEM;
12798           return SYMBOL_TINY_ABSOLUTE;
12799
12800         case AARCH64_CMODEL_SMALL:
12801           /* Same reasoning as the tiny code model, but the offset cap here is
12802              4G.  */
12803           if ((SYMBOL_REF_WEAK (x)
12804                && !aarch64_symbol_binds_local_p (x))
12805               || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
12806                             HOST_WIDE_INT_C (4294967264)))
12807             return SYMBOL_FORCE_TO_MEM;
12808           return SYMBOL_SMALL_ABSOLUTE;
12809
12810         case AARCH64_CMODEL_TINY_PIC:
12811           if (!aarch64_symbol_binds_local_p (x))
12812             return SYMBOL_TINY_GOT;
12813           return SYMBOL_TINY_ABSOLUTE;
12814
12815         case AARCH64_CMODEL_SMALL_SPIC:
12816         case AARCH64_CMODEL_SMALL_PIC:
12817           if (!aarch64_symbol_binds_local_p (x))
12818             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
12819                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
12820           return SYMBOL_SMALL_ABSOLUTE;
12821
12822         case AARCH64_CMODEL_LARGE:
12823           /* This is alright even in PIC code as the constant
12824              pool reference is always PC relative and within
12825              the same translation unit.  */
12826           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
12827             return SYMBOL_SMALL_ABSOLUTE;
12828           else
12829             return SYMBOL_FORCE_TO_MEM;
12830
12831         default:
12832           gcc_unreachable ();
12833         }
12834     }
12835
12836   /* By default push everything into the constant pool.  */
12837   return SYMBOL_FORCE_TO_MEM;
12838 }
12839
12840 bool
12841 aarch64_constant_address_p (rtx x)
12842 {
12843   return (CONSTANT_P (x) && memory_address_p (DImode, x));
12844 }
12845
12846 bool
12847 aarch64_legitimate_pic_operand_p (rtx x)
12848 {
12849   if (GET_CODE (x) == SYMBOL_REF
12850       || (GET_CODE (x) == CONST
12851           && GET_CODE (XEXP (x, 0)) == PLUS
12852           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
12853      return false;
12854
12855   return true;
12856 }
12857
12858 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
12859    that should be rematerialized rather than spilled.  */
12860
12861 static bool
12862 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
12863 {
12864   /* Support CSE and rematerialization of common constants.  */
12865   if (CONST_INT_P (x)
12866       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
12867       || GET_CODE (x) == CONST_VECTOR)
12868     return true;
12869
12870   /* Do not allow vector struct mode constants for Advanced SIMD.
12871      We could support 0 and -1 easily, but they need support in
12872      aarch64-simd.md.  */
12873   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12874   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
12875     return false;
12876
12877   /* Only accept variable-length vector constants if they can be
12878      handled directly.
12879
12880      ??? It would be possible to handle rematerialization of other
12881      constants via secondary reloads.  */
12882   if (vec_flags & VEC_ANY_SVE)
12883     return aarch64_simd_valid_immediate (x, NULL);
12884
12885   if (GET_CODE (x) == HIGH)
12886     x = XEXP (x, 0);
12887
12888   /* Accept polynomial constants that can be calculated by using the
12889      destination of a move as the sole temporary.  Constants that
12890      require a second temporary cannot be rematerialized (they can't be
12891      forced to memory and also aren't legitimate constants).  */
12892   poly_int64 offset;
12893   if (poly_int_rtx_p (x, &offset))
12894     return aarch64_offset_temporaries (false, offset) <= 1;
12895
12896   /* If an offset is being added to something else, we need to allow the
12897      base to be moved into the destination register, meaning that there
12898      are no free temporaries for the offset.  */
12899   x = strip_offset (x, &offset);
12900   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
12901     return false;
12902
12903   /* Do not allow const (plus (anchor_symbol, const_int)).  */
12904   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
12905     return false;
12906
12907   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
12908      so spilling them is better than rematerialization.  */
12909   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
12910     return true;
12911
12912   /* Label references are always constant.  */
12913   if (GET_CODE (x) == LABEL_REF)
12914     return true;
12915
12916   return false;
12917 }
12918
12919 rtx
12920 aarch64_load_tp (rtx target)
12921 {
12922   if (!target
12923       || GET_MODE (target) != Pmode
12924       || !register_operand (target, Pmode))
12925     target = gen_reg_rtx (Pmode);
12926
12927   /* Can return in any reg.  */
12928   emit_insn (gen_aarch64_load_tp_hard (target));
12929   return target;
12930 }
12931
12932 /* On AAPCS systems, this is the "struct __va_list".  */
12933 static GTY(()) tree va_list_type;
12934
12935 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
12936    Return the type to use as __builtin_va_list.
12937
12938    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
12939
12940    struct __va_list
12941    {
12942      void *__stack;
12943      void *__gr_top;
12944      void *__vr_top;
12945      int   __gr_offs;
12946      int   __vr_offs;
12947    };  */
12948
12949 static tree
12950 aarch64_build_builtin_va_list (void)
12951 {
12952   tree va_list_name;
12953   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12954
12955   /* Create the type.  */
12956   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
12957   /* Give it the required name.  */
12958   va_list_name = build_decl (BUILTINS_LOCATION,
12959                              TYPE_DECL,
12960                              get_identifier ("__va_list"),
12961                              va_list_type);
12962   DECL_ARTIFICIAL (va_list_name) = 1;
12963   TYPE_NAME (va_list_type) = va_list_name;
12964   TYPE_STUB_DECL (va_list_type) = va_list_name;
12965
12966   /* Create the fields.  */
12967   f_stack = build_decl (BUILTINS_LOCATION,
12968                         FIELD_DECL, get_identifier ("__stack"),
12969                         ptr_type_node);
12970   f_grtop = build_decl (BUILTINS_LOCATION,
12971                         FIELD_DECL, get_identifier ("__gr_top"),
12972                         ptr_type_node);
12973   f_vrtop = build_decl (BUILTINS_LOCATION,
12974                         FIELD_DECL, get_identifier ("__vr_top"),
12975                         ptr_type_node);
12976   f_groff = build_decl (BUILTINS_LOCATION,
12977                         FIELD_DECL, get_identifier ("__gr_offs"),
12978                         integer_type_node);
12979   f_vroff = build_decl (BUILTINS_LOCATION,
12980                         FIELD_DECL, get_identifier ("__vr_offs"),
12981                         integer_type_node);
12982
12983   /* Tell tree-stdarg pass about our internal offset fields.
12984      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
12985      purpose to identify whether the code is updating va_list internal
12986      offset fields through irregular way.  */
12987   va_list_gpr_counter_field = f_groff;
12988   va_list_fpr_counter_field = f_vroff;
12989
12990   DECL_ARTIFICIAL (f_stack) = 1;
12991   DECL_ARTIFICIAL (f_grtop) = 1;
12992   DECL_ARTIFICIAL (f_vrtop) = 1;
12993   DECL_ARTIFICIAL (f_groff) = 1;
12994   DECL_ARTIFICIAL (f_vroff) = 1;
12995
12996   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
12997   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
12998   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
12999   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
13000   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
13001
13002   TYPE_FIELDS (va_list_type) = f_stack;
13003   DECL_CHAIN (f_stack) = f_grtop;
13004   DECL_CHAIN (f_grtop) = f_vrtop;
13005   DECL_CHAIN (f_vrtop) = f_groff;
13006   DECL_CHAIN (f_groff) = f_vroff;
13007
13008   /* Compute its layout.  */
13009   layout_type (va_list_type);
13010
13011   return va_list_type;
13012 }
13013
13014 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
13015 static void
13016 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
13017 {
13018   const CUMULATIVE_ARGS *cum;
13019   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13020   tree stack, grtop, vrtop, groff, vroff;
13021   tree t;
13022   int gr_save_area_size = cfun->va_list_gpr_size;
13023   int vr_save_area_size = cfun->va_list_fpr_size;
13024   int vr_offset;
13025
13026   cum = &crtl->args.info;
13027   if (cfun->va_list_gpr_size)
13028     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
13029                              cfun->va_list_gpr_size);
13030   if (cfun->va_list_fpr_size)
13031     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
13032                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
13033
13034   if (!TARGET_FLOAT)
13035     {
13036       gcc_assert (cum->aapcs_nvrn == 0);
13037       vr_save_area_size = 0;
13038     }
13039
13040   f_stack = TYPE_FIELDS (va_list_type_node);
13041   f_grtop = DECL_CHAIN (f_stack);
13042   f_vrtop = DECL_CHAIN (f_grtop);
13043   f_groff = DECL_CHAIN (f_vrtop);
13044   f_vroff = DECL_CHAIN (f_groff);
13045
13046   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
13047                   NULL_TREE);
13048   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
13049                   NULL_TREE);
13050   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
13051                   NULL_TREE);
13052   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
13053                   NULL_TREE);
13054   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
13055                   NULL_TREE);
13056
13057   /* Emit code to initialize STACK, which points to the next varargs stack
13058      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
13059      by named arguments.  STACK is 8-byte aligned.  */
13060   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
13061   if (cum->aapcs_stack_size > 0)
13062     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
13063   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
13064   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13065
13066   /* Emit code to initialize GRTOP, the top of the GR save area.
13067      virtual_incoming_args_rtx should have been 16 byte aligned.  */
13068   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
13069   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
13070   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13071
13072   /* Emit code to initialize VRTOP, the top of the VR save area.
13073      This address is gr_save_area_bytes below GRTOP, rounded
13074      down to the next 16-byte boundary.  */
13075   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
13076   vr_offset = ROUND_UP (gr_save_area_size,
13077                         STACK_BOUNDARY / BITS_PER_UNIT);
13078
13079   if (vr_offset)
13080     t = fold_build_pointer_plus_hwi (t, -vr_offset);
13081   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
13082   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13083
13084   /* Emit code to initialize GROFF, the offset from GRTOP of the
13085      next GPR argument.  */
13086   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
13087               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
13088   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13089
13090   /* Likewise emit code to initialize VROFF, the offset from FTOP
13091      of the next VR argument.  */
13092   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
13093               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
13094   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13095 }
13096
13097 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
13098
13099 static tree
13100 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
13101                               gimple_seq *post_p ATTRIBUTE_UNUSED)
13102 {
13103   tree addr;
13104   bool indirect_p;
13105   bool is_ha;           /* is HFA or HVA.  */
13106   bool dw_align;        /* double-word align.  */
13107   machine_mode ag_mode = VOIDmode;
13108   int nregs;
13109   machine_mode mode;
13110
13111   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13112   tree stack, f_top, f_off, off, arg, roundup, on_stack;
13113   HOST_WIDE_INT size, rsize, adjust, align;
13114   tree t, u, cond1, cond2;
13115
13116   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
13117   if (indirect_p)
13118     type = build_pointer_type (type);
13119
13120   mode = TYPE_MODE (type);
13121
13122   f_stack = TYPE_FIELDS (va_list_type_node);
13123   f_grtop = DECL_CHAIN (f_stack);
13124   f_vrtop = DECL_CHAIN (f_grtop);
13125   f_groff = DECL_CHAIN (f_vrtop);
13126   f_vroff = DECL_CHAIN (f_groff);
13127
13128   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
13129                   f_stack, NULL_TREE);
13130   size = int_size_in_bytes (type);
13131   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
13132
13133   dw_align = false;
13134   adjust = 0;
13135   if (aarch64_vfp_is_call_or_return_candidate (mode,
13136                                                type,
13137                                                &ag_mode,
13138                                                &nregs,
13139                                                &is_ha))
13140     {
13141       /* No frontends can create types with variable-sized modes, so we
13142          shouldn't be asked to pass or return them.  */
13143       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
13144
13145       /* TYPE passed in fp/simd registers.  */
13146       if (!TARGET_FLOAT)
13147         aarch64_err_no_fpadvsimd (mode);
13148
13149       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
13150                       unshare_expr (valist), f_vrtop, NULL_TREE);
13151       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
13152                       unshare_expr (valist), f_vroff, NULL_TREE);
13153
13154       rsize = nregs * UNITS_PER_VREG;
13155
13156       if (is_ha)
13157         {
13158           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
13159             adjust = UNITS_PER_VREG - ag_size;
13160         }
13161       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13162                && size < UNITS_PER_VREG)
13163         {
13164           adjust = UNITS_PER_VREG - size;
13165         }
13166     }
13167   else
13168     {
13169       /* TYPE passed in general registers.  */
13170       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
13171                       unshare_expr (valist), f_grtop, NULL_TREE);
13172       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
13173                       unshare_expr (valist), f_groff, NULL_TREE);
13174       rsize = ROUND_UP (size, UNITS_PER_WORD);
13175       nregs = rsize / UNITS_PER_WORD;
13176
13177       if (align > 8)
13178         dw_align = true;
13179
13180       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13181           && size < UNITS_PER_WORD)
13182         {
13183           adjust = UNITS_PER_WORD  - size;
13184         }
13185     }
13186
13187   /* Get a local temporary for the field value.  */
13188   off = get_initialized_tmp_var (f_off, pre_p, NULL);
13189
13190   /* Emit code to branch if off >= 0.  */
13191   t = build2 (GE_EXPR, boolean_type_node, off,
13192               build_int_cst (TREE_TYPE (off), 0));
13193   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
13194
13195   if (dw_align)
13196     {
13197       /* Emit: offs = (offs + 15) & -16.  */
13198       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
13199                   build_int_cst (TREE_TYPE (off), 15));
13200       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
13201                   build_int_cst (TREE_TYPE (off), -16));
13202       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
13203     }
13204   else
13205     roundup = NULL;
13206
13207   /* Update ap.__[g|v]r_offs  */
13208   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
13209               build_int_cst (TREE_TYPE (off), rsize));
13210   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
13211
13212   /* String up.  */
13213   if (roundup)
13214     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
13215
13216   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
13217   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
13218               build_int_cst (TREE_TYPE (f_off), 0));
13219   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
13220
13221   /* String up: make sure the assignment happens before the use.  */
13222   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
13223   COND_EXPR_ELSE (cond1) = t;
13224
13225   /* Prepare the trees handling the argument that is passed on the stack;
13226      the top level node will store in ON_STACK.  */
13227   arg = get_initialized_tmp_var (stack, pre_p, NULL);
13228   if (align > 8)
13229     {
13230       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
13231       t = fold_build_pointer_plus_hwi (arg, 15);
13232       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
13233                   build_int_cst (TREE_TYPE (t), -16));
13234       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
13235     }
13236   else
13237     roundup = NULL;
13238   /* Advance ap.__stack  */
13239   t = fold_build_pointer_plus_hwi (arg, size + 7);
13240   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
13241               build_int_cst (TREE_TYPE (t), -8));
13242   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
13243   /* String up roundup and advance.  */
13244   if (roundup)
13245     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
13246   /* String up with arg */
13247   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
13248   /* Big-endianness related address adjustment.  */
13249   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13250       && size < UNITS_PER_WORD)
13251   {
13252     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
13253                 size_int (UNITS_PER_WORD - size));
13254     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
13255   }
13256
13257   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
13258   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
13259
13260   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
13261   t = off;
13262   if (adjust)
13263     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
13264                 build_int_cst (TREE_TYPE (off), adjust));
13265
13266   t = fold_convert (sizetype, t);
13267   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
13268
13269   if (is_ha)
13270     {
13271       /* type ha; // treat as "struct {ftype field[n];}"
13272          ... [computing offs]
13273          for (i = 0; i <nregs; ++i, offs += 16)
13274            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
13275          return ha;  */
13276       int i;
13277       tree tmp_ha, field_t, field_ptr_t;
13278
13279       /* Declare a local variable.  */
13280       tmp_ha = create_tmp_var_raw (type, "ha");
13281       gimple_add_tmp_var (tmp_ha);
13282
13283       /* Establish the base type.  */
13284       switch (ag_mode)
13285         {
13286         case E_SFmode:
13287           field_t = float_type_node;
13288           field_ptr_t = float_ptr_type_node;
13289           break;
13290         case E_DFmode:
13291           field_t = double_type_node;
13292           field_ptr_t = double_ptr_type_node;
13293           break;
13294         case E_TFmode:
13295           field_t = long_double_type_node;
13296           field_ptr_t = long_double_ptr_type_node;
13297           break;
13298         case E_HFmode:
13299           field_t = aarch64_fp16_type_node;
13300           field_ptr_t = aarch64_fp16_ptr_type_node;
13301           break;
13302         case E_V2SImode:
13303         case E_V4SImode:
13304             {
13305               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
13306               field_t = build_vector_type_for_mode (innertype, ag_mode);
13307               field_ptr_t = build_pointer_type (field_t);
13308             }
13309           break;
13310         default:
13311           gcc_assert (0);
13312         }
13313
13314       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
13315       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
13316       addr = t;
13317       t = fold_convert (field_ptr_t, addr);
13318       t = build2 (MODIFY_EXPR, field_t,
13319                   build1 (INDIRECT_REF, field_t, tmp_ha),
13320                   build1 (INDIRECT_REF, field_t, t));
13321
13322       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
13323       for (i = 1; i < nregs; ++i)
13324         {
13325           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
13326           u = fold_convert (field_ptr_t, addr);
13327           u = build2 (MODIFY_EXPR, field_t,
13328                       build2 (MEM_REF, field_t, tmp_ha,
13329                               build_int_cst (field_ptr_t,
13330                                              (i *
13331                                               int_size_in_bytes (field_t)))),
13332                       build1 (INDIRECT_REF, field_t, u));
13333           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
13334         }
13335
13336       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
13337       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
13338     }
13339
13340   COND_EXPR_ELSE (cond2) = t;
13341   addr = fold_convert (build_pointer_type (type), cond1);
13342   addr = build_va_arg_indirect_ref (addr);
13343
13344   if (indirect_p)
13345     addr = build_va_arg_indirect_ref (addr);
13346
13347   return addr;
13348 }
13349
13350 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
13351
13352 static void
13353 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
13354                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
13355                                 int no_rtl)
13356 {
13357   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
13358   CUMULATIVE_ARGS local_cum;
13359   int gr_saved = cfun->va_list_gpr_size;
13360   int vr_saved = cfun->va_list_fpr_size;
13361
13362   /* The caller has advanced CUM up to, but not beyond, the last named
13363      argument.  Advance a local copy of CUM past the last "real" named
13364      argument, to find out how many registers are left over.  */
13365   local_cum = *cum;
13366   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
13367
13368   /* Found out how many registers we need to save.
13369      Honor tree-stdvar analysis results.  */
13370   if (cfun->va_list_gpr_size)
13371     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
13372                     cfun->va_list_gpr_size / UNITS_PER_WORD);
13373   if (cfun->va_list_fpr_size)
13374     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
13375                     cfun->va_list_fpr_size / UNITS_PER_VREG);
13376
13377   if (!TARGET_FLOAT)
13378     {
13379       gcc_assert (local_cum.aapcs_nvrn == 0);
13380       vr_saved = 0;
13381     }
13382
13383   if (!no_rtl)
13384     {
13385       if (gr_saved > 0)
13386         {
13387           rtx ptr, mem;
13388
13389           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
13390           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
13391                                - gr_saved * UNITS_PER_WORD);
13392           mem = gen_frame_mem (BLKmode, ptr);
13393           set_mem_alias_set (mem, get_varargs_alias_set ());
13394
13395           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
13396                                mem, gr_saved);
13397         }
13398       if (vr_saved > 0)
13399         {
13400           /* We can't use move_block_from_reg, because it will use
13401              the wrong mode, storing D regs only.  */
13402           machine_mode mode = TImode;
13403           int off, i, vr_start;
13404
13405           /* Set OFF to the offset from virtual_incoming_args_rtx of
13406              the first vector register.  The VR save area lies below
13407              the GR one, and is aligned to 16 bytes.  */
13408           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
13409                            STACK_BOUNDARY / BITS_PER_UNIT);
13410           off -= vr_saved * UNITS_PER_VREG;
13411
13412           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
13413           for (i = 0; i < vr_saved; ++i)
13414             {
13415               rtx ptr, mem;
13416
13417               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
13418               mem = gen_frame_mem (mode, ptr);
13419               set_mem_alias_set (mem, get_varargs_alias_set ());
13420               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
13421               off += UNITS_PER_VREG;
13422             }
13423         }
13424     }
13425
13426   /* We don't save the size into *PRETEND_SIZE because we want to avoid
13427      any complication of having crtl->args.pretend_args_size changed.  */
13428   cfun->machine->frame.saved_varargs_size
13429     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
13430                  STACK_BOUNDARY / BITS_PER_UNIT)
13431        + vr_saved * UNITS_PER_VREG);
13432 }
13433
13434 static void
13435 aarch64_conditional_register_usage (void)
13436 {
13437   int i;
13438   if (!TARGET_FLOAT)
13439     {
13440       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
13441         {
13442           fixed_regs[i] = 1;
13443           call_used_regs[i] = 1;
13444         }
13445     }
13446   if (!TARGET_SVE)
13447     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
13448       {
13449         fixed_regs[i] = 1;
13450         call_used_regs[i] = 1;
13451       }
13452
13453   /* When tracking speculation, we need a couple of call-clobbered registers
13454      to track the speculation state.  It would be nice to just use
13455      IP0 and IP1, but currently there are numerous places that just
13456      assume these registers are free for other uses (eg pointer
13457      authentication).  */
13458   if (aarch64_track_speculation)
13459     {
13460       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
13461       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
13462       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
13463       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
13464     }
13465 }
13466
13467 /* Walk down the type tree of TYPE counting consecutive base elements.
13468    If *MODEP is VOIDmode, then set it to the first valid floating point
13469    type.  If a non-floating point type is found, or if a floating point
13470    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
13471    otherwise return the count in the sub-tree.  */
13472 static int
13473 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
13474 {
13475   machine_mode mode;
13476   HOST_WIDE_INT size;
13477
13478   switch (TREE_CODE (type))
13479     {
13480     case REAL_TYPE:
13481       mode = TYPE_MODE (type);
13482       if (mode != DFmode && mode != SFmode
13483           && mode != TFmode && mode != HFmode)
13484         return -1;
13485
13486       if (*modep == VOIDmode)
13487         *modep = mode;
13488
13489       if (*modep == mode)
13490         return 1;
13491
13492       break;
13493
13494     case COMPLEX_TYPE:
13495       mode = TYPE_MODE (TREE_TYPE (type));
13496       if (mode != DFmode && mode != SFmode
13497           && mode != TFmode && mode != HFmode)
13498         return -1;
13499
13500       if (*modep == VOIDmode)
13501         *modep = mode;
13502
13503       if (*modep == mode)
13504         return 2;
13505
13506       break;
13507
13508     case VECTOR_TYPE:
13509       /* Use V2SImode and V4SImode as representatives of all 64-bit
13510          and 128-bit vector types.  */
13511       size = int_size_in_bytes (type);
13512       switch (size)
13513         {
13514         case 8:
13515           mode = V2SImode;
13516           break;
13517         case 16:
13518           mode = V4SImode;
13519           break;
13520         default:
13521           return -1;
13522         }
13523
13524       if (*modep == VOIDmode)
13525         *modep = mode;
13526
13527       /* Vector modes are considered to be opaque: two vectors are
13528          equivalent for the purposes of being homogeneous aggregates
13529          if they are the same size.  */
13530       if (*modep == mode)
13531         return 1;
13532
13533       break;
13534
13535     case ARRAY_TYPE:
13536       {
13537         int count;
13538         tree index = TYPE_DOMAIN (type);
13539
13540         /* Can't handle incomplete types nor sizes that are not
13541            fixed.  */
13542         if (!COMPLETE_TYPE_P (type)
13543             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13544           return -1;
13545
13546         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
13547         if (count == -1
13548             || !index
13549             || !TYPE_MAX_VALUE (index)
13550             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
13551             || !TYPE_MIN_VALUE (index)
13552             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
13553             || count < 0)
13554           return -1;
13555
13556         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
13557                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
13558
13559         /* There must be no padding.  */
13560         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13561                       count * GET_MODE_BITSIZE (*modep)))
13562           return -1;
13563
13564         return count;
13565       }
13566
13567     case RECORD_TYPE:
13568       {
13569         int count = 0;
13570         int sub_count;
13571         tree field;
13572
13573         /* Can't handle incomplete types nor sizes that are not
13574            fixed.  */
13575         if (!COMPLETE_TYPE_P (type)
13576             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13577           return -1;
13578
13579         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
13580           {
13581             if (TREE_CODE (field) != FIELD_DECL)
13582               continue;
13583
13584             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
13585             if (sub_count < 0)
13586               return -1;
13587             count += sub_count;
13588           }
13589
13590         /* There must be no padding.  */
13591         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13592                       count * GET_MODE_BITSIZE (*modep)))
13593           return -1;
13594
13595         return count;
13596       }
13597
13598     case UNION_TYPE:
13599     case QUAL_UNION_TYPE:
13600       {
13601         /* These aren't very interesting except in a degenerate case.  */
13602         int count = 0;
13603         int sub_count;
13604         tree field;
13605
13606         /* Can't handle incomplete types nor sizes that are not
13607            fixed.  */
13608         if (!COMPLETE_TYPE_P (type)
13609             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13610           return -1;
13611
13612         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
13613           {
13614             if (TREE_CODE (field) != FIELD_DECL)
13615               continue;
13616
13617             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
13618             if (sub_count < 0)
13619               return -1;
13620             count = count > sub_count ? count : sub_count;
13621           }
13622
13623         /* There must be no padding.  */
13624         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13625                       count * GET_MODE_BITSIZE (*modep)))
13626           return -1;
13627
13628         return count;
13629       }
13630
13631     default:
13632       break;
13633     }
13634
13635   return -1;
13636 }
13637
13638 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
13639    type as described in AAPCS64 \S 4.1.2.
13640
13641    See the comment above aarch64_composite_type_p for the notes on MODE.  */
13642
13643 static bool
13644 aarch64_short_vector_p (const_tree type,
13645                         machine_mode mode)
13646 {
13647   poly_int64 size = -1;
13648
13649   if (type && TREE_CODE (type) == VECTOR_TYPE)
13650     size = int_size_in_bytes (type);
13651   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
13652             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
13653     size = GET_MODE_SIZE (mode);
13654
13655   return known_eq (size, 8) || known_eq (size, 16);
13656 }
13657
13658 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
13659    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
13660    array types.  The C99 floating-point complex types are also considered
13661    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
13662    types, which are GCC extensions and out of the scope of AAPCS64, are
13663    treated as composite types here as well.
13664
13665    Note that MODE itself is not sufficient in determining whether a type
13666    is such a composite type or not.  This is because
13667    stor-layout.c:compute_record_mode may have already changed the MODE
13668    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
13669    structure with only one field may have its MODE set to the mode of the
13670    field.  Also an integer mode whose size matches the size of the
13671    RECORD_TYPE type may be used to substitute the original mode
13672    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
13673    solely relied on.  */
13674
13675 static bool
13676 aarch64_composite_type_p (const_tree type,
13677                           machine_mode mode)
13678 {
13679   if (aarch64_short_vector_p (type, mode))
13680     return false;
13681
13682   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
13683     return true;
13684
13685   if (mode == BLKmode
13686       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
13687       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
13688     return true;
13689
13690   return false;
13691 }
13692
13693 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
13694    shall be passed or returned in simd/fp register(s) (providing these
13695    parameter passing registers are available).
13696
13697    Upon successful return, *COUNT returns the number of needed registers,
13698    *BASE_MODE returns the mode of the individual register and when IS_HAF
13699    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
13700    floating-point aggregate or a homogeneous short-vector aggregate.  */
13701
13702 static bool
13703 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
13704                                          const_tree type,
13705                                          machine_mode *base_mode,
13706                                          int *count,
13707                                          bool *is_ha)
13708 {
13709   machine_mode new_mode = VOIDmode;
13710   bool composite_p = aarch64_composite_type_p (type, mode);
13711
13712   if (is_ha != NULL) *is_ha = false;
13713
13714   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
13715       || aarch64_short_vector_p (type, mode))
13716     {
13717       *count = 1;
13718       new_mode = mode;
13719     }
13720   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
13721     {
13722       if (is_ha != NULL) *is_ha = true;
13723       *count = 2;
13724       new_mode = GET_MODE_INNER (mode);
13725     }
13726   else if (type && composite_p)
13727     {
13728       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
13729
13730       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
13731         {
13732           if (is_ha != NULL) *is_ha = true;
13733           *count = ag_count;
13734         }
13735       else
13736         return false;
13737     }
13738   else
13739     return false;
13740
13741   *base_mode = new_mode;
13742   return true;
13743 }
13744
13745 /* Implement TARGET_STRUCT_VALUE_RTX.  */
13746
13747 static rtx
13748 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
13749                           int incoming ATTRIBUTE_UNUSED)
13750 {
13751   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
13752 }
13753
13754 /* Implements target hook vector_mode_supported_p.  */
13755 static bool
13756 aarch64_vector_mode_supported_p (machine_mode mode)
13757 {
13758   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13759   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
13760 }
13761
13762 /* Return appropriate SIMD container
13763    for MODE within a vector of WIDTH bits.  */
13764 static machine_mode
13765 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
13766 {
13767   if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
13768     switch (mode)
13769       {
13770       case E_DFmode:
13771         return VNx2DFmode;
13772       case E_SFmode:
13773         return VNx4SFmode;
13774       case E_HFmode:
13775         return VNx8HFmode;
13776       case E_DImode:
13777         return VNx2DImode;
13778       case E_SImode:
13779         return VNx4SImode;
13780       case E_HImode:
13781         return VNx8HImode;
13782       case E_QImode:
13783         return VNx16QImode;
13784       default:
13785         return word_mode;
13786       }
13787
13788   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
13789   if (TARGET_SIMD)
13790     {
13791       if (known_eq (width, 128))
13792         switch (mode)
13793           {
13794           case E_DFmode:
13795             return V2DFmode;
13796           case E_SFmode:
13797             return V4SFmode;
13798           case E_HFmode:
13799             return V8HFmode;
13800           case E_SImode:
13801             return V4SImode;
13802           case E_HImode:
13803             return V8HImode;
13804           case E_QImode:
13805             return V16QImode;
13806           case E_DImode:
13807             return V2DImode;
13808           default:
13809             break;
13810           }
13811       else
13812         switch (mode)
13813           {
13814           case E_SFmode:
13815             return V2SFmode;
13816           case E_HFmode:
13817             return V4HFmode;
13818           case E_SImode:
13819             return V2SImode;
13820           case E_HImode:
13821             return V4HImode;
13822           case E_QImode:
13823             return V8QImode;
13824           default:
13825             break;
13826           }
13827     }
13828   return word_mode;
13829 }
13830
13831 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
13832 static machine_mode
13833 aarch64_preferred_simd_mode (scalar_mode mode)
13834 {
13835   poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
13836   return aarch64_simd_container_mode (mode, bits);
13837 }
13838
13839 /* Return a list of possible vector sizes for the vectorizer
13840    to iterate over.  */
13841 static void
13842 aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
13843 {
13844   if (TARGET_SVE)
13845     sizes->safe_push (BYTES_PER_SVE_VECTOR);
13846   sizes->safe_push (16);
13847   sizes->safe_push (8);
13848 }
13849
13850 /* Implement TARGET_MANGLE_TYPE.  */
13851
13852 static const char *
13853 aarch64_mangle_type (const_tree type)
13854 {
13855   /* The AArch64 ABI documents say that "__va_list" has to be
13856      mangled as if it is in the "std" namespace.  */
13857   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
13858     return "St9__va_list";
13859
13860   /* Half-precision float.  */
13861   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
13862     return "Dh";
13863
13864   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
13865      builtin types.  */
13866   if (TYPE_NAME (type) != NULL)
13867     return aarch64_mangle_builtin_type (type);
13868
13869   /* Use the default mangling.  */
13870   return NULL;
13871 }
13872
13873 /* Find the first rtx_insn before insn that will generate an assembly
13874    instruction.  */
13875
13876 static rtx_insn *
13877 aarch64_prev_real_insn (rtx_insn *insn)
13878 {
13879   if (!insn)
13880     return NULL;
13881
13882   do
13883     {
13884       insn = prev_real_insn (insn);
13885     }
13886   while (insn && recog_memoized (insn) < 0);
13887
13888   return insn;
13889 }
13890
13891 static bool
13892 is_madd_op (enum attr_type t1)
13893 {
13894   unsigned int i;
13895   /* A number of these may be AArch32 only.  */
13896   enum attr_type mlatypes[] = {
13897     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
13898     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
13899     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
13900   };
13901
13902   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
13903     {
13904       if (t1 == mlatypes[i])
13905         return true;
13906     }
13907
13908   return false;
13909 }
13910
13911 /* Check if there is a register dependency between a load and the insn
13912    for which we hold recog_data.  */
13913
13914 static bool
13915 dep_between_memop_and_curr (rtx memop)
13916 {
13917   rtx load_reg;
13918   int opno;
13919
13920   gcc_assert (GET_CODE (memop) == SET);
13921
13922   if (!REG_P (SET_DEST (memop)))
13923     return false;
13924
13925   load_reg = SET_DEST (memop);
13926   for (opno = 1; opno < recog_data.n_operands; opno++)
13927     {
13928       rtx operand = recog_data.operand[opno];
13929       if (REG_P (operand)
13930           && reg_overlap_mentioned_p (load_reg, operand))
13931         return true;
13932
13933     }
13934   return false;
13935 }
13936
13937
13938 /* When working around the Cortex-A53 erratum 835769,
13939    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
13940    instruction and has a preceding memory instruction such that a NOP
13941    should be inserted between them.  */
13942
13943 bool
13944 aarch64_madd_needs_nop (rtx_insn* insn)
13945 {
13946   enum attr_type attr_type;
13947   rtx_insn *prev;
13948   rtx body;
13949
13950   if (!TARGET_FIX_ERR_A53_835769)
13951     return false;
13952
13953   if (!INSN_P (insn) || recog_memoized (insn) < 0)
13954     return false;
13955
13956   attr_type = get_attr_type (insn);
13957   if (!is_madd_op (attr_type))
13958     return false;
13959
13960   prev = aarch64_prev_real_insn (insn);
13961   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
13962      Restore recog state to INSN to avoid state corruption.  */
13963   extract_constrain_insn_cached (insn);
13964
13965   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
13966     return false;
13967
13968   body = single_set (prev);
13969
13970   /* If the previous insn is a memory op and there is no dependency between
13971      it and the DImode madd, emit a NOP between them.  If body is NULL then we
13972      have a complex memory operation, probably a load/store pair.
13973      Be conservative for now and emit a NOP.  */
13974   if (GET_MODE (recog_data.operand[0]) == DImode
13975       && (!body || !dep_between_memop_and_curr (body)))
13976     return true;
13977
13978   return false;
13979
13980 }
13981
13982
13983 /* Implement FINAL_PRESCAN_INSN.  */
13984
13985 void
13986 aarch64_final_prescan_insn (rtx_insn *insn)
13987 {
13988   if (aarch64_madd_needs_nop (insn))
13989     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
13990 }
13991
13992
13993 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
13994    instruction.  */
13995
13996 bool
13997 aarch64_sve_index_immediate_p (rtx base_or_step)
13998 {
13999   return (CONST_INT_P (base_or_step)
14000           && IN_RANGE (INTVAL (base_or_step), -16, 15));
14001 }
14002
14003 /* Return true if X is a valid immediate for the SVE ADD and SUB
14004    instructions.  Negate X first if NEGATE_P is true.  */
14005
14006 bool
14007 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
14008 {
14009   rtx elt;
14010
14011   if (!const_vec_duplicate_p (x, &elt)
14012       || !CONST_INT_P (elt))
14013     return false;
14014
14015   HOST_WIDE_INT val = INTVAL (elt);
14016   if (negate_p)
14017     val = -val;
14018   val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
14019
14020   if (val & 0xff)
14021     return IN_RANGE (val, 0, 0xff);
14022   return IN_RANGE (val, 0, 0xff00);
14023 }
14024
14025 /* Return true if X is a valid immediate operand for an SVE logical
14026    instruction such as AND.  */
14027
14028 bool
14029 aarch64_sve_bitmask_immediate_p (rtx x)
14030 {
14031   rtx elt;
14032
14033   return (const_vec_duplicate_p (x, &elt)
14034           && CONST_INT_P (elt)
14035           && aarch64_bitmask_imm (INTVAL (elt),
14036                                   GET_MODE_INNER (GET_MODE (x))));
14037 }
14038
14039 /* Return true if X is a valid immediate for the SVE DUP and CPY
14040    instructions.  */
14041
14042 bool
14043 aarch64_sve_dup_immediate_p (rtx x)
14044 {
14045   rtx elt;
14046
14047   if (!const_vec_duplicate_p (x, &elt)
14048       || !CONST_INT_P (elt))
14049     return false;
14050
14051   HOST_WIDE_INT val = INTVAL (elt);
14052   if (val & 0xff)
14053     return IN_RANGE (val, -0x80, 0x7f);
14054   return IN_RANGE (val, -0x8000, 0x7f00);
14055 }
14056
14057 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
14058    SIGNED_P says whether the operand is signed rather than unsigned.  */
14059
14060 bool
14061 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
14062 {
14063   rtx elt;
14064
14065   return (const_vec_duplicate_p (x, &elt)
14066           && CONST_INT_P (elt)
14067           && (signed_p
14068               ? IN_RANGE (INTVAL (elt), -16, 15)
14069               : IN_RANGE (INTVAL (elt), 0, 127)));
14070 }
14071
14072 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
14073    instruction.  Negate X first if NEGATE_P is true.  */
14074
14075 bool
14076 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
14077 {
14078   rtx elt;
14079   REAL_VALUE_TYPE r;
14080
14081   if (!const_vec_duplicate_p (x, &elt)
14082       || GET_CODE (elt) != CONST_DOUBLE)
14083     return false;
14084
14085   r = *CONST_DOUBLE_REAL_VALUE (elt);
14086
14087   if (negate_p)
14088     r = real_value_negate (&r);
14089
14090   if (real_equal (&r, &dconst1))
14091     return true;
14092   if (real_equal (&r, &dconsthalf))
14093     return true;
14094   return false;
14095 }
14096
14097 /* Return true if X is a valid immediate operand for an SVE FMUL
14098    instruction.  */
14099
14100 bool
14101 aarch64_sve_float_mul_immediate_p (rtx x)
14102 {
14103   rtx elt;
14104
14105   /* GCC will never generate a multiply with an immediate of 2, so there is no
14106      point testing for it (even though it is a valid constant).  */
14107   return (const_vec_duplicate_p (x, &elt)
14108           && GET_CODE (elt) == CONST_DOUBLE
14109           && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
14110 }
14111
14112 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
14113    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
14114    is nonnull, use it to describe valid immediates.  */
14115 static bool
14116 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
14117                                     simd_immediate_info *info,
14118                                     enum simd_immediate_check which,
14119                                     simd_immediate_info::insn_type insn)
14120 {
14121   /* Try a 4-byte immediate with LSL.  */
14122   for (unsigned int shift = 0; shift < 32; shift += 8)
14123     if ((val32 & (0xff << shift)) == val32)
14124       {
14125         if (info)
14126           *info = simd_immediate_info (SImode, val32 >> shift, insn,
14127                                        simd_immediate_info::LSL, shift);
14128         return true;
14129       }
14130
14131   /* Try a 2-byte immediate with LSL.  */
14132   unsigned int imm16 = val32 & 0xffff;
14133   if (imm16 == (val32 >> 16))
14134     for (unsigned int shift = 0; shift < 16; shift += 8)
14135       if ((imm16 & (0xff << shift)) == imm16)
14136         {
14137           if (info)
14138             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
14139                                          simd_immediate_info::LSL, shift);
14140           return true;
14141         }
14142
14143   /* Try a 4-byte immediate with MSL, except for cases that MVN
14144      can handle.  */
14145   if (which == AARCH64_CHECK_MOV)
14146     for (unsigned int shift = 8; shift < 24; shift += 8)
14147       {
14148         unsigned int low = (1 << shift) - 1;
14149         if (((val32 & (0xff << shift)) | low) == val32)
14150           {
14151             if (info)
14152               *info = simd_immediate_info (SImode, val32 >> shift, insn,
14153                                            simd_immediate_info::MSL, shift);
14154             return true;
14155           }
14156       }
14157
14158   return false;
14159 }
14160
14161 /* Return true if replicating VAL64 is a valid immediate for the
14162    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
14163    use it to describe valid immediates.  */
14164 static bool
14165 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
14166                                  simd_immediate_info *info,
14167                                  enum simd_immediate_check which)
14168 {
14169   unsigned int val32 = val64 & 0xffffffff;
14170   unsigned int val16 = val64 & 0xffff;
14171   unsigned int val8 = val64 & 0xff;
14172
14173   if (val32 == (val64 >> 32))
14174     {
14175       if ((which & AARCH64_CHECK_ORR) != 0
14176           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
14177                                                  simd_immediate_info::MOV))
14178         return true;
14179
14180       if ((which & AARCH64_CHECK_BIC) != 0
14181           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
14182                                                  simd_immediate_info::MVN))
14183         return true;
14184
14185       /* Try using a replicated byte.  */
14186       if (which == AARCH64_CHECK_MOV
14187           && val16 == (val32 >> 16)
14188           && val8 == (val16 >> 8))
14189         {
14190           if (info)
14191             *info = simd_immediate_info (QImode, val8);
14192           return true;
14193         }
14194     }
14195
14196   /* Try using a bit-to-bytemask.  */
14197   if (which == AARCH64_CHECK_MOV)
14198     {
14199       unsigned int i;
14200       for (i = 0; i < 64; i += 8)
14201         {
14202           unsigned char byte = (val64 >> i) & 0xff;
14203           if (byte != 0 && byte != 0xff)
14204             break;
14205         }
14206       if (i == 64)
14207         {
14208           if (info)
14209             *info = simd_immediate_info (DImode, val64);
14210           return true;
14211         }
14212     }
14213   return false;
14214 }
14215
14216 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
14217    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
14218
14219 static bool
14220 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
14221                              simd_immediate_info *info)
14222 {
14223   scalar_int_mode mode = DImode;
14224   unsigned int val32 = val64 & 0xffffffff;
14225   if (val32 == (val64 >> 32))
14226     {
14227       mode = SImode;
14228       unsigned int val16 = val32 & 0xffff;
14229       if (val16 == (val32 >> 16))
14230         {
14231           mode = HImode;
14232           unsigned int val8 = val16 & 0xff;
14233           if (val8 == (val16 >> 8))
14234             mode = QImode;
14235         }
14236     }
14237   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
14238   if (IN_RANGE (val, -0x80, 0x7f))
14239     {
14240       /* DUP with no shift.  */
14241       if (info)
14242         *info = simd_immediate_info (mode, val);
14243       return true;
14244     }
14245   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
14246     {
14247       /* DUP with LSL #8.  */
14248       if (info)
14249         *info = simd_immediate_info (mode, val);
14250       return true;
14251     }
14252   if (aarch64_bitmask_imm (val64, mode))
14253     {
14254       /* DUPM.  */
14255       if (info)
14256         *info = simd_immediate_info (mode, val);
14257       return true;
14258     }
14259   return false;
14260 }
14261
14262 /* Return true if OP is a valid SIMD immediate for the operation
14263    described by WHICH.  If INFO is nonnull, use it to describe valid
14264    immediates.  */
14265 bool
14266 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
14267                               enum simd_immediate_check which)
14268 {
14269   machine_mode mode = GET_MODE (op);
14270   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14271   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
14272     return false;
14273
14274   scalar_mode elt_mode = GET_MODE_INNER (mode);
14275   rtx base, step;
14276   unsigned int n_elts;
14277   if (GET_CODE (op) == CONST_VECTOR
14278       && CONST_VECTOR_DUPLICATE_P (op))
14279     n_elts = CONST_VECTOR_NPATTERNS (op);
14280   else if ((vec_flags & VEC_SVE_DATA)
14281            && const_vec_series_p (op, &base, &step))
14282     {
14283       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
14284       if (!aarch64_sve_index_immediate_p (base)
14285           || !aarch64_sve_index_immediate_p (step))
14286         return false;
14287
14288       if (info)
14289         *info = simd_immediate_info (elt_mode, base, step);
14290       return true;
14291     }
14292   else if (GET_CODE (op) == CONST_VECTOR
14293            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
14294     /* N_ELTS set above.  */;
14295   else
14296     return false;
14297
14298   /* Handle PFALSE and PTRUE.  */
14299   if (vec_flags & VEC_SVE_PRED)
14300     return (op == CONST0_RTX (mode)
14301             || op == CONSTM1_RTX (mode));
14302
14303   scalar_float_mode elt_float_mode;
14304   if (n_elts == 1
14305       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
14306     {
14307       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
14308       if (aarch64_float_const_zero_rtx_p (elt)
14309           || aarch64_float_const_representable_p (elt))
14310         {
14311           if (info)
14312             *info = simd_immediate_info (elt_float_mode, elt);
14313           return true;
14314         }
14315     }
14316
14317   unsigned int elt_size = GET_MODE_SIZE (elt_mode);
14318   if (elt_size > 8)
14319     return false;
14320
14321   scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
14322
14323   /* Expand the vector constant out into a byte vector, with the least
14324      significant byte of the register first.  */
14325   auto_vec<unsigned char, 16> bytes;
14326   bytes.reserve (n_elts * elt_size);
14327   for (unsigned int i = 0; i < n_elts; i++)
14328     {
14329       /* The vector is provided in gcc endian-neutral fashion.
14330          For aarch64_be Advanced SIMD, it must be laid out in the vector
14331          register in reverse order.  */
14332       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
14333       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
14334
14335       if (elt_mode != elt_int_mode)
14336         elt = gen_lowpart (elt_int_mode, elt);
14337
14338       if (!CONST_INT_P (elt))
14339         return false;
14340
14341       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
14342       for (unsigned int byte = 0; byte < elt_size; byte++)
14343         {
14344           bytes.quick_push (elt_val & 0xff);
14345           elt_val >>= BITS_PER_UNIT;
14346         }
14347     }
14348
14349   /* The immediate must repeat every eight bytes.  */
14350   unsigned int nbytes = bytes.length ();
14351   for (unsigned i = 8; i < nbytes; ++i)
14352     if (bytes[i] != bytes[i - 8])
14353       return false;
14354
14355   /* Get the repeating 8-byte value as an integer.  No endian correction
14356      is needed here because bytes is already in lsb-first order.  */
14357   unsigned HOST_WIDE_INT val64 = 0;
14358   for (unsigned int i = 0; i < 8; i++)
14359     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
14360               << (i * BITS_PER_UNIT));
14361
14362   if (vec_flags & VEC_SVE_DATA)
14363     return aarch64_sve_valid_immediate (val64, info);
14364   else
14365     return aarch64_advsimd_valid_immediate (val64, info, which);
14366 }
14367
14368 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
14369    has a step in the range of INDEX.  Return the index expression if so,
14370    otherwise return null.  */
14371 rtx
14372 aarch64_check_zero_based_sve_index_immediate (rtx x)
14373 {
14374   rtx base, step;
14375   if (const_vec_series_p (x, &base, &step)
14376       && base == const0_rtx
14377       && aarch64_sve_index_immediate_p (step))
14378     return step;
14379   return NULL_RTX;
14380 }
14381
14382 /* Check of immediate shift constants are within range.  */
14383 bool
14384 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
14385 {
14386   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
14387   if (left)
14388     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
14389   else
14390     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
14391 }
14392
14393 /* Return the bitmask CONST_INT to select the bits required by a zero extract
14394    operation of width WIDTH at bit position POS.  */
14395
14396 rtx
14397 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
14398 {
14399   gcc_assert (CONST_INT_P (width));
14400   gcc_assert (CONST_INT_P (pos));
14401
14402   unsigned HOST_WIDE_INT mask
14403     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
14404   return GEN_INT (mask << UINTVAL (pos));
14405 }
14406
14407 bool
14408 aarch64_mov_operand_p (rtx x, machine_mode mode)
14409 {
14410   if (GET_CODE (x) == HIGH
14411       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
14412     return true;
14413
14414   if (CONST_INT_P (x))
14415     return true;
14416
14417   if (VECTOR_MODE_P (GET_MODE (x)))
14418     return aarch64_simd_valid_immediate (x, NULL);
14419
14420   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
14421     return true;
14422
14423   if (aarch64_sve_cnt_immediate_p (x))
14424     return true;
14425
14426   return aarch64_classify_symbolic_expression (x)
14427     == SYMBOL_TINY_ABSOLUTE;
14428 }
14429
14430 /* Return a const_int vector of VAL.  */
14431 rtx
14432 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
14433 {
14434   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
14435   return gen_const_vec_duplicate (mode, c);
14436 }
14437
14438 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
14439
14440 bool
14441 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
14442 {
14443   machine_mode vmode;
14444
14445   vmode = aarch64_simd_container_mode (mode, 64);
14446   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
14447   return aarch64_simd_valid_immediate (op_v, NULL);
14448 }
14449
14450 /* Construct and return a PARALLEL RTX vector with elements numbering the
14451    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
14452    the vector - from the perspective of the architecture.  This does not
14453    line up with GCC's perspective on lane numbers, so we end up with
14454    different masks depending on our target endian-ness.  The diagram
14455    below may help.  We must draw the distinction when building masks
14456    which select one half of the vector.  An instruction selecting
14457    architectural low-lanes for a big-endian target, must be described using
14458    a mask selecting GCC high-lanes.
14459
14460                  Big-Endian             Little-Endian
14461
14462 GCC             0   1   2   3           3   2   1   0
14463               | x | x | x | x |       | x | x | x | x |
14464 Architecture    3   2   1   0           3   2   1   0
14465
14466 Low Mask:         { 2, 3 }                { 0, 1 }
14467 High Mask:        { 0, 1 }                { 2, 3 }
14468
14469    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
14470
14471 rtx
14472 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
14473 {
14474   rtvec v = rtvec_alloc (nunits / 2);
14475   int high_base = nunits / 2;
14476   int low_base = 0;
14477   int base;
14478   rtx t1;
14479   int i;
14480
14481   if (BYTES_BIG_ENDIAN)
14482     base = high ? low_base : high_base;
14483   else
14484     base = high ? high_base : low_base;
14485
14486   for (i = 0; i < nunits / 2; i++)
14487     RTVEC_ELT (v, i) = GEN_INT (base + i);
14488
14489   t1 = gen_rtx_PARALLEL (mode, v);
14490   return t1;
14491 }
14492
14493 /* Check OP for validity as a PARALLEL RTX vector with elements
14494    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
14495    from the perspective of the architecture.  See the diagram above
14496    aarch64_simd_vect_par_cnst_half for more details.  */
14497
14498 bool
14499 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
14500                                        bool high)
14501 {
14502   int nelts;
14503   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
14504     return false;
14505
14506   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
14507   HOST_WIDE_INT count_op = XVECLEN (op, 0);
14508   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
14509   int i = 0;
14510
14511   if (count_op != count_ideal)
14512     return false;
14513
14514   for (i = 0; i < count_ideal; i++)
14515     {
14516       rtx elt_op = XVECEXP (op, 0, i);
14517       rtx elt_ideal = XVECEXP (ideal, 0, i);
14518
14519       if (!CONST_INT_P (elt_op)
14520           || INTVAL (elt_ideal) != INTVAL (elt_op))
14521         return false;
14522     }
14523   return true;
14524 }
14525
14526 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
14527    HIGH (exclusive).  */
14528 void
14529 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
14530                           const_tree exp)
14531 {
14532   HOST_WIDE_INT lane;
14533   gcc_assert (CONST_INT_P (operand));
14534   lane = INTVAL (operand);
14535
14536   if (lane < low || lane >= high)
14537   {
14538     if (exp)
14539       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
14540     else
14541       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
14542   }
14543 }
14544
14545 /* Peform endian correction on lane number N, which indexes a vector
14546    of mode MODE, and return the result as an SImode rtx.  */
14547
14548 rtx
14549 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
14550 {
14551   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
14552 }
14553
14554 /* Return TRUE if OP is a valid vector addressing mode.  */
14555
14556 bool
14557 aarch64_simd_mem_operand_p (rtx op)
14558 {
14559   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
14560                         || REG_P (XEXP (op, 0)));
14561 }
14562
14563 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
14564
14565 bool
14566 aarch64_sve_ld1r_operand_p (rtx op)
14567 {
14568   struct aarch64_address_info addr;
14569   scalar_mode mode;
14570
14571   return (MEM_P (op)
14572           && is_a <scalar_mode> (GET_MODE (op), &mode)
14573           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
14574           && addr.type == ADDRESS_REG_IMM
14575           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
14576 }
14577
14578 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
14579    The conditions for STR are the same.  */
14580 bool
14581 aarch64_sve_ldr_operand_p (rtx op)
14582 {
14583   struct aarch64_address_info addr;
14584
14585   return (MEM_P (op)
14586           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
14587                                        false, ADDR_QUERY_ANY)
14588           && addr.type == ADDRESS_REG_IMM);
14589 }
14590
14591 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
14592    We need to be able to access the individual pieces, so the range
14593    is different from LD[234] and ST[234].  */
14594 bool
14595 aarch64_sve_struct_memory_operand_p (rtx op)
14596 {
14597   if (!MEM_P (op))
14598     return false;
14599
14600   machine_mode mode = GET_MODE (op);
14601   struct aarch64_address_info addr;
14602   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
14603                                  ADDR_QUERY_ANY)
14604       || addr.type != ADDRESS_REG_IMM)
14605     return false;
14606
14607   poly_int64 first = addr.const_offset;
14608   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
14609   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
14610           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
14611 }
14612
14613 /* Emit a register copy from operand to operand, taking care not to
14614    early-clobber source registers in the process.
14615
14616    COUNT is the number of components into which the copy needs to be
14617    decomposed.  */
14618 void
14619 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
14620                                 unsigned int count)
14621 {
14622   unsigned int i;
14623   int rdest = REGNO (operands[0]);
14624   int rsrc = REGNO (operands[1]);
14625
14626   if (!reg_overlap_mentioned_p (operands[0], operands[1])
14627       || rdest < rsrc)
14628     for (i = 0; i < count; i++)
14629       emit_move_insn (gen_rtx_REG (mode, rdest + i),
14630                       gen_rtx_REG (mode, rsrc + i));
14631   else
14632     for (i = 0; i < count; i++)
14633       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
14634                       gen_rtx_REG (mode, rsrc + count - i - 1));
14635 }
14636
14637 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
14638    one of VSTRUCT modes: OI, CI, or XI.  */
14639 int
14640 aarch64_simd_attr_length_rglist (machine_mode mode)
14641 {
14642   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
14643   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
14644 }
14645
14646 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
14647    alignment of a vector to 128 bits.  SVE predicates have an alignment of
14648    16 bits.  */
14649 static HOST_WIDE_INT
14650 aarch64_simd_vector_alignment (const_tree type)
14651 {
14652   if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14653     /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
14654        be set for non-predicate vectors of booleans.  Modes are the most
14655        direct way we have of identifying real SVE predicate types.  */
14656     return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
14657   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
14658   return MIN (align, 128);
14659 }
14660
14661 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
14662 static poly_uint64
14663 aarch64_vectorize_preferred_vector_alignment (const_tree type)
14664 {
14665   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
14666     {
14667       /* If the length of the vector is fixed, try to align to that length,
14668          otherwise don't try to align at all.  */
14669       HOST_WIDE_INT result;
14670       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
14671         result = TYPE_ALIGN (TREE_TYPE (type));
14672       return result;
14673     }
14674   return TYPE_ALIGN (type);
14675 }
14676
14677 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
14678 static bool
14679 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
14680 {
14681   if (is_packed)
14682     return false;
14683
14684   /* For fixed-length vectors, check that the vectorizer will aim for
14685      full-vector alignment.  This isn't true for generic GCC vectors
14686      that are wider than the ABI maximum of 128 bits.  */
14687   poly_uint64 preferred_alignment =
14688     aarch64_vectorize_preferred_vector_alignment (type);
14689   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
14690       && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
14691                    preferred_alignment))
14692     return false;
14693
14694   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
14695   return true;
14696 }
14697
14698 /* Return true if the vector misalignment factor is supported by the
14699    target.  */
14700 static bool
14701 aarch64_builtin_support_vector_misalignment (machine_mode mode,
14702                                              const_tree type, int misalignment,
14703                                              bool is_packed)
14704 {
14705   if (TARGET_SIMD && STRICT_ALIGNMENT)
14706     {
14707       /* Return if movmisalign pattern is not supported for this mode.  */
14708       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
14709         return false;
14710
14711       /* Misalignment factor is unknown at compile time.  */
14712       if (misalignment == -1)
14713         return false;
14714     }
14715   return default_builtin_support_vector_misalignment (mode, type, misalignment,
14716                                                       is_packed);
14717 }
14718
14719 /* If VALS is a vector constant that can be loaded into a register
14720    using DUP, generate instructions to do so and return an RTX to
14721    assign to the register.  Otherwise return NULL_RTX.  */
14722 static rtx
14723 aarch64_simd_dup_constant (rtx vals)
14724 {
14725   machine_mode mode = GET_MODE (vals);
14726   machine_mode inner_mode = GET_MODE_INNER (mode);
14727   rtx x;
14728
14729   if (!const_vec_duplicate_p (vals, &x))
14730     return NULL_RTX;
14731
14732   /* We can load this constant by using DUP and a constant in a
14733      single ARM register.  This will be cheaper than a vector
14734      load.  */
14735   x = copy_to_mode_reg (inner_mode, x);
14736   return gen_vec_duplicate (mode, x);
14737 }
14738
14739
14740 /* Generate code to load VALS, which is a PARALLEL containing only
14741    constants (for vec_init) or CONST_VECTOR, efficiently into a
14742    register.  Returns an RTX to copy into the register, or NULL_RTX
14743    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
14744 static rtx
14745 aarch64_simd_make_constant (rtx vals)
14746 {
14747   machine_mode mode = GET_MODE (vals);
14748   rtx const_dup;
14749   rtx const_vec = NULL_RTX;
14750   int n_const = 0;
14751   int i;
14752
14753   if (GET_CODE (vals) == CONST_VECTOR)
14754     const_vec = vals;
14755   else if (GET_CODE (vals) == PARALLEL)
14756     {
14757       /* A CONST_VECTOR must contain only CONST_INTs and
14758          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
14759          Only store valid constants in a CONST_VECTOR.  */
14760       int n_elts = XVECLEN (vals, 0);
14761       for (i = 0; i < n_elts; ++i)
14762         {
14763           rtx x = XVECEXP (vals, 0, i);
14764           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
14765             n_const++;
14766         }
14767       if (n_const == n_elts)
14768         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
14769     }
14770   else
14771     gcc_unreachable ();
14772
14773   if (const_vec != NULL_RTX
14774       && aarch64_simd_valid_immediate (const_vec, NULL))
14775     /* Load using MOVI/MVNI.  */
14776     return const_vec;
14777   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
14778     /* Loaded using DUP.  */
14779     return const_dup;
14780   else if (const_vec != NULL_RTX)
14781     /* Load from constant pool. We can not take advantage of single-cycle
14782        LD1 because we need a PC-relative addressing mode.  */
14783     return const_vec;
14784   else
14785     /* A PARALLEL containing something not valid inside CONST_VECTOR.
14786        We can not construct an initializer.  */
14787     return NULL_RTX;
14788 }
14789
14790 /* Expand a vector initialisation sequence, such that TARGET is
14791    initialised to contain VALS.  */
14792
14793 void
14794 aarch64_expand_vector_init (rtx target, rtx vals)
14795 {
14796   machine_mode mode = GET_MODE (target);
14797   scalar_mode inner_mode = GET_MODE_INNER (mode);
14798   /* The number of vector elements.  */
14799   int n_elts = XVECLEN (vals, 0);
14800   /* The number of vector elements which are not constant.  */
14801   int n_var = 0;
14802   rtx any_const = NULL_RTX;
14803   /* The first element of vals.  */
14804   rtx v0 = XVECEXP (vals, 0, 0);
14805   bool all_same = true;
14806
14807   /* Count the number of variable elements to initialise.  */
14808   for (int i = 0; i < n_elts; ++i)
14809     {
14810       rtx x = XVECEXP (vals, 0, i);
14811       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
14812         ++n_var;
14813       else
14814         any_const = x;
14815
14816       all_same &= rtx_equal_p (x, v0);
14817     }
14818
14819   /* No variable elements, hand off to aarch64_simd_make_constant which knows
14820      how best to handle this.  */
14821   if (n_var == 0)
14822     {
14823       rtx constant = aarch64_simd_make_constant (vals);
14824       if (constant != NULL_RTX)
14825         {
14826           emit_move_insn (target, constant);
14827           return;
14828         }
14829     }
14830
14831   /* Splat a single non-constant element if we can.  */
14832   if (all_same)
14833     {
14834       rtx x = copy_to_mode_reg (inner_mode, v0);
14835       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
14836       return;
14837     }
14838
14839   enum insn_code icode = optab_handler (vec_set_optab, mode);
14840   gcc_assert (icode != CODE_FOR_nothing);
14841
14842   /* If there are only variable elements, try to optimize
14843      the insertion using dup for the most common element
14844      followed by insertions.  */
14845
14846   /* The algorithm will fill matches[*][0] with the earliest matching element,
14847      and matches[X][1] with the count of duplicate elements (if X is the
14848      earliest element which has duplicates).  */
14849
14850   if (n_var == n_elts && n_elts <= 16)
14851     {
14852       int matches[16][2] = {0};
14853       for (int i = 0; i < n_elts; i++)
14854         {
14855           for (int j = 0; j <= i; j++)
14856             {
14857               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
14858                 {
14859                   matches[i][0] = j;
14860                   matches[j][1]++;
14861                   break;
14862                 }
14863             }
14864         }
14865       int maxelement = 0;
14866       int maxv = 0;
14867       for (int i = 0; i < n_elts; i++)
14868         if (matches[i][1] > maxv)
14869           {
14870             maxelement = i;
14871             maxv = matches[i][1];
14872           }
14873
14874       /* Create a duplicate of the most common element, unless all elements
14875          are equally useless to us, in which case just immediately set the
14876          vector register using the first element.  */
14877
14878       if (maxv == 1)
14879         {
14880           /* For vectors of two 64-bit elements, we can do even better.  */
14881           if (n_elts == 2
14882               && (inner_mode == E_DImode
14883                   || inner_mode == E_DFmode))
14884
14885             {
14886               rtx x0 = XVECEXP (vals, 0, 0);
14887               rtx x1 = XVECEXP (vals, 0, 1);
14888               /* Combine can pick up this case, but handling it directly
14889                  here leaves clearer RTL.
14890
14891                  This is load_pair_lanes<mode>, and also gives us a clean-up
14892                  for store_pair_lanes<mode>.  */
14893               if (memory_operand (x0, inner_mode)
14894                   && memory_operand (x1, inner_mode)
14895                   && !STRICT_ALIGNMENT
14896                   && rtx_equal_p (XEXP (x1, 0),
14897                                   plus_constant (Pmode,
14898                                                  XEXP (x0, 0),
14899                                                  GET_MODE_SIZE (inner_mode))))
14900                 {
14901                   rtx t;
14902                   if (inner_mode == DFmode)
14903                     t = gen_load_pair_lanesdf (target, x0, x1);
14904                   else
14905                     t = gen_load_pair_lanesdi (target, x0, x1);
14906                   emit_insn (t);
14907                   return;
14908                 }
14909             }
14910           /* The subreg-move sequence below will move into lane zero of the
14911              vector register.  For big-endian we want that position to hold
14912              the last element of VALS.  */
14913           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
14914           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
14915           aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
14916         }
14917       else
14918         {
14919           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
14920           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
14921         }
14922
14923       /* Insert the rest.  */
14924       for (int i = 0; i < n_elts; i++)
14925         {
14926           rtx x = XVECEXP (vals, 0, i);
14927           if (matches[i][0] == maxelement)
14928             continue;
14929           x = copy_to_mode_reg (inner_mode, x);
14930           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
14931         }
14932       return;
14933     }
14934
14935   /* Initialise a vector which is part-variable.  We want to first try
14936      to build those lanes which are constant in the most efficient way we
14937      can.  */
14938   if (n_var != n_elts)
14939     {
14940       rtx copy = copy_rtx (vals);
14941
14942       /* Load constant part of vector.  We really don't care what goes into the
14943          parts we will overwrite, but we're more likely to be able to load the
14944          constant efficiently if it has fewer, larger, repeating parts
14945          (see aarch64_simd_valid_immediate).  */
14946       for (int i = 0; i < n_elts; i++)
14947         {
14948           rtx x = XVECEXP (vals, 0, i);
14949           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
14950             continue;
14951           rtx subst = any_const;
14952           for (int bit = n_elts / 2; bit > 0; bit /= 2)
14953             {
14954               /* Look in the copied vector, as more elements are const.  */
14955               rtx test = XVECEXP (copy, 0, i ^ bit);
14956               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
14957                 {
14958                   subst = test;
14959                   break;
14960                 }
14961             }
14962           XVECEXP (copy, 0, i) = subst;
14963         }
14964       aarch64_expand_vector_init (target, copy);
14965     }
14966
14967   /* Insert the variable lanes directly.  */
14968   for (int i = 0; i < n_elts; i++)
14969     {
14970       rtx x = XVECEXP (vals, 0, i);
14971       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
14972         continue;
14973       x = copy_to_mode_reg (inner_mode, x);
14974       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
14975     }
14976 }
14977
14978 static unsigned HOST_WIDE_INT
14979 aarch64_shift_truncation_mask (machine_mode mode)
14980 {
14981   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
14982     return 0;
14983   return GET_MODE_UNIT_BITSIZE (mode) - 1;
14984 }
14985
14986 /* Select a format to encode pointers in exception handling data.  */
14987 int
14988 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
14989 {
14990    int type;
14991    switch (aarch64_cmodel)
14992      {
14993      case AARCH64_CMODEL_TINY:
14994      case AARCH64_CMODEL_TINY_PIC:
14995      case AARCH64_CMODEL_SMALL:
14996      case AARCH64_CMODEL_SMALL_PIC:
14997      case AARCH64_CMODEL_SMALL_SPIC:
14998        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
14999           for everything.  */
15000        type = DW_EH_PE_sdata4;
15001        break;
15002      default:
15003        /* No assumptions here.  8-byte relocs required.  */
15004        type = DW_EH_PE_sdata8;
15005        break;
15006      }
15007    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
15008 }
15009
15010 /* The last .arch and .tune assembly strings that we printed.  */
15011 static std::string aarch64_last_printed_arch_string;
15012 static std::string aarch64_last_printed_tune_string;
15013
15014 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
15015    by the function fndecl.  */
15016
15017 void
15018 aarch64_declare_function_name (FILE *stream, const char* name,
15019                                 tree fndecl)
15020 {
15021   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
15022
15023   struct cl_target_option *targ_options;
15024   if (target_parts)
15025     targ_options = TREE_TARGET_OPTION (target_parts);
15026   else
15027     targ_options = TREE_TARGET_OPTION (target_option_current_node);
15028   gcc_assert (targ_options);
15029
15030   const struct processor *this_arch
15031     = aarch64_get_arch (targ_options->x_explicit_arch);
15032
15033   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
15034   std::string extension
15035     = aarch64_get_extension_string_for_isa_flags (isa_flags,
15036                                                   this_arch->flags);
15037   /* Only update the assembler .arch string if it is distinct from the last
15038      such string we printed.  */
15039   std::string to_print = this_arch->name + extension;
15040   if (to_print != aarch64_last_printed_arch_string)
15041     {
15042       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
15043       aarch64_last_printed_arch_string = to_print;
15044     }
15045
15046   /* Print the cpu name we're tuning for in the comments, might be
15047      useful to readers of the generated asm.  Do it only when it changes
15048      from function to function and verbose assembly is requested.  */
15049   const struct processor *this_tune
15050     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
15051
15052   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
15053     {
15054       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
15055                    this_tune->name);
15056       aarch64_last_printed_tune_string = this_tune->name;
15057     }
15058
15059   /* Don't forget the type directive for ELF.  */
15060   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
15061   ASM_OUTPUT_LABEL (stream, name);
15062 }
15063
15064 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
15065
15066 static void
15067 aarch64_start_file (void)
15068 {
15069   struct cl_target_option *default_options
15070     = TREE_TARGET_OPTION (target_option_default_node);
15071
15072   const struct processor *default_arch
15073     = aarch64_get_arch (default_options->x_explicit_arch);
15074   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
15075   std::string extension
15076     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
15077                                                   default_arch->flags);
15078
15079    aarch64_last_printed_arch_string = default_arch->name + extension;
15080    aarch64_last_printed_tune_string = "";
15081    asm_fprintf (asm_out_file, "\t.arch %s\n",
15082                 aarch64_last_printed_arch_string.c_str ());
15083
15084    default_file_start ();
15085 }
15086
15087 /* Emit load exclusive.  */
15088
15089 static void
15090 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
15091                              rtx mem, rtx model_rtx)
15092 {
15093   emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
15094 }
15095
15096 /* Emit store exclusive.  */
15097
15098 static void
15099 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
15100                               rtx rval, rtx mem, rtx model_rtx)
15101 {
15102   emit_insn (gen_aarch64_store_exclusive (mode, bval, rval, mem, model_rtx));
15103 }
15104
15105 /* Mark the previous jump instruction as unlikely.  */
15106
15107 static void
15108 aarch64_emit_unlikely_jump (rtx insn)
15109 {
15110   rtx_insn *jump = emit_jump_insn (insn);
15111   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
15112 }
15113
15114 /* Expand a compare and swap pattern.  */
15115
15116 void
15117 aarch64_expand_compare_and_swap (rtx operands[])
15118 {
15119   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
15120   machine_mode mode, r_mode;
15121
15122   bval = operands[0];
15123   rval = operands[1];
15124   mem = operands[2];
15125   oldval = operands[3];
15126   newval = operands[4];
15127   is_weak = operands[5];
15128   mod_s = operands[6];
15129   mod_f = operands[7];
15130   mode = GET_MODE (mem);
15131
15132   /* Normally the succ memory model must be stronger than fail, but in the
15133      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
15134      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
15135   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
15136       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
15137     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
15138
15139   r_mode = mode;
15140   if (mode == QImode || mode == HImode)
15141     {
15142       r_mode = SImode;
15143       rval = gen_reg_rtx (r_mode);
15144     }
15145
15146   if (TARGET_LSE)
15147     {
15148       /* The CAS insn requires oldval and rval overlap, but we need to
15149          have a copy of oldval saved across the operation to tell if
15150          the operation is successful.  */
15151       if (reg_overlap_mentioned_p (rval, oldval))
15152         rval = copy_to_mode_reg (r_mode, oldval);
15153       else
15154         emit_move_insn (rval, gen_lowpart (r_mode, oldval));
15155
15156       emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
15157                                                    newval, mod_s));
15158       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
15159     }
15160   else
15161     {
15162       /* The oldval predicate varies by mode.  Test it and force to reg.  */
15163       insn_code code = code_for_aarch64_compare_and_swap (mode);
15164       if (!insn_data[code].operand[2].predicate (oldval, mode))
15165         oldval = force_reg (mode, oldval);
15166
15167       emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
15168                                  is_weak, mod_s, mod_f));
15169       cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
15170     }
15171
15172   if (r_mode != mode)
15173     rval = gen_lowpart (mode, rval);
15174   emit_move_insn (operands[1], rval);
15175
15176   x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
15177   emit_insn (gen_rtx_SET (bval, x));
15178 }
15179
15180 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
15181    sequence implementing an atomic operation.  */
15182
15183 static void
15184 aarch64_emit_post_barrier (enum memmodel model)
15185 {
15186   const enum memmodel base_model = memmodel_base (model);
15187
15188   if (is_mm_sync (model)
15189       && (base_model == MEMMODEL_ACQUIRE
15190           || base_model == MEMMODEL_ACQ_REL
15191           || base_model == MEMMODEL_SEQ_CST))
15192     {
15193       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
15194     }
15195 }
15196
15197 /* Split a compare and swap pattern.  */
15198
15199 void
15200 aarch64_split_compare_and_swap (rtx operands[])
15201 {
15202   rtx rval, mem, oldval, newval, scratch;
15203   machine_mode mode;
15204   bool is_weak;
15205   rtx_code_label *label1, *label2;
15206   rtx x, cond;
15207   enum memmodel model;
15208   rtx model_rtx;
15209
15210   rval = operands[0];
15211   mem = operands[1];
15212   oldval = operands[2];
15213   newval = operands[3];
15214   is_weak = (operands[4] != const0_rtx);
15215   model_rtx = operands[5];
15216   scratch = operands[7];
15217   mode = GET_MODE (mem);
15218   model = memmodel_from_int (INTVAL (model_rtx));
15219
15220   /* When OLDVAL is zero and we want the strong version we can emit a tighter
15221     loop:
15222     .label1:
15223         LD[A]XR rval, [mem]
15224         CBNZ    rval, .label2
15225         ST[L]XR scratch, newval, [mem]
15226         CBNZ    scratch, .label1
15227     .label2:
15228         CMP     rval, 0.  */
15229   bool strong_zero_p = !is_weak && oldval == const0_rtx;
15230
15231   label1 = NULL;
15232   if (!is_weak)
15233     {
15234       label1 = gen_label_rtx ();
15235       emit_label (label1);
15236     }
15237   label2 = gen_label_rtx ();
15238
15239   /* The initial load can be relaxed for a __sync operation since a final
15240      barrier will be emitted to stop code hoisting.  */
15241   if (is_mm_sync (model))
15242     aarch64_emit_load_exclusive (mode, rval, mem,
15243                                  GEN_INT (MEMMODEL_RELAXED));
15244   else
15245     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
15246
15247   if (strong_zero_p)
15248     {
15249       if (aarch64_track_speculation)
15250         {
15251           /* Emit an explicit compare instruction, so that we can correctly
15252              track the condition codes.  */
15253           rtx cc_reg = aarch64_gen_compare_reg (NE, rval, const0_rtx);
15254           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
15255         }
15256       else
15257         x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
15258
15259       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15260                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
15261       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15262     }
15263   else
15264     {
15265       cond = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
15266       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
15267       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15268                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
15269       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15270     }
15271
15272   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
15273
15274   if (!is_weak)
15275     {
15276       if (aarch64_track_speculation)
15277         {
15278           /* Emit an explicit compare instruction, so that we can correctly
15279              track the condition codes.  */
15280           rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
15281           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
15282         }
15283       else
15284         x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
15285
15286       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15287                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
15288       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15289     }
15290   else
15291     {
15292       cond = gen_rtx_REG (CCmode, CC_REGNUM);
15293       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
15294       emit_insn (gen_rtx_SET (cond, x));
15295     }
15296
15297   emit_label (label2);
15298   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
15299      to set the condition flags.  If this is not used it will be removed by
15300      later passes.  */
15301   if (strong_zero_p)
15302     {
15303       cond = gen_rtx_REG (CCmode, CC_REGNUM);
15304       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
15305       emit_insn (gen_rtx_SET (cond, x));
15306     }
15307   /* Emit any final barrier needed for a __sync operation.  */
15308   if (is_mm_sync (model))
15309     aarch64_emit_post_barrier (model);
15310 }
15311
15312 /* Split an atomic operation.  */
15313
15314 void
15315 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
15316                          rtx value, rtx model_rtx, rtx cond)
15317 {
15318   machine_mode mode = GET_MODE (mem);
15319   machine_mode wmode = (mode == DImode ? DImode : SImode);
15320   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
15321   const bool is_sync = is_mm_sync (model);
15322   rtx_code_label *label;
15323   rtx x;
15324
15325   /* Split the atomic operation into a sequence.  */
15326   label = gen_label_rtx ();
15327   emit_label (label);
15328
15329   if (new_out)
15330     new_out = gen_lowpart (wmode, new_out);
15331   if (old_out)
15332     old_out = gen_lowpart (wmode, old_out);
15333   else
15334     old_out = new_out;
15335   value = simplify_gen_subreg (wmode, value, mode, 0);
15336
15337   /* The initial load can be relaxed for a __sync operation since a final
15338      barrier will be emitted to stop code hoisting.  */
15339  if (is_sync)
15340     aarch64_emit_load_exclusive (mode, old_out, mem,
15341                                  GEN_INT (MEMMODEL_RELAXED));
15342   else
15343     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
15344
15345   switch (code)
15346     {
15347     case SET:
15348       new_out = value;
15349       break;
15350
15351     case NOT:
15352       x = gen_rtx_AND (wmode, old_out, value);
15353       emit_insn (gen_rtx_SET (new_out, x));
15354       x = gen_rtx_NOT (wmode, new_out);
15355       emit_insn (gen_rtx_SET (new_out, x));
15356       break;
15357
15358     case MINUS:
15359       if (CONST_INT_P (value))
15360         {
15361           value = GEN_INT (-INTVAL (value));
15362           code = PLUS;
15363         }
15364       /* Fall through.  */
15365
15366     default:
15367       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
15368       emit_insn (gen_rtx_SET (new_out, x));
15369       break;
15370     }
15371
15372   aarch64_emit_store_exclusive (mode, cond, mem,
15373                                 gen_lowpart (mode, new_out), model_rtx);
15374
15375   if (aarch64_track_speculation)
15376     {
15377       /* Emit an explicit compare instruction, so that we can correctly
15378          track the condition codes.  */
15379       rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
15380       x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
15381     }
15382   else
15383     x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
15384
15385   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15386                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
15387   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15388
15389   /* Emit any final barrier needed for a __sync operation.  */
15390   if (is_sync)
15391     aarch64_emit_post_barrier (model);
15392 }
15393
15394 static void
15395 aarch64_init_libfuncs (void)
15396 {
15397    /* Half-precision float operations.  The compiler handles all operations
15398      with NULL libfuncs by converting to SFmode.  */
15399
15400   /* Conversions.  */
15401   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
15402   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
15403
15404   /* Arithmetic.  */
15405   set_optab_libfunc (add_optab, HFmode, NULL);
15406   set_optab_libfunc (sdiv_optab, HFmode, NULL);
15407   set_optab_libfunc (smul_optab, HFmode, NULL);
15408   set_optab_libfunc (neg_optab, HFmode, NULL);
15409   set_optab_libfunc (sub_optab, HFmode, NULL);
15410
15411   /* Comparisons.  */
15412   set_optab_libfunc (eq_optab, HFmode, NULL);
15413   set_optab_libfunc (ne_optab, HFmode, NULL);
15414   set_optab_libfunc (lt_optab, HFmode, NULL);
15415   set_optab_libfunc (le_optab, HFmode, NULL);
15416   set_optab_libfunc (ge_optab, HFmode, NULL);
15417   set_optab_libfunc (gt_optab, HFmode, NULL);
15418   set_optab_libfunc (unord_optab, HFmode, NULL);
15419 }
15420
15421 /* Target hook for c_mode_for_suffix.  */
15422 static machine_mode
15423 aarch64_c_mode_for_suffix (char suffix)
15424 {
15425   if (suffix == 'q')
15426     return TFmode;
15427
15428   return VOIDmode;
15429 }
15430
15431 /* We can only represent floating point constants which will fit in
15432    "quarter-precision" values.  These values are characterised by
15433    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
15434    by:
15435
15436    (-1)^s * (n/16) * 2^r
15437
15438    Where:
15439      's' is the sign bit.
15440      'n' is an integer in the range 16 <= n <= 31.
15441      'r' is an integer in the range -3 <= r <= 4.  */
15442
15443 /* Return true iff X can be represented by a quarter-precision
15444    floating point immediate operand X.  Note, we cannot represent 0.0.  */
15445 bool
15446 aarch64_float_const_representable_p (rtx x)
15447 {
15448   /* This represents our current view of how many bits
15449      make up the mantissa.  */
15450   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
15451   int exponent;
15452   unsigned HOST_WIDE_INT mantissa, mask;
15453   REAL_VALUE_TYPE r, m;
15454   bool fail;
15455
15456   if (!CONST_DOUBLE_P (x))
15457     return false;
15458
15459   if (GET_MODE (x) == VOIDmode
15460       || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
15461     return false;
15462
15463   r = *CONST_DOUBLE_REAL_VALUE (x);
15464
15465   /* We cannot represent infinities, NaNs or +/-zero.  We won't
15466      know if we have +zero until we analyse the mantissa, but we
15467      can reject the other invalid values.  */
15468   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
15469       || REAL_VALUE_MINUS_ZERO (r))
15470     return false;
15471
15472   /* Extract exponent.  */
15473   r = real_value_abs (&r);
15474   exponent = REAL_EXP (&r);
15475
15476   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
15477      highest (sign) bit, with a fixed binary point at bit point_pos.
15478      m1 holds the low part of the mantissa, m2 the high part.
15479      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
15480      bits for the mantissa, this can fail (low bits will be lost).  */
15481   real_ldexp (&m, &r, point_pos - exponent);
15482   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
15483
15484   /* If the low part of the mantissa has bits set we cannot represent
15485      the value.  */
15486   if (w.ulow () != 0)
15487     return false;
15488   /* We have rejected the lower HOST_WIDE_INT, so update our
15489      understanding of how many bits lie in the mantissa and
15490      look only at the high HOST_WIDE_INT.  */
15491   mantissa = w.elt (1);
15492   point_pos -= HOST_BITS_PER_WIDE_INT;
15493
15494   /* We can only represent values with a mantissa of the form 1.xxxx.  */
15495   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
15496   if ((mantissa & mask) != 0)
15497     return false;
15498
15499   /* Having filtered unrepresentable values, we may now remove all
15500      but the highest 5 bits.  */
15501   mantissa >>= point_pos - 5;
15502
15503   /* We cannot represent the value 0.0, so reject it.  This is handled
15504      elsewhere.  */
15505   if (mantissa == 0)
15506     return false;
15507
15508   /* Then, as bit 4 is always set, we can mask it off, leaving
15509      the mantissa in the range [0, 15].  */
15510   mantissa &= ~(1 << 4);
15511   gcc_assert (mantissa <= 15);
15512
15513   /* GCC internally does not use IEEE754-like encoding (where normalized
15514      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
15515      Our mantissa values are shifted 4 places to the left relative to
15516      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
15517      by 5 places to correct for GCC's representation.  */
15518   exponent = 5 - exponent;
15519
15520   return (exponent >= 0 && exponent <= 7);
15521 }
15522
15523 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
15524    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
15525    output MOVI/MVNI, ORR or BIC immediate.  */
15526 char*
15527 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
15528                                    enum simd_immediate_check which)
15529 {
15530   bool is_valid;
15531   static char templ[40];
15532   const char *mnemonic;
15533   const char *shift_op;
15534   unsigned int lane_count = 0;
15535   char element_char;
15536
15537   struct simd_immediate_info info;
15538
15539   /* This will return true to show const_vector is legal for use as either
15540      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
15541      It will also update INFO to show how the immediate should be generated.
15542      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
15543   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
15544   gcc_assert (is_valid);
15545
15546   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
15547   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
15548
15549   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15550     {
15551       gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
15552       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
15553          move immediate path.  */
15554       if (aarch64_float_const_zero_rtx_p (info.value))
15555         info.value = GEN_INT (0);
15556       else
15557         {
15558           const unsigned int buf_size = 20;
15559           char float_buf[buf_size] = {'\0'};
15560           real_to_decimal_for_mode (float_buf,
15561                                     CONST_DOUBLE_REAL_VALUE (info.value),
15562                                     buf_size, buf_size, 1, info.elt_mode);
15563
15564           if (lane_count == 1)
15565             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
15566           else
15567             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
15568                       lane_count, element_char, float_buf);
15569           return templ;
15570         }
15571     }
15572
15573   gcc_assert (CONST_INT_P (info.value));
15574
15575   if (which == AARCH64_CHECK_MOV)
15576     {
15577       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
15578       shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
15579       if (lane_count == 1)
15580         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
15581                   mnemonic, UINTVAL (info.value));
15582       else if (info.shift)
15583         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
15584                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
15585                   element_char, UINTVAL (info.value), shift_op, info.shift);
15586       else
15587         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
15588                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
15589                   element_char, UINTVAL (info.value));
15590     }
15591   else
15592     {
15593       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
15594       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
15595       if (info.shift)
15596         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
15597                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
15598                   element_char, UINTVAL (info.value), "lsl", info.shift);
15599       else
15600         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
15601                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
15602                   element_char, UINTVAL (info.value));
15603     }
15604   return templ;
15605 }
15606
15607 char*
15608 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
15609 {
15610
15611   /* If a floating point number was passed and we desire to use it in an
15612      integer mode do the conversion to integer.  */
15613   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
15614     {
15615       unsigned HOST_WIDE_INT ival;
15616       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
15617           gcc_unreachable ();
15618       immediate = gen_int_mode (ival, mode);
15619     }
15620
15621   machine_mode vmode;
15622   /* use a 64 bit mode for everything except for DI/DF mode, where we use
15623      a 128 bit vector mode.  */
15624   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
15625
15626   vmode = aarch64_simd_container_mode (mode, width);
15627   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
15628   return aarch64_output_simd_mov_immediate (v_op, width);
15629 }
15630
15631 /* Return the output string to use for moving immediate CONST_VECTOR
15632    into an SVE register.  */
15633
15634 char *
15635 aarch64_output_sve_mov_immediate (rtx const_vector)
15636 {
15637   static char templ[40];
15638   struct simd_immediate_info info;
15639   char element_char;
15640
15641   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
15642   gcc_assert (is_valid);
15643
15644   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
15645
15646   if (info.step)
15647     {
15648       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
15649                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
15650                 element_char, INTVAL (info.value), INTVAL (info.step));
15651       return templ;
15652     }
15653
15654   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15655     {
15656       if (aarch64_float_const_zero_rtx_p (info.value))
15657         info.value = GEN_INT (0);
15658       else
15659         {
15660           const int buf_size = 20;
15661           char float_buf[buf_size] = {};
15662           real_to_decimal_for_mode (float_buf,
15663                                     CONST_DOUBLE_REAL_VALUE (info.value),
15664                                     buf_size, buf_size, 1, info.elt_mode);
15665
15666           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
15667                     element_char, float_buf);
15668           return templ;
15669         }
15670     }
15671
15672   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
15673             element_char, INTVAL (info.value));
15674   return templ;
15675 }
15676
15677 /* Return the asm format for a PTRUE instruction whose destination has
15678    mode MODE.  SUFFIX is the element size suffix.  */
15679
15680 char *
15681 aarch64_output_ptrue (machine_mode mode, char suffix)
15682 {
15683   unsigned int nunits;
15684   static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
15685   if (GET_MODE_NUNITS (mode).is_constant (&nunits))
15686     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
15687   else
15688     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
15689   return buf;
15690 }
15691
15692 /* Split operands into moves from op[1] + op[2] into op[0].  */
15693
15694 void
15695 aarch64_split_combinev16qi (rtx operands[3])
15696 {
15697   unsigned int dest = REGNO (operands[0]);
15698   unsigned int src1 = REGNO (operands[1]);
15699   unsigned int src2 = REGNO (operands[2]);
15700   machine_mode halfmode = GET_MODE (operands[1]);
15701   unsigned int halfregs = REG_NREGS (operands[1]);
15702   rtx destlo, desthi;
15703
15704   gcc_assert (halfmode == V16QImode);
15705
15706   if (src1 == dest && src2 == dest + halfregs)
15707     {
15708       /* No-op move.  Can't split to nothing; emit something.  */
15709       emit_note (NOTE_INSN_DELETED);
15710       return;
15711     }
15712
15713   /* Preserve register attributes for variable tracking.  */
15714   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
15715   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
15716                                GET_MODE_SIZE (halfmode));
15717
15718   /* Special case of reversed high/low parts.  */
15719   if (reg_overlap_mentioned_p (operands[2], destlo)
15720       && reg_overlap_mentioned_p (operands[1], desthi))
15721     {
15722       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15723       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
15724       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15725     }
15726   else if (!reg_overlap_mentioned_p (operands[2], destlo))
15727     {
15728       /* Try to avoid unnecessary moves if part of the result
15729          is in the right place already.  */
15730       if (src1 != dest)
15731         emit_move_insn (destlo, operands[1]);
15732       if (src2 != dest + halfregs)
15733         emit_move_insn (desthi, operands[2]);
15734     }
15735   else
15736     {
15737       if (src2 != dest + halfregs)
15738         emit_move_insn (desthi, operands[2]);
15739       if (src1 != dest)
15740         emit_move_insn (destlo, operands[1]);
15741     }
15742 }
15743
15744 /* vec_perm support.  */
15745
15746 struct expand_vec_perm_d
15747 {
15748   rtx target, op0, op1;
15749   vec_perm_indices perm;
15750   machine_mode vmode;
15751   unsigned int vec_flags;
15752   bool one_vector_p;
15753   bool testing_p;
15754 };
15755
15756 /* Generate a variable permutation.  */
15757
15758 static void
15759 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
15760 {
15761   machine_mode vmode = GET_MODE (target);
15762   bool one_vector_p = rtx_equal_p (op0, op1);
15763
15764   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
15765   gcc_checking_assert (GET_MODE (op0) == vmode);
15766   gcc_checking_assert (GET_MODE (op1) == vmode);
15767   gcc_checking_assert (GET_MODE (sel) == vmode);
15768   gcc_checking_assert (TARGET_SIMD);
15769
15770   if (one_vector_p)
15771     {
15772       if (vmode == V8QImode)
15773         {
15774           /* Expand the argument to a V16QI mode by duplicating it.  */
15775           rtx pair = gen_reg_rtx (V16QImode);
15776           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
15777           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15778         }
15779       else
15780         {
15781           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
15782         }
15783     }
15784   else
15785     {
15786       rtx pair;
15787
15788       if (vmode == V8QImode)
15789         {
15790           pair = gen_reg_rtx (V16QImode);
15791           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
15792           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15793         }
15794       else
15795         {
15796           pair = gen_reg_rtx (OImode);
15797           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
15798           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
15799         }
15800     }
15801 }
15802
15803 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
15804    NELT is the number of elements in the vector.  */
15805
15806 void
15807 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
15808                          unsigned int nelt)
15809 {
15810   machine_mode vmode = GET_MODE (target);
15811   bool one_vector_p = rtx_equal_p (op0, op1);
15812   rtx mask;
15813
15814   /* The TBL instruction does not use a modulo index, so we must take care
15815      of that ourselves.  */
15816   mask = aarch64_simd_gen_const_vector_dup (vmode,
15817       one_vector_p ? nelt - 1 : 2 * nelt - 1);
15818   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
15819
15820   /* For big-endian, we also need to reverse the index within the vector
15821      (but not which vector).  */
15822   if (BYTES_BIG_ENDIAN)
15823     {
15824       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
15825       if (!one_vector_p)
15826         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
15827       sel = expand_simple_binop (vmode, XOR, sel, mask,
15828                                  NULL, 0, OPTAB_LIB_WIDEN);
15829     }
15830   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
15831 }
15832
15833 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
15834
15835 static void
15836 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
15837 {
15838   emit_insn (gen_rtx_SET (target,
15839                           gen_rtx_UNSPEC (GET_MODE (target),
15840                                           gen_rtvec (2, op0, op1), code)));
15841 }
15842
15843 /* Expand an SVE vec_perm with the given operands.  */
15844
15845 void
15846 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
15847 {
15848   machine_mode data_mode = GET_MODE (target);
15849   machine_mode sel_mode = GET_MODE (sel);
15850   /* Enforced by the pattern condition.  */
15851   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
15852
15853   /* Note: vec_perm indices are supposed to wrap when they go beyond the
15854      size of the two value vectors, i.e. the upper bits of the indices
15855      are effectively ignored.  SVE TBL instead produces 0 for any
15856      out-of-range indices, so we need to modulo all the vec_perm indices
15857      to ensure they are all in range.  */
15858   rtx sel_reg = force_reg (sel_mode, sel);
15859
15860   /* Check if the sel only references the first values vector.  */
15861   if (GET_CODE (sel) == CONST_VECTOR
15862       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
15863     {
15864       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
15865       return;
15866     }
15867
15868   /* Check if the two values vectors are the same.  */
15869   if (rtx_equal_p (op0, op1))
15870     {
15871       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
15872       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15873                                          NULL, 0, OPTAB_DIRECT);
15874       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
15875       return;
15876     }
15877
15878   /* Run TBL on for each value vector and combine the results.  */
15879
15880   rtx res0 = gen_reg_rtx (data_mode);
15881   rtx res1 = gen_reg_rtx (data_mode);
15882   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
15883   if (GET_CODE (sel) != CONST_VECTOR
15884       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
15885     {
15886       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
15887                                                        2 * nunits - 1);
15888       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15889                                      NULL, 0, OPTAB_DIRECT);
15890     }
15891   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
15892   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
15893                                      NULL, 0, OPTAB_DIRECT);
15894   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
15895   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
15896     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
15897   else
15898     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
15899 }
15900
15901 /* Recognize patterns suitable for the TRN instructions.  */
15902 static bool
15903 aarch64_evpc_trn (struct expand_vec_perm_d *d)
15904 {
15905   HOST_WIDE_INT odd;
15906   poly_uint64 nelt = d->perm.length ();
15907   rtx out, in0, in1, x;
15908   machine_mode vmode = d->vmode;
15909
15910   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15911     return false;
15912
15913   /* Note that these are little-endian tests.
15914      We correct for big-endian later.  */
15915   if (!d->perm[0].is_constant (&odd)
15916       || (odd != 0 && odd != 1)
15917       || !d->perm.series_p (0, 2, odd, 2)
15918       || !d->perm.series_p (1, 2, nelt + odd, 2))
15919     return false;
15920
15921   /* Success!  */
15922   if (d->testing_p)
15923     return true;
15924
15925   in0 = d->op0;
15926   in1 = d->op1;
15927   /* We don't need a big-endian lane correction for SVE; see the comment
15928      at the head of aarch64-sve.md for details.  */
15929   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15930     {
15931       x = in0, in0 = in1, in1 = x;
15932       odd = !odd;
15933     }
15934   out = d->target;
15935
15936   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15937                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
15938   return true;
15939 }
15940
15941 /* Recognize patterns suitable for the UZP instructions.  */
15942 static bool
15943 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
15944 {
15945   HOST_WIDE_INT odd;
15946   rtx out, in0, in1, x;
15947   machine_mode vmode = d->vmode;
15948
15949   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15950     return false;
15951
15952   /* Note that these are little-endian tests.
15953      We correct for big-endian later.  */
15954   if (!d->perm[0].is_constant (&odd)
15955       || (odd != 0 && odd != 1)
15956       || !d->perm.series_p (0, 1, odd, 2))
15957     return false;
15958
15959   /* Success!  */
15960   if (d->testing_p)
15961     return true;
15962
15963   in0 = d->op0;
15964   in1 = d->op1;
15965   /* We don't need a big-endian lane correction for SVE; see the comment
15966      at the head of aarch64-sve.md for details.  */
15967   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15968     {
15969       x = in0, in0 = in1, in1 = x;
15970       odd = !odd;
15971     }
15972   out = d->target;
15973
15974   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15975                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
15976   return true;
15977 }
15978
15979 /* Recognize patterns suitable for the ZIP instructions.  */
15980 static bool
15981 aarch64_evpc_zip (struct expand_vec_perm_d *d)
15982 {
15983   unsigned int high;
15984   poly_uint64 nelt = d->perm.length ();
15985   rtx out, in0, in1, x;
15986   machine_mode vmode = d->vmode;
15987
15988   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15989     return false;
15990
15991   /* Note that these are little-endian tests.
15992      We correct for big-endian later.  */
15993   poly_uint64 first = d->perm[0];
15994   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
15995       || !d->perm.series_p (0, 2, first, 1)
15996       || !d->perm.series_p (1, 2, first + nelt, 1))
15997     return false;
15998   high = maybe_ne (first, 0U);
15999
16000   /* Success!  */
16001   if (d->testing_p)
16002     return true;
16003
16004   in0 = d->op0;
16005   in1 = d->op1;
16006   /* We don't need a big-endian lane correction for SVE; see the comment
16007      at the head of aarch64-sve.md for details.  */
16008   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16009     {
16010       x = in0, in0 = in1, in1 = x;
16011       high = !high;
16012     }
16013   out = d->target;
16014
16015   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16016                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
16017   return true;
16018 }
16019
16020 /* Recognize patterns for the EXT insn.  */
16021
16022 static bool
16023 aarch64_evpc_ext (struct expand_vec_perm_d *d)
16024 {
16025   HOST_WIDE_INT location;
16026   rtx offset;
16027
16028   /* The first element always refers to the first vector.
16029      Check if the extracted indices are increasing by one.  */
16030   if (d->vec_flags == VEC_SVE_PRED
16031       || !d->perm[0].is_constant (&location)
16032       || !d->perm.series_p (0, 1, location, 1))
16033     return false;
16034
16035   /* Success! */
16036   if (d->testing_p)
16037     return true;
16038
16039   /* The case where (location == 0) is a no-op for both big- and little-endian,
16040      and is removed by the mid-end at optimization levels -O1 and higher.
16041
16042      We don't need a big-endian lane correction for SVE; see the comment
16043      at the head of aarch64-sve.md for details.  */
16044   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
16045     {
16046       /* After setup, we want the high elements of the first vector (stored
16047          at the LSB end of the register), and the low elements of the second
16048          vector (stored at the MSB end of the register). So swap.  */
16049       std::swap (d->op0, d->op1);
16050       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
16051          to_constant () is safe since this is restricted to Advanced SIMD
16052          vectors.  */
16053       location = d->perm.length ().to_constant () - location;
16054     }
16055
16056   offset = GEN_INT (location);
16057   emit_set_insn (d->target,
16058                  gen_rtx_UNSPEC (d->vmode,
16059                                  gen_rtvec (3, d->op0, d->op1, offset),
16060                                  UNSPEC_EXT));
16061   return true;
16062 }
16063
16064 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
16065    within each 64-bit, 32-bit or 16-bit granule.  */
16066
16067 static bool
16068 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
16069 {
16070   HOST_WIDE_INT diff;
16071   unsigned int i, size, unspec;
16072   machine_mode pred_mode;
16073
16074   if (d->vec_flags == VEC_SVE_PRED
16075       || !d->one_vector_p
16076       || !d->perm[0].is_constant (&diff))
16077     return false;
16078
16079   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
16080   if (size == 8)
16081     {
16082       unspec = UNSPEC_REV64;
16083       pred_mode = VNx2BImode;
16084     }
16085   else if (size == 4)
16086     {
16087       unspec = UNSPEC_REV32;
16088       pred_mode = VNx4BImode;
16089     }
16090   else if (size == 2)
16091     {
16092       unspec = UNSPEC_REV16;
16093       pred_mode = VNx8BImode;
16094     }
16095   else
16096     return false;
16097
16098   unsigned int step = diff + 1;
16099   for (i = 0; i < step; ++i)
16100     if (!d->perm.series_p (i, step, diff - i, step))
16101       return false;
16102
16103   /* Success! */
16104   if (d->testing_p)
16105     return true;
16106
16107   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
16108   if (d->vec_flags == VEC_SVE_DATA)
16109     {
16110       rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
16111       src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
16112                             UNSPEC_MERGE_PTRUE);
16113     }
16114   emit_set_insn (d->target, src);
16115   return true;
16116 }
16117
16118 /* Recognize patterns for the REV insn, which reverses elements within
16119    a full vector.  */
16120
16121 static bool
16122 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
16123 {
16124   poly_uint64 nelt = d->perm.length ();
16125
16126   if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
16127     return false;
16128
16129   if (!d->perm.series_p (0, 1, nelt - 1, -1))
16130     return false;
16131
16132   /* Success! */
16133   if (d->testing_p)
16134     return true;
16135
16136   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
16137   emit_set_insn (d->target, src);
16138   return true;
16139 }
16140
16141 static bool
16142 aarch64_evpc_dup (struct expand_vec_perm_d *d)
16143 {
16144   rtx out = d->target;
16145   rtx in0;
16146   HOST_WIDE_INT elt;
16147   machine_mode vmode = d->vmode;
16148   rtx lane;
16149
16150   if (d->vec_flags == VEC_SVE_PRED
16151       || d->perm.encoding ().encoded_nelts () != 1
16152       || !d->perm[0].is_constant (&elt))
16153     return false;
16154
16155   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
16156     return false;
16157
16158   /* Success! */
16159   if (d->testing_p)
16160     return true;
16161
16162   /* The generic preparation in aarch64_expand_vec_perm_const_1
16163      swaps the operand order and the permute indices if it finds
16164      d->perm[0] to be in the second operand.  Thus, we can always
16165      use d->op0 and need not do any extra arithmetic to get the
16166      correct lane number.  */
16167   in0 = d->op0;
16168   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
16169
16170   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
16171   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
16172   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
16173   return true;
16174 }
16175
16176 static bool
16177 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
16178 {
16179   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
16180   machine_mode vmode = d->vmode;
16181
16182   /* Make sure that the indices are constant.  */
16183   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
16184   for (unsigned int i = 0; i < encoded_nelts; ++i)
16185     if (!d->perm[i].is_constant ())
16186       return false;
16187
16188   if (d->testing_p)
16189     return true;
16190
16191   /* Generic code will try constant permutation twice.  Once with the
16192      original mode and again with the elements lowered to QImode.
16193      So wait and don't do the selector expansion ourselves.  */
16194   if (vmode != V8QImode && vmode != V16QImode)
16195     return false;
16196
16197   /* to_constant is safe since this routine is specific to Advanced SIMD
16198      vectors.  */
16199   unsigned int nelt = d->perm.length ().to_constant ();
16200   for (unsigned int i = 0; i < nelt; ++i)
16201     /* If big-endian and two vectors we end up with a weird mixed-endian
16202        mode on NEON.  Reverse the index within each word but not the word
16203        itself.  to_constant is safe because we checked is_constant above.  */
16204     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
16205                         ? d->perm[i].to_constant () ^ (nelt - 1)
16206                         : d->perm[i].to_constant ());
16207
16208   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
16209   sel = force_reg (vmode, sel);
16210
16211   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
16212   return true;
16213 }
16214
16215 /* Try to implement D using an SVE TBL instruction.  */
16216
16217 static bool
16218 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
16219 {
16220   unsigned HOST_WIDE_INT nelt;
16221
16222   /* Permuting two variable-length vectors could overflow the
16223      index range.  */
16224   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
16225     return false;
16226
16227   if (d->testing_p)
16228     return true;
16229
16230   machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
16231   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
16232   if (d->one_vector_p)
16233     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
16234   else
16235     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
16236   return true;
16237 }
16238
16239 static bool
16240 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
16241 {
16242   /* The pattern matching functions above are written to look for a small
16243      number to begin the sequence (0, 1, N/2).  If we begin with an index
16244      from the second operand, we can swap the operands.  */
16245   poly_int64 nelt = d->perm.length ();
16246   if (known_ge (d->perm[0], nelt))
16247     {
16248       d->perm.rotate_inputs (1);
16249       std::swap (d->op0, d->op1);
16250     }
16251
16252   if ((d->vec_flags == VEC_ADVSIMD
16253        || d->vec_flags == VEC_SVE_DATA
16254        || d->vec_flags == VEC_SVE_PRED)
16255       && known_gt (nelt, 1))
16256     {
16257       if (aarch64_evpc_rev_local (d))
16258         return true;
16259       else if (aarch64_evpc_rev_global (d))
16260         return true;
16261       else if (aarch64_evpc_ext (d))
16262         return true;
16263       else if (aarch64_evpc_dup (d))
16264         return true;
16265       else if (aarch64_evpc_zip (d))
16266         return true;
16267       else if (aarch64_evpc_uzp (d))
16268         return true;
16269       else if (aarch64_evpc_trn (d))
16270         return true;
16271       if (d->vec_flags == VEC_SVE_DATA)
16272         return aarch64_evpc_sve_tbl (d);
16273       else if (d->vec_flags == VEC_ADVSIMD)
16274         return aarch64_evpc_tbl (d);
16275     }
16276   return false;
16277 }
16278
16279 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
16280
16281 static bool
16282 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
16283                                   rtx op1, const vec_perm_indices &sel)
16284 {
16285   struct expand_vec_perm_d d;
16286
16287   /* Check whether the mask can be applied to a single vector.  */
16288   if (sel.ninputs () == 1
16289       || (op0 && rtx_equal_p (op0, op1)))
16290     d.one_vector_p = true;
16291   else if (sel.all_from_input_p (0))
16292     {
16293       d.one_vector_p = true;
16294       op1 = op0;
16295     }
16296   else if (sel.all_from_input_p (1))
16297     {
16298       d.one_vector_p = true;
16299       op0 = op1;
16300     }
16301   else
16302     d.one_vector_p = false;
16303
16304   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
16305                      sel.nelts_per_input ());
16306   d.vmode = vmode;
16307   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
16308   d.target = target;
16309   d.op0 = op0;
16310   d.op1 = op1;
16311   d.testing_p = !target;
16312
16313   if (!d.testing_p)
16314     return aarch64_expand_vec_perm_const_1 (&d);
16315
16316   rtx_insn *last = get_last_insn ();
16317   bool ret = aarch64_expand_vec_perm_const_1 (&d);
16318   gcc_assert (last == get_last_insn ());
16319
16320   return ret;
16321 }
16322
16323 /* Generate a byte permute mask for a register of mode MODE,
16324    which has NUNITS units.  */
16325
16326 rtx
16327 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
16328 {
16329   /* We have to reverse each vector because we dont have
16330      a permuted load that can reverse-load according to ABI rules.  */
16331   rtx mask;
16332   rtvec v = rtvec_alloc (16);
16333   unsigned int i, j;
16334   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
16335
16336   gcc_assert (BYTES_BIG_ENDIAN);
16337   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
16338
16339   for (i = 0; i < nunits; i++)
16340     for (j = 0; j < usize; j++)
16341       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
16342   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
16343   return force_reg (V16QImode, mask);
16344 }
16345
16346 /* Return true if X is a valid second operand for the SVE instruction
16347    that implements integer comparison OP_CODE.  */
16348
16349 static bool
16350 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
16351 {
16352   if (register_operand (x, VOIDmode))
16353     return true;
16354
16355   switch (op_code)
16356     {
16357     case LTU:
16358     case LEU:
16359     case GEU:
16360     case GTU:
16361       return aarch64_sve_cmp_immediate_p (x, false);
16362     case LT:
16363     case LE:
16364     case GE:
16365     case GT:
16366     case NE:
16367     case EQ:
16368       return aarch64_sve_cmp_immediate_p (x, true);
16369     default:
16370       gcc_unreachable ();
16371     }
16372 }
16373
16374 /* Use predicated SVE instructions to implement the equivalent of:
16375
16376      (set TARGET OP)
16377
16378    given that PTRUE is an all-true predicate of the appropriate mode.  */
16379
16380 static void
16381 aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
16382 {
16383   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
16384                                gen_rtvec (2, ptrue, op),
16385                                UNSPEC_MERGE_PTRUE);
16386   rtx_insn *insn = emit_set_insn (target, unspec);
16387   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
16388 }
16389
16390 /* Likewise, but also clobber the condition codes.  */
16391
16392 static void
16393 aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
16394 {
16395   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
16396                                gen_rtvec (2, ptrue, op),
16397                                UNSPEC_MERGE_PTRUE);
16398   rtx_insn *insn = emit_insn (gen_set_clobber_cc (target, unspec));
16399   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
16400 }
16401
16402 /* Return the UNSPEC_COND_* code for comparison CODE.  */
16403
16404 static unsigned int
16405 aarch64_unspec_cond_code (rtx_code code)
16406 {
16407   switch (code)
16408     {
16409     case NE:
16410       return UNSPEC_COND_NE;
16411     case EQ:
16412       return UNSPEC_COND_EQ;
16413     case LT:
16414       return UNSPEC_COND_LT;
16415     case GT:
16416       return UNSPEC_COND_GT;
16417     case LE:
16418       return UNSPEC_COND_LE;
16419     case GE:
16420       return UNSPEC_COND_GE;
16421     default:
16422       gcc_unreachable ();
16423     }
16424 }
16425
16426 /* Emit:
16427
16428       (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
16429
16430    where <X> is the operation associated with comparison CODE.  This form
16431    of instruction is used when (and (CODE OP0 OP1) PRED) would have different
16432    semantics, such as when PRED might not be all-true and when comparing
16433    inactive lanes could have side effects.  */
16434
16435 static void
16436 aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
16437                                   rtx pred, rtx op0, rtx op1)
16438 {
16439   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
16440                                gen_rtvec (3, pred, op0, op1),
16441                                aarch64_unspec_cond_code (code));
16442   emit_set_insn (target, unspec);
16443 }
16444
16445 /* Expand an SVE integer comparison using the SVE equivalent of:
16446
16447      (set TARGET (CODE OP0 OP1)).  */
16448
16449 void
16450 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
16451 {
16452   machine_mode pred_mode = GET_MODE (target);
16453   machine_mode data_mode = GET_MODE (op0);
16454
16455   if (!aarch64_sve_cmp_operand_p (code, op1))
16456     op1 = force_reg (data_mode, op1);
16457
16458   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
16459   rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
16460   aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
16461 }
16462
16463 /* Emit the SVE equivalent of:
16464
16465       (set TMP1 (CODE1 OP0 OP1))
16466       (set TMP2 (CODE2 OP0 OP1))
16467       (set TARGET (ior:PRED_MODE TMP1 TMP2))
16468
16469    PTRUE is an all-true predicate with the same mode as TARGET.  */
16470
16471 static void
16472 aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
16473                            rtx ptrue, rtx op0, rtx op1)
16474 {
16475   machine_mode pred_mode = GET_MODE (ptrue);
16476   rtx tmp1 = gen_reg_rtx (pred_mode);
16477   aarch64_emit_sve_ptrue_op (tmp1, ptrue,
16478                              gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
16479   rtx tmp2 = gen_reg_rtx (pred_mode);
16480   aarch64_emit_sve_ptrue_op (tmp2, ptrue,
16481                              gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
16482   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
16483 }
16484
16485 /* Emit the SVE equivalent of:
16486
16487       (set TMP (CODE OP0 OP1))
16488       (set TARGET (not TMP))
16489
16490    PTRUE is an all-true predicate with the same mode as TARGET.  */
16491
16492 static void
16493 aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
16494                                 rtx op0, rtx op1)
16495 {
16496   machine_mode pred_mode = GET_MODE (ptrue);
16497   rtx tmp = gen_reg_rtx (pred_mode);
16498   aarch64_emit_sve_ptrue_op (tmp, ptrue,
16499                              gen_rtx_fmt_ee (code, pred_mode, op0, op1));
16500   aarch64_emit_unop (target, one_cmpl_optab, tmp);
16501 }
16502
16503 /* Expand an SVE floating-point comparison using the SVE equivalent of:
16504
16505      (set TARGET (CODE OP0 OP1))
16506
16507    If CAN_INVERT_P is true, the caller can also handle inverted results;
16508    return true if the result is in fact inverted.  */
16509
16510 bool
16511 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
16512                                   rtx op0, rtx op1, bool can_invert_p)
16513 {
16514   machine_mode pred_mode = GET_MODE (target);
16515   machine_mode data_mode = GET_MODE (op0);
16516
16517   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
16518   switch (code)
16519     {
16520     case UNORDERED:
16521       /* UNORDERED has no immediate form.  */
16522       op1 = force_reg (data_mode, op1);
16523       /* fall through */
16524     case LT:
16525     case LE:
16526     case GT:
16527     case GE:
16528     case EQ:
16529     case NE:
16530       {
16531         /* There is native support for the comparison.  */
16532         rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
16533         aarch64_emit_sve_ptrue_op (target, ptrue, cond);
16534         return false;
16535       }
16536
16537     case LTGT:
16538       /* This is a trapping operation (LT or GT).  */
16539       aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
16540       return false;
16541
16542     case UNEQ:
16543       if (!flag_trapping_math)
16544         {
16545           /* This would trap for signaling NaNs.  */
16546           op1 = force_reg (data_mode, op1);
16547           aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
16548           return false;
16549         }
16550       /* fall through */
16551     case UNLT:
16552     case UNLE:
16553     case UNGT:
16554     case UNGE:
16555       if (flag_trapping_math)
16556         {
16557           /* Work out which elements are ordered.  */
16558           rtx ordered = gen_reg_rtx (pred_mode);
16559           op1 = force_reg (data_mode, op1);
16560           aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
16561
16562           /* Test the opposite condition for the ordered elements,
16563              then invert the result.  */
16564           if (code == UNEQ)
16565             code = NE;
16566           else
16567             code = reverse_condition_maybe_unordered (code);
16568           if (can_invert_p)
16569             {
16570               aarch64_emit_sve_predicated_cond (target, code,
16571                                                 ordered, op0, op1);
16572               return true;
16573             }
16574           rtx tmp = gen_reg_rtx (pred_mode);
16575           aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
16576           aarch64_emit_unop (target, one_cmpl_optab, tmp);
16577           return false;
16578         }
16579       break;
16580
16581     case ORDERED:
16582       /* ORDERED has no immediate form.  */
16583       op1 = force_reg (data_mode, op1);
16584       break;
16585
16586     default:
16587       gcc_unreachable ();
16588     }
16589
16590   /* There is native support for the inverse comparison.  */
16591   code = reverse_condition_maybe_unordered (code);
16592   if (can_invert_p)
16593     {
16594       rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
16595       aarch64_emit_sve_ptrue_op (target, ptrue, cond);
16596       return true;
16597     }
16598   aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
16599   return false;
16600 }
16601
16602 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
16603    of the data being selected and CMP_MODE is the mode of the values being
16604    compared.  */
16605
16606 void
16607 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
16608                           rtx *ops)
16609 {
16610   machine_mode pred_mode
16611     = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
16612                              GET_MODE_SIZE (cmp_mode)).require ();
16613   rtx pred = gen_reg_rtx (pred_mode);
16614   if (FLOAT_MODE_P (cmp_mode))
16615     {
16616       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
16617                                             ops[4], ops[5], true))
16618         std::swap (ops[1], ops[2]);
16619     }
16620   else
16621     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
16622
16623   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
16624   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
16625 }
16626
16627 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
16628    true.  However due to issues with register allocation it is preferable
16629    to avoid tieing integer scalar and FP scalar modes.  Executing integer
16630    operations in general registers is better than treating them as scalar
16631    vector operations.  This reduces latency and avoids redundant int<->FP
16632    moves.  So tie modes if they are either the same class, or vector modes
16633    with other vector modes, vector structs or any scalar mode.  */
16634
16635 static bool
16636 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
16637 {
16638   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
16639     return true;
16640
16641   /* We specifically want to allow elements of "structure" modes to
16642      be tieable to the structure.  This more general condition allows
16643      other rarer situations too.  The reason we don't extend this to
16644      predicate modes is that there are no predicate structure modes
16645      nor any specific instructions for extracting part of a predicate
16646      register.  */
16647   if (aarch64_vector_data_mode_p (mode1)
16648       && aarch64_vector_data_mode_p (mode2))
16649     return true;
16650
16651   /* Also allow any scalar modes with vectors.  */
16652   if (aarch64_vector_mode_supported_p (mode1)
16653       || aarch64_vector_mode_supported_p (mode2))
16654     return true;
16655
16656   return false;
16657 }
16658
16659 /* Return a new RTX holding the result of moving POINTER forward by
16660    AMOUNT bytes.  */
16661
16662 static rtx
16663 aarch64_move_pointer (rtx pointer, poly_int64 amount)
16664 {
16665   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
16666
16667   return adjust_automodify_address (pointer, GET_MODE (pointer),
16668                                     next, amount);
16669 }
16670
16671 /* Return a new RTX holding the result of moving POINTER forward by the
16672    size of the mode it points to.  */
16673
16674 static rtx
16675 aarch64_progress_pointer (rtx pointer)
16676 {
16677   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
16678 }
16679
16680 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
16681    MODE bytes.  */
16682
16683 static void
16684 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
16685                                               machine_mode mode)
16686 {
16687   rtx reg = gen_reg_rtx (mode);
16688
16689   /* "Cast" the pointers to the correct mode.  */
16690   *src = adjust_address (*src, mode, 0);
16691   *dst = adjust_address (*dst, mode, 0);
16692   /* Emit the memcpy.  */
16693   emit_move_insn (reg, *src);
16694   emit_move_insn (*dst, reg);
16695   /* Move the pointers forward.  */
16696   *src = aarch64_progress_pointer (*src);
16697   *dst = aarch64_progress_pointer (*dst);
16698 }
16699
16700 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
16701    we succeed, otherwise return false.  */
16702
16703 bool
16704 aarch64_expand_movmem (rtx *operands)
16705 {
16706   int n, mode_bits;
16707   rtx dst = operands[0];
16708   rtx src = operands[1];
16709   rtx base;
16710   machine_mode cur_mode = BLKmode, next_mode;
16711   bool speed_p = !optimize_function_for_size_p (cfun);
16712
16713   /* When optimizing for size, give a better estimate of the length of a
16714      memcpy call, but use the default otherwise.  Moves larger than 8 bytes
16715      will always require an even number of instructions to do now.  And each
16716      operation requires both a load+store, so devide the max number by 2.  */
16717   int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
16718
16719   /* We can't do anything smart if the amount to copy is not constant.  */
16720   if (!CONST_INT_P (operands[2]))
16721     return false;
16722
16723   n = INTVAL (operands[2]);
16724
16725   /* Try to keep the number of instructions low.  For all cases we will do at
16726      most two moves for the residual amount, since we'll always overlap the
16727      remainder.  */
16728   if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
16729     return false;
16730
16731   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
16732   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
16733
16734   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
16735   src = adjust_automodify_address (src, VOIDmode, base, 0);
16736
16737   /* Convert n to bits to make the rest of the code simpler.  */
16738   n = n * BITS_PER_UNIT;
16739
16740   /* Maximum amount to copy in one go.  The AArch64 back-end has integer modes
16741      larger than TImode, but we should not use them for loads/stores here.  */
16742   const int copy_limit = GET_MODE_BITSIZE (TImode);
16743
16744   while (n > 0)
16745     {
16746       /* Find the largest mode in which to do the copy in without over reading
16747          or writing.  */
16748       opt_scalar_int_mode mode_iter;
16749       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
16750         if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
16751           cur_mode = mode_iter.require ();
16752
16753       gcc_assert (cur_mode != BLKmode);
16754
16755       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
16756       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
16757
16758       n -= mode_bits;
16759
16760       /* Do certain trailing copies as overlapping if it's going to be
16761          cheaper.  i.e. less instructions to do so.  For instance doing a 15
16762          byte copy it's more efficient to do two overlapping 8 byte copies than
16763          8 + 6 + 1.  */
16764       if (n > 0 && n <= 8 * BITS_PER_UNIT)
16765         {
16766           next_mode = smallest_mode_for_size (n, MODE_INT);
16767           int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
16768           src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
16769           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
16770           n = n_bits;
16771         }
16772     }
16773
16774   return true;
16775 }
16776
16777 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
16778    SImode stores.  Handle the case when the constant has identical
16779    bottom and top halves.  This is beneficial when the two stores can be
16780    merged into an STP and we avoid synthesising potentially expensive
16781    immediates twice.  Return true if such a split is possible.  */
16782
16783 bool
16784 aarch64_split_dimode_const_store (rtx dst, rtx src)
16785 {
16786   rtx lo = gen_lowpart (SImode, src);
16787   rtx hi = gen_highpart_mode (SImode, DImode, src);
16788
16789   bool size_p = optimize_function_for_size_p (cfun);
16790
16791   if (!rtx_equal_p (lo, hi))
16792     return false;
16793
16794   unsigned int orig_cost
16795     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
16796   unsigned int lo_cost
16797     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
16798
16799   /* We want to transform:
16800      MOV        x1, 49370
16801      MOVK       x1, 0x140, lsl 16
16802      MOVK       x1, 0xc0da, lsl 32
16803      MOVK       x1, 0x140, lsl 48
16804      STR        x1, [x0]
16805    into:
16806      MOV        w1, 49370
16807      MOVK       w1, 0x140, lsl 16
16808      STP        w1, w1, [x0]
16809    So we want to perform this only when we save two instructions
16810    or more.  When optimizing for size, however, accept any code size
16811    savings we can.  */
16812   if (size_p && orig_cost <= lo_cost)
16813     return false;
16814
16815   if (!size_p
16816       && (orig_cost <= lo_cost + 1))
16817     return false;
16818
16819   rtx mem_lo = adjust_address (dst, SImode, 0);
16820   if (!aarch64_mem_pair_operand (mem_lo, SImode))
16821     return false;
16822
16823   rtx tmp_reg = gen_reg_rtx (SImode);
16824   aarch64_expand_mov_immediate (tmp_reg, lo);
16825   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
16826   /* Don't emit an explicit store pair as this may not be always profitable.
16827      Let the sched-fusion logic decide whether to merge them.  */
16828   emit_move_insn (mem_lo, tmp_reg);
16829   emit_move_insn (mem_hi, tmp_reg);
16830
16831   return true;
16832 }
16833
16834 /* Generate RTL for a conditional branch with rtx comparison CODE in
16835    mode CC_MODE.  The destination of the unlikely conditional branch
16836    is LABEL_REF.  */
16837
16838 void
16839 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
16840                               rtx label_ref)
16841 {
16842   rtx x;
16843   x = gen_rtx_fmt_ee (code, VOIDmode,
16844                       gen_rtx_REG (cc_mode, CC_REGNUM),
16845                       const0_rtx);
16846
16847   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16848                             gen_rtx_LABEL_REF (VOIDmode, label_ref),
16849                             pc_rtx);
16850   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16851 }
16852
16853 /* Generate DImode scratch registers for 128-bit (TImode) addition.
16854
16855    OP1 represents the TImode destination operand 1
16856    OP2 represents the TImode destination operand 2
16857    LOW_DEST represents the low half (DImode) of TImode operand 0
16858    LOW_IN1 represents the low half (DImode) of TImode operand 1
16859    LOW_IN2 represents the low half (DImode) of TImode operand 2
16860    HIGH_DEST represents the high half (DImode) of TImode operand 0
16861    HIGH_IN1 represents the high half (DImode) of TImode operand 1
16862    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
16863
16864 void
16865 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
16866                             rtx *low_in1, rtx *low_in2,
16867                             rtx *high_dest, rtx *high_in1,
16868                             rtx *high_in2)
16869 {
16870   *low_dest = gen_reg_rtx (DImode);
16871   *low_in1 = gen_lowpart (DImode, op1);
16872   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
16873                                   subreg_lowpart_offset (DImode, TImode));
16874   *high_dest = gen_reg_rtx (DImode);
16875   *high_in1 = gen_highpart (DImode, op1);
16876   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
16877                                    subreg_highpart_offset (DImode, TImode));
16878 }
16879
16880 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
16881
16882    This function differs from 'arch64_addti_scratch_regs' in that
16883    OP1 can be an immediate constant (zero). We must call
16884    subreg_highpart_offset with DImode and TImode arguments, otherwise
16885    VOIDmode will be used for the const_int which generates an internal
16886    error from subreg_size_highpart_offset which does not expect a size of zero.
16887
16888    OP1 represents the TImode destination operand 1
16889    OP2 represents the TImode destination operand 2
16890    LOW_DEST represents the low half (DImode) of TImode operand 0
16891    LOW_IN1 represents the low half (DImode) of TImode operand 1
16892    LOW_IN2 represents the low half (DImode) of TImode operand 2
16893    HIGH_DEST represents the high half (DImode) of TImode operand 0
16894    HIGH_IN1 represents the high half (DImode) of TImode operand 1
16895    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
16896
16897
16898 void
16899 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
16900                              rtx *low_in1, rtx *low_in2,
16901                              rtx *high_dest, rtx *high_in1,
16902                              rtx *high_in2)
16903 {
16904   *low_dest = gen_reg_rtx (DImode);
16905   *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
16906                                   subreg_lowpart_offset (DImode, TImode));
16907
16908   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
16909                                   subreg_lowpart_offset (DImode, TImode));
16910   *high_dest = gen_reg_rtx (DImode);
16911
16912   *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
16913                                    subreg_highpart_offset (DImode, TImode));
16914   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
16915                                    subreg_highpart_offset (DImode, TImode));
16916 }
16917
16918 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
16919
16920    OP0 represents the TImode destination operand 0
16921    LOW_DEST represents the low half (DImode) of TImode operand 0
16922    LOW_IN1 represents the low half (DImode) of TImode operand 1
16923    LOW_IN2 represents the low half (DImode) of TImode operand 2
16924    HIGH_DEST represents the high half (DImode) of TImode operand 0
16925    HIGH_IN1 represents the high half (DImode) of TImode operand 1
16926    HIGH_IN2 represents the high half (DImode) of TImode operand 2
16927    UNSIGNED_P is true if the operation is being performed on unsigned
16928    values.  */
16929 void
16930 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
16931                        rtx low_in2, rtx high_dest, rtx high_in1,
16932                        rtx high_in2, bool unsigned_p)
16933 {
16934   if (low_in2 == const0_rtx)
16935     {
16936       low_dest = low_in1;
16937       high_in2 = force_reg (DImode, high_in2);
16938       if (unsigned_p)
16939         emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
16940       else
16941         emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
16942     }
16943   else
16944     {
16945       if (CONST_INT_P (low_in2))
16946         {
16947           high_in2 = force_reg (DImode, high_in2);
16948           emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
16949                                               GEN_INT (-INTVAL (low_in2))));
16950         }
16951       else
16952         emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
16953
16954       if (unsigned_p)
16955         emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
16956       else
16957         emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
16958     }
16959
16960   emit_move_insn (gen_lowpart (DImode, op0), low_dest);
16961   emit_move_insn (gen_highpart (DImode, op0), high_dest);
16962
16963 }
16964
16965 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
16966
16967 static unsigned HOST_WIDE_INT
16968 aarch64_asan_shadow_offset (void)
16969 {
16970   return (HOST_WIDE_INT_1 << 36);
16971 }
16972
16973 static rtx
16974 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
16975                         int code, tree treeop0, tree treeop1)
16976 {
16977   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16978   rtx op0, op1;
16979   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16980   insn_code icode;
16981   struct expand_operand ops[4];
16982
16983   start_sequence ();
16984   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16985
16986   op_mode = GET_MODE (op0);
16987   if (op_mode == VOIDmode)
16988     op_mode = GET_MODE (op1);
16989
16990   switch (op_mode)
16991     {
16992     case E_QImode:
16993     case E_HImode:
16994     case E_SImode:
16995       cmp_mode = SImode;
16996       icode = CODE_FOR_cmpsi;
16997       break;
16998
16999     case E_DImode:
17000       cmp_mode = DImode;
17001       icode = CODE_FOR_cmpdi;
17002       break;
17003
17004     case E_SFmode:
17005       cmp_mode = SFmode;
17006       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
17007       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
17008       break;
17009
17010     case E_DFmode:
17011       cmp_mode = DFmode;
17012       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
17013       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
17014       break;
17015
17016     default:
17017       end_sequence ();
17018       return NULL_RTX;
17019     }
17020
17021   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
17022   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
17023   if (!op0 || !op1)
17024     {
17025       end_sequence ();
17026       return NULL_RTX;
17027     }
17028   *prep_seq = get_insns ();
17029   end_sequence ();
17030
17031   create_fixed_operand (&ops[0], op0);
17032   create_fixed_operand (&ops[1], op1);
17033
17034   start_sequence ();
17035   if (!maybe_expand_insn (icode, 2, ops))
17036     {
17037       end_sequence ();
17038       return NULL_RTX;
17039     }
17040   *gen_seq = get_insns ();
17041   end_sequence ();
17042
17043   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
17044                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
17045 }
17046
17047 static rtx
17048 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
17049                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
17050 {
17051   rtx op0, op1, target;
17052   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
17053   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
17054   insn_code icode;
17055   struct expand_operand ops[6];
17056   int aarch64_cond;
17057
17058   push_to_sequence (*prep_seq);
17059   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
17060
17061   op_mode = GET_MODE (op0);
17062   if (op_mode == VOIDmode)
17063     op_mode = GET_MODE (op1);
17064
17065   switch (op_mode)
17066     {
17067     case E_QImode:
17068     case E_HImode:
17069     case E_SImode:
17070       cmp_mode = SImode;
17071       icode = CODE_FOR_ccmpsi;
17072       break;
17073
17074     case E_DImode:
17075       cmp_mode = DImode;
17076       icode = CODE_FOR_ccmpdi;
17077       break;
17078
17079     case E_SFmode:
17080       cmp_mode = SFmode;
17081       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
17082       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
17083       break;
17084
17085     case E_DFmode:
17086       cmp_mode = DFmode;
17087       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
17088       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
17089       break;
17090
17091     default:
17092       end_sequence ();
17093       return NULL_RTX;
17094     }
17095
17096   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
17097   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
17098   if (!op0 || !op1)
17099     {
17100       end_sequence ();
17101       return NULL_RTX;
17102     }
17103   *prep_seq = get_insns ();
17104   end_sequence ();
17105
17106   target = gen_rtx_REG (cc_mode, CC_REGNUM);
17107   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
17108
17109   if (bit_code != AND)
17110     {
17111       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
17112                                                 GET_MODE (XEXP (prev, 0))),
17113                              VOIDmode, XEXP (prev, 0), const0_rtx);
17114       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
17115     }
17116
17117   create_fixed_operand (&ops[0], XEXP (prev, 0));
17118   create_fixed_operand (&ops[1], target);
17119   create_fixed_operand (&ops[2], op0);
17120   create_fixed_operand (&ops[3], op1);
17121   create_fixed_operand (&ops[4], prev);
17122   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
17123
17124   push_to_sequence (*gen_seq);
17125   if (!maybe_expand_insn (icode, 6, ops))
17126     {
17127       end_sequence ();
17128       return NULL_RTX;
17129     }
17130
17131   *gen_seq = get_insns ();
17132   end_sequence ();
17133
17134   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
17135 }
17136
17137 #undef TARGET_GEN_CCMP_FIRST
17138 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
17139
17140 #undef TARGET_GEN_CCMP_NEXT
17141 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
17142
17143 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
17144    instruction fusion of some sort.  */
17145
17146 static bool
17147 aarch64_macro_fusion_p (void)
17148 {
17149   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
17150 }
17151
17152
17153 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
17154    should be kept together during scheduling.  */
17155
17156 static bool
17157 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
17158 {
17159   rtx set_dest;
17160   rtx prev_set = single_set (prev);
17161   rtx curr_set = single_set (curr);
17162   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
17163   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
17164
17165   if (!aarch64_macro_fusion_p ())
17166     return false;
17167
17168   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
17169     {
17170       /* We are trying to match:
17171          prev (mov)  == (set (reg r0) (const_int imm16))
17172          curr (movk) == (set (zero_extract (reg r0)
17173                                            (const_int 16)
17174                                            (const_int 16))
17175                              (const_int imm16_1))  */
17176
17177       set_dest = SET_DEST (curr_set);
17178
17179       if (GET_CODE (set_dest) == ZERO_EXTRACT
17180           && CONST_INT_P (SET_SRC (curr_set))
17181           && CONST_INT_P (SET_SRC (prev_set))
17182           && CONST_INT_P (XEXP (set_dest, 2))
17183           && INTVAL (XEXP (set_dest, 2)) == 16
17184           && REG_P (XEXP (set_dest, 0))
17185           && REG_P (SET_DEST (prev_set))
17186           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
17187         {
17188           return true;
17189         }
17190     }
17191
17192   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
17193     {
17194
17195       /*  We're trying to match:
17196           prev (adrp) == (set (reg r1)
17197                               (high (symbol_ref ("SYM"))))
17198           curr (add) == (set (reg r0)
17199                              (lo_sum (reg r1)
17200                                      (symbol_ref ("SYM"))))
17201           Note that r0 need not necessarily be the same as r1, especially
17202           during pre-regalloc scheduling.  */
17203
17204       if (satisfies_constraint_Ush (SET_SRC (prev_set))
17205           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
17206         {
17207           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
17208               && REG_P (XEXP (SET_SRC (curr_set), 0))
17209               && REGNO (XEXP (SET_SRC (curr_set), 0))
17210                  == REGNO (SET_DEST (prev_set))
17211               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
17212                               XEXP (SET_SRC (curr_set), 1)))
17213             return true;
17214         }
17215     }
17216
17217   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
17218     {
17219
17220       /* We're trying to match:
17221          prev (movk) == (set (zero_extract (reg r0)
17222                                            (const_int 16)
17223                                            (const_int 32))
17224                              (const_int imm16_1))
17225          curr (movk) == (set (zero_extract (reg r0)
17226                                            (const_int 16)
17227                                            (const_int 48))
17228                              (const_int imm16_2))  */
17229
17230       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
17231           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
17232           && REG_P (XEXP (SET_DEST (prev_set), 0))
17233           && REG_P (XEXP (SET_DEST (curr_set), 0))
17234           && REGNO (XEXP (SET_DEST (prev_set), 0))
17235              == REGNO (XEXP (SET_DEST (curr_set), 0))
17236           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
17237           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
17238           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
17239           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
17240           && CONST_INT_P (SET_SRC (prev_set))
17241           && CONST_INT_P (SET_SRC (curr_set)))
17242         return true;
17243
17244     }
17245   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
17246     {
17247       /* We're trying to match:
17248           prev (adrp) == (set (reg r0)
17249                               (high (symbol_ref ("SYM"))))
17250           curr (ldr) == (set (reg r1)
17251                              (mem (lo_sum (reg r0)
17252                                              (symbol_ref ("SYM")))))
17253                  or
17254           curr (ldr) == (set (reg r1)
17255                              (zero_extend (mem
17256                                            (lo_sum (reg r0)
17257                                                    (symbol_ref ("SYM"))))))  */
17258       if (satisfies_constraint_Ush (SET_SRC (prev_set))
17259           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
17260         {
17261           rtx curr_src = SET_SRC (curr_set);
17262
17263           if (GET_CODE (curr_src) == ZERO_EXTEND)
17264             curr_src = XEXP (curr_src, 0);
17265
17266           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
17267               && REG_P (XEXP (XEXP (curr_src, 0), 0))
17268               && REGNO (XEXP (XEXP (curr_src, 0), 0))
17269                  == REGNO (SET_DEST (prev_set))
17270               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
17271                               XEXP (SET_SRC (prev_set), 0)))
17272               return true;
17273         }
17274     }
17275
17276   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
17277        && aarch_crypto_can_dual_issue (prev, curr))
17278     return true;
17279
17280   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
17281       && any_condjump_p (curr))
17282     {
17283       unsigned int condreg1, condreg2;
17284       rtx cc_reg_1;
17285       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
17286       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
17287
17288       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
17289           && prev
17290           && modified_in_p (cc_reg_1, prev))
17291         {
17292           enum attr_type prev_type = get_attr_type (prev);
17293
17294           /* FIXME: this misses some which is considered simple arthematic
17295              instructions for ThunderX.  Simple shifts are missed here.  */
17296           if (prev_type == TYPE_ALUS_SREG
17297               || prev_type == TYPE_ALUS_IMM
17298               || prev_type == TYPE_LOGICS_REG
17299               || prev_type == TYPE_LOGICS_IMM)
17300             return true;
17301         }
17302     }
17303
17304   if (prev_set
17305       && curr_set
17306       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
17307       && any_condjump_p (curr))
17308     {
17309       /* We're trying to match:
17310           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
17311           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
17312                                                          (const_int 0))
17313                                                  (label_ref ("SYM"))
17314                                                  (pc))  */
17315       if (SET_DEST (curr_set) == (pc_rtx)
17316           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
17317           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
17318           && REG_P (SET_DEST (prev_set))
17319           && REGNO (SET_DEST (prev_set))
17320              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
17321         {
17322           /* Fuse ALU operations followed by conditional branch instruction.  */
17323           switch (get_attr_type (prev))
17324             {
17325             case TYPE_ALU_IMM:
17326             case TYPE_ALU_SREG:
17327             case TYPE_ADC_REG:
17328             case TYPE_ADC_IMM:
17329             case TYPE_ADCS_REG:
17330             case TYPE_ADCS_IMM:
17331             case TYPE_LOGIC_REG:
17332             case TYPE_LOGIC_IMM:
17333             case TYPE_CSEL:
17334             case TYPE_ADR:
17335             case TYPE_MOV_IMM:
17336             case TYPE_SHIFT_REG:
17337             case TYPE_SHIFT_IMM:
17338             case TYPE_BFM:
17339             case TYPE_RBIT:
17340             case TYPE_REV:
17341             case TYPE_EXTEND:
17342               return true;
17343
17344             default:;
17345             }
17346         }
17347     }
17348
17349   return false;
17350 }
17351
17352 /* Return true iff the instruction fusion described by OP is enabled.  */
17353
17354 bool
17355 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
17356 {
17357   return (aarch64_tune_params.fusible_ops & op) != 0;
17358 }
17359
17360 /* If MEM is in the form of [base+offset], extract the two parts
17361    of address and set to BASE and OFFSET, otherwise return false
17362    after clearing BASE and OFFSET.  */
17363
17364 bool
17365 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
17366 {
17367   rtx addr;
17368
17369   gcc_assert (MEM_P (mem));
17370
17371   addr = XEXP (mem, 0);
17372
17373   if (REG_P (addr))
17374     {
17375       *base = addr;
17376       *offset = const0_rtx;
17377       return true;
17378     }
17379
17380   if (GET_CODE (addr) == PLUS
17381       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
17382     {
17383       *base = XEXP (addr, 0);
17384       *offset = XEXP (addr, 1);
17385       return true;
17386     }
17387
17388   *base = NULL_RTX;
17389   *offset = NULL_RTX;
17390
17391   return false;
17392 }
17393
17394 /* Types for scheduling fusion.  */
17395 enum sched_fusion_type
17396 {
17397   SCHED_FUSION_NONE = 0,
17398   SCHED_FUSION_LD_SIGN_EXTEND,
17399   SCHED_FUSION_LD_ZERO_EXTEND,
17400   SCHED_FUSION_LD,
17401   SCHED_FUSION_ST,
17402   SCHED_FUSION_NUM
17403 };
17404
17405 /* If INSN is a load or store of address in the form of [base+offset],
17406    extract the two parts and set to BASE and OFFSET.  Return scheduling
17407    fusion type this INSN is.  */
17408
17409 static enum sched_fusion_type
17410 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
17411 {
17412   rtx x, dest, src;
17413   enum sched_fusion_type fusion = SCHED_FUSION_LD;
17414
17415   gcc_assert (INSN_P (insn));
17416   x = PATTERN (insn);
17417   if (GET_CODE (x) != SET)
17418     return SCHED_FUSION_NONE;
17419
17420   src = SET_SRC (x);
17421   dest = SET_DEST (x);
17422
17423   machine_mode dest_mode = GET_MODE (dest);
17424
17425   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
17426     return SCHED_FUSION_NONE;
17427
17428   if (GET_CODE (src) == SIGN_EXTEND)
17429     {
17430       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
17431       src = XEXP (src, 0);
17432       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
17433         return SCHED_FUSION_NONE;
17434     }
17435   else if (GET_CODE (src) == ZERO_EXTEND)
17436     {
17437       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
17438       src = XEXP (src, 0);
17439       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
17440         return SCHED_FUSION_NONE;
17441     }
17442
17443   if (GET_CODE (src) == MEM && REG_P (dest))
17444     extract_base_offset_in_addr (src, base, offset);
17445   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
17446     {
17447       fusion = SCHED_FUSION_ST;
17448       extract_base_offset_in_addr (dest, base, offset);
17449     }
17450   else
17451     return SCHED_FUSION_NONE;
17452
17453   if (*base == NULL_RTX || *offset == NULL_RTX)
17454     fusion = SCHED_FUSION_NONE;
17455
17456   return fusion;
17457 }
17458
17459 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
17460
17461    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
17462    and PRI are only calculated for these instructions.  For other instruction,
17463    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
17464    type instruction fusion can be added by returning different priorities.
17465
17466    It's important that irrelevant instructions get the largest FUSION_PRI.  */
17467
17468 static void
17469 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
17470                                int *fusion_pri, int *pri)
17471 {
17472   int tmp, off_val;
17473   rtx base, offset;
17474   enum sched_fusion_type fusion;
17475
17476   gcc_assert (INSN_P (insn));
17477
17478   tmp = max_pri - 1;
17479   fusion = fusion_load_store (insn, &base, &offset);
17480   if (fusion == SCHED_FUSION_NONE)
17481     {
17482       *pri = tmp;
17483       *fusion_pri = tmp;
17484       return;
17485     }
17486
17487   /* Set FUSION_PRI according to fusion type and base register.  */
17488   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
17489
17490   /* Calculate PRI.  */
17491   tmp /= 2;
17492
17493   /* INSN with smaller offset goes first.  */
17494   off_val = (int)(INTVAL (offset));
17495   if (off_val >= 0)
17496     tmp -= (off_val & 0xfffff);
17497   else
17498     tmp += ((- off_val) & 0xfffff);
17499
17500   *pri = tmp;
17501   return;
17502 }
17503
17504 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
17505    Adjust priority of sha1h instructions so they are scheduled before
17506    other SHA1 instructions.  */
17507
17508 static int
17509 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
17510 {
17511   rtx x = PATTERN (insn);
17512
17513   if (GET_CODE (x) == SET)
17514     {
17515       x = SET_SRC (x);
17516
17517       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
17518         return priority + 10;
17519     }
17520
17521   return priority;
17522 }
17523
17524 /* Given OPERANDS of consecutive load/store, check if we can merge
17525    them into ldp/stp.  LOAD is true if they are load instructions.
17526    MODE is the mode of memory operands.  */
17527
17528 bool
17529 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
17530                                 machine_mode mode)
17531 {
17532   HOST_WIDE_INT offval_1, offval_2, msize;
17533   enum reg_class rclass_1, rclass_2;
17534   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
17535
17536   if (load)
17537     {
17538       mem_1 = operands[1];
17539       mem_2 = operands[3];
17540       reg_1 = operands[0];
17541       reg_2 = operands[2];
17542       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
17543       if (REGNO (reg_1) == REGNO (reg_2))
17544         return false;
17545     }
17546   else
17547     {
17548       mem_1 = operands[0];
17549       mem_2 = operands[2];
17550       reg_1 = operands[1];
17551       reg_2 = operands[3];
17552     }
17553
17554   /* The mems cannot be volatile.  */
17555   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
17556     return false;
17557
17558   /* If we have SImode and slow unaligned ldp,
17559      check the alignment to be at least 8 byte. */
17560   if (mode == SImode
17561       && (aarch64_tune_params.extra_tuning_flags
17562           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
17563       && !optimize_size
17564       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
17565     return false;
17566
17567   /* Check if the addresses are in the form of [base+offset].  */
17568   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
17569   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
17570     return false;
17571   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
17572   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
17573     return false;
17574
17575   /* Check if the bases are same.  */
17576   if (!rtx_equal_p (base_1, base_2))
17577     return false;
17578
17579   /* The operands must be of the same size.  */
17580   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
17581                          GET_MODE_SIZE (GET_MODE (mem_2))));
17582
17583   offval_1 = INTVAL (offset_1);
17584   offval_2 = INTVAL (offset_2);
17585   /* We should only be trying this for fixed-sized modes.  There is no
17586      SVE LDP/STP instruction.  */
17587   msize = GET_MODE_SIZE (mode).to_constant ();
17588   /* Check if the offsets are consecutive.  */
17589   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
17590     return false;
17591
17592   /* Check if the addresses are clobbered by load.  */
17593   if (load)
17594     {
17595       if (reg_mentioned_p (reg_1, mem_1))
17596         return false;
17597
17598       /* In increasing order, the last load can clobber the address.  */
17599       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
17600         return false;
17601     }
17602
17603   /* One of the memory accesses must be a mempair operand.
17604      If it is not the first one, they need to be swapped by the
17605      peephole.  */
17606   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
17607        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
17608     return false;
17609
17610   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
17611     rclass_1 = FP_REGS;
17612   else
17613     rclass_1 = GENERAL_REGS;
17614
17615   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
17616     rclass_2 = FP_REGS;
17617   else
17618     rclass_2 = GENERAL_REGS;
17619
17620   /* Check if the registers are of same class.  */
17621   if (rclass_1 != rclass_2)
17622     return false;
17623
17624   return true;
17625 }
17626
17627 /* Given OPERANDS of consecutive load/store that can be merged,
17628    swap them if they are not in ascending order.  */
17629 void
17630 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
17631 {
17632   rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
17633   HOST_WIDE_INT offval_1, offval_2;
17634
17635   if (load)
17636     {
17637       mem_1 = operands[1];
17638       mem_2 = operands[3];
17639     }
17640   else
17641     {
17642       mem_1 = operands[0];
17643       mem_2 = operands[2];
17644     }
17645
17646   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
17647   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
17648
17649   offval_1 = INTVAL (offset_1);
17650   offval_2 = INTVAL (offset_2);
17651
17652   if (offval_1 > offval_2)
17653     {
17654       /* Irrespective of whether this is a load or a store,
17655          we do the same swap.  */
17656       std::swap (operands[0], operands[2]);
17657       std::swap (operands[1], operands[3]);
17658     }
17659 }
17660
17661 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
17662    comparison between the two.  */
17663 int
17664 aarch64_host_wide_int_compare (const void *x, const void *y)
17665 {
17666   return wi::cmps (* ((const HOST_WIDE_INT *) x),
17667                    * ((const HOST_WIDE_INT *) y));
17668 }
17669
17670 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
17671    other pointing to a REG rtx containing an offset, compare the offsets
17672    of the two pairs.
17673
17674    Return:
17675
17676         1 iff offset (X) > offset (Y)
17677         0 iff offset (X) == offset (Y)
17678         -1 iff offset (X) < offset (Y)  */
17679 int
17680 aarch64_ldrstr_offset_compare (const void *x, const void *y)
17681 {
17682   const rtx * operands_1 = (const rtx *) x;
17683   const rtx * operands_2 = (const rtx *) y;
17684   rtx mem_1, mem_2, base, offset_1, offset_2;
17685
17686   if (MEM_P (operands_1[0]))
17687     mem_1 = operands_1[0];
17688   else
17689     mem_1 = operands_1[1];
17690
17691   if (MEM_P (operands_2[0]))
17692     mem_2 = operands_2[0];
17693   else
17694     mem_2 = operands_2[1];
17695
17696   /* Extract the offsets.  */
17697   extract_base_offset_in_addr (mem_1, &base, &offset_1);
17698   extract_base_offset_in_addr (mem_2, &base, &offset_2);
17699
17700   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
17701
17702   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
17703 }
17704
17705 /* Given OPERANDS of consecutive load/store, check if we can merge
17706    them into ldp/stp by adjusting the offset.  LOAD is true if they
17707    are load instructions.  MODE is the mode of memory operands.
17708
17709    Given below consecutive stores:
17710
17711      str  w1, [xb, 0x100]
17712      str  w1, [xb, 0x104]
17713      str  w1, [xb, 0x108]
17714      str  w1, [xb, 0x10c]
17715
17716    Though the offsets are out of the range supported by stp, we can
17717    still pair them after adjusting the offset, like:
17718
17719      add  scratch, xb, 0x100
17720      stp  w1, w1, [scratch]
17721      stp  w1, w1, [scratch, 0x8]
17722
17723    The peephole patterns detecting this opportunity should guarantee
17724    the scratch register is avaliable.  */
17725
17726 bool
17727 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
17728                                        scalar_mode mode)
17729 {
17730   const int num_insns = 4;
17731   enum reg_class rclass;
17732   HOST_WIDE_INT offvals[num_insns], msize;
17733   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
17734
17735   if (load)
17736     {
17737       for (int i = 0; i < num_insns; i++)
17738         {
17739           reg[i] = operands[2 * i];
17740           mem[i] = operands[2 * i + 1];
17741
17742           gcc_assert (REG_P (reg[i]));
17743         }
17744
17745       /* Do not attempt to merge the loads if the loads clobber each other.  */
17746       for (int i = 0; i < 8; i += 2)
17747         for (int j = i + 2; j < 8; j += 2)
17748           if (reg_overlap_mentioned_p (operands[i], operands[j]))
17749             return false;
17750     }
17751   else
17752     for (int i = 0; i < num_insns; i++)
17753       {
17754         mem[i] = operands[2 * i];
17755         reg[i] = operands[2 * i + 1];
17756       }
17757
17758   /* Skip if memory operand is by itself valid for ldp/stp.  */
17759   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
17760     return false;
17761
17762   for (int i = 0; i < num_insns; i++)
17763     {
17764       /* The mems cannot be volatile.  */
17765       if (MEM_VOLATILE_P (mem[i]))
17766         return false;
17767
17768       /* Check if the addresses are in the form of [base+offset].  */
17769       extract_base_offset_in_addr (mem[i], base + i, offset + i);
17770       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
17771         return false;
17772     }
17773
17774   /* Check if the registers are of same class.  */
17775   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
17776     ? FP_REGS : GENERAL_REGS;
17777
17778   for (int i = 1; i < num_insns; i++)
17779     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
17780       {
17781         if (rclass != FP_REGS)
17782           return false;
17783       }
17784     else
17785       {
17786         if (rclass != GENERAL_REGS)
17787           return false;
17788       }
17789
17790   /* Only the last register in the order in which they occur
17791      may be clobbered by the load.  */
17792   if (rclass == GENERAL_REGS && load)
17793     for (int i = 0; i < num_insns - 1; i++)
17794       if (reg_mentioned_p (reg[i], mem[i]))
17795         return false;
17796
17797   /* Check if the bases are same.  */
17798   for (int i = 0; i < num_insns - 1; i++)
17799     if (!rtx_equal_p (base[i], base[i + 1]))
17800       return false;
17801
17802   for (int i = 0; i < num_insns; i++)
17803     offvals[i] = INTVAL (offset[i]);
17804
17805   msize = GET_MODE_SIZE (mode);
17806
17807   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
17808   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
17809          aarch64_host_wide_int_compare);
17810
17811   if (!(offvals[1] == offvals[0] + msize
17812         && offvals[3] == offvals[2] + msize))
17813     return false;
17814
17815   /* Check that offsets are within range of each other.  The ldp/stp
17816      instructions have 7 bit immediate offsets, so use 0x80.  */
17817   if (offvals[2] - offvals[0] >= msize * 0x80)
17818     return false;
17819
17820   /* The offsets must be aligned with respect to each other.  */
17821   if (offvals[0] % msize != offvals[2] % msize)
17822     return false;
17823
17824   /* If we have SImode and slow unaligned ldp,
17825      check the alignment to be at least 8 byte. */
17826   if (mode == SImode
17827       && (aarch64_tune_params.extra_tuning_flags
17828           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
17829       && !optimize_size
17830       && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
17831     return false;
17832
17833   return true;
17834 }
17835
17836 /* Given OPERANDS of consecutive load/store, this function pairs them
17837    into LDP/STP after adjusting the offset.  It depends on the fact
17838    that the operands can be sorted so the offsets are correct for STP.
17839    MODE is the mode of memory operands.  CODE is the rtl operator
17840    which should be applied to all memory operands, it's SIGN_EXTEND,
17841    ZERO_EXTEND or UNKNOWN.  */
17842
17843 bool
17844 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
17845                              scalar_mode mode, RTX_CODE code)
17846 {
17847   rtx base, offset_1, offset_3, t1, t2;
17848   rtx mem_1, mem_2, mem_3, mem_4;
17849   rtx temp_operands[8];
17850   HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
17851                 stp_off_upper_limit, stp_off_lower_limit, msize;
17852
17853   /* We make changes on a copy as we may still bail out.  */
17854   for (int i = 0; i < 8; i ++)
17855     temp_operands[i] = operands[i];
17856
17857   /* Sort the operands.  */
17858   qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
17859
17860   if (load)
17861     {
17862       mem_1 = temp_operands[1];
17863       mem_2 = temp_operands[3];
17864       mem_3 = temp_operands[5];
17865       mem_4 = temp_operands[7];
17866     }
17867   else
17868     {
17869       mem_1 = temp_operands[0];
17870       mem_2 = temp_operands[2];
17871       mem_3 = temp_operands[4];
17872       mem_4 = temp_operands[6];
17873       gcc_assert (code == UNKNOWN);
17874     }
17875
17876   extract_base_offset_in_addr (mem_1, &base, &offset_1);
17877   extract_base_offset_in_addr (mem_3, &base, &offset_3);
17878   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
17879               && offset_3 != NULL_RTX);
17880
17881   /* Adjust offset so it can fit in LDP/STP instruction.  */
17882   msize = GET_MODE_SIZE (mode);
17883   stp_off_upper_limit = msize * (0x40 - 1);
17884   stp_off_lower_limit = - msize * 0x40;
17885
17886   off_val_1 = INTVAL (offset_1);
17887   off_val_3 = INTVAL (offset_3);
17888
17889   /* The base offset is optimally half way between the two STP/LDP offsets.  */
17890   if (msize <= 4)
17891     base_off = (off_val_1 + off_val_3) / 2;
17892   else
17893     /* However, due to issues with negative LDP/STP offset generation for
17894        larger modes, for DF, DI and vector modes. we must not use negative
17895        addresses smaller than 9 signed unadjusted bits can store.  This
17896        provides the most range in this case.  */
17897     base_off = off_val_1;
17898
17899   /* Adjust the base so that it is aligned with the addresses but still
17900      optimal.  */
17901   if (base_off % msize != off_val_1 % msize)
17902     /* Fix the offset, bearing in mind we want to make it bigger not
17903        smaller.  */
17904     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17905   else if (msize <= 4)
17906     /* The negative range of LDP/STP is one larger than the positive range.  */
17907     base_off += msize;
17908
17909   /* Check if base offset is too big or too small.  We can attempt to resolve
17910      this issue by setting it to the maximum value and seeing if the offsets
17911      still fit.  */
17912   if (base_off >= 0x1000)
17913     {
17914       base_off = 0x1000 - 1;
17915       /* We must still make sure that the base offset is aligned with respect
17916          to the address.  But it may may not be made any bigger.  */
17917       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17918     }
17919
17920   /* Likewise for the case where the base is too small.  */
17921   if (base_off <= -0x1000)
17922     {
17923       base_off = -0x1000 + 1;
17924       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17925     }
17926
17927   /* Offset of the first STP/LDP.  */
17928   new_off_1 = off_val_1 - base_off;
17929
17930   /* Offset of the second STP/LDP.  */
17931   new_off_3 = off_val_3 - base_off;
17932
17933   /* The offsets must be within the range of the LDP/STP instructions.  */
17934   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
17935       || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
17936     return false;
17937
17938   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
17939                                                   new_off_1), true);
17940   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
17941                                                   new_off_1 + msize), true);
17942   replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
17943                                                   new_off_3), true);
17944   replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
17945                                                   new_off_3 + msize), true);
17946
17947   if (!aarch64_mem_pair_operand (mem_1, mode)
17948       || !aarch64_mem_pair_operand (mem_3, mode))
17949     return false;
17950
17951   if (code == ZERO_EXTEND)
17952     {
17953       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
17954       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
17955       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
17956       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
17957     }
17958   else if (code == SIGN_EXTEND)
17959     {
17960       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
17961       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
17962       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
17963       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
17964     }
17965
17966   if (load)
17967     {
17968       operands[0] = temp_operands[0];
17969       operands[1] = mem_1;
17970       operands[2] = temp_operands[2];
17971       operands[3] = mem_2;
17972       operands[4] = temp_operands[4];
17973       operands[5] = mem_3;
17974       operands[6] = temp_operands[6];
17975       operands[7] = mem_4;
17976     }
17977   else
17978     {
17979       operands[0] = mem_1;
17980       operands[1] = temp_operands[1];
17981       operands[2] = mem_2;
17982       operands[3] = temp_operands[3];
17983       operands[4] = mem_3;
17984       operands[5] = temp_operands[5];
17985       operands[6] = mem_4;
17986       operands[7] = temp_operands[7];
17987     }
17988
17989   /* Emit adjusting instruction.  */
17990   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
17991   /* Emit ldp/stp instructions.  */
17992   t1 = gen_rtx_SET (operands[0], operands[1]);
17993   t2 = gen_rtx_SET (operands[2], operands[3]);
17994   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17995   t1 = gen_rtx_SET (operands[4], operands[5]);
17996   t2 = gen_rtx_SET (operands[6], operands[7]);
17997   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17998   return true;
17999 }
18000
18001 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
18002    it isn't worth branching around empty masked ops (including masked
18003    stores).  */
18004
18005 static bool
18006 aarch64_empty_mask_is_expensive (unsigned)
18007 {
18008   return false;
18009 }
18010
18011 /* Return 1 if pseudo register should be created and used to hold
18012    GOT address for PIC code.  */
18013
18014 bool
18015 aarch64_use_pseudo_pic_reg (void)
18016 {
18017   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
18018 }
18019
18020 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
18021
18022 static int
18023 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
18024 {
18025   switch (XINT (x, 1))
18026     {
18027     case UNSPEC_GOTSMALLPIC:
18028     case UNSPEC_GOTSMALLPIC28K:
18029     case UNSPEC_GOTTINYPIC:
18030       return 0;
18031     default:
18032       break;
18033     }
18034
18035   return default_unspec_may_trap_p (x, flags);
18036 }
18037
18038
18039 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
18040    return the log2 of that value.  Otherwise return -1.  */
18041
18042 int
18043 aarch64_fpconst_pow_of_2 (rtx x)
18044 {
18045   const REAL_VALUE_TYPE *r;
18046
18047   if (!CONST_DOUBLE_P (x))
18048     return -1;
18049
18050   r = CONST_DOUBLE_REAL_VALUE (x);
18051
18052   if (REAL_VALUE_NEGATIVE (*r)
18053       || REAL_VALUE_ISNAN (*r)
18054       || REAL_VALUE_ISINF (*r)
18055       || !real_isinteger (r, DFmode))
18056     return -1;
18057
18058   return exact_log2 (real_to_integer (r));
18059 }
18060
18061 /* If X is a vector of equal CONST_DOUBLE values and that value is
18062    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
18063
18064 int
18065 aarch64_vec_fpconst_pow_of_2 (rtx x)
18066 {
18067   int nelts;
18068   if (GET_CODE (x) != CONST_VECTOR
18069       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
18070     return -1;
18071
18072   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
18073     return -1;
18074
18075   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
18076   if (firstval <= 0)
18077     return -1;
18078
18079   for (int i = 1; i < nelts; i++)
18080     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
18081       return -1;
18082
18083   return firstval;
18084 }
18085
18086 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
18087    to float.
18088
18089    __fp16 always promotes through this hook.
18090    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
18091    through the generic excess precision logic rather than here.  */
18092
18093 static tree
18094 aarch64_promoted_type (const_tree t)
18095 {
18096   if (SCALAR_FLOAT_TYPE_P (t)
18097       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
18098     return float_type_node;
18099
18100   return NULL_TREE;
18101 }
18102
18103 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
18104
18105 static bool
18106 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
18107                            optimization_type opt_type)
18108 {
18109   switch (op)
18110     {
18111     case rsqrt_optab:
18112       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
18113
18114     default:
18115       return true;
18116     }
18117 }
18118
18119 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
18120
18121 static unsigned int
18122 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
18123                                         int *offset)
18124 {
18125   /* Polynomial invariant 1 == (VG / 2) - 1.  */
18126   gcc_assert (i == 1);
18127   *factor = 2;
18128   *offset = 1;
18129   return AARCH64_DWARF_VG;
18130 }
18131
18132 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
18133    if MODE is HFmode, and punt to the generic implementation otherwise.  */
18134
18135 static bool
18136 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
18137 {
18138   return (mode == HFmode
18139           ? true
18140           : default_libgcc_floating_mode_supported_p (mode));
18141 }
18142
18143 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
18144    if MODE is HFmode, and punt to the generic implementation otherwise.  */
18145
18146 static bool
18147 aarch64_scalar_mode_supported_p (scalar_mode mode)
18148 {
18149   return (mode == HFmode
18150           ? true
18151           : default_scalar_mode_supported_p (mode));
18152 }
18153
18154 /* Set the value of FLT_EVAL_METHOD.
18155    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
18156
18157     0: evaluate all operations and constants, whose semantic type has at
18158        most the range and precision of type float, to the range and
18159        precision of float; evaluate all other operations and constants to
18160        the range and precision of the semantic type;
18161
18162     N, where _FloatN is a supported interchange floating type
18163        evaluate all operations and constants, whose semantic type has at
18164        most the range and precision of _FloatN type, to the range and
18165        precision of the _FloatN type; evaluate all other operations and
18166        constants to the range and precision of the semantic type;
18167
18168    If we have the ARMv8.2-A extensions then we support _Float16 in native
18169    precision, so we should set this to 16.  Otherwise, we support the type,
18170    but want to evaluate expressions in float precision, so set this to
18171    0.  */
18172
18173 static enum flt_eval_method
18174 aarch64_excess_precision (enum excess_precision_type type)
18175 {
18176   switch (type)
18177     {
18178       case EXCESS_PRECISION_TYPE_FAST:
18179       case EXCESS_PRECISION_TYPE_STANDARD:
18180         /* We can calculate either in 16-bit range and precision or
18181            32-bit range and precision.  Make that decision based on whether
18182            we have native support for the ARMv8.2-A 16-bit floating-point
18183            instructions or not.  */
18184         return (TARGET_FP_F16INST
18185                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
18186                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
18187       case EXCESS_PRECISION_TYPE_IMPLICIT:
18188         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
18189       default:
18190         gcc_unreachable ();
18191     }
18192   return FLT_EVAL_METHOD_UNPREDICTABLE;
18193 }
18194
18195 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
18196    scheduled for speculative execution.  Reject the long-running division
18197    and square-root instructions.  */
18198
18199 static bool
18200 aarch64_sched_can_speculate_insn (rtx_insn *insn)
18201 {
18202   switch (get_attr_type (insn))
18203     {
18204       case TYPE_SDIV:
18205       case TYPE_UDIV:
18206       case TYPE_FDIVS:
18207       case TYPE_FDIVD:
18208       case TYPE_FSQRTS:
18209       case TYPE_FSQRTD:
18210       case TYPE_NEON_FP_SQRT_S:
18211       case TYPE_NEON_FP_SQRT_D:
18212       case TYPE_NEON_FP_SQRT_S_Q:
18213       case TYPE_NEON_FP_SQRT_D_Q:
18214       case TYPE_NEON_FP_DIV_S:
18215       case TYPE_NEON_FP_DIV_D:
18216       case TYPE_NEON_FP_DIV_S_Q:
18217       case TYPE_NEON_FP_DIV_D_Q:
18218         return false;
18219       default:
18220         return true;
18221     }
18222 }
18223
18224 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
18225
18226 static int
18227 aarch64_compute_pressure_classes (reg_class *classes)
18228 {
18229   int i = 0;
18230   classes[i++] = GENERAL_REGS;
18231   classes[i++] = FP_REGS;
18232   /* PR_REGS isn't a useful pressure class because many predicate pseudo
18233      registers need to go in PR_LO_REGS at some point during their
18234      lifetime.  Splitting it into two halves has the effect of making
18235      all predicates count against PR_LO_REGS, so that we try whenever
18236      possible to restrict the number of live predicates to 8.  This
18237      greatly reduces the amount of spilling in certain loops.  */
18238   classes[i++] = PR_LO_REGS;
18239   classes[i++] = PR_HI_REGS;
18240   return i;
18241 }
18242
18243 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
18244
18245 static bool
18246 aarch64_can_change_mode_class (machine_mode from,
18247                                machine_mode to, reg_class_t)
18248 {
18249   if (BYTES_BIG_ENDIAN)
18250     {
18251       bool from_sve_p = aarch64_sve_data_mode_p (from);
18252       bool to_sve_p = aarch64_sve_data_mode_p (to);
18253
18254       /* Don't allow changes between SVE data modes and non-SVE modes.
18255          See the comment at the head of aarch64-sve.md for details.  */
18256       if (from_sve_p != to_sve_p)
18257         return false;
18258
18259       /* Don't allow changes in element size: lane 0 of the new vector
18260          would not then be lane 0 of the old vector.  See the comment
18261          above aarch64_maybe_expand_sve_subreg_move for a more detailed
18262          description.
18263
18264          In the worst case, this forces a register to be spilled in
18265          one mode and reloaded in the other, which handles the
18266          endianness correctly.  */
18267       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
18268         return false;
18269     }
18270   return true;
18271 }
18272
18273 /* Implement TARGET_EARLY_REMAT_MODES.  */
18274
18275 static void
18276 aarch64_select_early_remat_modes (sbitmap modes)
18277 {
18278   /* SVE values are not normally live across a call, so it should be
18279      worth doing early rematerialization even in VL-specific mode.  */
18280   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
18281     {
18282       machine_mode mode = (machine_mode) i;
18283       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
18284       if (vec_flags & VEC_ANY_SVE)
18285         bitmap_set_bit (modes, i);
18286     }
18287 }
18288
18289 /* Override the default target speculation_safe_value.  */
18290 static rtx
18291 aarch64_speculation_safe_value (machine_mode mode,
18292                                 rtx result, rtx val, rtx failval)
18293 {
18294   /* Maybe we should warn if falling back to hard barriers.  They are
18295      likely to be noticably more expensive than the alternative below.  */
18296   if (!aarch64_track_speculation)
18297     return default_speculation_safe_value (mode, result, val, failval);
18298
18299   if (!REG_P (val))
18300     val = copy_to_mode_reg (mode, val);
18301
18302   if (!aarch64_reg_or_zero (failval, mode))
18303     failval = copy_to_mode_reg (mode, failval);
18304
18305   emit_insn (gen_despeculate_copy (mode, result, val, failval));
18306   return result;
18307 }
18308
18309 /* Implement TARGET_ESTIMATED_POLY_VALUE.
18310    Look into the tuning structure for an estimate.
18311    VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
18312    Advanced SIMD 128 bits.  */
18313
18314 static HOST_WIDE_INT
18315 aarch64_estimated_poly_value (poly_int64 val)
18316 {
18317   enum aarch64_sve_vector_bits_enum width_source
18318     = aarch64_tune_params.sve_width;
18319
18320   /* If we still don't have an estimate, use the default.  */
18321   if (width_source == SVE_SCALABLE)
18322     return default_estimated_poly_value (val);
18323
18324   HOST_WIDE_INT over_128 = width_source - 128;
18325   return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
18326 }
18327
18328 /* Target-specific selftests.  */
18329
18330 #if CHECKING_P
18331
18332 namespace selftest {
18333
18334 /* Selftest for the RTL loader.
18335    Verify that the RTL loader copes with a dump from
18336    print_rtx_function.  This is essentially just a test that class
18337    function_reader can handle a real dump, but it also verifies
18338    that lookup_reg_by_dump_name correctly handles hard regs.
18339    The presence of hard reg names in the dump means that the test is
18340    target-specific, hence it is in this file.  */
18341
18342 static void
18343 aarch64_test_loading_full_dump ()
18344 {
18345   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
18346
18347   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
18348
18349   rtx_insn *insn_1 = get_insn_by_uid (1);
18350   ASSERT_EQ (NOTE, GET_CODE (insn_1));
18351
18352   rtx_insn *insn_15 = get_insn_by_uid (15);
18353   ASSERT_EQ (INSN, GET_CODE (insn_15));
18354   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
18355
18356   /* Verify crtl->return_rtx.  */
18357   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
18358   ASSERT_EQ (0, REGNO (crtl->return_rtx));
18359   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
18360 }
18361
18362 /* Run all target-specific selftests.  */
18363
18364 static void
18365 aarch64_run_selftests (void)
18366 {
18367   aarch64_test_loading_full_dump ();
18368 }
18369
18370 } // namespace selftest
18371
18372 #endif /* #if CHECKING_P */
18373
18374 #undef TARGET_ADDRESS_COST
18375 #define TARGET_ADDRESS_COST aarch64_address_cost
18376
18377 /* This hook will determines whether unnamed bitfields affect the alignment
18378    of the containing structure.  The hook returns true if the structure
18379    should inherit the alignment requirements of an unnamed bitfield's
18380    type.  */
18381 #undef TARGET_ALIGN_ANON_BITFIELD
18382 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
18383
18384 #undef TARGET_ASM_ALIGNED_DI_OP
18385 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
18386
18387 #undef TARGET_ASM_ALIGNED_HI_OP
18388 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
18389
18390 #undef TARGET_ASM_ALIGNED_SI_OP
18391 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
18392
18393 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
18394 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
18395   hook_bool_const_tree_hwi_hwi_const_tree_true
18396
18397 #undef TARGET_ASM_FILE_START
18398 #define TARGET_ASM_FILE_START aarch64_start_file
18399
18400 #undef TARGET_ASM_OUTPUT_MI_THUNK
18401 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
18402
18403 #undef TARGET_ASM_SELECT_RTX_SECTION
18404 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
18405
18406 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
18407 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
18408
18409 #undef TARGET_BUILD_BUILTIN_VA_LIST
18410 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
18411
18412 #undef TARGET_CALLEE_COPIES
18413 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
18414
18415 #undef TARGET_CAN_ELIMINATE
18416 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
18417
18418 #undef TARGET_CAN_INLINE_P
18419 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
18420
18421 #undef TARGET_CANNOT_FORCE_CONST_MEM
18422 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
18423
18424 #undef TARGET_CASE_VALUES_THRESHOLD
18425 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
18426
18427 #undef TARGET_CONDITIONAL_REGISTER_USAGE
18428 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
18429
18430 /* Only the least significant bit is used for initialization guard
18431    variables.  */
18432 #undef TARGET_CXX_GUARD_MASK_BIT
18433 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
18434
18435 #undef TARGET_C_MODE_FOR_SUFFIX
18436 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
18437
18438 #ifdef TARGET_BIG_ENDIAN_DEFAULT
18439 #undef  TARGET_DEFAULT_TARGET_FLAGS
18440 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
18441 #endif
18442
18443 #undef TARGET_CLASS_MAX_NREGS
18444 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
18445
18446 #undef TARGET_BUILTIN_DECL
18447 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
18448
18449 #undef TARGET_BUILTIN_RECIPROCAL
18450 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
18451
18452 #undef TARGET_C_EXCESS_PRECISION
18453 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
18454
18455 #undef  TARGET_EXPAND_BUILTIN
18456 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
18457
18458 #undef TARGET_EXPAND_BUILTIN_VA_START
18459 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
18460
18461 #undef TARGET_FOLD_BUILTIN
18462 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
18463
18464 #undef TARGET_FUNCTION_ARG
18465 #define TARGET_FUNCTION_ARG aarch64_function_arg
18466
18467 #undef TARGET_FUNCTION_ARG_ADVANCE
18468 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
18469
18470 #undef TARGET_FUNCTION_ARG_BOUNDARY
18471 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
18472
18473 #undef TARGET_FUNCTION_ARG_PADDING
18474 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
18475
18476 #undef TARGET_GET_RAW_RESULT_MODE
18477 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
18478 #undef TARGET_GET_RAW_ARG_MODE
18479 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
18480
18481 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
18482 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
18483
18484 #undef TARGET_FUNCTION_VALUE
18485 #define TARGET_FUNCTION_VALUE aarch64_function_value
18486
18487 #undef TARGET_FUNCTION_VALUE_REGNO_P
18488 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
18489
18490 #undef TARGET_GIMPLE_FOLD_BUILTIN
18491 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
18492
18493 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
18494 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
18495
18496 #undef  TARGET_INIT_BUILTINS
18497 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
18498
18499 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
18500 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
18501   aarch64_ira_change_pseudo_allocno_class
18502
18503 #undef TARGET_LEGITIMATE_ADDRESS_P
18504 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
18505
18506 #undef TARGET_LEGITIMATE_CONSTANT_P
18507 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
18508
18509 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
18510 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
18511   aarch64_legitimize_address_displacement
18512
18513 #undef TARGET_LIBGCC_CMP_RETURN_MODE
18514 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
18515
18516 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
18517 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
18518 aarch64_libgcc_floating_mode_supported_p
18519
18520 #undef TARGET_MANGLE_TYPE
18521 #define TARGET_MANGLE_TYPE aarch64_mangle_type
18522
18523 #undef TARGET_MEMORY_MOVE_COST
18524 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
18525
18526 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
18527 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
18528
18529 #undef TARGET_MUST_PASS_IN_STACK
18530 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
18531
18532 /* This target hook should return true if accesses to volatile bitfields
18533    should use the narrowest mode possible.  It should return false if these
18534    accesses should use the bitfield container type.  */
18535 #undef TARGET_NARROW_VOLATILE_BITFIELD
18536 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
18537
18538 #undef  TARGET_OPTION_OVERRIDE
18539 #define TARGET_OPTION_OVERRIDE aarch64_override_options
18540
18541 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
18542 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
18543   aarch64_override_options_after_change
18544
18545 #undef TARGET_OPTION_SAVE
18546 #define TARGET_OPTION_SAVE aarch64_option_save
18547
18548 #undef TARGET_OPTION_RESTORE
18549 #define TARGET_OPTION_RESTORE aarch64_option_restore
18550
18551 #undef TARGET_OPTION_PRINT
18552 #define TARGET_OPTION_PRINT aarch64_option_print
18553
18554 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
18555 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
18556
18557 #undef TARGET_SET_CURRENT_FUNCTION
18558 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
18559
18560 #undef TARGET_PASS_BY_REFERENCE
18561 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
18562
18563 #undef TARGET_PREFERRED_RELOAD_CLASS
18564 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
18565
18566 #undef TARGET_SCHED_REASSOCIATION_WIDTH
18567 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
18568
18569 #undef TARGET_PROMOTED_TYPE
18570 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
18571
18572 #undef TARGET_SECONDARY_RELOAD
18573 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
18574
18575 #undef TARGET_SHIFT_TRUNCATION_MASK
18576 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
18577
18578 #undef TARGET_SETUP_INCOMING_VARARGS
18579 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
18580
18581 #undef TARGET_STRUCT_VALUE_RTX
18582 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
18583
18584 #undef TARGET_REGISTER_MOVE_COST
18585 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
18586
18587 #undef TARGET_RETURN_IN_MEMORY
18588 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
18589
18590 #undef TARGET_RETURN_IN_MSB
18591 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
18592
18593 #undef TARGET_RTX_COSTS
18594 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
18595
18596 #undef TARGET_SCALAR_MODE_SUPPORTED_P
18597 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
18598
18599 #undef TARGET_SCHED_ISSUE_RATE
18600 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
18601
18602 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
18603 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
18604   aarch64_sched_first_cycle_multipass_dfa_lookahead
18605
18606 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
18607 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
18608   aarch64_first_cycle_multipass_dfa_lookahead_guard
18609
18610 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
18611 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
18612   aarch64_get_separate_components
18613
18614 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
18615 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
18616   aarch64_components_for_bb
18617
18618 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
18619 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
18620   aarch64_disqualify_components
18621
18622 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
18623 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
18624   aarch64_emit_prologue_components
18625
18626 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
18627 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
18628   aarch64_emit_epilogue_components
18629
18630 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
18631 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
18632   aarch64_set_handled_components
18633
18634 #undef TARGET_TRAMPOLINE_INIT
18635 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
18636
18637 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
18638 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
18639
18640 #undef TARGET_VECTOR_MODE_SUPPORTED_P
18641 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
18642
18643 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
18644 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
18645   aarch64_builtin_support_vector_misalignment
18646
18647 #undef TARGET_ARRAY_MODE
18648 #define TARGET_ARRAY_MODE aarch64_array_mode
18649
18650 #undef TARGET_ARRAY_MODE_SUPPORTED_P
18651 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
18652
18653 #undef TARGET_VECTORIZE_ADD_STMT_COST
18654 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
18655
18656 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
18657 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
18658   aarch64_builtin_vectorization_cost
18659
18660 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
18661 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
18662
18663 #undef TARGET_VECTORIZE_BUILTINS
18664 #define TARGET_VECTORIZE_BUILTINS
18665
18666 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
18667 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
18668   aarch64_builtin_vectorized_function
18669
18670 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
18671 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
18672   aarch64_autovectorize_vector_sizes
18673
18674 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
18675 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
18676   aarch64_atomic_assign_expand_fenv
18677
18678 /* Section anchor support.  */
18679
18680 #undef TARGET_MIN_ANCHOR_OFFSET
18681 #define TARGET_MIN_ANCHOR_OFFSET -256
18682
18683 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
18684    byte offset; we can do much more for larger data types, but have no way
18685    to determine the size of the access.  We assume accesses are aligned.  */
18686 #undef TARGET_MAX_ANCHOR_OFFSET
18687 #define TARGET_MAX_ANCHOR_OFFSET 4095
18688
18689 #undef TARGET_VECTOR_ALIGNMENT
18690 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
18691
18692 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
18693 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
18694   aarch64_vectorize_preferred_vector_alignment
18695 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
18696 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
18697   aarch64_simd_vector_alignment_reachable
18698
18699 /* vec_perm support.  */
18700
18701 #undef TARGET_VECTORIZE_VEC_PERM_CONST
18702 #define TARGET_VECTORIZE_VEC_PERM_CONST \
18703   aarch64_vectorize_vec_perm_const
18704
18705 #undef TARGET_VECTORIZE_GET_MASK_MODE
18706 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
18707 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
18708 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
18709   aarch64_empty_mask_is_expensive
18710 #undef TARGET_PREFERRED_ELSE_VALUE
18711 #define TARGET_PREFERRED_ELSE_VALUE \
18712   aarch64_preferred_else_value
18713
18714 #undef TARGET_INIT_LIBFUNCS
18715 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
18716
18717 #undef TARGET_FIXED_CONDITION_CODE_REGS
18718 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
18719
18720 #undef TARGET_FLAGS_REGNUM
18721 #define TARGET_FLAGS_REGNUM CC_REGNUM
18722
18723 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
18724 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
18725
18726 #undef TARGET_ASAN_SHADOW_OFFSET
18727 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
18728
18729 #undef TARGET_LEGITIMIZE_ADDRESS
18730 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
18731
18732 #undef TARGET_SCHED_CAN_SPECULATE_INSN
18733 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
18734
18735 #undef TARGET_CAN_USE_DOLOOP_P
18736 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
18737
18738 #undef TARGET_SCHED_ADJUST_PRIORITY
18739 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
18740
18741 #undef TARGET_SCHED_MACRO_FUSION_P
18742 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
18743
18744 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
18745 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
18746
18747 #undef TARGET_SCHED_FUSION_PRIORITY
18748 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
18749
18750 #undef TARGET_UNSPEC_MAY_TRAP_P
18751 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
18752
18753 #undef TARGET_USE_PSEUDO_PIC_REG
18754 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
18755
18756 #undef TARGET_PRINT_OPERAND
18757 #define TARGET_PRINT_OPERAND aarch64_print_operand
18758
18759 #undef TARGET_PRINT_OPERAND_ADDRESS
18760 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
18761
18762 #undef TARGET_OPTAB_SUPPORTED_P
18763 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
18764
18765 #undef TARGET_OMIT_STRUCT_RETURN_REG
18766 #define TARGET_OMIT_STRUCT_RETURN_REG true
18767
18768 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
18769 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
18770   aarch64_dwarf_poly_indeterminate_value
18771
18772 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
18773 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
18774 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
18775
18776 #undef TARGET_HARD_REGNO_NREGS
18777 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
18778 #undef TARGET_HARD_REGNO_MODE_OK
18779 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
18780
18781 #undef TARGET_MODES_TIEABLE_P
18782 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
18783
18784 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
18785 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
18786   aarch64_hard_regno_call_part_clobbered
18787
18788 #undef TARGET_CONSTANT_ALIGNMENT
18789 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
18790
18791 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
18792 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
18793   aarch64_stack_clash_protection_alloca_probe_range
18794
18795 #undef TARGET_COMPUTE_PRESSURE_CLASSES
18796 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
18797
18798 #undef TARGET_CAN_CHANGE_MODE_CLASS
18799 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
18800
18801 #undef TARGET_SELECT_EARLY_REMAT_MODES
18802 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
18803
18804 #undef TARGET_SPECULATION_SAFE_VALUE
18805 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
18806
18807 #undef TARGET_ESTIMATED_POLY_VALUE
18808 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
18809
18810 #undef TARGET_ATTRIBUTE_TABLE
18811 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
18812
18813 #if CHECKING_P
18814 #undef TARGET_RUN_TARGET_SELFTESTS
18815 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
18816 #endif /* #if CHECKING_P */
18817
18818 struct gcc_target targetm = TARGET_INITIALIZER;
18819
18820 #include "gt-aarch64.h"