gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2019 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #include "config.h"
  24 #define INCLUDE_STRING
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "memmodel.h"
  32 #include "gimple.h"
  33 #include "cfghooks.h"
  34 #include "cfgloop.h"
  35 #include "df.h"
  36 #include "tm_p.h"
  37 #include "stringpool.h"
  38 #include "attribs.h"
  39 #include "optabs.h"
  40 #include "regs.h"
  41 #include "emit-rtl.h"
  42 #include "recog.h"
  43 #include "diagnostic.h"
  44 #include "insn-attr.h"
  45 #include "alias.h"
  46 #include "fold-const.h"
  47 #include "stor-layout.h"
  48 #include "calls.h"
  49 #include "varasm.h"
  50 #include "output.h"
  51 #include "flags.h"
  52 #include "explow.h"
  53 #include "expr.h"
  54 #include "reload.h"
  55 #include "langhooks.h"
  56 #include "opts.h"
  57 #include "params.h"
  58 #include "gimplify.h"
  59 #include "dwarf2.h"
  60 #include "gimple-iterator.h"
  61 #include "tree-vectorizer.h"
  62 #include "aarch64-cost-tables.h"
  63 #include "dumpfile.h"
  64 #include "builtins.h"
  65 #include "rtl-iter.h"
  66 #include "tm-constrs.h"
  67 #include "sched-int.h"
  68 #include "target-globals.h"
  69 #include "common/common-target.h"
  70 #include "cfgrtl.h"
  71 #include "selftest.h"
  72 #include "selftest-rtl.h"
  73 #include "rtx-vector-builder.h"
  74
  75 /* This file should be included last.  */
  76 #include "target-def.h"
  77
  78 /* Defined for convenience.  */
  79 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  80
  81 /* Information about a legitimate vector immediate operand.  */
  82 struct simd_immediate_info
  83 {
  84   enum insn_type { MOV, MVN };
  85   enum modifier_type { LSL, MSL };
  86
  87   simd_immediate_info () {}
  88   simd_immediate_info (scalar_float_mode, rtx);
  89   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
  90                        insn_type = MOV, modifier_type = LSL,
  91                        unsigned int = 0);
  92   simd_immediate_info (scalar_mode, rtx, rtx);
  93
  94   /* The mode of the elements.  */
  95   scalar_mode elt_mode;
  96
  97   /* The value of each element if all elements are the same, or the
  98      first value if the constant is a series.  */
  99   rtx value;
 100
 101   /* The value of the step if the constant is a series, null otherwise.  */
 102   rtx step;
 103
 104   /* The instruction to use to move the immediate into a vector.  */
 105   insn_type insn;
 106
 107   /* The kind of shift modifier to use, and the number of bits to shift.
 108      This is (LSL, 0) if no shift is needed.  */
 109   modifier_type modifier;
 110   unsigned int shift;
 111 };
 112
 113 /* Construct a floating-point immediate in which each element has mode
 114    ELT_MODE_IN and value VALUE_IN.  */
 115 inline simd_immediate_info
 116 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 117   : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
 118     modifier (LSL), shift (0)
 119 {}
 120
 121 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 122    and value VALUE_IN.  The other parameters are as for the structure
 123    fields.  */
 124 inline simd_immediate_info
 125 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 126                        unsigned HOST_WIDE_INT value_in,
 127                        insn_type insn_in, modifier_type modifier_in,
 128                        unsigned int shift_in)
 129   : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
 130     step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
 131 {}
 132
 133 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 134    and where element I is equal to VALUE_IN + I * STEP_IN.  */
 135 inline simd_immediate_info
 136 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
 137   : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
 138     modifier (LSL), shift (0)
 139 {}
 140
 141 /* The current code model.  */
 142 enum aarch64_code_model aarch64_cmodel;
 143
 144 /* The number of 64-bit elements in an SVE vector.  */
 145 poly_uint16 aarch64_sve_vg;
 146
 147 #ifdef HAVE_AS_TLS
 148 #undef TARGET_HAVE_TLS
 149 #define TARGET_HAVE_TLS 1
 150 #endif
 151
 152 static bool aarch64_composite_type_p (const_tree, machine_mode);
 153 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 154                                                      const_tree,
 155                                                      machine_mode *, int *,
 156                                                      bool *);
 157 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 158 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 159 static void aarch64_override_options_after_change (void);
 160 static bool aarch64_vector_mode_supported_p (machine_mode);
 161 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 162 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 163                                                          const_tree type,
 164                                                          int misalignment,
 165                                                          bool is_packed);
 166 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 167 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 168                                             aarch64_addr_query_type);
 169 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
 170
 171 /* Major revision number of the ARM Architecture implemented by the target.  */
 172 unsigned aarch64_architecture_version;
 173
 174 /* The processor for which instructions should be scheduled.  */
 175 enum aarch64_processor aarch64_tune = cortexa53;
 176
 177 /* Mask to specify which instruction scheduling options should be used.  */
 178 unsigned long aarch64_tune_flags = 0;
 179
 180 /* Global flag for PC relative loads.  */
 181 bool aarch64_pcrelative_literal_loads;
 182
 183 /* Global flag for whether frame pointer is enabled.  */
 184 bool aarch64_use_frame_pointer;
 185
 186 #define BRANCH_PROTECT_STR_MAX 255
 187 char *accepted_branch_protection_string = NULL;
 188
 189 static enum aarch64_parse_opt_result
 190 aarch64_parse_branch_protection (const char*, char**);
 191
 192 /* Support for command line parsing of boolean flags in the tuning
 193    structures.  */
 194 struct aarch64_flag_desc
 195 {
 196   const char* name;
 197   unsigned int flag;
 198 };
 199
 200 #define AARCH64_FUSION_PAIR(name, internal_name) \
 201   { name, AARCH64_FUSE_##internal_name },
 202 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 203 {
 204   { "none", AARCH64_FUSE_NOTHING },
 205 #include "aarch64-fusion-pairs.def"
 206   { "all", AARCH64_FUSE_ALL },
 207   { NULL, AARCH64_FUSE_NOTHING }
 208 };
 209
 210 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 211   { name, AARCH64_EXTRA_TUNE_##internal_name },
 212 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 213 {
 214   { "none", AARCH64_EXTRA_TUNE_NONE },
 215 #include "aarch64-tuning-flags.def"
 216   { "all", AARCH64_EXTRA_TUNE_ALL },
 217   { NULL, AARCH64_EXTRA_TUNE_NONE }
 218 };
 219
 220 /* Tuning parameters.  */
 221
 222 static const struct cpu_addrcost_table generic_addrcost_table =
 223 {
 224     {
 225       1, /* hi  */
 226       0, /* si  */
 227       0, /* di  */
 228       1, /* ti  */
 229     },
 230   0, /* pre_modify  */
 231   0, /* post_modify  */
 232   0, /* register_offset  */
 233   0, /* register_sextend  */
 234   0, /* register_zextend  */
 235   0 /* imm_offset  */
 236 };
 237
 238 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 239 {
 240     {
 241       0, /* hi  */
 242       0, /* si  */
 243       0, /* di  */
 244       2, /* ti  */
 245     },
 246   0, /* pre_modify  */
 247   0, /* post_modify  */
 248   1, /* register_offset  */
 249   1, /* register_sextend  */
 250   2, /* register_zextend  */
 251   0, /* imm_offset  */
 252 };
 253
 254 static const struct cpu_addrcost_table xgene1_addrcost_table =
 255 {
 256     {
 257       1, /* hi  */
 258       0, /* si  */
 259       0, /* di  */
 260       1, /* ti  */
 261     },
 262   1, /* pre_modify  */
 263   1, /* post_modify  */
 264   0, /* register_offset  */
 265   1, /* register_sextend  */
 266   1, /* register_zextend  */
 267   0, /* imm_offset  */
 268 };
 269
 270 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 271 {
 272     {
 273       1, /* hi  */
 274       1, /* si  */
 275       1, /* di  */
 276       2, /* ti  */
 277     },
 278   0, /* pre_modify  */
 279   0, /* post_modify  */
 280   2, /* register_offset  */
 281   3, /* register_sextend  */
 282   3, /* register_zextend  */
 283   0, /* imm_offset  */
 284 };
 285
 286 static const struct cpu_addrcost_table tsv110_addrcost_table =
 287 {
 288     {
 289       1, /* hi  */
 290       0, /* si  */
 291       0, /* di  */
 292       1, /* ti  */
 293     },
 294   0, /* pre_modify  */
 295   0, /* post_modify  */
 296   0, /* register_offset  */
 297   1, /* register_sextend  */
 298   1, /* register_zextend  */
 299   0, /* imm_offset  */
 300 };
 301
 302 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 303 {
 304     {
 305       1, /* hi  */
 306       1, /* si  */
 307       1, /* di  */
 308       2, /* ti  */
 309     },
 310   1, /* pre_modify  */
 311   1, /* post_modify  */
 312   3, /* register_offset  */
 313   3, /* register_sextend  */
 314   3, /* register_zextend  */
 315   2, /* imm_offset  */
 316 };
 317
 318 static const struct cpu_regmove_cost generic_regmove_cost =
 319 {
 320   1, /* GP2GP  */
 321   /* Avoid the use of slow int<->fp moves for spilling by setting
 322      their cost higher than memmov_cost.  */
 323   5, /* GP2FP  */
 324   5, /* FP2GP  */
 325   2 /* FP2FP  */
 326 };
 327
 328 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 329 {
 330   1, /* GP2GP  */
 331   /* Avoid the use of slow int<->fp moves for spilling by setting
 332      their cost higher than memmov_cost.  */
 333   5, /* GP2FP  */
 334   5, /* FP2GP  */
 335   2 /* FP2FP  */
 336 };
 337
 338 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 339 {
 340   1, /* GP2GP  */
 341   /* Avoid the use of slow int<->fp moves for spilling by setting
 342      their cost higher than memmov_cost.  */
 343   5, /* GP2FP  */
 344   5, /* FP2GP  */
 345   2 /* FP2FP  */
 346 };
 347
 348 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 349 {
 350   1, /* GP2GP  */
 351   /* Avoid the use of slow int<->fp moves for spilling by setting
 352      their cost higher than memmov_cost (actual, 4 and 9).  */
 353   9, /* GP2FP  */
 354   9, /* FP2GP  */
 355   1 /* FP2FP  */
 356 };
 357
 358 static const struct cpu_regmove_cost thunderx_regmove_cost =
 359 {
 360   2, /* GP2GP  */
 361   2, /* GP2FP  */
 362   6, /* FP2GP  */
 363   4 /* FP2FP  */
 364 };
 365
 366 static const struct cpu_regmove_cost xgene1_regmove_cost =
 367 {
 368   1, /* GP2GP  */
 369   /* Avoid the use of slow int<->fp moves for spilling by setting
 370      their cost higher than memmov_cost.  */
 371   8, /* GP2FP  */
 372   8, /* FP2GP  */
 373   2 /* FP2FP  */
 374 };
 375
 376 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 377 {
 378   2, /* GP2GP  */
 379   /* Avoid the use of int<->fp moves for spilling.  */
 380   6, /* GP2FP  */
 381   6, /* FP2GP  */
 382   4 /* FP2FP  */
 383 };
 384
 385 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 386 {
 387   1, /* GP2GP  */
 388   /* Avoid the use of int<->fp moves for spilling.  */
 389   8, /* GP2FP  */
 390   8, /* FP2GP  */
 391   4  /* FP2FP  */
 392 };
 393
 394 static const struct cpu_regmove_cost tsv110_regmove_cost =
 395 {
 396   1, /* GP2GP  */
 397   /* Avoid the use of slow int<->fp moves for spilling by setting
 398      their cost higher than memmov_cost.  */
 399   2, /* GP2FP  */
 400   3, /* FP2GP  */
 401   2  /* FP2FP  */
 402 };
 403
 404 /* Generic costs for vector insn classes.  */
 405 static const struct cpu_vector_cost generic_vector_cost =
 406 {
 407   1, /* scalar_int_stmt_cost  */
 408   1, /* scalar_fp_stmt_cost  */
 409   1, /* scalar_load_cost  */
 410   1, /* scalar_store_cost  */
 411   1, /* vec_int_stmt_cost  */
 412   1, /* vec_fp_stmt_cost  */
 413   2, /* vec_permute_cost  */
 414   1, /* vec_to_scalar_cost  */
 415   1, /* scalar_to_vec_cost  */
 416   1, /* vec_align_load_cost  */
 417   1, /* vec_unalign_load_cost  */
 418   1, /* vec_unalign_store_cost  */
 419   1, /* vec_store_cost  */
 420   3, /* cond_taken_branch_cost  */
 421   1 /* cond_not_taken_branch_cost  */
 422 };
 423
 424 /* QDF24XX costs for vector insn classes.  */
 425 static const struct cpu_vector_cost qdf24xx_vector_cost =
 426 {
 427   1, /* scalar_int_stmt_cost  */
 428   1, /* scalar_fp_stmt_cost  */
 429   1, /* scalar_load_cost  */
 430   1, /* scalar_store_cost  */
 431   1, /* vec_int_stmt_cost  */
 432   3, /* vec_fp_stmt_cost  */
 433   2, /* vec_permute_cost  */
 434   1, /* vec_to_scalar_cost  */
 435   1, /* scalar_to_vec_cost  */
 436   1, /* vec_align_load_cost  */
 437   1, /* vec_unalign_load_cost  */
 438   1, /* vec_unalign_store_cost  */
 439   1, /* vec_store_cost  */
 440   3, /* cond_taken_branch_cost  */
 441   1 /* cond_not_taken_branch_cost  */
 442 };
 443
 444 /* ThunderX costs for vector insn classes.  */
 445 static const struct cpu_vector_cost thunderx_vector_cost =
 446 {
 447   1, /* scalar_int_stmt_cost  */
 448   1, /* scalar_fp_stmt_cost  */
 449   3, /* scalar_load_cost  */
 450   1, /* scalar_store_cost  */
 451   4, /* vec_int_stmt_cost  */
 452   1, /* vec_fp_stmt_cost  */
 453   4, /* vec_permute_cost  */
 454   2, /* vec_to_scalar_cost  */
 455   2, /* scalar_to_vec_cost  */
 456   3, /* vec_align_load_cost  */
 457   5, /* vec_unalign_load_cost  */
 458   5, /* vec_unalign_store_cost  */
 459   1, /* vec_store_cost  */
 460   3, /* cond_taken_branch_cost  */
 461   3 /* cond_not_taken_branch_cost  */
 462 };
 463
 464 static const struct cpu_vector_cost tsv110_vector_cost =
 465 {
 466   1, /* scalar_int_stmt_cost  */
 467   1, /* scalar_fp_stmt_cost  */
 468   5, /* scalar_load_cost  */
 469   1, /* scalar_store_cost  */
 470   2, /* vec_int_stmt_cost  */
 471   2, /* vec_fp_stmt_cost  */
 472   2, /* vec_permute_cost  */
 473   3, /* vec_to_scalar_cost  */
 474   2, /* scalar_to_vec_cost  */
 475   5, /* vec_align_load_cost  */
 476   5, /* vec_unalign_load_cost  */
 477   1, /* vec_unalign_store_cost  */
 478   1, /* vec_store_cost  */
 479   1, /* cond_taken_branch_cost  */
 480   1 /* cond_not_taken_branch_cost  */
 481 };
 482
 483 /* Generic costs for vector insn classes.  */
 484 static const struct cpu_vector_cost cortexa57_vector_cost =
 485 {
 486   1, /* scalar_int_stmt_cost  */
 487   1, /* scalar_fp_stmt_cost  */
 488   4, /* scalar_load_cost  */
 489   1, /* scalar_store_cost  */
 490   2, /* vec_int_stmt_cost  */
 491   2, /* vec_fp_stmt_cost  */
 492   3, /* vec_permute_cost  */
 493   8, /* vec_to_scalar_cost  */
 494   8, /* scalar_to_vec_cost  */
 495   4, /* vec_align_load_cost  */
 496   4, /* vec_unalign_load_cost  */
 497   1, /* vec_unalign_store_cost  */
 498   1, /* vec_store_cost  */
 499   1, /* cond_taken_branch_cost  */
 500   1 /* cond_not_taken_branch_cost  */
 501 };
 502
 503 static const struct cpu_vector_cost exynosm1_vector_cost =
 504 {
 505   1, /* scalar_int_stmt_cost  */
 506   1, /* scalar_fp_stmt_cost  */
 507   5, /* scalar_load_cost  */
 508   1, /* scalar_store_cost  */
 509   3, /* vec_int_stmt_cost  */
 510   3, /* vec_fp_stmt_cost  */
 511   3, /* vec_permute_cost  */
 512   3, /* vec_to_scalar_cost  */
 513   3, /* scalar_to_vec_cost  */
 514   5, /* vec_align_load_cost  */
 515   5, /* vec_unalign_load_cost  */
 516   1, /* vec_unalign_store_cost  */
 517   1, /* vec_store_cost  */
 518   1, /* cond_taken_branch_cost  */
 519   1 /* cond_not_taken_branch_cost  */
 520 };
 521
 522 /* Generic costs for vector insn classes.  */
 523 static const struct cpu_vector_cost xgene1_vector_cost =
 524 {
 525   1, /* scalar_int_stmt_cost  */
 526   1, /* scalar_fp_stmt_cost  */
 527   5, /* scalar_load_cost  */
 528   1, /* scalar_store_cost  */
 529   2, /* vec_int_stmt_cost  */
 530   2, /* vec_fp_stmt_cost  */
 531   2, /* vec_permute_cost  */
 532   4, /* vec_to_scalar_cost  */
 533   4, /* scalar_to_vec_cost  */
 534   10, /* vec_align_load_cost  */
 535   10, /* vec_unalign_load_cost  */
 536   2, /* vec_unalign_store_cost  */
 537   2, /* vec_store_cost  */
 538   2, /* cond_taken_branch_cost  */
 539   1 /* cond_not_taken_branch_cost  */
 540 };
 541
 542 /* Costs for vector insn classes for Vulcan.  */
 543 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 544 {
 545   1, /* scalar_int_stmt_cost  */
 546   6, /* scalar_fp_stmt_cost  */
 547   4, /* scalar_load_cost  */
 548   1, /* scalar_store_cost  */
 549   5, /* vec_int_stmt_cost  */
 550   6, /* vec_fp_stmt_cost  */
 551   3, /* vec_permute_cost  */
 552   6, /* vec_to_scalar_cost  */
 553   5, /* scalar_to_vec_cost  */
 554   8, /* vec_align_load_cost  */
 555   8, /* vec_unalign_load_cost  */
 556   4, /* vec_unalign_store_cost  */
 557   4, /* vec_store_cost  */
 558   2, /* cond_taken_branch_cost  */
 559   1  /* cond_not_taken_branch_cost  */
 560 };
 561
 562 /* Generic costs for branch instructions.  */
 563 static const struct cpu_branch_cost generic_branch_cost =
 564 {
 565   1,  /* Predictable.  */
 566   3   /* Unpredictable.  */
 567 };
 568
 569 /* Generic approximation modes.  */
 570 static const cpu_approx_modes generic_approx_modes =
 571 {
 572   AARCH64_APPROX_NONE,  /* division  */
 573   AARCH64_APPROX_NONE,  /* sqrt  */
 574   AARCH64_APPROX_NONE   /* recip_sqrt  */
 575 };
 576
 577 /* Approximation modes for Exynos M1.  */
 578 static const cpu_approx_modes exynosm1_approx_modes =
 579 {
 580   AARCH64_APPROX_NONE,  /* division  */
 581   AARCH64_APPROX_ALL,   /* sqrt  */
 582   AARCH64_APPROX_ALL    /* recip_sqrt  */
 583 };
 584
 585 /* Approximation modes for X-Gene 1.  */
 586 static const cpu_approx_modes xgene1_approx_modes =
 587 {
 588   AARCH64_APPROX_NONE,  /* division  */
 589   AARCH64_APPROX_NONE,  /* sqrt  */
 590   AARCH64_APPROX_ALL    /* recip_sqrt  */
 591 };
 592
 593 /* Generic prefetch settings (which disable prefetch).  */
 594 static const cpu_prefetch_tune generic_prefetch_tune =
 595 {
 596   0,                    /* num_slots  */
 597   -1,                   /* l1_cache_size  */
 598   -1,                   /* l1_cache_line_size  */
 599   -1,                   /* l2_cache_size  */
 600   true,                 /* prefetch_dynamic_strides */
 601   -1,                   /* minimum_stride */
 602   -1                    /* default_opt_level  */
 603 };
 604
 605 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 606 {
 607   0,                    /* num_slots  */
 608   -1,                   /* l1_cache_size  */
 609   64,                   /* l1_cache_line_size  */
 610   -1,                   /* l2_cache_size  */
 611   true,                 /* prefetch_dynamic_strides */
 612   -1,                   /* minimum_stride */
 613   -1                    /* default_opt_level  */
 614 };
 615
 616 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 617 {
 618   4,                    /* num_slots  */
 619   32,                   /* l1_cache_size  */
 620   64,                   /* l1_cache_line_size  */
 621   512,                  /* l2_cache_size  */
 622   false,                /* prefetch_dynamic_strides */
 623   2048,                 /* minimum_stride */
 624   3                     /* default_opt_level  */
 625 };
 626
 627 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 628 {
 629   8,                    /* num_slots  */
 630   32,                   /* l1_cache_size  */
 631   128,                  /* l1_cache_line_size  */
 632   16*1024,              /* l2_cache_size  */
 633   true,                 /* prefetch_dynamic_strides */
 634   -1,                   /* minimum_stride */
 635   3                     /* default_opt_level  */
 636 };
 637
 638 static const cpu_prefetch_tune thunderx_prefetch_tune =
 639 {
 640   8,                    /* num_slots  */
 641   32,                   /* l1_cache_size  */
 642   128,                  /* l1_cache_line_size  */
 643   -1,                   /* l2_cache_size  */
 644   true,                 /* prefetch_dynamic_strides */
 645   -1,                   /* minimum_stride */
 646   -1                    /* default_opt_level  */
 647 };
 648
 649 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 650 {
 651   8,                    /* num_slots  */
 652   32,                   /* l1_cache_size  */
 653   64,                   /* l1_cache_line_size  */
 654   256,                  /* l2_cache_size  */
 655   true,                 /* prefetch_dynamic_strides */
 656   -1,                   /* minimum_stride */
 657   -1                    /* default_opt_level  */
 658 };
 659
 660 static const cpu_prefetch_tune tsv110_prefetch_tune =
 661 {
 662   0,                    /* num_slots  */
 663   64,                   /* l1_cache_size  */
 664   64,                   /* l1_cache_line_size  */
 665   512,                  /* l2_cache_size  */
 666   true,                 /* prefetch_dynamic_strides */
 667   -1,                   /* minimum_stride */
 668   -1                    /* default_opt_level  */
 669 };
 670
 671 static const cpu_prefetch_tune xgene1_prefetch_tune =
 672 {
 673   8,                    /* num_slots  */
 674   32,                   /* l1_cache_size  */
 675   64,                   /* l1_cache_line_size  */
 676   256,                  /* l2_cache_size  */
 677   true,                 /* prefetch_dynamic_strides */
 678   -1,                   /* minimum_stride */
 679   -1                    /* default_opt_level  */
 680 };
 681
 682 static const struct tune_params generic_tunings =
 683 {
 684   &cortexa57_extra_costs,
 685   &generic_addrcost_table,
 686   &generic_regmove_cost,
 687   &generic_vector_cost,
 688   &generic_branch_cost,
 689   &generic_approx_modes,
 690   SVE_NOT_IMPLEMENTED, /* sve_width  */
 691   4, /* memmov_cost  */
 692   2, /* issue_rate  */
 693   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 694   "8",  /* function_align.  */
 695   "4",  /* jump_align.  */
 696   "8",  /* loop_align.  */
 697   2,    /* int_reassoc_width.  */
 698   4,    /* fp_reassoc_width.  */
 699   1,    /* vec_reassoc_width.  */
 700   2,    /* min_div_recip_mul_sf.  */
 701   2,    /* min_div_recip_mul_df.  */
 702   0,    /* max_case_values.  */
 703   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 704   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 705   &generic_prefetch_tune
 706 };
 707
 708 static const struct tune_params cortexa35_tunings =
 709 {
 710   &cortexa53_extra_costs,
 711   &generic_addrcost_table,
 712   &cortexa53_regmove_cost,
 713   &generic_vector_cost,
 714   &generic_branch_cost,
 715   &generic_approx_modes,
 716   SVE_NOT_IMPLEMENTED, /* sve_width  */
 717   4, /* memmov_cost  */
 718   1, /* issue_rate  */
 719   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 720    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 721   "16", /* function_align.  */
 722   "4",  /* jump_align.  */
 723   "8",  /* loop_align.  */
 724   2,    /* int_reassoc_width.  */
 725   4,    /* fp_reassoc_width.  */
 726   1,    /* vec_reassoc_width.  */
 727   2,    /* min_div_recip_mul_sf.  */
 728   2,    /* min_div_recip_mul_df.  */
 729   0,    /* max_case_values.  */
 730   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 731   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 732   &generic_prefetch_tune
 733 };
 734
 735 static const struct tune_params cortexa53_tunings =
 736 {
 737   &cortexa53_extra_costs,
 738   &generic_addrcost_table,
 739   &cortexa53_regmove_cost,
 740   &generic_vector_cost,
 741   &generic_branch_cost,
 742   &generic_approx_modes,
 743   SVE_NOT_IMPLEMENTED, /* sve_width  */
 744   4, /* memmov_cost  */
 745   2, /* issue_rate  */
 746   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 747    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 748   "16", /* function_align.  */
 749   "4",  /* jump_align.  */
 750   "8",  /* loop_align.  */
 751   2,    /* int_reassoc_width.  */
 752   4,    /* fp_reassoc_width.  */
 753   1,    /* vec_reassoc_width.  */
 754   2,    /* min_div_recip_mul_sf.  */
 755   2,    /* min_div_recip_mul_df.  */
 756   0,    /* max_case_values.  */
 757   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 758   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 759   &generic_prefetch_tune
 760 };
 761
 762 static const struct tune_params cortexa57_tunings =
 763 {
 764   &cortexa57_extra_costs,
 765   &generic_addrcost_table,
 766   &cortexa57_regmove_cost,
 767   &cortexa57_vector_cost,
 768   &generic_branch_cost,
 769   &generic_approx_modes,
 770   SVE_NOT_IMPLEMENTED, /* sve_width  */
 771   4, /* memmov_cost  */
 772   3, /* issue_rate  */
 773   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 774    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 775   "16", /* function_align.  */
 776   "4",  /* jump_align.  */
 777   "8",  /* loop_align.  */
 778   2,    /* int_reassoc_width.  */
 779   4,    /* fp_reassoc_width.  */
 780   1,    /* vec_reassoc_width.  */
 781   2,    /* min_div_recip_mul_sf.  */
 782   2,    /* min_div_recip_mul_df.  */
 783   0,    /* max_case_values.  */
 784   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 785   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 786   &generic_prefetch_tune
 787 };
 788
 789 static const struct tune_params cortexa72_tunings =
 790 {
 791   &cortexa57_extra_costs,
 792   &generic_addrcost_table,
 793   &cortexa57_regmove_cost,
 794   &cortexa57_vector_cost,
 795   &generic_branch_cost,
 796   &generic_approx_modes,
 797   SVE_NOT_IMPLEMENTED, /* sve_width  */
 798   4, /* memmov_cost  */
 799   3, /* issue_rate  */
 800   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 801    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 802   "16", /* function_align.  */
 803   "4",  /* jump_align.  */
 804   "8",  /* loop_align.  */
 805   2,    /* int_reassoc_width.  */
 806   4,    /* fp_reassoc_width.  */
 807   1,    /* vec_reassoc_width.  */
 808   2,    /* min_div_recip_mul_sf.  */
 809   2,    /* min_div_recip_mul_df.  */
 810   0,    /* max_case_values.  */
 811   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 812   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 813   &generic_prefetch_tune
 814 };
 815
 816 static const struct tune_params cortexa73_tunings =
 817 {
 818   &cortexa57_extra_costs,
 819   &generic_addrcost_table,
 820   &cortexa57_regmove_cost,
 821   &cortexa57_vector_cost,
 822   &generic_branch_cost,
 823   &generic_approx_modes,
 824   SVE_NOT_IMPLEMENTED, /* sve_width  */
 825   4, /* memmov_cost.  */
 826   2, /* issue_rate.  */
 827   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 828    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 829   "16", /* function_align.  */
 830   "4",  /* jump_align.  */
 831   "8",  /* loop_align.  */
 832   2,    /* int_reassoc_width.  */
 833   4,    /* fp_reassoc_width.  */
 834   1,    /* vec_reassoc_width.  */
 835   2,    /* min_div_recip_mul_sf.  */
 836   2,    /* min_div_recip_mul_df.  */
 837   0,    /* max_case_values.  */
 838   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 839   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 840   &generic_prefetch_tune
 841 };
 842
 843
 844
 845 static const struct tune_params exynosm1_tunings =
 846 {
 847   &exynosm1_extra_costs,
 848   &exynosm1_addrcost_table,
 849   &exynosm1_regmove_cost,
 850   &exynosm1_vector_cost,
 851   &generic_branch_cost,
 852   &exynosm1_approx_modes,
 853   SVE_NOT_IMPLEMENTED, /* sve_width  */
 854   4,    /* memmov_cost  */
 855   3,    /* issue_rate  */
 856   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 857   "4",  /* function_align.  */
 858   "4",  /* jump_align.  */
 859   "4",  /* loop_align.  */
 860   2,    /* int_reassoc_width.  */
 861   4,    /* fp_reassoc_width.  */
 862   1,    /* vec_reassoc_width.  */
 863   2,    /* min_div_recip_mul_sf.  */
 864   2,    /* min_div_recip_mul_df.  */
 865   48,   /* max_case_values.  */
 866   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 867   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 868   &exynosm1_prefetch_tune
 869 };
 870
 871 static const struct tune_params thunderxt88_tunings =
 872 {
 873   &thunderx_extra_costs,
 874   &generic_addrcost_table,
 875   &thunderx_regmove_cost,
 876   &thunderx_vector_cost,
 877   &generic_branch_cost,
 878   &generic_approx_modes,
 879   SVE_NOT_IMPLEMENTED, /* sve_width  */
 880   6, /* memmov_cost  */
 881   2, /* issue_rate  */
 882   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 883   "8",  /* function_align.  */
 884   "8",  /* jump_align.  */
 885   "8",  /* loop_align.  */
 886   2,    /* int_reassoc_width.  */
 887   4,    /* fp_reassoc_width.  */
 888   1,    /* vec_reassoc_width.  */
 889   2,    /* min_div_recip_mul_sf.  */
 890   2,    /* min_div_recip_mul_df.  */
 891   0,    /* max_case_values.  */
 892   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 893   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 894   &thunderxt88_prefetch_tune
 895 };
 896
 897 static const struct tune_params thunderx_tunings =
 898 {
 899   &thunderx_extra_costs,
 900   &generic_addrcost_table,
 901   &thunderx_regmove_cost,
 902   &thunderx_vector_cost,
 903   &generic_branch_cost,
 904   &generic_approx_modes,
 905   SVE_NOT_IMPLEMENTED, /* sve_width  */
 906   6, /* memmov_cost  */
 907   2, /* issue_rate  */
 908   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 909   "8",  /* function_align.  */
 910   "8",  /* jump_align.  */
 911   "8",  /* loop_align.  */
 912   2,    /* int_reassoc_width.  */
 913   4,    /* fp_reassoc_width.  */
 914   1,    /* vec_reassoc_width.  */
 915   2,    /* min_div_recip_mul_sf.  */
 916   2,    /* min_div_recip_mul_df.  */
 917   0,    /* max_case_values.  */
 918   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 919   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 920    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 921   &thunderx_prefetch_tune
 922 };
 923
 924 static const struct tune_params tsv110_tunings =
 925 {
 926   &tsv110_extra_costs,
 927   &tsv110_addrcost_table,
 928   &tsv110_regmove_cost,
 929   &tsv110_vector_cost,
 930   &generic_branch_cost,
 931   &generic_approx_modes,
 932   SVE_NOT_IMPLEMENTED, /* sve_width  */
 933   4,    /* memmov_cost  */
 934   4,    /* issue_rate  */
 935   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
 936    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 937   "16", /* function_align.  */
 938   "4",  /* jump_align.  */
 939   "8",  /* loop_align.  */
 940   2,    /* int_reassoc_width.  */
 941   4,    /* fp_reassoc_width.  */
 942   1,    /* vec_reassoc_width.  */
 943   2,    /* min_div_recip_mul_sf.  */
 944   2,    /* min_div_recip_mul_df.  */
 945   0,    /* max_case_values.  */
 946   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 947   (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
 948   &tsv110_prefetch_tune
 949 };
 950
 951 static const struct tune_params xgene1_tunings =
 952 {
 953   &xgene1_extra_costs,
 954   &xgene1_addrcost_table,
 955   &xgene1_regmove_cost,
 956   &xgene1_vector_cost,
 957   &generic_branch_cost,
 958   &xgene1_approx_modes,
 959   SVE_NOT_IMPLEMENTED, /* sve_width  */
 960   6, /* memmov_cost  */
 961   4, /* issue_rate  */
 962   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 963   "16", /* function_align.  */
 964   "16", /* jump_align.  */
 965   "16", /* loop_align.  */
 966   2,    /* int_reassoc_width.  */
 967   4,    /* fp_reassoc_width.  */
 968   1,    /* vec_reassoc_width.  */
 969   2,    /* min_div_recip_mul_sf.  */
 970   2,    /* min_div_recip_mul_df.  */
 971   17,   /* max_case_values.  */
 972   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 973   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
 974   &xgene1_prefetch_tune
 975 };
 976
 977 static const struct tune_params emag_tunings =
 978 {
 979   &xgene1_extra_costs,
 980   &xgene1_addrcost_table,
 981   &xgene1_regmove_cost,
 982   &xgene1_vector_cost,
 983   &generic_branch_cost,
 984   &xgene1_approx_modes,
 985   SVE_NOT_IMPLEMENTED,
 986   6, /* memmov_cost  */
 987   4, /* issue_rate  */
 988   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 989   "16", /* function_align.  */
 990   "16", /* jump_align.  */
 991   "16", /* loop_align.  */
 992   2,    /* int_reassoc_width.  */
 993   4,    /* fp_reassoc_width.  */
 994   1,    /* vec_reassoc_width.  */
 995   2,    /* min_div_recip_mul_sf.  */
 996   2,    /* min_div_recip_mul_df.  */
 997   17,   /* max_case_values.  */
 998   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 999   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1000   &xgene1_prefetch_tune
1001 };
1002
1003 static const struct tune_params qdf24xx_tunings =
1004 {
1005   &qdf24xx_extra_costs,
1006   &qdf24xx_addrcost_table,
1007   &qdf24xx_regmove_cost,
1008   &qdf24xx_vector_cost,
1009   &generic_branch_cost,
1010   &generic_approx_modes,
1011   SVE_NOT_IMPLEMENTED, /* sve_width  */
1012   4, /* memmov_cost  */
1013   4, /* issue_rate  */
1014   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1015    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1016   "16", /* function_align.  */
1017   "8",  /* jump_align.  */
1018   "16", /* loop_align.  */
1019   2,    /* int_reassoc_width.  */
1020   4,    /* fp_reassoc_width.  */
1021   1,    /* vec_reassoc_width.  */
1022   2,    /* min_div_recip_mul_sf.  */
1023   2,    /* min_div_recip_mul_df.  */
1024   0,    /* max_case_values.  */
1025   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1026   AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
1027   &qdf24xx_prefetch_tune
1028 };
1029
1030 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
1031    for now.  */
1032 static const struct tune_params saphira_tunings =
1033 {
1034   &generic_extra_costs,
1035   &generic_addrcost_table,
1036   &generic_regmove_cost,
1037   &generic_vector_cost,
1038   &generic_branch_cost,
1039   &generic_approx_modes,
1040   SVE_NOT_IMPLEMENTED, /* sve_width  */
1041   4, /* memmov_cost  */
1042   4, /* issue_rate  */
1043   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1044    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1045   "16", /* function_align.  */
1046   "8",  /* jump_align.  */
1047   "16", /* loop_align.  */
1048   2,    /* int_reassoc_width.  */
1049   4,    /* fp_reassoc_width.  */
1050   1,    /* vec_reassoc_width.  */
1051   2,    /* min_div_recip_mul_sf.  */
1052   2,    /* min_div_recip_mul_df.  */
1053   0,    /* max_case_values.  */
1054   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1055   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
1056   &generic_prefetch_tune
1057 };
1058
1059 static const struct tune_params thunderx2t99_tunings =
1060 {
1061   &thunderx2t99_extra_costs,
1062   &thunderx2t99_addrcost_table,
1063   &thunderx2t99_regmove_cost,
1064   &thunderx2t99_vector_cost,
1065   &generic_branch_cost,
1066   &generic_approx_modes,
1067   SVE_NOT_IMPLEMENTED, /* sve_width  */
1068   4, /* memmov_cost.  */
1069   4, /* issue_rate.  */
1070   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
1071    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
1072   "16", /* function_align.  */
1073   "8",  /* jump_align.  */
1074   "16", /* loop_align.  */
1075   3,    /* int_reassoc_width.  */
1076   2,    /* fp_reassoc_width.  */
1077   2,    /* vec_reassoc_width.  */
1078   2,    /* min_div_recip_mul_sf.  */
1079   2,    /* min_div_recip_mul_df.  */
1080   0,    /* max_case_values.  */
1081   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1082   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1083   &thunderx2t99_prefetch_tune
1084 };
1085
1086 /* Support for fine-grained override of the tuning structures.  */
1087 struct aarch64_tuning_override_function
1088 {
1089   const char* name;
1090   void (*parse_override)(const char*, struct tune_params*);
1091 };
1092
1093 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1094 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1095 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1096
1097 static const struct aarch64_tuning_override_function
1098 aarch64_tuning_override_functions[] =
1099 {
1100   { "fuse", aarch64_parse_fuse_string },
1101   { "tune", aarch64_parse_tune_string },
1102   { "sve_width", aarch64_parse_sve_width_string },
1103   { NULL, NULL }
1104 };
1105
1106 /* A processor implementing AArch64.  */
1107 struct processor
1108 {
1109   const char *const name;
1110   enum aarch64_processor ident;
1111   enum aarch64_processor sched_core;
1112   enum aarch64_arch arch;
1113   unsigned architecture_version;
1114   const unsigned long flags;
1115   const struct tune_params *const tune;
1116 };
1117
1118 /* Architectures implementing AArch64.  */
1119 static const struct processor all_architectures[] =
1120 {
1121 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1122   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1123 #include "aarch64-arches.def"
1124   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1125 };
1126
1127 /* Processor cores implementing AArch64.  */
1128 static const struct processor all_cores[] =
1129 {
1130 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1131   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
1132   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
1133   FLAGS, &COSTS##_tunings},
1134 #include "aarch64-cores.def"
1135   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1136     AARCH64_FL_FOR_ARCH8, &generic_tunings},
1137   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1138 };
1139
1140
1141 /* Target specification.  These are populated by the -march, -mtune, -mcpu
1142    handling code or by target attributes.  */
1143 static const struct processor *selected_arch;
1144 static const struct processor *selected_cpu;
1145 static const struct processor *selected_tune;
1146
1147 /* The current tuning set.  */
1148 struct tune_params aarch64_tune_params = generic_tunings;
1149
1150 /* Table of machine attributes.  */
1151 static const struct attribute_spec aarch64_attribute_table[] =
1152 {
1153   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1154        affects_type_identity, handler, exclude } */
1155   { "aarch64_vector_pcs", 0, 0, false, true,  true,  false, NULL, NULL },
1156   { NULL,                 0, 0, false, false, false, false, NULL, NULL }
1157 };
1158
1159 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1160
1161 /* An ISA extension in the co-processor and main instruction set space.  */
1162 struct aarch64_option_extension
1163 {
1164   const char *const name;
1165   const unsigned long flags_on;
1166   const unsigned long flags_off;
1167 };
1168
1169 typedef enum aarch64_cond_code
1170 {
1171   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1172   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1173   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1174 }
1175 aarch64_cc;
1176
1177 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1178
1179 struct aarch64_branch_protect_type
1180 {
1181   /* The type's name that the user passes to the branch-protection option
1182     string.  */
1183   const char* name;
1184   /* Function to handle the protection type and set global variables.
1185     First argument is the string token corresponding with this type and the
1186     second argument is the next token in the option string.
1187     Return values:
1188     * AARCH64_PARSE_OK: Handling was sucessful.
1189     * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1190       should print an error.
1191     * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1192       own error.  */
1193   enum aarch64_parse_opt_result (*handler)(char*, char*);
1194   /* A list of types that can follow this type in the option string.  */
1195   const aarch64_branch_protect_type* subtypes;
1196   unsigned int num_subtypes;
1197 };
1198
1199 static enum aarch64_parse_opt_result
1200 aarch64_handle_no_branch_protection (char* str, char* rest)
1201 {
1202   aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1203   aarch64_enable_bti = 0;
1204   if (rest)
1205     {
1206       error ("unexpected %<%s%> after %<%s%>", rest, str);
1207       return AARCH64_PARSE_INVALID_FEATURE;
1208     }
1209   return AARCH64_PARSE_OK;
1210 }
1211
1212 static enum aarch64_parse_opt_result
1213 aarch64_handle_standard_branch_protection (char* str, char* rest)
1214 {
1215   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1216   aarch64_enable_bti = 1;
1217   if (rest)
1218     {
1219       error ("unexpected %<%s%> after %<%s%>", rest, str);
1220       return AARCH64_PARSE_INVALID_FEATURE;
1221     }
1222   return AARCH64_PARSE_OK;
1223 }
1224
1225 static enum aarch64_parse_opt_result
1226 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1227                                     char* rest ATTRIBUTE_UNUSED)
1228 {
1229   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1230   return AARCH64_PARSE_OK;
1231 }
1232
1233 static enum aarch64_parse_opt_result
1234 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1235                               char* rest ATTRIBUTE_UNUSED)
1236 {
1237   aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1238   return AARCH64_PARSE_OK;
1239 }
1240
1241 static enum aarch64_parse_opt_result
1242 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1243                                     char* rest ATTRIBUTE_UNUSED)
1244 {
1245   aarch64_enable_bti = 1;
1246   return AARCH64_PARSE_OK;
1247 }
1248
1249 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1250   { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1251   { NULL, NULL, NULL, 0 }
1252 };
1253
1254 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1255   { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1256   { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1257   { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1258     ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1259   { "bti", aarch64_handle_bti_protection, NULL, 0 },
1260   { NULL, NULL, NULL, 0 }
1261 };
1262
1263 /* The condition codes of the processor, and the inverse function.  */
1264 static const char * const aarch64_condition_codes[] =
1265 {
1266   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1267   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1268 };
1269
1270 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1271 const char *
1272 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1273                         const char * branch_format)
1274 {
1275     rtx_code_label * tmp_label = gen_label_rtx ();
1276     char label_buf[256];
1277     char buffer[128];
1278     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1279                                  CODE_LABEL_NUMBER (tmp_label));
1280     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1281     rtx dest_label = operands[pos_label];
1282     operands[pos_label] = tmp_label;
1283
1284     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1285     output_asm_insn (buffer, operands);
1286
1287     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1288     operands[pos_label] = dest_label;
1289     output_asm_insn (buffer, operands);
1290     return "";
1291 }
1292
1293 void
1294 aarch64_err_no_fpadvsimd (machine_mode mode)
1295 {
1296   if (TARGET_GENERAL_REGS_ONLY)
1297     if (FLOAT_MODE_P (mode))
1298       error ("%qs is incompatible with the use of floating-point types",
1299              "-mgeneral-regs-only");
1300     else
1301       error ("%qs is incompatible with the use of vector types",
1302              "-mgeneral-regs-only");
1303   else
1304     if (FLOAT_MODE_P (mode))
1305       error ("%qs feature modifier is incompatible with the use of"
1306              " floating-point types", "+nofp");
1307     else
1308       error ("%qs feature modifier is incompatible with the use of"
1309              " vector types", "+nofp");
1310 }
1311
1312 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1313    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1314    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1315    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1316    and GENERAL_REGS is lower than the memory cost (in this case the best class
1317    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
1318    cost results in bad allocations with many redundant int<->FP moves which
1319    are expensive on various cores.
1320    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1321    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
1322    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
1323    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
1324    The result of this is that it is no longer inefficient to have a higher
1325    memory move cost than the register move cost.
1326 */
1327
1328 static reg_class_t
1329 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1330                                          reg_class_t best_class)
1331 {
1332   machine_mode mode;
1333
1334   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1335       || !reg_class_subset_p (FP_REGS, allocno_class))
1336     return allocno_class;
1337
1338   if (!reg_class_subset_p (GENERAL_REGS, best_class)
1339       || !reg_class_subset_p (FP_REGS, best_class))
1340     return best_class;
1341
1342   mode = PSEUDO_REGNO_MODE (regno);
1343   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1344 }
1345
1346 static unsigned int
1347 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1348 {
1349   if (GET_MODE_UNIT_SIZE (mode) == 4)
1350     return aarch64_tune_params.min_div_recip_mul_sf;
1351   return aarch64_tune_params.min_div_recip_mul_df;
1352 }
1353
1354 /* Return the reassociation width of treeop OPC with mode MODE.  */
1355 static int
1356 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1357 {
1358   if (VECTOR_MODE_P (mode))
1359     return aarch64_tune_params.vec_reassoc_width;
1360   if (INTEGRAL_MODE_P (mode))
1361     return aarch64_tune_params.int_reassoc_width;
1362   /* Avoid reassociating floating point addition so we emit more FMAs.  */
1363   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1364     return aarch64_tune_params.fp_reassoc_width;
1365   return 1;
1366 }
1367
1368 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1369 unsigned
1370 aarch64_dbx_register_number (unsigned regno)
1371 {
1372    if (GP_REGNUM_P (regno))
1373      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1374    else if (regno == SP_REGNUM)
1375      return AARCH64_DWARF_SP;
1376    else if (FP_REGNUM_P (regno))
1377      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1378    else if (PR_REGNUM_P (regno))
1379      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1380    else if (regno == VG_REGNUM)
1381      return AARCH64_DWARF_VG;
1382
1383    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1384       equivalent DWARF register.  */
1385    return DWARF_FRAME_REGISTERS;
1386 }
1387
1388 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1389 static bool
1390 aarch64_advsimd_struct_mode_p (machine_mode mode)
1391 {
1392   return (TARGET_SIMD
1393           && (mode == OImode || mode == CImode || mode == XImode));
1394 }
1395
1396 /* Return true if MODE is an SVE predicate mode.  */
1397 static bool
1398 aarch64_sve_pred_mode_p (machine_mode mode)
1399 {
1400   return (TARGET_SVE
1401           && (mode == VNx16BImode
1402               || mode == VNx8BImode
1403               || mode == VNx4BImode
1404               || mode == VNx2BImode));
1405 }
1406
1407 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1408 const unsigned int VEC_ADVSIMD  = 1;
1409 const unsigned int VEC_SVE_DATA = 2;
1410 const unsigned int VEC_SVE_PRED = 4;
1411 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1412    a structure of 2, 3 or 4 vectors.  */
1413 const unsigned int VEC_STRUCT   = 8;
1414 /* Useful combinations of the above.  */
1415 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1416 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1417
1418 /* Return a set of flags describing the vector properties of mode MODE.
1419    Ignore modes that are not supported by the current target.  */
1420 static unsigned int
1421 aarch64_classify_vector_mode (machine_mode mode)
1422 {
1423   if (aarch64_advsimd_struct_mode_p (mode))
1424     return VEC_ADVSIMD | VEC_STRUCT;
1425
1426   if (aarch64_sve_pred_mode_p (mode))
1427     return VEC_SVE_PRED;
1428
1429   scalar_mode inner = GET_MODE_INNER (mode);
1430   if (VECTOR_MODE_P (mode)
1431       && (inner == QImode
1432           || inner == HImode
1433           || inner == HFmode
1434           || inner == SImode
1435           || inner == SFmode
1436           || inner == DImode
1437           || inner == DFmode))
1438     {
1439       if (TARGET_SVE)
1440         {
1441           if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1442             return VEC_SVE_DATA;
1443           if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1444               || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1445               || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1446             return VEC_SVE_DATA | VEC_STRUCT;
1447         }
1448
1449       /* This includes V1DF but not V1DI (which doesn't exist).  */
1450       if (TARGET_SIMD
1451           && (known_eq (GET_MODE_BITSIZE (mode), 64)
1452               || known_eq (GET_MODE_BITSIZE (mode), 128)))
1453         return VEC_ADVSIMD;
1454     }
1455
1456   return 0;
1457 }
1458
1459 /* Return true if MODE is any of the data vector modes, including
1460    structure modes.  */
1461 static bool
1462 aarch64_vector_data_mode_p (machine_mode mode)
1463 {
1464   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1465 }
1466
1467 /* Return true if MODE is an SVE data vector mode; either a single vector
1468    or a structure of vectors.  */
1469 static bool
1470 aarch64_sve_data_mode_p (machine_mode mode)
1471 {
1472   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1473 }
1474
1475 /* Implement target hook TARGET_ARRAY_MODE.  */
1476 static opt_machine_mode
1477 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1478 {
1479   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1480       && IN_RANGE (nelems, 2, 4))
1481     return mode_for_vector (GET_MODE_INNER (mode),
1482                             GET_MODE_NUNITS (mode) * nelems);
1483
1484   return opt_machine_mode ();
1485 }
1486
1487 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1488 static bool
1489 aarch64_array_mode_supported_p (machine_mode mode,
1490                                 unsigned HOST_WIDE_INT nelems)
1491 {
1492   if (TARGET_SIMD
1493       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1494           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1495       && (nelems >= 2 && nelems <= 4))
1496     return true;
1497
1498   return false;
1499 }
1500
1501 /* Return the SVE predicate mode to use for elements that have
1502    ELEM_NBYTES bytes, if such a mode exists.  */
1503
1504 opt_machine_mode
1505 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1506 {
1507   if (TARGET_SVE)
1508     {
1509       if (elem_nbytes == 1)
1510         return VNx16BImode;
1511       if (elem_nbytes == 2)
1512         return VNx8BImode;
1513       if (elem_nbytes == 4)
1514         return VNx4BImode;
1515       if (elem_nbytes == 8)
1516         return VNx2BImode;
1517     }
1518   return opt_machine_mode ();
1519 }
1520
1521 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1522
1523 static opt_machine_mode
1524 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1525 {
1526   if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1527     {
1528       unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1529       machine_mode pred_mode;
1530       if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1531         return pred_mode;
1532     }
1533
1534   return default_get_mask_mode (nunits, nbytes);
1535 }
1536
1537 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
1538    prefer to use the first arithmetic operand as the else value if
1539    the else value doesn't matter, since that exactly matches the SVE
1540    destructive merging form.  For ternary operations we could either
1541    pick the first operand and use FMAD-like instructions or the last
1542    operand and use FMLA-like instructions; the latter seems more
1543    natural.  */
1544
1545 static tree
1546 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1547 {
1548   return nops == 3 ? ops[2] : ops[0];
1549 }
1550
1551 /* Implement TARGET_HARD_REGNO_NREGS.  */
1552
1553 static unsigned int
1554 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1555 {
1556   /* ??? Logically we should only need to provide a value when
1557      HARD_REGNO_MODE_OK says that the combination is valid,
1558      but at the moment we need to handle all modes.  Just ignore
1559      any runtime parts for registers that can't store them.  */
1560   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1561   switch (aarch64_regno_regclass (regno))
1562     {
1563     case FP_REGS:
1564     case FP_LO_REGS:
1565       if (aarch64_sve_data_mode_p (mode))
1566         return exact_div (GET_MODE_SIZE (mode),
1567                           BYTES_PER_SVE_VECTOR).to_constant ();
1568       return CEIL (lowest_size, UNITS_PER_VREG);
1569     case PR_REGS:
1570     case PR_LO_REGS:
1571     case PR_HI_REGS:
1572       return 1;
1573     default:
1574       return CEIL (lowest_size, UNITS_PER_WORD);
1575     }
1576   gcc_unreachable ();
1577 }
1578
1579 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1580
1581 static bool
1582 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1583 {
1584   if (GET_MODE_CLASS (mode) == MODE_CC)
1585     return regno == CC_REGNUM;
1586
1587   if (regno == VG_REGNUM)
1588     /* This must have the same size as _Unwind_Word.  */
1589     return mode == DImode;
1590
1591   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1592   if (vec_flags & VEC_SVE_PRED)
1593     return PR_REGNUM_P (regno);
1594
1595   if (PR_REGNUM_P (regno))
1596     return 0;
1597
1598   if (regno == SP_REGNUM)
1599     /* The purpose of comparing with ptr_mode is to support the
1600        global register variable associated with the stack pointer
1601        register via the syntax of asm ("wsp") in ILP32.  */
1602     return mode == Pmode || mode == ptr_mode;
1603
1604   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1605     return mode == Pmode;
1606
1607   if (GP_REGNUM_P (regno))
1608     {
1609       if (known_le (GET_MODE_SIZE (mode), 8))
1610         return true;
1611       else if (known_le (GET_MODE_SIZE (mode), 16))
1612         return (regno & 1) == 0;
1613     }
1614   else if (FP_REGNUM_P (regno))
1615     {
1616       if (vec_flags & VEC_STRUCT)
1617         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1618       else
1619         return !VECTOR_MODE_P (mode) || vec_flags != 0;
1620     }
1621
1622   return false;
1623 }
1624
1625 /* Return true if this is a definition of a vectorized simd function.  */
1626
1627 static bool
1628 aarch64_simd_decl_p (tree fndecl)
1629 {
1630   tree fntype;
1631
1632   if (fndecl == NULL)
1633     return false;
1634   fntype = TREE_TYPE (fndecl);
1635   if (fntype == NULL)
1636     return false;
1637
1638   /* Functions with the aarch64_vector_pcs attribute use the simd ABI.  */
1639   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
1640     return true;
1641
1642   return false;
1643 }
1644
1645 /* Return the mode a register save/restore should use.  DImode for integer
1646    registers, DFmode for FP registers in non-SIMD functions (they only save
1647    the bottom half of a 128 bit register), or TFmode for FP registers in
1648    SIMD functions.  */
1649
1650 static machine_mode
1651 aarch64_reg_save_mode (tree fndecl, unsigned regno)
1652 {
1653   return GP_REGNUM_P (regno)
1654            ? E_DImode
1655            : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
1656 }
1657
1658 /* Return true if the instruction is a call to a SIMD function, false
1659    if it is not a SIMD function or if we do not know anything about
1660    the function.  */
1661
1662 static bool
1663 aarch64_simd_call_p (rtx_insn *insn)
1664 {
1665   rtx symbol;
1666   rtx call;
1667   tree fndecl;
1668
1669   gcc_assert (CALL_P (insn));
1670   call = get_call_rtx_from (insn);
1671   symbol = XEXP (XEXP (call, 0), 0);
1672   if (GET_CODE (symbol) != SYMBOL_REF)
1673     return false;
1674   fndecl = SYMBOL_REF_DECL (symbol);
1675   if (!fndecl)
1676     return false;
1677
1678   return aarch64_simd_decl_p (fndecl);
1679 }
1680
1681 /* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS.  If INSN calls
1682    a function that uses the SIMD ABI, take advantage of the extra
1683    call-preserved registers that the ABI provides.  */
1684
1685 void
1686 aarch64_remove_extra_call_preserved_regs (rtx_insn *insn,
1687                                           HARD_REG_SET *return_set)
1688 {
1689   if (aarch64_simd_call_p (insn))
1690     {
1691       for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1692         if (FP_SIMD_SAVED_REGNUM_P (regno))
1693           CLEAR_HARD_REG_BIT (*return_set, regno);
1694     }
1695 }
1696
1697 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1698    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1699    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1700
1701 static bool
1702 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1703 {
1704   return FP_REGNUM_P (regno) && maybe_gt (GET_MODE_SIZE (mode), 8);
1705 }
1706
1707 /* Implement REGMODE_NATURAL_SIZE.  */
1708 poly_uint64
1709 aarch64_regmode_natural_size (machine_mode mode)
1710 {
1711   /* The natural size for SVE data modes is one SVE data vector,
1712      and similarly for predicates.  We can't independently modify
1713      anything smaller than that.  */
1714   /* ??? For now, only do this for variable-width SVE registers.
1715      Doing it for constant-sized registers breaks lower-subreg.c.  */
1716   /* ??? And once that's fixed, we should probably have similar
1717      code for Advanced SIMD.  */
1718   if (!aarch64_sve_vg.is_constant ())
1719     {
1720       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1721       if (vec_flags & VEC_SVE_PRED)
1722         return BYTES_PER_SVE_PRED;
1723       if (vec_flags & VEC_SVE_DATA)
1724         return BYTES_PER_SVE_VECTOR;
1725     }
1726   return UNITS_PER_WORD;
1727 }
1728
1729 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1730 machine_mode
1731 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1732                                      machine_mode mode)
1733 {
1734   /* The predicate mode determines which bits are significant and
1735      which are "don't care".  Decreasing the number of lanes would
1736      lose data while increasing the number of lanes would make bits
1737      unnecessarily significant.  */
1738   if (PR_REGNUM_P (regno))
1739     return mode;
1740   if (known_ge (GET_MODE_SIZE (mode), 4))
1741     return mode;
1742   else
1743     return SImode;
1744 }
1745
1746 /* Return true if I's bits are consecutive ones from the MSB.  */
1747 bool
1748 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
1749 {
1750   return exact_log2 (-i) != HOST_WIDE_INT_M1;
1751 }
1752
1753 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
1754    that strcpy from constants will be faster.  */
1755
1756 static HOST_WIDE_INT
1757 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1758 {
1759   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1760     return MAX (align, BITS_PER_WORD);
1761   return align;
1762 }
1763
1764 /* Return true if calls to DECL should be treated as
1765    long-calls (ie called via a register).  */
1766 static bool
1767 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1768 {
1769   return false;
1770 }
1771
1772 /* Return true if calls to symbol-ref SYM should be treated as
1773    long-calls (ie called via a register).  */
1774 bool
1775 aarch64_is_long_call_p (rtx sym)
1776 {
1777   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1778 }
1779
1780 /* Return true if calls to symbol-ref SYM should not go through
1781    plt stubs.  */
1782
1783 bool
1784 aarch64_is_noplt_call_p (rtx sym)
1785 {
1786   const_tree decl = SYMBOL_REF_DECL (sym);
1787
1788   if (flag_pic
1789       && decl
1790       && (!flag_plt
1791           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1792       && !targetm.binds_local_p (decl))
1793     return true;
1794
1795   return false;
1796 }
1797
1798 /* Return true if the offsets to a zero/sign-extract operation
1799    represent an expression that matches an extend operation.  The
1800    operands represent the paramters from
1801
1802    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1803 bool
1804 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1805                                 rtx extract_imm)
1806 {
1807   HOST_WIDE_INT mult_val, extract_val;
1808
1809   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1810     return false;
1811
1812   mult_val = INTVAL (mult_imm);
1813   extract_val = INTVAL (extract_imm);
1814
1815   if (extract_val > 8
1816       && extract_val < GET_MODE_BITSIZE (mode)
1817       && exact_log2 (extract_val & ~7) > 0
1818       && (extract_val & 7) <= 4
1819       && mult_val == (1 << (extract_val & 7)))
1820     return true;
1821
1822   return false;
1823 }
1824
1825 /* Emit an insn that's a simple single-set.  Both the operands must be
1826    known to be valid.  */
1827 inline static rtx_insn *
1828 emit_set_insn (rtx x, rtx y)
1829 {
1830   return emit_insn (gen_rtx_SET (x, y));
1831 }
1832
1833 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1834    return the rtx for register 0 in the proper mode.  */
1835 rtx
1836 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1837 {
1838   machine_mode mode = SELECT_CC_MODE (code, x, y);
1839   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1840
1841   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1842   return cc_reg;
1843 }
1844
1845 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
1846
1847 static rtx
1848 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
1849                                   machine_mode y_mode)
1850 {
1851   if (y_mode == E_QImode || y_mode == E_HImode)
1852     {
1853       if (CONST_INT_P (y))
1854         y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
1855       else
1856         {
1857           rtx t, cc_reg;
1858           machine_mode cc_mode;
1859
1860           t = gen_rtx_ZERO_EXTEND (SImode, y);
1861           t = gen_rtx_COMPARE (CC_SWPmode, t, x);
1862           cc_mode = CC_SWPmode;
1863           cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
1864           emit_set_insn (cc_reg, t);
1865           return cc_reg;
1866         }
1867     }
1868
1869   return aarch64_gen_compare_reg (code, x, y);
1870 }
1871
1872 /* Build the SYMBOL_REF for __tls_get_addr.  */
1873
1874 static GTY(()) rtx tls_get_addr_libfunc;
1875
1876 rtx
1877 aarch64_tls_get_addr (void)
1878 {
1879   if (!tls_get_addr_libfunc)
1880     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1881   return tls_get_addr_libfunc;
1882 }
1883
1884 /* Return the TLS model to use for ADDR.  */
1885
1886 static enum tls_model
1887 tls_symbolic_operand_type (rtx addr)
1888 {
1889   enum tls_model tls_kind = TLS_MODEL_NONE;
1890   if (GET_CODE (addr) == CONST)
1891     {
1892       poly_int64 addend;
1893       rtx sym = strip_offset (addr, &addend);
1894       if (GET_CODE (sym) == SYMBOL_REF)
1895         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1896     }
1897   else if (GET_CODE (addr) == SYMBOL_REF)
1898     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1899
1900   return tls_kind;
1901 }
1902
1903 /* We'll allow lo_sum's in addresses in our legitimate addresses
1904    so that combine would take care of combining addresses where
1905    necessary, but for generation purposes, we'll generate the address
1906    as :
1907    RTL                               Absolute
1908    tmp = hi (symbol_ref);            adrp  x1, foo
1909    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1910                                      nop
1911
1912    PIC                               TLS
1913    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1914    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1915                                      bl   __tls_get_addr
1916                                      nop
1917
1918    Load TLS symbol, depending on TLS mechanism and TLS access model.
1919
1920    Global Dynamic - Traditional TLS:
1921    adrp tmp, :tlsgd:imm
1922    add  dest, tmp, #:tlsgd_lo12:imm
1923    bl   __tls_get_addr
1924
1925    Global Dynamic - TLS Descriptors:
1926    adrp dest, :tlsdesc:imm
1927    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1928    add  dest, dest, #:tlsdesc_lo12:imm
1929    blr  tmp
1930    mrs  tp, tpidr_el0
1931    add  dest, dest, tp
1932
1933    Initial Exec:
1934    mrs  tp, tpidr_el0
1935    adrp tmp, :gottprel:imm
1936    ldr  dest, [tmp, #:gottprel_lo12:imm]
1937    add  dest, dest, tp
1938
1939    Local Exec:
1940    mrs  tp, tpidr_el0
1941    add  t0, tp, #:tprel_hi12:imm, lsl #12
1942    add  t0, t0, #:tprel_lo12_nc:imm
1943 */
1944
1945 static void
1946 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1947                                    enum aarch64_symbol_type type)
1948 {
1949   switch (type)
1950     {
1951     case SYMBOL_SMALL_ABSOLUTE:
1952       {
1953         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1954         rtx tmp_reg = dest;
1955         machine_mode mode = GET_MODE (dest);
1956
1957         gcc_assert (mode == Pmode || mode == ptr_mode);
1958
1959         if (can_create_pseudo_p ())
1960           tmp_reg = gen_reg_rtx (mode);
1961
1962         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1963         emit_insn (gen_add_losym (dest, tmp_reg, imm));
1964         return;
1965       }
1966
1967     case SYMBOL_TINY_ABSOLUTE:
1968       emit_insn (gen_rtx_SET (dest, imm));
1969       return;
1970
1971     case SYMBOL_SMALL_GOT_28K:
1972       {
1973         machine_mode mode = GET_MODE (dest);
1974         rtx gp_rtx = pic_offset_table_rtx;
1975         rtx insn;
1976         rtx mem;
1977
1978         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1979            here before rtl expand.  Tree IVOPT will generate rtl pattern to
1980            decide rtx costs, in which case pic_offset_table_rtx is not
1981            initialized.  For that case no need to generate the first adrp
1982            instruction as the final cost for global variable access is
1983            one instruction.  */
1984         if (gp_rtx != NULL)
1985           {
1986             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1987                using the page base as GOT base, the first page may be wasted,
1988                in the worst scenario, there is only 28K space for GOT).
1989
1990                The generate instruction sequence for accessing global variable
1991                is:
1992
1993                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1994
1995                Only one instruction needed. But we must initialize
1996                pic_offset_table_rtx properly.  We generate initialize insn for
1997                every global access, and allow CSE to remove all redundant.
1998
1999                The final instruction sequences will look like the following
2000                for multiply global variables access.
2001
2002                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2003
2004                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2005                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2006                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2007                  ...  */
2008
2009             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2010             crtl->uses_pic_offset_table = 1;
2011             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2012
2013             if (mode != GET_MODE (gp_rtx))
2014              gp_rtx = gen_lowpart (mode, gp_rtx);
2015
2016           }
2017
2018         if (mode == ptr_mode)
2019           {
2020             if (mode == DImode)
2021               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2022             else
2023               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2024
2025             mem = XVECEXP (SET_SRC (insn), 0, 0);
2026           }
2027         else
2028           {
2029             gcc_assert (mode == Pmode);
2030
2031             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2032             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2033           }
2034
2035         /* The operand is expected to be MEM.  Whenever the related insn
2036            pattern changed, above code which calculate mem should be
2037            updated.  */
2038         gcc_assert (GET_CODE (mem) == MEM);
2039         MEM_READONLY_P (mem) = 1;
2040         MEM_NOTRAP_P (mem) = 1;
2041         emit_insn (insn);
2042         return;
2043       }
2044
2045     case SYMBOL_SMALL_GOT_4G:
2046       {
2047         /* In ILP32, the mode of dest can be either SImode or DImode,
2048            while the got entry is always of SImode size.  The mode of
2049            dest depends on how dest is used: if dest is assigned to a
2050            pointer (e.g. in the memory), it has SImode; it may have
2051            DImode if dest is dereferenced to access the memeory.
2052            This is why we have to handle three different ldr_got_small
2053            patterns here (two patterns for ILP32).  */
2054
2055         rtx insn;
2056         rtx mem;
2057         rtx tmp_reg = dest;
2058         machine_mode mode = GET_MODE (dest);
2059
2060         if (can_create_pseudo_p ())
2061           tmp_reg = gen_reg_rtx (mode);
2062
2063         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2064         if (mode == ptr_mode)
2065           {
2066             if (mode == DImode)
2067               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2068             else
2069               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2070
2071             mem = XVECEXP (SET_SRC (insn), 0, 0);
2072           }
2073         else
2074           {
2075             gcc_assert (mode == Pmode);
2076
2077             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2078             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2079           }
2080
2081         gcc_assert (GET_CODE (mem) == MEM);
2082         MEM_READONLY_P (mem) = 1;
2083         MEM_NOTRAP_P (mem) = 1;
2084         emit_insn (insn);
2085         return;
2086       }
2087
2088     case SYMBOL_SMALL_TLSGD:
2089       {
2090         rtx_insn *insns;
2091         machine_mode mode = GET_MODE (dest);
2092         rtx result = gen_rtx_REG (mode, R0_REGNUM);
2093
2094         start_sequence ();
2095         if (TARGET_ILP32)
2096           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2097         else
2098           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2099         insns = get_insns ();
2100         end_sequence ();
2101
2102         RTL_CONST_CALL_P (insns) = 1;
2103         emit_libcall_block (insns, dest, result, imm);
2104         return;
2105       }
2106
2107     case SYMBOL_SMALL_TLSDESC:
2108       {
2109         machine_mode mode = GET_MODE (dest);
2110         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2111         rtx tp;
2112
2113         gcc_assert (mode == Pmode || mode == ptr_mode);
2114
2115         /* In ILP32, the got entry is always of SImode size.  Unlike
2116            small GOT, the dest is fixed at reg 0.  */
2117         if (TARGET_ILP32)
2118           emit_insn (gen_tlsdesc_small_si (imm));
2119         else
2120           emit_insn (gen_tlsdesc_small_di (imm));
2121         tp = aarch64_load_tp (NULL);
2122
2123         if (mode != Pmode)
2124           tp = gen_lowpart (mode, tp);
2125
2126         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2127         if (REG_P (dest))
2128           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2129         return;
2130       }
2131
2132     case SYMBOL_SMALL_TLSIE:
2133       {
2134         /* In ILP32, the mode of dest can be either SImode or DImode,
2135            while the got entry is always of SImode size.  The mode of
2136            dest depends on how dest is used: if dest is assigned to a
2137            pointer (e.g. in the memory), it has SImode; it may have
2138            DImode if dest is dereferenced to access the memeory.
2139            This is why we have to handle three different tlsie_small
2140            patterns here (two patterns for ILP32).  */
2141         machine_mode mode = GET_MODE (dest);
2142         rtx tmp_reg = gen_reg_rtx (mode);
2143         rtx tp = aarch64_load_tp (NULL);
2144
2145         if (mode == ptr_mode)
2146           {
2147             if (mode == DImode)
2148               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2149             else
2150               {
2151                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2152                 tp = gen_lowpart (mode, tp);
2153               }
2154           }
2155         else
2156           {
2157             gcc_assert (mode == Pmode);
2158             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2159           }
2160
2161         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2162         if (REG_P (dest))
2163           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2164         return;
2165       }
2166
2167     case SYMBOL_TLSLE12:
2168     case SYMBOL_TLSLE24:
2169     case SYMBOL_TLSLE32:
2170     case SYMBOL_TLSLE48:
2171       {
2172         machine_mode mode = GET_MODE (dest);
2173         rtx tp = aarch64_load_tp (NULL);
2174
2175         if (mode != Pmode)
2176           tp = gen_lowpart (mode, tp);
2177
2178         switch (type)
2179           {
2180           case SYMBOL_TLSLE12:
2181             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2182                         (dest, tp, imm));
2183             break;
2184           case SYMBOL_TLSLE24:
2185             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2186                         (dest, tp, imm));
2187           break;
2188           case SYMBOL_TLSLE32:
2189             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2190                         (dest, imm));
2191             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2192                         (dest, dest, tp));
2193           break;
2194           case SYMBOL_TLSLE48:
2195             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2196                         (dest, imm));
2197             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2198                         (dest, dest, tp));
2199             break;
2200           default:
2201             gcc_unreachable ();
2202           }
2203
2204         if (REG_P (dest))
2205           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2206         return;
2207       }
2208
2209     case SYMBOL_TINY_GOT:
2210       emit_insn (gen_ldr_got_tiny (dest, imm));
2211       return;
2212
2213     case SYMBOL_TINY_TLSIE:
2214       {
2215         machine_mode mode = GET_MODE (dest);
2216         rtx tp = aarch64_load_tp (NULL);
2217
2218         if (mode == ptr_mode)
2219           {
2220             if (mode == DImode)
2221               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2222             else
2223               {
2224                 tp = gen_lowpart (mode, tp);
2225                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2226               }
2227           }
2228         else
2229           {
2230             gcc_assert (mode == Pmode);
2231             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2232           }
2233
2234         if (REG_P (dest))
2235           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2236         return;
2237       }
2238
2239     default:
2240       gcc_unreachable ();
2241     }
2242 }
2243
2244 /* Emit a move from SRC to DEST.  Assume that the move expanders can
2245    handle all moves if !can_create_pseudo_p ().  The distinction is
2246    important because, unlike emit_move_insn, the move expanders know
2247    how to force Pmode objects into the constant pool even when the
2248    constant pool address is not itself legitimate.  */
2249 static rtx
2250 aarch64_emit_move (rtx dest, rtx src)
2251 {
2252   return (can_create_pseudo_p ()
2253           ? emit_move_insn (dest, src)
2254           : emit_move_insn_1 (dest, src));
2255 }
2256
2257 /* Apply UNOPTAB to OP and store the result in DEST.  */
2258
2259 static void
2260 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2261 {
2262   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2263   if (dest != tmp)
2264     emit_move_insn (dest, tmp);
2265 }
2266
2267 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
2268
2269 static void
2270 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2271 {
2272   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2273                           OPTAB_DIRECT);
2274   if (dest != tmp)
2275     emit_move_insn (dest, tmp);
2276 }
2277
2278 /* Split a 128-bit move operation into two 64-bit move operations,
2279    taking care to handle partial overlap of register to register
2280    copies.  Special cases are needed when moving between GP regs and
2281    FP regs.  SRC can be a register, constant or memory; DST a register
2282    or memory.  If either operand is memory it must not have any side
2283    effects.  */
2284 void
2285 aarch64_split_128bit_move (rtx dst, rtx src)
2286 {
2287   rtx dst_lo, dst_hi;
2288   rtx src_lo, src_hi;
2289
2290   machine_mode mode = GET_MODE (dst);
2291
2292   gcc_assert (mode == TImode || mode == TFmode);
2293   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2294   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2295
2296   if (REG_P (dst) && REG_P (src))
2297     {
2298       int src_regno = REGNO (src);
2299       int dst_regno = REGNO (dst);
2300
2301       /* Handle FP <-> GP regs.  */
2302       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2303         {
2304           src_lo = gen_lowpart (word_mode, src);
2305           src_hi = gen_highpart (word_mode, src);
2306
2307           emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2308           emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2309           return;
2310         }
2311       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2312         {
2313           dst_lo = gen_lowpart (word_mode, dst);
2314           dst_hi = gen_highpart (word_mode, dst);
2315
2316           emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2317           emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2318           return;
2319         }
2320     }
2321
2322   dst_lo = gen_lowpart (word_mode, dst);
2323   dst_hi = gen_highpart (word_mode, dst);
2324   src_lo = gen_lowpart (word_mode, src);
2325   src_hi = gen_highpart_mode (word_mode, mode, src);
2326
2327   /* At most one pairing may overlap.  */
2328   if (reg_overlap_mentioned_p (dst_lo, src_hi))
2329     {
2330       aarch64_emit_move (dst_hi, src_hi);
2331       aarch64_emit_move (dst_lo, src_lo);
2332     }
2333   else
2334     {
2335       aarch64_emit_move (dst_lo, src_lo);
2336       aarch64_emit_move (dst_hi, src_hi);
2337     }
2338 }
2339
2340 bool
2341 aarch64_split_128bit_move_p (rtx dst, rtx src)
2342 {
2343   return (! REG_P (src)
2344           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2345 }
2346
2347 /* Split a complex SIMD combine.  */
2348
2349 void
2350 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2351 {
2352   machine_mode src_mode = GET_MODE (src1);
2353   machine_mode dst_mode = GET_MODE (dst);
2354
2355   gcc_assert (VECTOR_MODE_P (dst_mode));
2356   gcc_assert (register_operand (dst, dst_mode)
2357               && register_operand (src1, src_mode)
2358               && register_operand (src2, src_mode));
2359
2360   emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2361   return;
2362 }
2363
2364 /* Split a complex SIMD move.  */
2365
2366 void
2367 aarch64_split_simd_move (rtx dst, rtx src)
2368 {
2369   machine_mode src_mode = GET_MODE (src);
2370   machine_mode dst_mode = GET_MODE (dst);
2371
2372   gcc_assert (VECTOR_MODE_P (dst_mode));
2373
2374   if (REG_P (dst) && REG_P (src))
2375     {
2376       gcc_assert (VECTOR_MODE_P (src_mode));
2377       emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2378     }
2379 }
2380
2381 bool
2382 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2383                               machine_mode ymode, rtx y)
2384 {
2385   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2386   gcc_assert (r != NULL);
2387   return rtx_equal_p (x, r);
2388 }
2389
2390
2391 static rtx
2392 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2393 {
2394   if (can_create_pseudo_p ())
2395     return force_reg (mode, value);
2396   else
2397     {
2398       gcc_assert (x);
2399       aarch64_emit_move (x, value);
2400       return x;
2401     }
2402 }
2403
2404 /* Return true if we can move VALUE into a register using a single
2405    CNT[BHWD] instruction.  */
2406
2407 static bool
2408 aarch64_sve_cnt_immediate_p (poly_int64 value)
2409 {
2410   HOST_WIDE_INT factor = value.coeffs[0];
2411   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
2412   return (value.coeffs[1] == factor
2413           && IN_RANGE (factor, 2, 16 * 16)
2414           && (factor & 1) == 0
2415           && factor <= 16 * (factor & -factor));
2416 }
2417
2418 /* Likewise for rtx X.  */
2419
2420 bool
2421 aarch64_sve_cnt_immediate_p (rtx x)
2422 {
2423   poly_int64 value;
2424   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2425 }
2426
2427 /* Return the asm string for an instruction with a CNT-like vector size
2428    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2429    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2430    first part of the operands template (the part that comes before the
2431    vector size itself).  FACTOR is the number of quadwords.
2432    NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2433    If it is zero, we can use any element size.  */
2434
2435 static char *
2436 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2437                                   unsigned int factor,
2438                                   unsigned int nelts_per_vq)
2439 {
2440   static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2441
2442   if (nelts_per_vq == 0)
2443     /* There is some overlap in the ranges of the four CNT instructions.
2444        Here we always use the smallest possible element size, so that the
2445        multiplier is 1 whereever possible.  */
2446     nelts_per_vq = factor & -factor;
2447   int shift = std::min (exact_log2 (nelts_per_vq), 4);
2448   gcc_assert (IN_RANGE (shift, 1, 4));
2449   char suffix = "dwhb"[shift - 1];
2450
2451   factor >>= shift;
2452   unsigned int written;
2453   if (factor == 1)
2454     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2455                         prefix, suffix, operands);
2456   else
2457     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2458                         prefix, suffix, operands, factor);
2459   gcc_assert (written < sizeof (buffer));
2460   return buffer;
2461 }
2462
2463 /* Return the asm string for an instruction with a CNT-like vector size
2464    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2465    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2466    first part of the operands template (the part that comes before the
2467    vector size itself).  X is the value of the vector size operand,
2468    as a polynomial integer rtx.  */
2469
2470 char *
2471 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2472                                   rtx x)
2473 {
2474   poly_int64 value = rtx_to_poly_int64 (x);
2475   gcc_assert (aarch64_sve_cnt_immediate_p (value));
2476   return aarch64_output_sve_cnt_immediate (prefix, operands,
2477                                            value.coeffs[1], 0);
2478 }
2479
2480 /* Return true if we can add VALUE to a register using a single ADDVL
2481    or ADDPL instruction.  */
2482
2483 static bool
2484 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2485 {
2486   HOST_WIDE_INT factor = value.coeffs[0];
2487   if (factor == 0 || value.coeffs[1] != factor)
2488     return false;
2489   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2490      and a value of 16 is one vector width.  */
2491   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2492           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2493 }
2494
2495 /* Likewise for rtx X.  */
2496
2497 bool
2498 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2499 {
2500   poly_int64 value;
2501   return (poly_int_rtx_p (x, &value)
2502           && aarch64_sve_addvl_addpl_immediate_p (value));
2503 }
2504
2505 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2506    and storing the result in operand 0.  */
2507
2508 char *
2509 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2510 {
2511   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2512   poly_int64 offset_value = rtx_to_poly_int64 (offset);
2513   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2514
2515   /* Use INC or DEC if possible.  */
2516   if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2517     {
2518       if (aarch64_sve_cnt_immediate_p (offset_value))
2519         return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2520                                                  offset_value.coeffs[1], 0);
2521       if (aarch64_sve_cnt_immediate_p (-offset_value))
2522         return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2523                                                  -offset_value.coeffs[1], 0);
2524     }
2525
2526   int factor = offset_value.coeffs[1];
2527   if ((factor & 15) == 0)
2528     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2529   else
2530     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2531   return buffer;
2532 }
2533
2534 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2535    instruction.  If it is, store the number of elements in each vector
2536    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2537    factor in *FACTOR_OUT (if nonnull).  */
2538
2539 bool
2540 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2541                                  unsigned int *nelts_per_vq_out)
2542 {
2543   rtx elt;
2544   poly_int64 value;
2545
2546   if (!const_vec_duplicate_p (x, &elt)
2547       || !poly_int_rtx_p (elt, &value))
2548     return false;
2549
2550   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2551   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2552     /* There's no vector INCB.  */
2553     return false;
2554
2555   HOST_WIDE_INT factor = value.coeffs[0];
2556   if (value.coeffs[1] != factor)
2557     return false;
2558
2559   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
2560   if ((factor % nelts_per_vq) != 0
2561       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2562     return false;
2563
2564   if (factor_out)
2565     *factor_out = factor;
2566   if (nelts_per_vq_out)
2567     *nelts_per_vq_out = nelts_per_vq;
2568   return true;
2569 }
2570
2571 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2572    instruction.  */
2573
2574 bool
2575 aarch64_sve_inc_dec_immediate_p (rtx x)
2576 {
2577   return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2578 }
2579
2580 /* Return the asm template for an SVE vector INC or DEC instruction.
2581    OPERANDS gives the operands before the vector count and X is the
2582    value of the vector count operand itself.  */
2583
2584 char *
2585 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2586 {
2587   int factor;
2588   unsigned int nelts_per_vq;
2589   if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2590     gcc_unreachable ();
2591   if (factor < 0)
2592     return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2593                                              nelts_per_vq);
2594   else
2595     return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2596                                              nelts_per_vq);
2597 }
2598
2599 static int
2600 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2601                                 scalar_int_mode mode)
2602 {
2603   int i;
2604   unsigned HOST_WIDE_INT val, val2, mask;
2605   int one_match, zero_match;
2606   int num_insns;
2607
2608   val = INTVAL (imm);
2609
2610   if (aarch64_move_imm (val, mode))
2611     {
2612       if (generate)
2613         emit_insn (gen_rtx_SET (dest, imm));
2614       return 1;
2615     }
2616
2617   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2618      (with XXXX non-zero). In that case check to see if the move can be done in
2619      a smaller mode.  */
2620   val2 = val & 0xffffffff;
2621   if (mode == DImode
2622       && aarch64_move_imm (val2, SImode)
2623       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2624     {
2625       if (generate)
2626         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2627
2628       /* Check if we have to emit a second instruction by checking to see
2629          if any of the upper 32 bits of the original DI mode value is set.  */
2630       if (val == val2)
2631         return 1;
2632
2633       i = (val >> 48) ? 48 : 32;
2634
2635       if (generate)
2636          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2637                                     GEN_INT ((val >> i) & 0xffff)));
2638
2639       return 2;
2640     }
2641
2642   if ((val >> 32) == 0 || mode == SImode)
2643     {
2644       if (generate)
2645         {
2646           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2647           if (mode == SImode)
2648             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2649                                        GEN_INT ((val >> 16) & 0xffff)));
2650           else
2651             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2652                                        GEN_INT ((val >> 16) & 0xffff)));
2653         }
2654       return 2;
2655     }
2656
2657   /* Remaining cases are all for DImode.  */
2658
2659   mask = 0xffff;
2660   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2661     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2662   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2663     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2664
2665   if (zero_match != 2 && one_match != 2)
2666     {
2667       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2668          For a 64-bit bitmask try whether changing 16 bits to all ones or
2669          zeroes creates a valid bitmask.  To check any repeated bitmask,
2670          try using 16 bits from the other 32-bit half of val.  */
2671
2672       for (i = 0; i < 64; i += 16, mask <<= 16)
2673         {
2674           val2 = val & ~mask;
2675           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2676             break;
2677           val2 = val | mask;
2678           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2679             break;
2680           val2 = val2 & ~mask;
2681           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2682           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2683             break;
2684         }
2685       if (i != 64)
2686         {
2687           if (generate)
2688             {
2689               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2690               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2691                                          GEN_INT ((val >> i) & 0xffff)));
2692             }
2693           return 2;
2694         }
2695     }
2696
2697   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2698      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
2699      otherwise skip zero bits.  */
2700
2701   num_insns = 1;
2702   mask = 0xffff;
2703   val2 = one_match > zero_match ? ~val : val;
2704   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2705
2706   if (generate)
2707     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2708                                            ? (val | ~(mask << i))
2709                                            : (val & (mask << i)))));
2710   for (i += 16; i < 64; i += 16)
2711     {
2712       if ((val2 & (mask << i)) == 0)
2713         continue;
2714       if (generate)
2715         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2716                                    GEN_INT ((val >> i) & 0xffff)));
2717       num_insns ++;
2718     }
2719
2720   return num_insns;
2721 }
2722
2723 /* Return whether imm is a 128-bit immediate which is simple enough to
2724    expand inline.  */
2725 bool
2726 aarch64_mov128_immediate (rtx imm)
2727 {
2728   if (GET_CODE (imm) == CONST_INT)
2729     return true;
2730
2731   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2732
2733   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2734   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2735
2736   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2737          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2738 }
2739
2740
2741 /* Return the number of temporary registers that aarch64_add_offset_1
2742    would need to add OFFSET to a register.  */
2743
2744 static unsigned int
2745 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2746 {
2747   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2748 }
2749
2750 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
2751    a non-polynomial OFFSET.  MODE is the mode of the addition.
2752    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2753    be set and CFA adjustments added to the generated instructions.
2754
2755    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2756    temporary if register allocation is already complete.  This temporary
2757    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
2758    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2759    the immediate again.
2760
2761    Since this function may be used to adjust the stack pointer, we must
2762    ensure that it cannot cause transient stack deallocation (for example
2763    by first incrementing SP and then decrementing when adjusting by a
2764    large immediate).  */
2765
2766 static void
2767 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2768                       rtx src, HOST_WIDE_INT offset, rtx temp1,
2769                       bool frame_related_p, bool emit_move_imm)
2770 {
2771   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2772   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2773
2774   HOST_WIDE_INT moffset = abs_hwi (offset);
2775   rtx_insn *insn;
2776
2777   if (!moffset)
2778     {
2779       if (!rtx_equal_p (dest, src))
2780         {
2781           insn = emit_insn (gen_rtx_SET (dest, src));
2782           RTX_FRAME_RELATED_P (insn) = frame_related_p;
2783         }
2784       return;
2785     }
2786
2787   /* Single instruction adjustment.  */
2788   if (aarch64_uimm12_shift (moffset))
2789     {
2790       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2791       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2792       return;
2793     }
2794
2795   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2796      and either:
2797
2798      a) the offset cannot be loaded by a 16-bit move or
2799      b) there is no spare register into which we can move it.  */
2800   if (moffset < 0x1000000
2801       && ((!temp1 && !can_create_pseudo_p ())
2802           || !aarch64_move_imm (moffset, mode)))
2803     {
2804       HOST_WIDE_INT low_off = moffset & 0xfff;
2805
2806       low_off = offset < 0 ? -low_off : low_off;
2807       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2808       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2809       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2810       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2811       return;
2812     }
2813
2814   /* Emit a move immediate if required and an addition/subtraction.  */
2815   if (emit_move_imm)
2816     {
2817       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2818       temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2819     }
2820   insn = emit_insn (offset < 0
2821                     ? gen_sub3_insn (dest, src, temp1)
2822                     : gen_add3_insn (dest, src, temp1));
2823   if (frame_related_p)
2824     {
2825       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2826       rtx adj = plus_constant (mode, src, offset);
2827       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2828     }
2829 }
2830
2831 /* Return the number of temporary registers that aarch64_add_offset
2832    would need to move OFFSET into a register or add OFFSET to a register;
2833    ADD_P is true if we want the latter rather than the former.  */
2834
2835 static unsigned int
2836 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2837 {
2838   /* This follows the same structure as aarch64_add_offset.  */
2839   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2840     return 0;
2841
2842   unsigned int count = 0;
2843   HOST_WIDE_INT factor = offset.coeffs[1];
2844   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2845   poly_int64 poly_offset (factor, factor);
2846   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2847     /* Need one register for the ADDVL/ADDPL result.  */
2848     count += 1;
2849   else if (factor != 0)
2850     {
2851       factor = abs (factor);
2852       if (factor > 16 * (factor & -factor))
2853         /* Need one register for the CNT result and one for the multiplication
2854            factor.  If necessary, the second temporary can be reused for the
2855            constant part of the offset.  */
2856         return 2;
2857       /* Need one register for the CNT result (which might then
2858          be shifted).  */
2859       count += 1;
2860     }
2861   return count + aarch64_add_offset_1_temporaries (constant);
2862 }
2863
2864 /* If X can be represented as a poly_int64, return the number
2865    of temporaries that are required to add it to a register.
2866    Return -1 otherwise.  */
2867
2868 int
2869 aarch64_add_offset_temporaries (rtx x)
2870 {
2871   poly_int64 offset;
2872   if (!poly_int_rtx_p (x, &offset))
2873     return -1;
2874   return aarch64_offset_temporaries (true, offset);
2875 }
2876
2877 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
2878    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2879    be set and CFA adjustments added to the generated instructions.
2880
2881    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2882    temporary if register allocation is already complete.  This temporary
2883    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2884    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2885    false to avoid emitting the immediate again.
2886
2887    TEMP2, if nonnull, is a second temporary register that doesn't
2888    overlap either DEST or REG.
2889
2890    Since this function may be used to adjust the stack pointer, we must
2891    ensure that it cannot cause transient stack deallocation (for example
2892    by first incrementing SP and then decrementing when adjusting by a
2893    large immediate).  */
2894
2895 static void
2896 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2897                     poly_int64 offset, rtx temp1, rtx temp2,
2898                     bool frame_related_p, bool emit_move_imm = true)
2899 {
2900   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2901   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2902   gcc_assert (temp1 == NULL_RTX
2903               || !frame_related_p
2904               || !reg_overlap_mentioned_p (temp1, dest));
2905   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2906
2907   /* Try using ADDVL or ADDPL to add the whole value.  */
2908   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2909     {
2910       rtx offset_rtx = gen_int_mode (offset, mode);
2911       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2912       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2913       return;
2914     }
2915
2916   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2917      SVE vector register, over and above the minimum size of 128 bits.
2918      This is equivalent to half the value returned by CNTD with a
2919      vector shape of ALL.  */
2920   HOST_WIDE_INT factor = offset.coeffs[1];
2921   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2922
2923   /* Try using ADDVL or ADDPL to add the VG-based part.  */
2924   poly_int64 poly_offset (factor, factor);
2925   if (src != const0_rtx
2926       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2927     {
2928       rtx offset_rtx = gen_int_mode (poly_offset, mode);
2929       if (frame_related_p)
2930         {
2931           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2932           RTX_FRAME_RELATED_P (insn) = true;
2933           src = dest;
2934         }
2935       else
2936         {
2937           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
2938           src = aarch64_force_temporary (mode, temp1, addr);
2939           temp1 = temp2;
2940           temp2 = NULL_RTX;
2941         }
2942     }
2943   /* Otherwise use a CNT-based sequence.  */
2944   else if (factor != 0)
2945     {
2946       /* Use a subtraction if we have a negative factor.  */
2947       rtx_code code = PLUS;
2948       if (factor < 0)
2949         {
2950           factor = -factor;
2951           code = MINUS;
2952         }
2953
2954       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
2955          into the multiplication.  */
2956       rtx val;
2957       int shift = 0;
2958       if (factor & 1)
2959         /* Use a right shift by 1.  */
2960         shift = -1;
2961       else
2962         factor /= 2;
2963       HOST_WIDE_INT low_bit = factor & -factor;
2964       if (factor <= 16 * low_bit)
2965         {
2966           if (factor > 16 * 8)
2967             {
2968               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
2969                  the value with the minimum multiplier and shift it into
2970                  position.  */
2971               int extra_shift = exact_log2 (low_bit);
2972               shift += extra_shift;
2973               factor >>= extra_shift;
2974             }
2975           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
2976         }
2977       else
2978         {
2979           /* Use CNTD, then multiply it by FACTOR.  */
2980           val = gen_int_mode (poly_int64 (2, 2), mode);
2981           val = aarch64_force_temporary (mode, temp1, val);
2982
2983           /* Go back to using a negative multiplication factor if we have
2984              no register from which to subtract.  */
2985           if (code == MINUS && src == const0_rtx)
2986             {
2987               factor = -factor;
2988               code = PLUS;
2989             }
2990           rtx coeff1 = gen_int_mode (factor, mode);
2991           coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
2992           val = gen_rtx_MULT (mode, val, coeff1);
2993         }
2994
2995       if (shift > 0)
2996         {
2997           /* Multiply by 1 << SHIFT.  */
2998           val = aarch64_force_temporary (mode, temp1, val);
2999           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3000         }
3001       else if (shift == -1)
3002         {
3003           /* Divide by 2.  */
3004           val = aarch64_force_temporary (mode, temp1, val);
3005           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3006         }
3007
3008       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
3009       if (src != const0_rtx)
3010         {
3011           val = aarch64_force_temporary (mode, temp1, val);
3012           val = gen_rtx_fmt_ee (code, mode, src, val);
3013         }
3014       else if (code == MINUS)
3015         {
3016           val = aarch64_force_temporary (mode, temp1, val);
3017           val = gen_rtx_NEG (mode, val);
3018         }
3019
3020       if (constant == 0 || frame_related_p)
3021         {
3022           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3023           if (frame_related_p)
3024             {
3025               RTX_FRAME_RELATED_P (insn) = true;
3026               add_reg_note (insn, REG_CFA_ADJUST_CFA,
3027                             gen_rtx_SET (dest, plus_constant (Pmode, src,
3028                                                               poly_offset)));
3029             }
3030           src = dest;
3031           if (constant == 0)
3032             return;
3033         }
3034       else
3035         {
3036           src = aarch64_force_temporary (mode, temp1, val);
3037           temp1 = temp2;
3038           temp2 = NULL_RTX;
3039         }
3040
3041       emit_move_imm = true;
3042     }
3043
3044   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
3045                         frame_related_p, emit_move_imm);
3046 }
3047
3048 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3049    than a poly_int64.  */
3050
3051 void
3052 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3053                           rtx offset_rtx, rtx temp1, rtx temp2)
3054 {
3055   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
3056                       temp1, temp2, false);
3057 }
3058
3059 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3060    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
3061    if TEMP1 already contains abs (DELTA).  */
3062
3063 static inline void
3064 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
3065 {
3066   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
3067                       temp1, temp2, true, emit_move_imm);
3068 }
3069
3070 /* Subtract DELTA from the stack pointer, marking the instructions
3071    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
3072    if nonnull.  */
3073
3074 static inline void
3075 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
3076                 bool emit_move_imm = true)
3077 {
3078   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
3079                       temp1, temp2, frame_related_p, emit_move_imm);
3080 }
3081
3082 /* Set DEST to (vec_series BASE STEP).  */
3083
3084 static void
3085 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
3086 {
3087   machine_mode mode = GET_MODE (dest);
3088   scalar_mode inner = GET_MODE_INNER (mode);
3089
3090   /* Each operand can be a register or an immediate in the range [-16, 15].  */
3091   if (!aarch64_sve_index_immediate_p (base))
3092     base = force_reg (inner, base);
3093   if (!aarch64_sve_index_immediate_p (step))
3094     step = force_reg (inner, step);
3095
3096   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
3097 }
3098
3099 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
3100    integer of mode INT_MODE.  Return true on success.  */
3101
3102 static bool
3103 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
3104                                       rtx src)
3105 {
3106   /* If the constant is smaller than 128 bits, we can do the move
3107      using a vector of SRC_MODEs.  */
3108   if (src_mode != TImode)
3109     {
3110       poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
3111                                      GET_MODE_SIZE (src_mode));
3112       machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
3113       emit_move_insn (gen_lowpart (dup_mode, dest),
3114                       gen_const_vec_duplicate (dup_mode, src));
3115       return true;
3116     }
3117
3118   /* Use LD1RQ[BHWD] to load the 128 bits from memory.  */
3119   src = force_const_mem (src_mode, src);
3120   if (!src)
3121     return false;
3122
3123   /* Make sure that the address is legitimate.  */
3124   if (!aarch64_sve_ld1r_operand_p (src))
3125     {
3126       rtx addr = force_reg (Pmode, XEXP (src, 0));
3127       src = replace_equiv_address (src, addr);
3128     }
3129
3130   machine_mode mode = GET_MODE (dest);
3131   unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
3132   machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
3133   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3134   src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
3135   emit_insn (gen_rtx_SET (dest, src));
3136   return true;
3137 }
3138
3139 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
3140    isn't a simple duplicate or series.  */
3141
3142 static void
3143 aarch64_expand_sve_const_vector (rtx dest, rtx src)
3144 {
3145   machine_mode mode = GET_MODE (src);
3146   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
3147   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
3148   gcc_assert (npatterns > 1);
3149
3150   if (nelts_per_pattern == 1)
3151     {
3152       /* The constant is a repeating seqeuence of at least two elements,
3153          where the repeating elements occupy no more than 128 bits.
3154          Get an integer representation of the replicated value.  */
3155       scalar_int_mode int_mode;
3156       if (BYTES_BIG_ENDIAN)
3157         /* For now, always use LD1RQ to load the value on big-endian
3158            targets, since the handling of smaller integers includes a
3159            subreg that is semantically an element reverse.  */
3160         int_mode = TImode;
3161       else
3162         {
3163           unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
3164           gcc_assert (int_bits <= 128);
3165           int_mode = int_mode_for_size (int_bits, 0).require ();
3166         }
3167       rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
3168       if (int_value
3169           && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
3170         return;
3171     }
3172
3173   /* Expand each pattern individually.  */
3174   rtx_vector_builder builder;
3175   auto_vec<rtx, 16> vectors (npatterns);
3176   for (unsigned int i = 0; i < npatterns; ++i)
3177     {
3178       builder.new_vector (mode, 1, nelts_per_pattern);
3179       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
3180         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
3181       vectors.quick_push (force_reg (mode, builder.build ()));
3182     }
3183
3184   /* Use permutes to interleave the separate vectors.  */
3185   while (npatterns > 1)
3186     {
3187       npatterns /= 2;
3188       for (unsigned int i = 0; i < npatterns; ++i)
3189         {
3190           rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
3191           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
3192           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
3193           vectors[i] = tmp;
3194         }
3195     }
3196   gcc_assert (vectors[0] == dest);
3197 }
3198
3199 /* Set DEST to immediate IMM.  For SVE vector modes, GEN_VEC_DUPLICATE
3200    is a pattern that can be used to set DEST to a replicated scalar
3201    element.  */
3202
3203 void
3204 aarch64_expand_mov_immediate (rtx dest, rtx imm,
3205                               rtx (*gen_vec_duplicate) (rtx, rtx))
3206 {
3207   machine_mode mode = GET_MODE (dest);
3208
3209   /* Check on what type of symbol it is.  */
3210   scalar_int_mode int_mode;
3211   if ((GET_CODE (imm) == SYMBOL_REF
3212        || GET_CODE (imm) == LABEL_REF
3213        || GET_CODE (imm) == CONST
3214        || GET_CODE (imm) == CONST_POLY_INT)
3215       && is_a <scalar_int_mode> (mode, &int_mode))
3216     {
3217       rtx mem;
3218       poly_int64 offset;
3219       HOST_WIDE_INT const_offset;
3220       enum aarch64_symbol_type sty;
3221
3222       /* If we have (const (plus symbol offset)), separate out the offset
3223          before we start classifying the symbol.  */
3224       rtx base = strip_offset (imm, &offset);
3225
3226       /* We must always add an offset involving VL separately, rather than
3227          folding it into the relocation.  */
3228       if (!offset.is_constant (&const_offset))
3229         {
3230           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
3231             emit_insn (gen_rtx_SET (dest, imm));
3232           else
3233             {
3234               /* Do arithmetic on 32-bit values if the result is smaller
3235                  than that.  */
3236               if (partial_subreg_p (int_mode, SImode))
3237                 {
3238                   /* It is invalid to do symbol calculations in modes
3239                      narrower than SImode.  */
3240                   gcc_assert (base == const0_rtx);
3241                   dest = gen_lowpart (SImode, dest);
3242                   int_mode = SImode;
3243                 }
3244               if (base != const0_rtx)
3245                 {
3246                   base = aarch64_force_temporary (int_mode, dest, base);
3247                   aarch64_add_offset (int_mode, dest, base, offset,
3248                                       NULL_RTX, NULL_RTX, false);
3249                 }
3250               else
3251                 aarch64_add_offset (int_mode, dest, base, offset,
3252                                     dest, NULL_RTX, false);
3253             }
3254           return;
3255         }
3256
3257       sty = aarch64_classify_symbol (base, const_offset);
3258       switch (sty)
3259         {
3260         case SYMBOL_FORCE_TO_MEM:
3261           if (const_offset != 0
3262               && targetm.cannot_force_const_mem (int_mode, imm))
3263             {
3264               gcc_assert (can_create_pseudo_p ());
3265               base = aarch64_force_temporary (int_mode, dest, base);
3266               aarch64_add_offset (int_mode, dest, base, const_offset,
3267                                   NULL_RTX, NULL_RTX, false);
3268               return;
3269             }
3270
3271           mem = force_const_mem (ptr_mode, imm);
3272           gcc_assert (mem);
3273
3274           /* If we aren't generating PC relative literals, then
3275              we need to expand the literal pool access carefully.
3276              This is something that needs to be done in a number
3277              of places, so could well live as a separate function.  */
3278           if (!aarch64_pcrelative_literal_loads)
3279             {
3280               gcc_assert (can_create_pseudo_p ());
3281               base = gen_reg_rtx (ptr_mode);
3282               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
3283               if (ptr_mode != Pmode)
3284                 base = convert_memory_address (Pmode, base);
3285               mem = gen_rtx_MEM (ptr_mode, base);
3286             }
3287
3288           if (int_mode != ptr_mode)
3289             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
3290
3291           emit_insn (gen_rtx_SET (dest, mem));
3292
3293           return;
3294
3295         case SYMBOL_SMALL_TLSGD:
3296         case SYMBOL_SMALL_TLSDESC:
3297         case SYMBOL_SMALL_TLSIE:
3298         case SYMBOL_SMALL_GOT_28K:
3299         case SYMBOL_SMALL_GOT_4G:
3300         case SYMBOL_TINY_GOT:
3301         case SYMBOL_TINY_TLSIE:
3302           if (const_offset != 0)
3303             {
3304               gcc_assert(can_create_pseudo_p ());
3305               base = aarch64_force_temporary (int_mode, dest, base);
3306               aarch64_add_offset (int_mode, dest, base, const_offset,
3307                                   NULL_RTX, NULL_RTX, false);
3308               return;
3309             }
3310           /* FALLTHRU */
3311
3312         case SYMBOL_SMALL_ABSOLUTE:
3313         case SYMBOL_TINY_ABSOLUTE:
3314         case SYMBOL_TLSLE12:
3315         case SYMBOL_TLSLE24:
3316         case SYMBOL_TLSLE32:
3317         case SYMBOL_TLSLE48:
3318           aarch64_load_symref_appropriately (dest, imm, sty);
3319           return;
3320
3321         default:
3322           gcc_unreachable ();
3323         }
3324     }
3325
3326   if (!CONST_INT_P (imm))
3327     {
3328       rtx base, step, value;
3329       if (GET_CODE (imm) == HIGH
3330           || aarch64_simd_valid_immediate (imm, NULL))
3331         emit_insn (gen_rtx_SET (dest, imm));
3332       else if (const_vec_series_p (imm, &base, &step))
3333         aarch64_expand_vec_series (dest, base, step);
3334       else if (const_vec_duplicate_p (imm, &value))
3335         {
3336           /* If the constant is out of range of an SVE vector move,
3337              load it from memory if we can, otherwise move it into
3338              a register and use a DUP.  */
3339           scalar_mode inner_mode = GET_MODE_INNER (mode);
3340           rtx op = force_const_mem (inner_mode, value);
3341           if (!op)
3342             op = force_reg (inner_mode, value);
3343           else if (!aarch64_sve_ld1r_operand_p (op))
3344             {
3345               rtx addr = force_reg (Pmode, XEXP (op, 0));
3346               op = replace_equiv_address (op, addr);
3347             }
3348           emit_insn (gen_vec_duplicate (dest, op));
3349         }
3350       else if (GET_CODE (imm) == CONST_VECTOR
3351                && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3352         aarch64_expand_sve_const_vector (dest, imm);
3353       else
3354         {
3355           rtx mem = force_const_mem (mode, imm);
3356           gcc_assert (mem);
3357           emit_move_insn (dest, mem);
3358         }
3359
3360       return;
3361     }
3362
3363   aarch64_internal_mov_immediate (dest, imm, true,
3364                                   as_a <scalar_int_mode> (mode));
3365 }
3366
3367 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
3368    that is known to contain PTRUE.  */
3369
3370 void
3371 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3372 {
3373   emit_insn (gen_rtx_SET (dest, gen_rtx_UNSPEC (GET_MODE (dest),
3374                                                 gen_rtvec (2, pred, src),
3375                                                 UNSPEC_MERGE_PTRUE)));
3376 }
3377
3378 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3379    operand is in memory.  In this case we need to use the predicated LD1
3380    and ST1 instead of LDR and STR, both for correctness on big-endian
3381    targets and because LD1 and ST1 support a wider range of addressing modes.
3382    PRED_MODE is the mode of the predicate.
3383
3384    See the comment at the head of aarch64-sve.md for details about the
3385    big-endian handling.  */
3386
3387 void
3388 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3389 {
3390   machine_mode mode = GET_MODE (dest);
3391   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3392   if (!register_operand (src, mode)
3393       && !register_operand (dest, mode))
3394     {
3395       rtx tmp = gen_reg_rtx (mode);
3396       if (MEM_P (src))
3397         aarch64_emit_sve_pred_move (tmp, ptrue, src);
3398       else
3399         emit_move_insn (tmp, src);
3400       src = tmp;
3401     }
3402   aarch64_emit_sve_pred_move (dest, ptrue, src);
3403 }
3404
3405 /* Called only on big-endian targets.  See whether an SVE vector move
3406    from SRC to DEST is effectively a REV[BHW] instruction, because at
3407    least one operand is a subreg of an SVE vector that has wider or
3408    narrower elements.  Return true and emit the instruction if so.
3409
3410    For example:
3411
3412      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3413
3414    represents a VIEW_CONVERT between the following vectors, viewed
3415    in memory order:
3416
3417      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
3418      R1: { [0],      [1],      [2],      [3],     ... }
3419
3420    The high part of lane X in R2 should therefore correspond to lane X*2
3421    of R1, but the register representations are:
3422
3423          msb                                      lsb
3424      R2: ...... [1].high  [1].low   [0].high  [0].low
3425      R1: ...... [3]       [2]       [1]       [0]
3426
3427    where the low part of lane X in R2 corresponds to lane X*2 in R1.
3428    We therefore need a reverse operation to swap the high and low values
3429    around.
3430
3431    This is purely an optimization.  Without it we would spill the
3432    subreg operand to the stack in one mode and reload it in the
3433    other mode, which has the same effect as the REV.  */
3434
3435 bool
3436 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3437 {
3438   gcc_assert (BYTES_BIG_ENDIAN);
3439   if (GET_CODE (dest) == SUBREG)
3440     dest = SUBREG_REG (dest);
3441   if (GET_CODE (src) == SUBREG)
3442     src = SUBREG_REG (src);
3443
3444   /* The optimization handles two single SVE REGs with different element
3445      sizes.  */
3446   if (!REG_P (dest)
3447       || !REG_P (src)
3448       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3449       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3450       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3451           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3452     return false;
3453
3454   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
3455   rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3456   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3457                                UNSPEC_REV_SUBREG);
3458   emit_insn (gen_rtx_SET (dest, unspec));
3459   return true;
3460 }
3461
3462 /* Return a copy of X with mode MODE, without changing its other
3463    attributes.  Unlike gen_lowpart, this doesn't care whether the
3464    mode change is valid.  */
3465
3466 static rtx
3467 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3468 {
3469   if (GET_MODE (x) == mode)
3470     return x;
3471
3472   x = shallow_copy_rtx (x);
3473   set_mode_and_regno (x, mode, REGNO (x));
3474   return x;
3475 }
3476
3477 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3478    operands.  */
3479
3480 void
3481 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3482 {
3483   /* Decide which REV operation we need.  The mode with narrower elements
3484      determines the mode of the operands and the mode with the wider
3485      elements determines the reverse width.  */
3486   machine_mode mode_with_wider_elts = GET_MODE (dest);
3487   machine_mode mode_with_narrower_elts = GET_MODE (src);
3488   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3489       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3490     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3491
3492   unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3493   unsigned int unspec;
3494   if (wider_bytes == 8)
3495     unspec = UNSPEC_REV64;
3496   else if (wider_bytes == 4)
3497     unspec = UNSPEC_REV32;
3498   else if (wider_bytes == 2)
3499     unspec = UNSPEC_REV16;
3500   else
3501     gcc_unreachable ();
3502   machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3503
3504   /* Emit:
3505
3506        (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3507                          UNSPEC_MERGE_PTRUE))
3508
3509      with the appropriate modes.  */
3510   ptrue = gen_lowpart (pred_mode, ptrue);
3511   dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3512   src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3513   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3514   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3515                         UNSPEC_MERGE_PTRUE);
3516   emit_insn (gen_rtx_SET (dest, src));
3517 }
3518
3519 static bool
3520 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3521                                  tree exp ATTRIBUTE_UNUSED)
3522 {
3523   if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
3524     return false;
3525
3526   return true;
3527 }
3528
3529 /* Implement TARGET_PASS_BY_REFERENCE.  */
3530
3531 static bool
3532 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3533                            machine_mode mode,
3534                            const_tree type,
3535                            bool named ATTRIBUTE_UNUSED)
3536 {
3537   HOST_WIDE_INT size;
3538   machine_mode dummymode;
3539   int nregs;
3540
3541   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
3542   if (mode == BLKmode && type)
3543     size = int_size_in_bytes (type);
3544   else
3545     /* No frontends can create types with variable-sized modes, so we
3546        shouldn't be asked to pass or return them.  */
3547     size = GET_MODE_SIZE (mode).to_constant ();
3548
3549   /* Aggregates are passed by reference based on their size.  */
3550   if (type && AGGREGATE_TYPE_P (type))
3551     {
3552       size = int_size_in_bytes (type);
3553     }
3554
3555   /* Variable sized arguments are always returned by reference.  */
3556   if (size < 0)
3557     return true;
3558
3559   /* Can this be a candidate to be passed in fp/simd register(s)?  */
3560   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3561                                                &dummymode, &nregs,
3562                                                NULL))
3563     return false;
3564
3565   /* Arguments which are variable sized or larger than 2 registers are
3566      passed by reference unless they are a homogenous floating point
3567      aggregate.  */
3568   return size > 2 * UNITS_PER_WORD;
3569 }
3570
3571 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
3572 static bool
3573 aarch64_return_in_msb (const_tree valtype)
3574 {
3575   machine_mode dummy_mode;
3576   int dummy_int;
3577
3578   /* Never happens in little-endian mode.  */
3579   if (!BYTES_BIG_ENDIAN)
3580     return false;
3581
3582   /* Only composite types smaller than or equal to 16 bytes can
3583      be potentially returned in registers.  */
3584   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3585       || int_size_in_bytes (valtype) <= 0
3586       || int_size_in_bytes (valtype) > 16)
3587     return false;
3588
3589   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3590      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3591      is always passed/returned in the least significant bits of fp/simd
3592      register(s).  */
3593   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3594                                                &dummy_mode, &dummy_int, NULL))
3595     return false;
3596
3597   return true;
3598 }
3599
3600 /* Implement TARGET_FUNCTION_VALUE.
3601    Define how to find the value returned by a function.  */
3602
3603 static rtx
3604 aarch64_function_value (const_tree type, const_tree func,
3605                         bool outgoing ATTRIBUTE_UNUSED)
3606 {
3607   machine_mode mode;
3608   int unsignedp;
3609   int count;
3610   machine_mode ag_mode;
3611
3612   mode = TYPE_MODE (type);
3613   if (INTEGRAL_TYPE_P (type))
3614     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3615
3616   if (aarch64_return_in_msb (type))
3617     {
3618       HOST_WIDE_INT size = int_size_in_bytes (type);
3619
3620       if (size % UNITS_PER_WORD != 0)
3621         {
3622           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3623           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3624         }
3625     }
3626
3627   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3628                                                &ag_mode, &count, NULL))
3629     {
3630       if (!aarch64_composite_type_p (type, mode))
3631         {
3632           gcc_assert (count == 1 && mode == ag_mode);
3633           return gen_rtx_REG (mode, V0_REGNUM);
3634         }
3635       else
3636         {
3637           int i;
3638           rtx par;
3639
3640           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3641           for (i = 0; i < count; i++)
3642             {
3643               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3644               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3645               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3646               XVECEXP (par, 0, i) = tmp;
3647             }
3648           return par;
3649         }
3650     }
3651   else
3652     return gen_rtx_REG (mode, R0_REGNUM);
3653 }
3654
3655 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3656    Return true if REGNO is the number of a hard register in which the values
3657    of called function may come back.  */
3658
3659 static bool
3660 aarch64_function_value_regno_p (const unsigned int regno)
3661 {
3662   /* Maximum of 16 bytes can be returned in the general registers.  Examples
3663      of 16-byte return values are: 128-bit integers and 16-byte small
3664      structures (excluding homogeneous floating-point aggregates).  */
3665   if (regno == R0_REGNUM || regno == R1_REGNUM)
3666     return true;
3667
3668   /* Up to four fp/simd registers can return a function value, e.g. a
3669      homogeneous floating-point aggregate having four members.  */
3670   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3671     return TARGET_FLOAT;
3672
3673   return false;
3674 }
3675
3676 /* Implement TARGET_RETURN_IN_MEMORY.
3677
3678    If the type T of the result of a function is such that
3679      void func (T arg)
3680    would require that arg be passed as a value in a register (or set of
3681    registers) according to the parameter passing rules, then the result
3682    is returned in the same registers as would be used for such an
3683    argument.  */
3684
3685 static bool
3686 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3687 {
3688   HOST_WIDE_INT size;
3689   machine_mode ag_mode;
3690   int count;
3691
3692   if (!AGGREGATE_TYPE_P (type)
3693       && TREE_CODE (type) != COMPLEX_TYPE
3694       && TREE_CODE (type) != VECTOR_TYPE)
3695     /* Simple scalar types always returned in registers.  */
3696     return false;
3697
3698   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3699                                                type,
3700                                                &ag_mode,
3701                                                &count,
3702                                                NULL))
3703     return false;
3704
3705   /* Types larger than 2 registers returned in memory.  */
3706   size = int_size_in_bytes (type);
3707   return (size < 0 || size > 2 * UNITS_PER_WORD);
3708 }
3709
3710 static bool
3711 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3712                                const_tree type, int *nregs)
3713 {
3714   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3715   return aarch64_vfp_is_call_or_return_candidate (mode,
3716                                                   type,
3717                                                   &pcum->aapcs_vfp_rmode,
3718                                                   nregs,
3719                                                   NULL);
3720 }
3721
3722 /* Given MODE and TYPE of a function argument, return the alignment in
3723    bits.  The idea is to suppress any stronger alignment requested by
3724    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3725    This is a helper function for local use only.  */
3726
3727 static unsigned int
3728 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
3729 {
3730   if (!type)
3731     return GET_MODE_ALIGNMENT (mode);
3732
3733   if (integer_zerop (TYPE_SIZE (type)))
3734     return 0;
3735
3736   gcc_assert (TYPE_MODE (type) == mode);
3737
3738   if (!AGGREGATE_TYPE_P (type))
3739     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3740
3741   if (TREE_CODE (type) == ARRAY_TYPE)
3742     return TYPE_ALIGN (TREE_TYPE (type));
3743
3744   unsigned int alignment = 0;
3745   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3746     if (TREE_CODE (field) == FIELD_DECL)
3747       alignment = std::max (alignment, DECL_ALIGN (field));
3748
3749   return alignment;
3750 }
3751
3752 /* Layout a function argument according to the AAPCS64 rules.  The rule
3753    numbers refer to the rule numbers in the AAPCS64.  */
3754
3755 static void
3756 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3757                     const_tree type,
3758                     bool named ATTRIBUTE_UNUSED)
3759 {
3760   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3761   int ncrn, nvrn, nregs;
3762   bool allocate_ncrn, allocate_nvrn;
3763   HOST_WIDE_INT size;
3764
3765   /* We need to do this once per argument.  */
3766   if (pcum->aapcs_arg_processed)
3767     return;
3768
3769   pcum->aapcs_arg_processed = true;
3770
3771   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
3772   if (type)
3773     size = int_size_in_bytes (type);
3774   else
3775     /* No frontends can create types with variable-sized modes, so we
3776        shouldn't be asked to pass or return them.  */
3777     size = GET_MODE_SIZE (mode).to_constant ();
3778   size = ROUND_UP (size, UNITS_PER_WORD);
3779
3780   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3781   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3782                                                  mode,
3783                                                  type,
3784                                                  &nregs);
3785
3786   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3787      The following code thus handles passing by SIMD/FP registers first.  */
3788
3789   nvrn = pcum->aapcs_nvrn;
3790
3791   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3792      and homogenous short-vector aggregates (HVA).  */
3793   if (allocate_nvrn)
3794     {
3795       if (!TARGET_FLOAT)
3796         aarch64_err_no_fpadvsimd (mode);
3797
3798       if (nvrn + nregs <= NUM_FP_ARG_REGS)
3799         {
3800           pcum->aapcs_nextnvrn = nvrn + nregs;
3801           if (!aarch64_composite_type_p (type, mode))
3802             {
3803               gcc_assert (nregs == 1);
3804               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3805             }
3806           else
3807             {
3808               rtx par;
3809               int i;
3810               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3811               for (i = 0; i < nregs; i++)
3812                 {
3813                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3814                                          V0_REGNUM + nvrn + i);
3815                   rtx offset = gen_int_mode
3816                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3817                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3818                   XVECEXP (par, 0, i) = tmp;
3819                 }
3820               pcum->aapcs_reg = par;
3821             }
3822           return;
3823         }
3824       else
3825         {
3826           /* C.3 NSRN is set to 8.  */
3827           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3828           goto on_stack;
3829         }
3830     }
3831
3832   ncrn = pcum->aapcs_ncrn;
3833   nregs = size / UNITS_PER_WORD;
3834
3835   /* C6 - C9.  though the sign and zero extension semantics are
3836      handled elsewhere.  This is the case where the argument fits
3837      entirely general registers.  */
3838   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3839     {
3840
3841       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3842
3843       /* C.8 if the argument has an alignment of 16 then the NGRN is
3844          rounded up to the next even number.  */
3845       if (nregs == 2
3846           && ncrn % 2
3847           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3848              comparison is there because for > 16 * BITS_PER_UNIT
3849              alignment nregs should be > 2 and therefore it should be
3850              passed by reference rather than value.  */
3851           && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3852         {
3853           ++ncrn;
3854           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3855         }
3856
3857       /* NREGS can be 0 when e.g. an empty structure is to be passed.
3858          A reg is still generated for it, but the caller should be smart
3859          enough not to use it.  */
3860       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3861         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3862       else
3863         {
3864           rtx par;
3865           int i;
3866
3867           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3868           for (i = 0; i < nregs; i++)
3869             {
3870               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3871               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3872                                        GEN_INT (i * UNITS_PER_WORD));
3873               XVECEXP (par, 0, i) = tmp;
3874             }
3875           pcum->aapcs_reg = par;
3876         }
3877
3878       pcum->aapcs_nextncrn = ncrn + nregs;
3879       return;
3880     }
3881
3882   /* C.11  */
3883   pcum->aapcs_nextncrn = NUM_ARG_REGS;
3884
3885   /* The argument is passed on stack; record the needed number of words for
3886      this argument and align the total size if necessary.  */
3887 on_stack:
3888   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3889
3890   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3891     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
3892                                        16 / UNITS_PER_WORD);
3893   return;
3894 }
3895
3896 /* Implement TARGET_FUNCTION_ARG.  */
3897
3898 static rtx
3899 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
3900                       const_tree type, bool named)
3901 {
3902   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3903   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
3904
3905   if (mode == VOIDmode)
3906     return NULL_RTX;
3907
3908   aarch64_layout_arg (pcum_v, mode, type, named);
3909   return pcum->aapcs_reg;
3910 }
3911
3912 void
3913 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
3914                            const_tree fntype ATTRIBUTE_UNUSED,
3915                            rtx libname ATTRIBUTE_UNUSED,
3916                            const_tree fndecl ATTRIBUTE_UNUSED,
3917                            unsigned n_named ATTRIBUTE_UNUSED)
3918 {
3919   pcum->aapcs_ncrn = 0;
3920   pcum->aapcs_nvrn = 0;
3921   pcum->aapcs_nextncrn = 0;
3922   pcum->aapcs_nextnvrn = 0;
3923   pcum->pcs_variant = ARM_PCS_AAPCS64;
3924   pcum->aapcs_reg = NULL_RTX;
3925   pcum->aapcs_arg_processed = false;
3926   pcum->aapcs_stack_words = 0;
3927   pcum->aapcs_stack_size = 0;
3928
3929   if (!TARGET_FLOAT
3930       && fndecl && TREE_PUBLIC (fndecl)
3931       && fntype && fntype != error_mark_node)
3932     {
3933       const_tree type = TREE_TYPE (fntype);
3934       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
3935       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
3936       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
3937                                                    &mode, &nregs, NULL))
3938         aarch64_err_no_fpadvsimd (TYPE_MODE (type));
3939     }
3940   return;
3941 }
3942
3943 static void
3944 aarch64_function_arg_advance (cumulative_args_t pcum_v,
3945                               machine_mode mode,
3946                               const_tree type,
3947                               bool named)
3948 {
3949   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3950   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
3951     {
3952       aarch64_layout_arg (pcum_v, mode, type, named);
3953       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
3954                   != (pcum->aapcs_stack_words != 0));
3955       pcum->aapcs_arg_processed = false;
3956       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
3957       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
3958       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
3959       pcum->aapcs_stack_words = 0;
3960       pcum->aapcs_reg = NULL_RTX;
3961     }
3962 }
3963
3964 bool
3965 aarch64_function_arg_regno_p (unsigned regno)
3966 {
3967   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
3968           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
3969 }
3970
3971 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
3972    PARM_BOUNDARY bits of alignment, but will be given anything up
3973    to STACK_BOUNDARY bits if the type requires it.  This makes sure
3974    that both before and after the layout of each argument, the Next
3975    Stacked Argument Address (NSAA) will have a minimum alignment of
3976    8 bytes.  */
3977
3978 static unsigned int
3979 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
3980 {
3981   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
3982   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
3983 }
3984
3985 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
3986
3987 static fixed_size_mode
3988 aarch64_get_reg_raw_mode (int regno)
3989 {
3990   if (TARGET_SVE && FP_REGNUM_P (regno))
3991     /* Don't use the SVE part of the register for __builtin_apply and
3992        __builtin_return.  The SVE registers aren't used by the normal PCS,
3993        so using them there would be a waste of time.  The PCS extensions
3994        for SVE types are fundamentally incompatible with the
3995        __builtin_return/__builtin_apply interface.  */
3996     return as_a <fixed_size_mode> (V16QImode);
3997   return default_get_reg_raw_mode (regno);
3998 }
3999
4000 /* Implement TARGET_FUNCTION_ARG_PADDING.
4001
4002    Small aggregate types are placed in the lowest memory address.
4003
4004    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
4005
4006 static pad_direction
4007 aarch64_function_arg_padding (machine_mode mode, const_tree type)
4008 {
4009   /* On little-endian targets, the least significant byte of every stack
4010      argument is passed at the lowest byte address of the stack slot.  */
4011   if (!BYTES_BIG_ENDIAN)
4012     return PAD_UPWARD;
4013
4014   /* Otherwise, integral, floating-point and pointer types are padded downward:
4015      the least significant byte of a stack argument is passed at the highest
4016      byte address of the stack slot.  */
4017   if (type
4018       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
4019          || POINTER_TYPE_P (type))
4020       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
4021     return PAD_DOWNWARD;
4022
4023   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
4024   return PAD_UPWARD;
4025 }
4026
4027 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4028
4029    It specifies padding for the last (may also be the only)
4030    element of a block move between registers and memory.  If
4031    assuming the block is in the memory, padding upward means that
4032    the last element is padded after its highest significant byte,
4033    while in downward padding, the last element is padded at the
4034    its least significant byte side.
4035
4036    Small aggregates and small complex types are always padded
4037    upwards.
4038
4039    We don't need to worry about homogeneous floating-point or
4040    short-vector aggregates; their move is not affected by the
4041    padding direction determined here.  Regardless of endianness,
4042    each element of such an aggregate is put in the least
4043    significant bits of a fp/simd register.
4044
4045    Return !BYTES_BIG_ENDIAN if the least significant byte of the
4046    register has useful data, and return the opposite if the most
4047    significant byte does.  */
4048
4049 bool
4050 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
4051                      bool first ATTRIBUTE_UNUSED)
4052 {
4053
4054   /* Small composite types are always padded upward.  */
4055   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
4056     {
4057       HOST_WIDE_INT size;
4058       if (type)
4059         size = int_size_in_bytes (type);
4060       else
4061         /* No frontends can create types with variable-sized modes, so we
4062            shouldn't be asked to pass or return them.  */
4063         size = GET_MODE_SIZE (mode).to_constant ();
4064       if (size < 2 * UNITS_PER_WORD)
4065         return true;
4066     }
4067
4068   /* Otherwise, use the default padding.  */
4069   return !BYTES_BIG_ENDIAN;
4070 }
4071
4072 static scalar_int_mode
4073 aarch64_libgcc_cmp_return_mode (void)
4074 {
4075   return SImode;
4076 }
4077
4078 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
4079
4080 /* We use the 12-bit shifted immediate arithmetic instructions so values
4081    must be multiple of (1 << 12), i.e. 4096.  */
4082 #define ARITH_FACTOR 4096
4083
4084 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
4085 #error Cannot use simple address calculation for stack probing
4086 #endif
4087
4088 /* The pair of scratch registers used for stack probing.  */
4089 #define PROBE_STACK_FIRST_REG  R9_REGNUM
4090 #define PROBE_STACK_SECOND_REG R10_REGNUM
4091
4092 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
4093    inclusive.  These are offsets from the current stack pointer.  */
4094
4095 static void
4096 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
4097 {
4098   HOST_WIDE_INT size;
4099   if (!poly_size.is_constant (&size))
4100     {
4101       sorry ("stack probes for SVE frames");
4102       return;
4103     }
4104
4105   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
4106
4107   /* See the same assertion on PROBE_INTERVAL above.  */
4108   gcc_assert ((first % ARITH_FACTOR) == 0);
4109
4110   /* See if we have a constant small number of probes to generate.  If so,
4111      that's the easy case.  */
4112   if (size <= PROBE_INTERVAL)
4113     {
4114       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
4115
4116       emit_set_insn (reg1,
4117                      plus_constant (Pmode,
4118                                     stack_pointer_rtx, -(first + base)));
4119       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
4120     }
4121
4122   /* The run-time loop is made up of 8 insns in the generic case while the
4123      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
4124   else if (size <= 4 * PROBE_INTERVAL)
4125     {
4126       HOST_WIDE_INT i, rem;
4127
4128       emit_set_insn (reg1,
4129                      plus_constant (Pmode,
4130                                     stack_pointer_rtx,
4131                                     -(first + PROBE_INTERVAL)));
4132       emit_stack_probe (reg1);
4133
4134       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
4135          it exceeds SIZE.  If only two probes are needed, this will not
4136          generate any code.  Then probe at FIRST + SIZE.  */
4137       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
4138         {
4139           emit_set_insn (reg1,
4140                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
4141           emit_stack_probe (reg1);
4142         }
4143
4144       rem = size - (i - PROBE_INTERVAL);
4145       if (rem > 256)
4146         {
4147           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4148
4149           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
4150           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
4151         }
4152       else
4153         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
4154     }
4155
4156   /* Otherwise, do the same as above, but in a loop.  Note that we must be
4157      extra careful with variables wrapping around because we might be at
4158      the very top (or the very bottom) of the address space and we have
4159      to be able to handle this case properly; in particular, we use an
4160      equality test for the loop condition.  */
4161   else
4162     {
4163       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
4164
4165       /* Step 1: round SIZE to the previous multiple of the interval.  */
4166
4167       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
4168
4169
4170       /* Step 2: compute initial and final value of the loop counter.  */
4171
4172       /* TEST_ADDR = SP + FIRST.  */
4173       emit_set_insn (reg1,
4174                      plus_constant (Pmode, stack_pointer_rtx, -first));
4175
4176       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
4177       HOST_WIDE_INT adjustment = - (first + rounded_size);
4178       if (! aarch64_uimm12_shift (adjustment))
4179         {
4180           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
4181                                           true, Pmode);
4182           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
4183         }
4184       else
4185         emit_set_insn (reg2,
4186                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
4187
4188       /* Step 3: the loop
4189
4190          do
4191            {
4192              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
4193              probe at TEST_ADDR
4194            }
4195          while (TEST_ADDR != LAST_ADDR)
4196
4197          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
4198          until it is equal to ROUNDED_SIZE.  */
4199
4200       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
4201
4202
4203       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
4204          that SIZE is equal to ROUNDED_SIZE.  */
4205
4206       if (size != rounded_size)
4207         {
4208           HOST_WIDE_INT rem = size - rounded_size;
4209
4210           if (rem > 256)
4211             {
4212               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4213
4214               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
4215               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
4216             }
4217           else
4218             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
4219         }
4220     }
4221
4222   /* Make sure nothing is scheduled before we are done.  */
4223   emit_insn (gen_blockage ());
4224 }
4225
4226 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
4227    absolute addresses.  */
4228
4229 const char *
4230 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
4231 {
4232   static int labelno = 0;
4233   char loop_lab[32];
4234   rtx xops[2];
4235
4236   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
4237
4238   /* Loop.  */
4239   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
4240
4241   HOST_WIDE_INT stack_clash_probe_interval
4242     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
4243
4244   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
4245   xops[0] = reg1;
4246   HOST_WIDE_INT interval;
4247   if (flag_stack_clash_protection)
4248     interval = stack_clash_probe_interval;
4249   else
4250     interval = PROBE_INTERVAL;
4251
4252   gcc_assert (aarch64_uimm12_shift (interval));
4253   xops[1] = GEN_INT (interval);
4254
4255   output_asm_insn ("sub\t%0, %0, %1", xops);
4256
4257   /* If doing stack clash protection then we probe up by the ABI specified
4258      amount.  We do this because we're dropping full pages at a time in the
4259      loop.  But if we're doing non-stack clash probing, probe at SP 0.  */
4260   if (flag_stack_clash_protection)
4261     xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
4262   else
4263     xops[1] = CONST0_RTX (GET_MODE (xops[1]));
4264
4265   /* Probe at TEST_ADDR.  If we're inside the loop it is always safe to probe
4266      by this amount for each iteration.  */
4267   output_asm_insn ("str\txzr, [%0, %1]", xops);
4268
4269   /* Test if TEST_ADDR == LAST_ADDR.  */
4270   xops[1] = reg2;
4271   output_asm_insn ("cmp\t%0, %1", xops);
4272
4273   /* Branch.  */
4274   fputs ("\tb.ne\t", asm_out_file);
4275   assemble_name_raw (asm_out_file, loop_lab);
4276   fputc ('\n', asm_out_file);
4277
4278   return "";
4279 }
4280
4281 /* Emit the probe loop for doing stack clash probes and stack adjustments for
4282    SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
4283    of GUARD_SIZE.  When a probe is emitted it is done at most
4284    MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
4285    at most MIN_PROBE_THRESHOLD.  By the end of this function
4286    BASE = BASE - ADJUSTMENT.  */
4287
4288 const char *
4289 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
4290                                       rtx min_probe_threshold, rtx guard_size)
4291 {
4292   /* This function is not allowed to use any instruction generation function
4293      like gen_ and friends.  If you do you'll likely ICE during CFG validation,
4294      so instead emit the code you want using output_asm_insn.  */
4295   gcc_assert (flag_stack_clash_protection);
4296   gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
4297   gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
4298
4299   /* The minimum required allocation before the residual requires probing.  */
4300   HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
4301
4302   /* Clamp the value down to the nearest value that can be used with a cmp.  */
4303   residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
4304   rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
4305
4306   gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
4307   gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
4308
4309   static int labelno = 0;
4310   char loop_start_lab[32];
4311   char loop_end_lab[32];
4312   rtx xops[2];
4313
4314   ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
4315   ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
4316
4317   /* Emit loop start label.  */
4318   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
4319
4320   /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
4321   xops[0] = adjustment;
4322   xops[1] = probe_offset_value_rtx;
4323   output_asm_insn ("cmp\t%0, %1", xops);
4324
4325   /* Branch to end if not enough adjustment to probe.  */
4326   fputs ("\tb.lt\t", asm_out_file);
4327   assemble_name_raw (asm_out_file, loop_end_lab);
4328   fputc ('\n', asm_out_file);
4329
4330   /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
4331   xops[0] = base;
4332   xops[1] = probe_offset_value_rtx;
4333   output_asm_insn ("sub\t%0, %0, %1", xops);
4334
4335   /* Probe at BASE.  */
4336   xops[1] = const0_rtx;
4337   output_asm_insn ("str\txzr, [%0, %1]", xops);
4338
4339   /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
4340   xops[0] = adjustment;
4341   xops[1] = probe_offset_value_rtx;
4342   output_asm_insn ("sub\t%0, %0, %1", xops);
4343
4344   /* Branch to start if still more bytes to allocate.  */
4345   fputs ("\tb\t", asm_out_file);
4346   assemble_name_raw (asm_out_file, loop_start_lab);
4347   fputc ('\n', asm_out_file);
4348
4349   /* No probe leave.  */
4350   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
4351
4352   /* BASE = BASE - ADJUSTMENT.  */
4353   xops[0] = base;
4354   xops[1] = adjustment;
4355   output_asm_insn ("sub\t%0, %0, %1", xops);
4356   return "";
4357 }
4358
4359 /* Determine whether a frame chain needs to be generated.  */
4360 static bool
4361 aarch64_needs_frame_chain (void)
4362 {
4363   /* Force a frame chain for EH returns so the return address is at FP+8.  */
4364   if (frame_pointer_needed || crtl->calls_eh_return)
4365     return true;
4366
4367   /* A leaf function cannot have calls or write LR.  */
4368   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
4369
4370   /* Don't use a frame chain in leaf functions if leaf frame pointers
4371      are disabled.  */
4372   if (flag_omit_leaf_frame_pointer && is_leaf)
4373     return false;
4374
4375   return aarch64_use_frame_pointer;
4376 }
4377
4378 /* Mark the registers that need to be saved by the callee and calculate
4379    the size of the callee-saved registers area and frame record (both FP
4380    and LR may be omitted).  */
4381 static void
4382 aarch64_layout_frame (void)
4383 {
4384   HOST_WIDE_INT offset = 0;
4385   int regno, last_fp_reg = INVALID_REGNUM;
4386   bool simd_function = aarch64_simd_decl_p (cfun->decl);
4387
4388   cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
4389
4390   /* Adjust the outgoing arguments size if required.  Keep it in sync with what
4391      the mid-end is doing.  */
4392   crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
4393
4394 #define SLOT_NOT_REQUIRED (-2)
4395 #define SLOT_REQUIRED     (-1)
4396
4397   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
4398   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
4399
4400   /* If this is a non-leaf simd function with calls we assume that
4401      at least one of those calls is to a non-simd function and thus
4402      we must save V8 to V23 in the prologue.  */
4403
4404   if (simd_function && !crtl->is_leaf)
4405     {
4406       for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4407         if (FP_SIMD_SAVED_REGNUM_P (regno))
4408           df_set_regs_ever_live (regno, true);
4409     }
4410
4411   /* First mark all the registers that really need to be saved...  */
4412   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4413     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4414
4415   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4416     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4417
4418   /* ... that includes the eh data registers (if needed)...  */
4419   if (crtl->calls_eh_return)
4420     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
4421       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
4422         = SLOT_REQUIRED;
4423
4424   /* ... and any callee saved register that dataflow says is live.  */
4425   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4426     if (df_regs_ever_live_p (regno)
4427         && (regno == R30_REGNUM
4428             || !call_used_regs[regno]))
4429       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4430
4431   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4432     if (df_regs_ever_live_p (regno)
4433         && (!call_used_regs[regno]
4434             || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
4435       {
4436         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4437         last_fp_reg = regno;
4438       }
4439
4440   if (cfun->machine->frame.emit_frame_chain)
4441     {
4442       /* FP and LR are placed in the linkage record.  */
4443       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
4444       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
4445       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
4446       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
4447       offset = 2 * UNITS_PER_WORD;
4448     }
4449
4450   /* With stack-clash, LR must be saved in non-leaf functions.  */
4451   gcc_assert (crtl->is_leaf
4452               || (cfun->machine->frame.reg_offset[R30_REGNUM]
4453                   != SLOT_NOT_REQUIRED));
4454
4455   /* Now assign stack slots for them.  */
4456   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4457     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4458       {
4459         cfun->machine->frame.reg_offset[regno] = offset;
4460         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4461           cfun->machine->frame.wb_candidate1 = regno;
4462         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4463           cfun->machine->frame.wb_candidate2 = regno;
4464         offset += UNITS_PER_WORD;
4465       }
4466
4467   HOST_WIDE_INT max_int_offset = offset;
4468   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4469   bool has_align_gap = offset != max_int_offset;
4470
4471   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4472     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4473       {
4474         /* If there is an alignment gap between integer and fp callee-saves,
4475            allocate the last fp register to it if possible.  */
4476         if (regno == last_fp_reg
4477             && has_align_gap
4478             && !simd_function
4479             && (offset & 8) == 0)
4480           {
4481             cfun->machine->frame.reg_offset[regno] = max_int_offset;
4482             break;
4483           }
4484
4485         cfun->machine->frame.reg_offset[regno] = offset;
4486         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4487           cfun->machine->frame.wb_candidate1 = regno;
4488         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4489                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4490           cfun->machine->frame.wb_candidate2 = regno;
4491         offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
4492       }
4493
4494   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4495
4496   cfun->machine->frame.saved_regs_size = offset;
4497
4498   HOST_WIDE_INT varargs_and_saved_regs_size
4499     = offset + cfun->machine->frame.saved_varargs_size;
4500
4501   cfun->machine->frame.hard_fp_offset
4502     = aligned_upper_bound (varargs_and_saved_regs_size
4503                            + get_frame_size (),
4504                            STACK_BOUNDARY / BITS_PER_UNIT);
4505
4506   /* Both these values are already aligned.  */
4507   gcc_assert (multiple_p (crtl->outgoing_args_size,
4508                           STACK_BOUNDARY / BITS_PER_UNIT));
4509   cfun->machine->frame.frame_size
4510     = (cfun->machine->frame.hard_fp_offset
4511        + crtl->outgoing_args_size);
4512
4513   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4514
4515   cfun->machine->frame.initial_adjust = 0;
4516   cfun->machine->frame.final_adjust = 0;
4517   cfun->machine->frame.callee_adjust = 0;
4518   cfun->machine->frame.callee_offset = 0;
4519
4520   HOST_WIDE_INT max_push_offset = 0;
4521   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4522     max_push_offset = 512;
4523   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4524     max_push_offset = 256;
4525
4526   HOST_WIDE_INT const_size, const_fp_offset;
4527   if (cfun->machine->frame.frame_size.is_constant (&const_size)
4528       && const_size < max_push_offset
4529       && known_eq (crtl->outgoing_args_size, 0))
4530     {
4531       /* Simple, small frame with no outgoing arguments:
4532          stp reg1, reg2, [sp, -frame_size]!
4533          stp reg3, reg4, [sp, 16]  */
4534       cfun->machine->frame.callee_adjust = const_size;
4535     }
4536   else if (known_lt (crtl->outgoing_args_size
4537                      + cfun->machine->frame.saved_regs_size, 512)
4538            && !(cfun->calls_alloca
4539                 && known_lt (cfun->machine->frame.hard_fp_offset,
4540                              max_push_offset)))
4541     {
4542       /* Frame with small outgoing arguments:
4543          sub sp, sp, frame_size
4544          stp reg1, reg2, [sp, outgoing_args_size]
4545          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
4546       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4547       cfun->machine->frame.callee_offset
4548         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4549     }
4550   else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4551            && const_fp_offset < max_push_offset)
4552     {
4553       /* Frame with large outgoing arguments but a small local area:
4554          stp reg1, reg2, [sp, -hard_fp_offset]!
4555          stp reg3, reg4, [sp, 16]
4556          sub sp, sp, outgoing_args_size  */
4557       cfun->machine->frame.callee_adjust = const_fp_offset;
4558       cfun->machine->frame.final_adjust
4559         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4560     }
4561   else
4562     {
4563       /* Frame with large local area and outgoing arguments using frame pointer:
4564          sub sp, sp, hard_fp_offset
4565          stp x29, x30, [sp, 0]
4566          add x29, sp, 0
4567          stp reg3, reg4, [sp, 16]
4568          sub sp, sp, outgoing_args_size  */
4569       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4570       cfun->machine->frame.final_adjust
4571         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4572     }
4573
4574   cfun->machine->frame.laid_out = true;
4575 }
4576
4577 /* Return true if the register REGNO is saved on entry to
4578    the current function.  */
4579
4580 static bool
4581 aarch64_register_saved_on_entry (int regno)
4582 {
4583   return cfun->machine->frame.reg_offset[regno] >= 0;
4584 }
4585
4586 /* Return the next register up from REGNO up to LIMIT for the callee
4587    to save.  */
4588
4589 static unsigned
4590 aarch64_next_callee_save (unsigned regno, unsigned limit)
4591 {
4592   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4593     regno ++;
4594   return regno;
4595 }
4596
4597 /* Push the register number REGNO of mode MODE to the stack with write-back
4598    adjusting the stack by ADJUSTMENT.  */
4599
4600 static void
4601 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4602                            HOST_WIDE_INT adjustment)
4603  {
4604   rtx base_rtx = stack_pointer_rtx;
4605   rtx insn, reg, mem;
4606
4607   reg = gen_rtx_REG (mode, regno);
4608   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4609                             plus_constant (Pmode, base_rtx, -adjustment));
4610   mem = gen_frame_mem (mode, mem);
4611
4612   insn = emit_move_insn (mem, reg);
4613   RTX_FRAME_RELATED_P (insn) = 1;
4614 }
4615
4616 /* Generate and return an instruction to store the pair of registers
4617    REG and REG2 of mode MODE to location BASE with write-back adjusting
4618    the stack location BASE by ADJUSTMENT.  */
4619
4620 static rtx
4621 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4622                           HOST_WIDE_INT adjustment)
4623 {
4624   switch (mode)
4625     {
4626     case E_DImode:
4627       return gen_storewb_pairdi_di (base, base, reg, reg2,
4628                                     GEN_INT (-adjustment),
4629                                     GEN_INT (UNITS_PER_WORD - adjustment));
4630     case E_DFmode:
4631       return gen_storewb_pairdf_di (base, base, reg, reg2,
4632                                     GEN_INT (-adjustment),
4633                                     GEN_INT (UNITS_PER_WORD - adjustment));
4634     case E_TFmode:
4635       return gen_storewb_pairtf_di (base, base, reg, reg2,
4636                                     GEN_INT (-adjustment),
4637                                     GEN_INT (UNITS_PER_VREG - adjustment));
4638     default:
4639       gcc_unreachable ();
4640     }
4641 }
4642
4643 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4644    stack pointer by ADJUSTMENT.  */
4645
4646 static void
4647 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4648 {
4649   rtx_insn *insn;
4650   machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
4651
4652   if (regno2 == INVALID_REGNUM)
4653     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4654
4655   rtx reg1 = gen_rtx_REG (mode, regno1);
4656   rtx reg2 = gen_rtx_REG (mode, regno2);
4657
4658   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4659                                               reg2, adjustment));
4660   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4661   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4662   RTX_FRAME_RELATED_P (insn) = 1;
4663 }
4664
4665 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4666    adjusting it by ADJUSTMENT afterwards.  */
4667
4668 static rtx
4669 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4670                          HOST_WIDE_INT adjustment)
4671 {
4672   switch (mode)
4673     {
4674     case E_DImode:
4675       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4676                                    GEN_INT (UNITS_PER_WORD));
4677     case E_DFmode:
4678       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4679                                    GEN_INT (UNITS_PER_WORD));
4680     case E_TFmode:
4681       return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
4682                                    GEN_INT (UNITS_PER_VREG));
4683     default:
4684       gcc_unreachable ();
4685     }
4686 }
4687
4688 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4689    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4690    into CFI_OPS.  */
4691
4692 static void
4693 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4694                   rtx *cfi_ops)
4695 {
4696   machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
4697   rtx reg1 = gen_rtx_REG (mode, regno1);
4698
4699   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4700
4701   if (regno2 == INVALID_REGNUM)
4702     {
4703       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4704       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4705       emit_move_insn (reg1, gen_frame_mem (mode, mem));
4706     }
4707   else
4708     {
4709       rtx reg2 = gen_rtx_REG (mode, regno2);
4710       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4711       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4712                                           reg2, adjustment));
4713     }
4714 }
4715
4716 /* Generate and return a store pair instruction of mode MODE to store
4717    register REG1 to MEM1 and register REG2 to MEM2.  */
4718
4719 static rtx
4720 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4721                         rtx reg2)
4722 {
4723   switch (mode)
4724     {
4725     case E_DImode:
4726       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
4727
4728     case E_DFmode:
4729       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
4730
4731     case E_TFmode:
4732       return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
4733
4734     default:
4735       gcc_unreachable ();
4736     }
4737 }
4738
4739 /* Generate and regurn a load pair isntruction of mode MODE to load register
4740    REG1 from MEM1 and register REG2 from MEM2.  */
4741
4742 static rtx
4743 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4744                        rtx mem2)
4745 {
4746   switch (mode)
4747     {
4748     case E_DImode:
4749       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
4750
4751     case E_DFmode:
4752       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
4753
4754     case E_TFmode:
4755       return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
4756
4757     default:
4758       gcc_unreachable ();
4759     }
4760 }
4761
4762 /* Return TRUE if return address signing should be enabled for the current
4763    function, otherwise return FALSE.  */
4764
4765 bool
4766 aarch64_return_address_signing_enabled (void)
4767 {
4768   /* This function should only be called after frame laid out.   */
4769   gcc_assert (cfun->machine->frame.laid_out);
4770
4771   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4772      if it's LR is pushed onto stack.  */
4773   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4774           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4775               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4776 }
4777
4778 /* Return TRUE if Branch Target Identification Mechanism is enabled.  */
4779 bool
4780 aarch64_bti_enabled (void)
4781 {
4782   return (aarch64_enable_bti == 1);
4783 }
4784
4785 /* Emit code to save the callee-saved registers from register number START
4786    to LIMIT to the stack at the location starting at offset START_OFFSET,
4787    skipping any write-back candidates if SKIP_WB is true.  */
4788
4789 static void
4790 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4791                            unsigned start, unsigned limit, bool skip_wb)
4792 {
4793   rtx_insn *insn;
4794   unsigned regno;
4795   unsigned regno2;
4796
4797   for (regno = aarch64_next_callee_save (start, limit);
4798        regno <= limit;
4799        regno = aarch64_next_callee_save (regno + 1, limit))
4800     {
4801       rtx reg, mem;
4802       poly_int64 offset;
4803       int offset_diff;
4804
4805       if (skip_wb
4806           && (regno == cfun->machine->frame.wb_candidate1
4807               || regno == cfun->machine->frame.wb_candidate2))
4808         continue;
4809
4810       if (cfun->machine->reg_is_wrapped_separately[regno])
4811        continue;
4812
4813       reg = gen_rtx_REG (mode, regno);
4814       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4815       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4816                                                 offset));
4817
4818       regno2 = aarch64_next_callee_save (regno + 1, limit);
4819       offset_diff = cfun->machine->frame.reg_offset[regno2]
4820                     - cfun->machine->frame.reg_offset[regno];
4821
4822       if (regno2 <= limit
4823           && !cfun->machine->reg_is_wrapped_separately[regno2]
4824           && known_eq (GET_MODE_SIZE (mode), offset_diff))
4825         {
4826           rtx reg2 = gen_rtx_REG (mode, regno2);
4827           rtx mem2;
4828
4829           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4830           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4831                                                      offset));
4832           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4833                                                     reg2));
4834
4835           /* The first part of a frame-related parallel insn is
4836              always assumed to be relevant to the frame
4837              calculations; subsequent parts, are only
4838              frame-related if explicitly marked.  */
4839           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4840           regno = regno2;
4841         }
4842       else
4843         insn = emit_move_insn (mem, reg);
4844
4845       RTX_FRAME_RELATED_P (insn) = 1;
4846     }
4847 }
4848
4849 /* Emit code to restore the callee registers of mode MODE from register
4850    number START up to and including LIMIT.  Restore from the stack offset
4851    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4852    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
4853
4854 static void
4855 aarch64_restore_callee_saves (machine_mode mode,
4856                               poly_int64 start_offset, unsigned start,
4857                               unsigned limit, bool skip_wb, rtx *cfi_ops)
4858 {
4859   rtx base_rtx = stack_pointer_rtx;
4860   unsigned regno;
4861   unsigned regno2;
4862   poly_int64 offset;
4863
4864   for (regno = aarch64_next_callee_save (start, limit);
4865        regno <= limit;
4866        regno = aarch64_next_callee_save (regno + 1, limit))
4867     {
4868       if (cfun->machine->reg_is_wrapped_separately[regno])
4869        continue;
4870
4871       rtx reg, mem;
4872       int offset_diff;
4873
4874       if (skip_wb
4875           && (regno == cfun->machine->frame.wb_candidate1
4876               || regno == cfun->machine->frame.wb_candidate2))
4877         continue;
4878
4879       reg = gen_rtx_REG (mode, regno);
4880       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4881       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4882
4883       regno2 = aarch64_next_callee_save (regno + 1, limit);
4884       offset_diff = cfun->machine->frame.reg_offset[regno2]
4885                     - cfun->machine->frame.reg_offset[regno];
4886
4887       if (regno2 <= limit
4888           && !cfun->machine->reg_is_wrapped_separately[regno2]
4889           && known_eq (GET_MODE_SIZE (mode), offset_diff))
4890         {
4891           rtx reg2 = gen_rtx_REG (mode, regno2);
4892           rtx mem2;
4893
4894           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4895           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4896           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4897
4898           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4899           regno = regno2;
4900         }
4901       else
4902         emit_move_insn (reg, mem);
4903       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
4904     }
4905 }
4906
4907 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4908    of MODE.  */
4909
4910 static inline bool
4911 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4912 {
4913   HOST_WIDE_INT multiple;
4914   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4915           && IN_RANGE (multiple, -8, 7));
4916 }
4917
4918 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4919    of MODE.  */
4920
4921 static inline bool
4922 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4923 {
4924   HOST_WIDE_INT multiple;
4925   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4926           && IN_RANGE (multiple, 0, 63));
4927 }
4928
4929 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4930    of MODE.  */
4931
4932 bool
4933 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4934 {
4935   HOST_WIDE_INT multiple;
4936   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4937           && IN_RANGE (multiple, -64, 63));
4938 }
4939
4940 /* Return true if OFFSET is a signed 9-bit value.  */
4941
4942 bool
4943 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
4944                                        poly_int64 offset)
4945 {
4946   HOST_WIDE_INT const_offset;
4947   return (offset.is_constant (&const_offset)
4948           && IN_RANGE (const_offset, -256, 255));
4949 }
4950
4951 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4952    of MODE.  */
4953
4954 static inline bool
4955 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4956 {
4957   HOST_WIDE_INT multiple;
4958   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4959           && IN_RANGE (multiple, -256, 255));
4960 }
4961
4962 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
4963    of MODE.  */
4964
4965 static inline bool
4966 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4967 {
4968   HOST_WIDE_INT multiple;
4969   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4970           && IN_RANGE (multiple, 0, 4095));
4971 }
4972
4973 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
4974
4975 static sbitmap
4976 aarch64_get_separate_components (void)
4977 {
4978   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4979   bitmap_clear (components);
4980
4981   /* The registers we need saved to the frame.  */
4982   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4983     if (aarch64_register_saved_on_entry (regno))
4984       {
4985         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4986         if (!frame_pointer_needed)
4987           offset += cfun->machine->frame.frame_size
4988                     - cfun->machine->frame.hard_fp_offset;
4989         /* Check that we can access the stack slot of the register with one
4990            direct load with no adjustments needed.  */
4991         if (offset_12bit_unsigned_scaled_p (DImode, offset))
4992           bitmap_set_bit (components, regno);
4993       }
4994
4995   /* Don't mess with the hard frame pointer.  */
4996   if (frame_pointer_needed)
4997     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
4998
4999   unsigned reg1 = cfun->machine->frame.wb_candidate1;
5000   unsigned reg2 = cfun->machine->frame.wb_candidate2;
5001   /* If registers have been chosen to be stored/restored with
5002      writeback don't interfere with them to avoid having to output explicit
5003      stack adjustment instructions.  */
5004   if (reg2 != INVALID_REGNUM)
5005     bitmap_clear_bit (components, reg2);
5006   if (reg1 != INVALID_REGNUM)
5007     bitmap_clear_bit (components, reg1);
5008
5009   bitmap_clear_bit (components, LR_REGNUM);
5010   bitmap_clear_bit (components, SP_REGNUM);
5011
5012   return components;
5013 }
5014
5015 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
5016
5017 static sbitmap
5018 aarch64_components_for_bb (basic_block bb)
5019 {
5020   bitmap in = DF_LIVE_IN (bb);
5021   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
5022   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
5023   bool simd_function = aarch64_simd_decl_p (cfun->decl);
5024
5025   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5026   bitmap_clear (components);
5027
5028   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
5029   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5030     if ((!call_used_regs[regno]
5031         || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
5032        && (bitmap_bit_p (in, regno)
5033            || bitmap_bit_p (gen, regno)
5034            || bitmap_bit_p (kill, regno)))
5035       {
5036         unsigned regno2, offset, offset2;
5037         bitmap_set_bit (components, regno);
5038
5039         /* If there is a callee-save at an adjacent offset, add it too
5040            to increase the use of LDP/STP.  */
5041         offset = cfun->machine->frame.reg_offset[regno];
5042         regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
5043
5044         if (regno2 <= LAST_SAVED_REGNUM)
5045           {
5046             offset2 = cfun->machine->frame.reg_offset[regno2];
5047             if ((offset & ~8) == (offset2 & ~8))
5048               bitmap_set_bit (components, regno2);
5049           }
5050       }
5051
5052   return components;
5053 }
5054
5055 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5056    Nothing to do for aarch64.  */
5057
5058 static void
5059 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
5060 {
5061 }
5062
5063 /* Return the next set bit in BMP from START onwards.  Return the total number
5064    of bits in BMP if no set bit is found at or after START.  */
5065
5066 static unsigned int
5067 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
5068 {
5069   unsigned int nbits = SBITMAP_SIZE (bmp);
5070   if (start == nbits)
5071     return start;
5072
5073   gcc_assert (start < nbits);
5074   for (unsigned int i = start; i < nbits; i++)
5075     if (bitmap_bit_p (bmp, i))
5076       return i;
5077
5078   return nbits;
5079 }
5080
5081 /* Do the work for aarch64_emit_prologue_components and
5082    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
5083    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
5084    for these components or the epilogue sequence.  That is, it determines
5085    whether we should emit stores or loads and what kind of CFA notes to attach
5086    to the insns.  Otherwise the logic for the two sequences is very
5087    similar.  */
5088
5089 static void
5090 aarch64_process_components (sbitmap components, bool prologue_p)
5091 {
5092   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
5093                              ? HARD_FRAME_POINTER_REGNUM
5094                              : STACK_POINTER_REGNUM);
5095
5096   unsigned last_regno = SBITMAP_SIZE (components);
5097   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
5098   rtx_insn *insn = NULL;
5099
5100   while (regno != last_regno)
5101     {
5102       /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
5103          so DFmode for the vector registers is enough.  For simd functions
5104          we want to save the low 128 bits.  */
5105       machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
5106
5107       rtx reg = gen_rtx_REG (mode, regno);
5108       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5109       if (!frame_pointer_needed)
5110         offset += cfun->machine->frame.frame_size
5111                   - cfun->machine->frame.hard_fp_offset;
5112       rtx addr = plus_constant (Pmode, ptr_reg, offset);
5113       rtx mem = gen_frame_mem (mode, addr);
5114
5115       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
5116       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
5117       /* No more registers to handle after REGNO.
5118          Emit a single save/restore and exit.  */
5119       if (regno2 == last_regno)
5120         {
5121           insn = emit_insn (set);
5122           RTX_FRAME_RELATED_P (insn) = 1;
5123           if (prologue_p)
5124             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5125           else
5126             add_reg_note (insn, REG_CFA_RESTORE, reg);
5127           break;
5128         }
5129
5130       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
5131       /* The next register is not of the same class or its offset is not
5132          mergeable with the current one into a pair.  */
5133       if (!satisfies_constraint_Ump (mem)
5134           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
5135           || (aarch64_simd_decl_p (cfun->decl) && FP_REGNUM_P (regno))
5136           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
5137                        GET_MODE_SIZE (mode)))
5138         {
5139           insn = emit_insn (set);
5140           RTX_FRAME_RELATED_P (insn) = 1;
5141           if (prologue_p)
5142             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5143           else
5144             add_reg_note (insn, REG_CFA_RESTORE, reg);
5145
5146           regno = regno2;
5147           continue;
5148         }
5149
5150       /* REGNO2 can be saved/restored in a pair with REGNO.  */
5151       rtx reg2 = gen_rtx_REG (mode, regno2);
5152       if (!frame_pointer_needed)
5153         offset2 += cfun->machine->frame.frame_size
5154                   - cfun->machine->frame.hard_fp_offset;
5155       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
5156       rtx mem2 = gen_frame_mem (mode, addr2);
5157       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
5158                              : gen_rtx_SET (reg2, mem2);
5159
5160       if (prologue_p)
5161         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
5162       else
5163         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5164
5165       RTX_FRAME_RELATED_P (insn) = 1;
5166       if (prologue_p)
5167         {
5168           add_reg_note (insn, REG_CFA_OFFSET, set);
5169           add_reg_note (insn, REG_CFA_OFFSET, set2);
5170         }
5171       else
5172         {
5173           add_reg_note (insn, REG_CFA_RESTORE, reg);
5174           add_reg_note (insn, REG_CFA_RESTORE, reg2);
5175         }
5176
5177       regno = aarch64_get_next_set_bit (components, regno2 + 1);
5178     }
5179 }
5180
5181 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
5182
5183 static void
5184 aarch64_emit_prologue_components (sbitmap components)
5185 {
5186   aarch64_process_components (components, true);
5187 }
5188
5189 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
5190
5191 static void
5192 aarch64_emit_epilogue_components (sbitmap components)
5193 {
5194   aarch64_process_components (components, false);
5195 }
5196
5197 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
5198
5199 static void
5200 aarch64_set_handled_components (sbitmap components)
5201 {
5202   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5203     if (bitmap_bit_p (components, regno))
5204       cfun->machine->reg_is_wrapped_separately[regno] = true;
5205 }
5206
5207 /* On AArch64 we have an ABI defined safe buffer.  This constant is used to
5208    determining the probe offset for alloca.  */
5209
5210 static HOST_WIDE_INT
5211 aarch64_stack_clash_protection_alloca_probe_range (void)
5212 {
5213   return STACK_CLASH_CALLER_GUARD;
5214 }
5215
5216
5217 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
5218    registers.  If POLY_SIZE is not large enough to require a probe this function
5219    will only adjust the stack.  When allocating the stack space
5220    FRAME_RELATED_P is then used to indicate if the allocation is frame related.
5221    FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
5222    arguments.  If we are then we ensure that any allocation larger than the ABI
5223    defined buffer needs a probe so that the invariant of having a 1KB buffer is
5224    maintained.
5225
5226    We emit barriers after each stack adjustment to prevent optimizations from
5227    breaking the invariant that we never drop the stack more than a page.  This
5228    invariant is needed to make it easier to correctly handle asynchronous
5229    events, e.g. if we were to allow the stack to be dropped by more than a page
5230    and then have multiple probes up and we take a signal somewhere in between
5231    then the signal handler doesn't know the state of the stack and can make no
5232    assumptions about which pages have been probed.  */
5233
5234 static void
5235 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
5236                                         poly_int64 poly_size,
5237                                         bool frame_related_p,
5238                                         bool final_adjustment_p)
5239 {
5240   HOST_WIDE_INT guard_size
5241     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5242   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
5243   /* When doing the final adjustment for the outgoing argument size we can't
5244      assume that LR was saved at position 0.  So subtract it's offset from the
5245      ABI safe buffer so that we don't accidentally allow an adjustment that
5246      would result in an allocation larger than the ABI buffer without
5247      probing.  */
5248   HOST_WIDE_INT min_probe_threshold
5249     = final_adjustment_p
5250       ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
5251       : guard_size - guard_used_by_caller;
5252
5253   poly_int64 frame_size = cfun->machine->frame.frame_size;
5254
5255   /* We should always have a positive probe threshold.  */
5256   gcc_assert (min_probe_threshold > 0);
5257
5258   if (flag_stack_clash_protection && !final_adjustment_p)
5259     {
5260       poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5261       poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5262
5263       if (known_eq (frame_size, 0))
5264         {
5265           dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
5266         }
5267       else if (known_lt (initial_adjust, guard_size - guard_used_by_caller)
5268                && known_lt (final_adjust, guard_used_by_caller))
5269         {
5270           dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
5271         }
5272     }
5273
5274   /* If SIZE is not large enough to require probing, just adjust the stack and
5275      exit.  */
5276   if (known_lt (poly_size, min_probe_threshold)
5277       || !flag_stack_clash_protection)
5278     {
5279       aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
5280       return;
5281     }
5282
5283   HOST_WIDE_INT size;
5284   /* Handle the SVE non-constant case first.  */
5285   if (!poly_size.is_constant (&size))
5286     {
5287      if (dump_file)
5288       {
5289         fprintf (dump_file, "Stack clash SVE prologue: ");
5290         print_dec (poly_size, dump_file);
5291         fprintf (dump_file, " bytes, dynamic probing will be required.\n");
5292       }
5293
5294       /* First calculate the amount of bytes we're actually spilling.  */
5295       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
5296                           poly_size, temp1, temp2, false, true);
5297
5298       rtx_insn *insn = get_last_insn ();
5299
5300       if (frame_related_p)
5301         {
5302           /* This is done to provide unwinding information for the stack
5303              adjustments we're about to do, however to prevent the optimizers
5304              from removing the R15 move and leaving the CFA note (which would be
5305              very wrong) we tie the old and new stack pointer together.
5306              The tie will expand to nothing but the optimizers will not touch
5307              the instruction.  */
5308           rtx stack_ptr_copy = gen_rtx_REG (Pmode, R15_REGNUM);
5309           emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
5310           emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
5311
5312           /* We want the CFA independent of the stack pointer for the
5313              duration of the loop.  */
5314           add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
5315           RTX_FRAME_RELATED_P (insn) = 1;
5316         }
5317
5318       rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
5319       rtx guard_const = gen_int_mode (guard_size, Pmode);
5320
5321       insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
5322                                                    stack_pointer_rtx, temp1,
5323                                                    probe_const, guard_const));
5324
5325       /* Now reset the CFA register if needed.  */
5326       if (frame_related_p)
5327         {
5328           add_reg_note (insn, REG_CFA_DEF_CFA,
5329                         gen_rtx_PLUS (Pmode, stack_pointer_rtx,
5330                                       gen_int_mode (poly_size, Pmode)));
5331           RTX_FRAME_RELATED_P (insn) = 1;
5332         }
5333
5334       return;
5335     }
5336
5337   if (dump_file)
5338     fprintf (dump_file,
5339              "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
5340              " bytes, probing will be required.\n", size);
5341
5342   /* Round size to the nearest multiple of guard_size, and calculate the
5343      residual as the difference between the original size and the rounded
5344      size.  */
5345   HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
5346   HOST_WIDE_INT residual = size - rounded_size;
5347
5348   /* We can handle a small number of allocations/probes inline.  Otherwise
5349      punt to a loop.  */
5350   if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
5351     {
5352       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
5353         {
5354           aarch64_sub_sp (NULL, temp2, guard_size, true);
5355           emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
5356                                            guard_used_by_caller));
5357           emit_insn (gen_blockage ());
5358         }
5359       dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
5360     }
5361   else
5362     {
5363       /* Compute the ending address.  */
5364       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
5365                           temp1, NULL, false, true);
5366       rtx_insn *insn = get_last_insn ();
5367
5368       /* For the initial allocation, we don't have a frame pointer
5369          set up, so we always need CFI notes.  If we're doing the
5370          final allocation, then we may have a frame pointer, in which
5371          case it is the CFA, otherwise we need CFI notes.
5372
5373          We can determine which allocation we are doing by looking at
5374          the value of FRAME_RELATED_P since the final allocations are not
5375          frame related.  */
5376       if (frame_related_p)
5377         {
5378           /* We want the CFA independent of the stack pointer for the
5379              duration of the loop.  */
5380           add_reg_note (insn, REG_CFA_DEF_CFA,
5381                         plus_constant (Pmode, temp1, rounded_size));
5382           RTX_FRAME_RELATED_P (insn) = 1;
5383         }
5384
5385       /* This allocates and probes the stack.  Note that this re-uses some of
5386          the existing Ada stack protection code.  However we are guaranteed not
5387          to enter the non loop or residual branches of that code.
5388
5389          The non-loop part won't be entered because if our allocation amount
5390          doesn't require a loop, the case above would handle it.
5391
5392          The residual amount won't be entered because TEMP1 is a mutliple of
5393          the allocation size.  The residual will always be 0.  As such, the only
5394          part we are actually using from that code is the loop setup.  The
5395          actual probing is done in aarch64_output_probe_stack_range.  */
5396       insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
5397                                                stack_pointer_rtx, temp1));
5398
5399       /* Now reset the CFA register if needed.  */
5400       if (frame_related_p)
5401         {
5402           add_reg_note (insn, REG_CFA_DEF_CFA,
5403                         plus_constant (Pmode, stack_pointer_rtx, rounded_size));
5404           RTX_FRAME_RELATED_P (insn) = 1;
5405         }
5406
5407       emit_insn (gen_blockage ());
5408       dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
5409     }
5410
5411   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
5412      be probed.  This maintains the requirement that each page is probed at
5413      least once.  For initial probing we probe only if the allocation is
5414      more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
5415      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
5416      GUARD_SIZE.  This works that for any allocation that is large enough to
5417      trigger a probe here, we'll have at least one, and if they're not large
5418      enough for this code to emit anything for them, The page would have been
5419      probed by the saving of FP/LR either by this function or any callees.  If
5420      we don't have any callees then we won't have more stack adjustments and so
5421      are still safe.  */
5422   if (residual)
5423     {
5424       HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
5425       /* If we're doing final adjustments, and we've done any full page
5426          allocations then any residual needs to be probed.  */
5427       if (final_adjustment_p && rounded_size != 0)
5428         min_probe_threshold = 0;
5429       /* If doing a small final adjustment, we always probe at offset 0.
5430          This is done to avoid issues when LR is not at position 0 or when
5431          the final adjustment is smaller than the probing offset.  */
5432       else if (final_adjustment_p && rounded_size == 0)
5433         residual_probe_offset = 0;
5434
5435       aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
5436       if (residual >= min_probe_threshold)
5437         {
5438           if (dump_file)
5439             fprintf (dump_file,
5440                      "Stack clash AArch64 prologue residuals: "
5441                      HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
5442                      "\n", residual);
5443
5444             emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
5445                                              residual_probe_offset));
5446           emit_insn (gen_blockage ());
5447         }
5448     }
5449 }
5450
5451 /* Return 1 if the register is used by the epilogue.  We need to say the
5452    return register is used, but only after epilogue generation is complete.
5453    Note that in the case of sibcalls, the values "used by the epilogue" are
5454    considered live at the start of the called function.
5455
5456    For SIMD functions we need to return 1 for FP registers that are saved and
5457    restored by a function but are not zero in call_used_regs.  If we do not do
5458    this optimizations may remove the restore of the register.  */
5459
5460 int
5461 aarch64_epilogue_uses (int regno)
5462 {
5463   if (epilogue_completed)
5464     {
5465       if (regno == LR_REGNUM)
5466         return 1;
5467       if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno))
5468         return 1;
5469     }
5470   return 0;
5471 }
5472
5473 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
5474    is saved at BASE + OFFSET.  */
5475
5476 static void
5477 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
5478                             rtx base, poly_int64 offset)
5479 {
5480   rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
5481   add_reg_note (insn, REG_CFA_EXPRESSION,
5482                 gen_rtx_SET (mem, regno_reg_rtx[reg]));
5483 }
5484
5485 /* AArch64 stack frames generated by this compiler look like:
5486
5487         +-------------------------------+
5488         |                               |
5489         |  incoming stack arguments     |
5490         |                               |
5491         +-------------------------------+
5492         |                               | <-- incoming stack pointer (aligned)
5493         |  callee-allocated save area   |
5494         |  for register varargs         |
5495         |                               |
5496         +-------------------------------+
5497         |  local variables              | <-- frame_pointer_rtx
5498         |                               |
5499         +-------------------------------+
5500         |  padding                      | \
5501         +-------------------------------+  |
5502         |  callee-saved registers       |  | frame.saved_regs_size
5503         +-------------------------------+  |
5504         |  LR'                          |  |
5505         +-------------------------------+  |
5506         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
5507         +-------------------------------+
5508         |  dynamic allocation           |
5509         +-------------------------------+
5510         |  padding                      |
5511         +-------------------------------+
5512         |  outgoing stack arguments     | <-- arg_pointer
5513         |                               |
5514         +-------------------------------+
5515         |                               | <-- stack_pointer_rtx (aligned)
5516
5517    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
5518    but leave frame_pointer_rtx and hard_frame_pointer_rtx
5519    unchanged.
5520
5521    By default for stack-clash we assume the guard is at least 64KB, but this
5522    value is configurable to either 4KB or 64KB.  We also force the guard size to
5523    be the same as the probing interval and both values are kept in sync.
5524
5525    With those assumptions the callee can allocate up to 63KB (or 3KB depending
5526    on the guard size) of stack space without probing.
5527
5528    When probing is needed, we emit a probe at the start of the prologue
5529    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
5530
5531    We have to track how much space has been allocated and the only stores
5532    to the stack we track as implicit probes are the FP/LR stores.
5533
5534    For outgoing arguments we probe if the size is larger than 1KB, such that
5535    the ABI specified buffer is maintained for the next callee.  */
5536
5537 /* Generate the prologue instructions for entry into a function.
5538    Establish the stack frame by decreasing the stack pointer with a
5539    properly calculated size and, if necessary, create a frame record
5540    filled with the values of LR and previous frame pointer.  The
5541    current FP is also set up if it is in use.  */
5542
5543 void
5544 aarch64_expand_prologue (void)
5545 {
5546   poly_int64 frame_size = cfun->machine->frame.frame_size;
5547   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5548   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
5549   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5550   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
5551   unsigned reg1 = cfun->machine->frame.wb_candidate1;
5552   unsigned reg2 = cfun->machine->frame.wb_candidate2;
5553   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
5554   rtx_insn *insn;
5555
5556   /* Sign return address for functions.  */
5557   if (aarch64_return_address_signing_enabled ())
5558     {
5559       insn = emit_insn (gen_pacisp ());
5560       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5561       RTX_FRAME_RELATED_P (insn) = 1;
5562     }
5563
5564   if (flag_stack_usage_info)
5565     current_function_static_stack_size = constant_lower_bound (frame_size);
5566
5567   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
5568     {
5569       if (crtl->is_leaf && !cfun->calls_alloca)
5570         {
5571           if (maybe_gt (frame_size, PROBE_INTERVAL)
5572               && maybe_gt (frame_size, get_stack_check_protect ()))
5573             aarch64_emit_probe_stack_range (get_stack_check_protect (),
5574                                             (frame_size
5575                                              - get_stack_check_protect ()));
5576         }
5577       else if (maybe_gt (frame_size, 0))
5578         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
5579     }
5580
5581   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
5582   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
5583
5584   /* In theory we should never have both an initial adjustment
5585      and a callee save adjustment.  Verify that is the case since the
5586      code below does not handle it for -fstack-clash-protection.  */
5587   gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
5588
5589   /* Will only probe if the initial adjustment is larger than the guard
5590      less the amount of the guard reserved for use by the caller's
5591      outgoing args.  */
5592   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
5593                                           true, false);
5594
5595   if (callee_adjust != 0)
5596     aarch64_push_regs (reg1, reg2, callee_adjust);
5597
5598   if (emit_frame_chain)
5599     {
5600       poly_int64 reg_offset = callee_adjust;
5601       if (callee_adjust == 0)
5602         {
5603           reg1 = R29_REGNUM;
5604           reg2 = R30_REGNUM;
5605           reg_offset = callee_offset;
5606           aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
5607         }
5608       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
5609                           stack_pointer_rtx, callee_offset,
5610                           tmp1_rtx, tmp0_rtx, frame_pointer_needed);
5611       if (frame_pointer_needed && !frame_size.is_constant ())
5612         {
5613           /* Variable-sized frames need to describe the save slot
5614              address using DW_CFA_expression rather than DW_CFA_offset.
5615              This means that, without taking further action, the
5616              locations of the registers that we've already saved would
5617              remain based on the stack pointer even after we redefine
5618              the CFA based on the frame pointer.  We therefore need new
5619              DW_CFA_expressions to re-express the save slots with addresses
5620              based on the frame pointer.  */
5621           rtx_insn *insn = get_last_insn ();
5622           gcc_assert (RTX_FRAME_RELATED_P (insn));
5623
5624           /* Add an explicit CFA definition if this was previously
5625              implicit.  */
5626           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
5627             {
5628               rtx src = plus_constant (Pmode, stack_pointer_rtx,
5629                                        callee_offset);
5630               add_reg_note (insn, REG_CFA_ADJUST_CFA,
5631                             gen_rtx_SET (hard_frame_pointer_rtx, src));
5632             }
5633
5634           /* Change the save slot expressions for the registers that
5635              we've already saved.  */
5636           reg_offset -= callee_offset;
5637           aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
5638                                       reg_offset + UNITS_PER_WORD);
5639           aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
5640                                       reg_offset);
5641         }
5642       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
5643     }
5644
5645   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5646                              callee_adjust != 0 || emit_frame_chain);
5647   if (aarch64_simd_decl_p (cfun->decl))
5648     aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5649                                callee_adjust != 0 || emit_frame_chain);
5650   else
5651     aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5652                                callee_adjust != 0 || emit_frame_chain);
5653
5654   /* We may need to probe the final adjustment if it is larger than the guard
5655      that is assumed by the called.  */
5656   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
5657                                           !frame_pointer_needed, true);
5658 }
5659
5660 /* Return TRUE if we can use a simple_return insn.
5661
5662    This function checks whether the callee saved stack is empty, which
5663    means no restore actions are need. The pro_and_epilogue will use
5664    this to check whether shrink-wrapping opt is feasible.  */
5665
5666 bool
5667 aarch64_use_return_insn_p (void)
5668 {
5669   if (!reload_completed)
5670     return false;
5671
5672   if (crtl->profile)
5673     return false;
5674
5675   return known_eq (cfun->machine->frame.frame_size, 0);
5676 }
5677
5678 /* Return false for non-leaf SIMD functions in order to avoid
5679    shrink-wrapping them.  Doing this will lose the necessary
5680    save/restore of FP registers.  */
5681
5682 bool
5683 aarch64_use_simple_return_insn_p (void)
5684 {
5685   if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf)
5686     return false;
5687
5688   return true;
5689 }
5690
5691 /* Generate the epilogue instructions for returning from a function.
5692    This is almost exactly the reverse of the prolog sequence, except
5693    that we need to insert barriers to avoid scheduling loads that read
5694    from a deallocated stack, and we optimize the unwind records by
5695    emitting them all together if possible.  */
5696 void
5697 aarch64_expand_epilogue (bool for_sibcall)
5698 {
5699   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5700   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
5701   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5702   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
5703   unsigned reg1 = cfun->machine->frame.wb_candidate1;
5704   unsigned reg2 = cfun->machine->frame.wb_candidate2;
5705   rtx cfi_ops = NULL;
5706   rtx_insn *insn;
5707   /* A stack clash protection prologue may not have left EP0_REGNUM or
5708      EP1_REGNUM in a usable state.  The same is true for allocations
5709      with an SVE component, since we then need both temporary registers
5710      for each allocation.  For stack clash we are in a usable state if
5711      the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
5712   HOST_WIDE_INT guard_size
5713     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5714   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
5715
5716   /* We can re-use the registers when the allocation amount is smaller than
5717      guard_size - guard_used_by_caller because we won't be doing any probes
5718      then.  In such situations the register should remain live with the correct
5719      value.  */
5720   bool can_inherit_p = (initial_adjust.is_constant ()
5721                         && final_adjust.is_constant ())
5722                         && (!flag_stack_clash_protection
5723                             || known_lt (initial_adjust,
5724                                          guard_size - guard_used_by_caller));
5725
5726   /* We need to add memory barrier to prevent read from deallocated stack.  */
5727   bool need_barrier_p
5728     = maybe_ne (get_frame_size ()
5729                 + cfun->machine->frame.saved_varargs_size, 0);
5730
5731   /* Emit a barrier to prevent loads from a deallocated stack.  */
5732   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
5733       || cfun->calls_alloca
5734       || crtl->calls_eh_return)
5735     {
5736       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5737       need_barrier_p = false;
5738     }
5739
5740   /* Restore the stack pointer from the frame pointer if it may not
5741      be the same as the stack pointer.  */
5742   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
5743   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
5744   if (frame_pointer_needed
5745       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
5746     /* If writeback is used when restoring callee-saves, the CFA
5747        is restored on the instruction doing the writeback.  */
5748     aarch64_add_offset (Pmode, stack_pointer_rtx,
5749                         hard_frame_pointer_rtx, -callee_offset,
5750                         tmp1_rtx, tmp0_rtx, callee_adjust == 0);
5751   else
5752      /* The case where we need to re-use the register here is very rare, so
5753         avoid the complicated condition and just always emit a move if the
5754         immediate doesn't fit.  */
5755      aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
5756
5757   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5758                                 callee_adjust != 0, &cfi_ops);
5759   if (aarch64_simd_decl_p (cfun->decl))
5760     aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5761                                   callee_adjust != 0, &cfi_ops);
5762   else
5763     aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5764                                   callee_adjust != 0, &cfi_ops);
5765
5766   if (need_barrier_p)
5767     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5768
5769   if (callee_adjust != 0)
5770     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
5771
5772   if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
5773     {
5774       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
5775       insn = get_last_insn ();
5776       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
5777       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
5778       RTX_FRAME_RELATED_P (insn) = 1;
5779       cfi_ops = NULL;
5780     }
5781
5782   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
5783      add restriction on emit_move optimization to leaf functions.  */
5784   aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
5785                   (!can_inherit_p || !crtl->is_leaf
5786                    || df_regs_ever_live_p (EP0_REGNUM)));
5787
5788   if (cfi_ops)
5789     {
5790       /* Emit delayed restores and reset the CFA to be SP.  */
5791       insn = get_last_insn ();
5792       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
5793       REG_NOTES (insn) = cfi_ops;
5794       RTX_FRAME_RELATED_P (insn) = 1;
5795     }
5796
5797   /* We prefer to emit the combined return/authenticate instruction RETAA,
5798      however there are three cases in which we must instead emit an explicit
5799      authentication instruction.
5800
5801         1) Sibcalls don't return in a normal way, so if we're about to call one
5802            we must authenticate.
5803
5804         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5805            generating code for !TARGET_ARMV8_3 we can't use it and must
5806            explicitly authenticate.
5807
5808         3) On an eh_return path we make extra stack adjustments to update the
5809            canonical frame address to be the exception handler's CFA.  We want
5810            to authenticate using the CFA of the function which calls eh_return.
5811     */
5812   if (aarch64_return_address_signing_enabled ()
5813       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
5814     {
5815       insn = emit_insn (gen_autisp ());
5816       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5817       RTX_FRAME_RELATED_P (insn) = 1;
5818     }
5819
5820   /* Stack adjustment for exception handler.  */
5821   if (crtl->calls_eh_return)
5822     {
5823       /* We need to unwind the stack by the offset computed by
5824          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
5825          to be SP; letting the CFA move during this adjustment
5826          is just as correct as retaining the CFA from the body
5827          of the function.  Therefore, do nothing special.  */
5828       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
5829     }
5830
5831   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
5832   if (!for_sibcall)
5833     emit_jump_insn (ret_rtx);
5834 }
5835
5836 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
5837    normally or return to a previous frame after unwinding.
5838
5839    An EH return uses a single shared return sequence.  The epilogue is
5840    exactly like a normal epilogue except that it has an extra input
5841    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5842    that must be applied after the frame has been destroyed.  An extra label
5843    is inserted before the epilogue which initializes this register to zero,
5844    and this is the entry point for a normal return.
5845
5846    An actual EH return updates the return address, initializes the stack
5847    adjustment and jumps directly into the epilogue (bypassing the zeroing
5848    of the adjustment).  Since the return address is typically saved on the
5849    stack when a function makes a call, the saved LR must be updated outside
5850    the epilogue.
5851
5852    This poses problems as the store is generated well before the epilogue,
5853    so the offset of LR is not known yet.  Also optimizations will remove the
5854    store as it appears dead, even after the epilogue is generated (as the
5855    base or offset for loading LR is different in many cases).
5856
5857    To avoid these problems this implementation forces the frame pointer
5858    in eh_return functions so that the location of LR is fixed and known early.
5859    It also marks the store volatile, so no optimization is permitted to
5860    remove the store.  */
5861 rtx
5862 aarch64_eh_return_handler_rtx (void)
5863 {
5864   rtx tmp = gen_frame_mem (Pmode,
5865     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
5866
5867   /* Mark the store volatile, so no optimization is permitted to remove it.  */
5868   MEM_VOLATILE_P (tmp) = true;
5869   return tmp;
5870 }
5871
5872 /* Output code to add DELTA to the first argument, and then jump
5873    to FUNCTION.  Used for C++ multiple inheritance.  */
5874 static void
5875 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
5876                          HOST_WIDE_INT delta,
5877                          HOST_WIDE_INT vcall_offset,
5878                          tree function)
5879 {
5880   /* The this pointer is always in x0.  Note that this differs from
5881      Arm where the this pointer maybe bumped to r1 if r0 is required
5882      to return a pointer to an aggregate.  On AArch64 a result value
5883      pointer will be in x8.  */
5884   int this_regno = R0_REGNUM;
5885   rtx this_rtx, temp0, temp1, addr, funexp;
5886   rtx_insn *insn;
5887
5888   reload_completed = 1;
5889   emit_note (NOTE_INSN_PROLOGUE_END);
5890
5891   this_rtx = gen_rtx_REG (Pmode, this_regno);
5892   temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
5893   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
5894
5895   if (vcall_offset == 0)
5896     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
5897   else
5898     {
5899       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
5900
5901       addr = this_rtx;
5902       if (delta != 0)
5903         {
5904           if (delta >= -256 && delta < 256)
5905             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
5906                                        plus_constant (Pmode, this_rtx, delta));
5907           else
5908             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
5909                                 temp1, temp0, false);
5910         }
5911
5912       if (Pmode == ptr_mode)
5913         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
5914       else
5915         aarch64_emit_move (temp0,
5916                            gen_rtx_ZERO_EXTEND (Pmode,
5917                                                 gen_rtx_MEM (ptr_mode, addr)));
5918
5919       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
5920           addr = plus_constant (Pmode, temp0, vcall_offset);
5921       else
5922         {
5923           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
5924                                           Pmode);
5925           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
5926         }
5927
5928       if (Pmode == ptr_mode)
5929         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
5930       else
5931         aarch64_emit_move (temp1,
5932                            gen_rtx_SIGN_EXTEND (Pmode,
5933                                                 gen_rtx_MEM (ptr_mode, addr)));
5934
5935       emit_insn (gen_add2_insn (this_rtx, temp1));
5936     }
5937
5938   /* Generate a tail call to the target function.  */
5939   if (!TREE_USED (function))
5940     {
5941       assemble_external (function);
5942       TREE_USED (function) = 1;
5943     }
5944   funexp = XEXP (DECL_RTL (function), 0);
5945   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
5946   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
5947   SIBLING_CALL_P (insn) = 1;
5948
5949   insn = get_insns ();
5950   shorten_branches (insn);
5951   final_start_function (insn, file, 1);
5952   final (insn, file, 1);
5953   final_end_function ();
5954
5955   /* Stop pretending to be a post-reload pass.  */
5956   reload_completed = 0;
5957 }
5958
5959 static bool
5960 aarch64_tls_referenced_p (rtx x)
5961 {
5962   if (!TARGET_HAVE_TLS)
5963     return false;
5964   subrtx_iterator::array_type array;
5965   FOR_EACH_SUBRTX (iter, array, x, ALL)
5966     {
5967       const_rtx x = *iter;
5968       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
5969         return true;
5970       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
5971          TLS offsets, not real symbol references.  */
5972       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5973         iter.skip_subrtxes ();
5974     }
5975   return false;
5976 }
5977
5978
5979 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5980    a left shift of 0 or 12 bits.  */
5981 bool
5982 aarch64_uimm12_shift (HOST_WIDE_INT val)
5983 {
5984   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
5985           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
5986           );
5987 }
5988
5989 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
5990    that can be created with a left shift of 0 or 12.  */
5991 static HOST_WIDE_INT
5992 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
5993 {
5994   /* Check to see if the value fits in 24 bits, as that is the maximum we can
5995      handle correctly.  */
5996   gcc_assert ((val & 0xffffff) == val);
5997
5998   if (((val & 0xfff) << 0) == val)
5999     return val;
6000
6001   return val & (0xfff << 12);
6002 }
6003
6004 /* Return true if val is an immediate that can be loaded into a
6005    register by a MOVZ instruction.  */
6006 static bool
6007 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
6008 {
6009   if (GET_MODE_SIZE (mode) > 4)
6010     {
6011       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
6012           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
6013         return 1;
6014     }
6015   else
6016     {
6017       /* Ignore sign extension.  */
6018       val &= (HOST_WIDE_INT) 0xffffffff;
6019     }
6020   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
6021           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
6022 }
6023
6024 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
6025    64-bit (DImode) integer.  */
6026
6027 static unsigned HOST_WIDE_INT
6028 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
6029 {
6030   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
6031   while (size < 64)
6032     {
6033       val &= (HOST_WIDE_INT_1U << size) - 1;
6034       val |= val << size;
6035       size *= 2;
6036     }
6037   return val;
6038 }
6039
6040 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
6041
6042 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
6043   {
6044     0x0000000100000001ull,
6045     0x0001000100010001ull,
6046     0x0101010101010101ull,
6047     0x1111111111111111ull,
6048     0x5555555555555555ull,
6049   };
6050
6051
6052 /* Return true if val is a valid bitmask immediate.  */
6053
6054 bool
6055 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
6056 {
6057   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
6058   int bits;
6059
6060   /* Check for a single sequence of one bits and return quickly if so.
6061      The special cases of all ones and all zeroes returns false.  */
6062   val = aarch64_replicate_bitmask_imm (val_in, mode);
6063   tmp = val + (val & -val);
6064
6065   if (tmp == (tmp & -tmp))
6066     return (val + 1) > 1;
6067
6068   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
6069   if (mode == SImode)
6070     val = (val << 32) | (val & 0xffffffff);
6071
6072   /* Invert if the immediate doesn't start with a zero bit - this means we
6073      only need to search for sequences of one bits.  */
6074   if (val & 1)
6075     val = ~val;
6076
6077   /* Find the first set bit and set tmp to val with the first sequence of one
6078      bits removed.  Return success if there is a single sequence of ones.  */
6079   first_one = val & -val;
6080   tmp = val & (val + first_one);
6081
6082   if (tmp == 0)
6083     return true;
6084
6085   /* Find the next set bit and compute the difference in bit position.  */
6086   next_one = tmp & -tmp;
6087   bits = clz_hwi (first_one) - clz_hwi (next_one);
6088   mask = val ^ tmp;
6089
6090   /* Check the bit position difference is a power of 2, and that the first
6091      sequence of one bits fits within 'bits' bits.  */
6092   if ((mask >> bits) != 0 || bits != (bits & -bits))
6093     return false;
6094
6095   /* Check the sequence of one bits is repeated 64/bits times.  */
6096   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
6097 }
6098
6099 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
6100    Assumed precondition: VAL_IN Is not zero.  */
6101
6102 unsigned HOST_WIDE_INT
6103 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
6104 {
6105   int lowest_bit_set = ctz_hwi (val_in);
6106   int highest_bit_set = floor_log2 (val_in);
6107   gcc_assert (val_in != 0);
6108
6109   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
6110           (HOST_WIDE_INT_1U << lowest_bit_set));
6111 }
6112
6113 /* Create constant where bits outside of lowest bit set to highest bit set
6114    are set to 1.  */
6115
6116 unsigned HOST_WIDE_INT
6117 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
6118 {
6119   return val_in | ~aarch64_and_split_imm1 (val_in);
6120 }
6121
6122 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
6123
6124 bool
6125 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
6126 {
6127   scalar_int_mode int_mode;
6128   if (!is_a <scalar_int_mode> (mode, &int_mode))
6129     return false;
6130
6131   if (aarch64_bitmask_imm (val_in, int_mode))
6132     return false;
6133
6134   if (aarch64_move_imm (val_in, int_mode))
6135     return false;
6136
6137   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
6138
6139   return aarch64_bitmask_imm (imm2, int_mode);
6140 }
6141
6142 /* Return true if val is an immediate that can be loaded into a
6143    register in a single instruction.  */
6144 bool
6145 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
6146 {
6147   scalar_int_mode int_mode;
6148   if (!is_a <scalar_int_mode> (mode, &int_mode))
6149     return false;
6150
6151   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
6152     return 1;
6153   return aarch64_bitmask_imm (val, int_mode);
6154 }
6155
6156 static bool
6157 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
6158 {
6159   rtx base, offset;
6160
6161   if (GET_CODE (x) == HIGH)
6162     return true;
6163
6164   /* There's no way to calculate VL-based values using relocations.  */
6165   subrtx_iterator::array_type array;
6166   FOR_EACH_SUBRTX (iter, array, x, ALL)
6167     if (GET_CODE (*iter) == CONST_POLY_INT)
6168       return true;
6169
6170   split_const (x, &base, &offset);
6171   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
6172     {
6173       if (aarch64_classify_symbol (base, INTVAL (offset))
6174           != SYMBOL_FORCE_TO_MEM)
6175         return true;
6176       else
6177         /* Avoid generating a 64-bit relocation in ILP32; leave
6178            to aarch64_expand_mov_immediate to handle it properly.  */
6179         return mode != ptr_mode;
6180     }
6181
6182   return aarch64_tls_referenced_p (x);
6183 }
6184
6185 /* Implement TARGET_CASE_VALUES_THRESHOLD.
6186    The expansion for a table switch is quite expensive due to the number
6187    of instructions, the table lookup and hard to predict indirect jump.
6188    When optimizing for speed, and -O3 enabled, use the per-core tuning if
6189    set, otherwise use tables for > 16 cases as a tradeoff between size and
6190    performance.  When optimizing for size, use the default setting.  */
6191
6192 static unsigned int
6193 aarch64_case_values_threshold (void)
6194 {
6195   /* Use the specified limit for the number of cases before using jump
6196      tables at higher optimization levels.  */
6197   if (optimize > 2
6198       && selected_cpu->tune->max_case_values != 0)
6199     return selected_cpu->tune->max_case_values;
6200   else
6201     return optimize_size ? default_case_values_threshold () : 17;
6202 }
6203
6204 /* Return true if register REGNO is a valid index register.
6205    STRICT_P is true if REG_OK_STRICT is in effect.  */
6206
6207 bool
6208 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
6209 {
6210   if (!HARD_REGISTER_NUM_P (regno))
6211     {
6212       if (!strict_p)
6213         return true;
6214
6215       if (!reg_renumber)
6216         return false;
6217
6218       regno = reg_renumber[regno];
6219     }
6220   return GP_REGNUM_P (regno);
6221 }
6222
6223 /* Return true if register REGNO is a valid base register for mode MODE.
6224    STRICT_P is true if REG_OK_STRICT is in effect.  */
6225
6226 bool
6227 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
6228 {
6229   if (!HARD_REGISTER_NUM_P (regno))
6230     {
6231       if (!strict_p)
6232         return true;
6233
6234       if (!reg_renumber)
6235         return false;
6236
6237       regno = reg_renumber[regno];
6238     }
6239
6240   /* The fake registers will be eliminated to either the stack or
6241      hard frame pointer, both of which are usually valid base registers.
6242      Reload deals with the cases where the eliminated form isn't valid.  */
6243   return (GP_REGNUM_P (regno)
6244           || regno == SP_REGNUM
6245           || regno == FRAME_POINTER_REGNUM
6246           || regno == ARG_POINTER_REGNUM);
6247 }
6248
6249 /* Return true if X is a valid base register for mode MODE.
6250    STRICT_P is true if REG_OK_STRICT is in effect.  */
6251
6252 static bool
6253 aarch64_base_register_rtx_p (rtx x, bool strict_p)
6254 {
6255   if (!strict_p
6256       && GET_CODE (x) == SUBREG
6257       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
6258     x = SUBREG_REG (x);
6259
6260   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
6261 }
6262
6263 /* Return true if address offset is a valid index.  If it is, fill in INFO
6264    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
6265
6266 static bool
6267 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
6268                         machine_mode mode, bool strict_p)
6269 {
6270   enum aarch64_address_type type;
6271   rtx index;
6272   int shift;
6273
6274   /* (reg:P) */
6275   if ((REG_P (x) || GET_CODE (x) == SUBREG)
6276       && GET_MODE (x) == Pmode)
6277     {
6278       type = ADDRESS_REG_REG;
6279       index = x;
6280       shift = 0;
6281     }
6282   /* (sign_extend:DI (reg:SI)) */
6283   else if ((GET_CODE (x) == SIGN_EXTEND
6284             || GET_CODE (x) == ZERO_EXTEND)
6285            && GET_MODE (x) == DImode
6286            && GET_MODE (XEXP (x, 0)) == SImode)
6287     {
6288       type = (GET_CODE (x) == SIGN_EXTEND)
6289         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6290       index = XEXP (x, 0);
6291       shift = 0;
6292     }
6293   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
6294   else if (GET_CODE (x) == MULT
6295            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6296                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6297            && GET_MODE (XEXP (x, 0)) == DImode
6298            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6299            && CONST_INT_P (XEXP (x, 1)))
6300     {
6301       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6302         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6303       index = XEXP (XEXP (x, 0), 0);
6304       shift = exact_log2 (INTVAL (XEXP (x, 1)));
6305     }
6306   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
6307   else if (GET_CODE (x) == ASHIFT
6308            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6309                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6310            && GET_MODE (XEXP (x, 0)) == DImode
6311            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6312            && CONST_INT_P (XEXP (x, 1)))
6313     {
6314       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6315         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6316       index = XEXP (XEXP (x, 0), 0);
6317       shift = INTVAL (XEXP (x, 1));
6318     }
6319   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
6320   else if ((GET_CODE (x) == SIGN_EXTRACT
6321             || GET_CODE (x) == ZERO_EXTRACT)
6322            && GET_MODE (x) == DImode
6323            && GET_CODE (XEXP (x, 0)) == MULT
6324            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6325            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
6326     {
6327       type = (GET_CODE (x) == SIGN_EXTRACT)
6328         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6329       index = XEXP (XEXP (x, 0), 0);
6330       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
6331       if (INTVAL (XEXP (x, 1)) != 32 + shift
6332           || INTVAL (XEXP (x, 2)) != 0)
6333         shift = -1;
6334     }
6335   /* (and:DI (mult:DI (reg:DI) (const_int scale))
6336      (const_int 0xffffffff<<shift)) */
6337   else if (GET_CODE (x) == AND
6338            && GET_MODE (x) == DImode
6339            && GET_CODE (XEXP (x, 0)) == MULT
6340            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6341            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6342            && CONST_INT_P (XEXP (x, 1)))
6343     {
6344       type = ADDRESS_REG_UXTW;
6345       index = XEXP (XEXP (x, 0), 0);
6346       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
6347       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
6348         shift = -1;
6349     }
6350   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
6351   else if ((GET_CODE (x) == SIGN_EXTRACT
6352             || GET_CODE (x) == ZERO_EXTRACT)
6353            && GET_MODE (x) == DImode
6354            && GET_CODE (XEXP (x, 0)) == ASHIFT
6355            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6356            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
6357     {
6358       type = (GET_CODE (x) == SIGN_EXTRACT)
6359         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6360       index = XEXP (XEXP (x, 0), 0);
6361       shift = INTVAL (XEXP (XEXP (x, 0), 1));
6362       if (INTVAL (XEXP (x, 1)) != 32 + shift
6363           || INTVAL (XEXP (x, 2)) != 0)
6364         shift = -1;
6365     }
6366   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
6367      (const_int 0xffffffff<<shift)) */
6368   else if (GET_CODE (x) == AND
6369            && GET_MODE (x) == DImode
6370            && GET_CODE (XEXP (x, 0)) == ASHIFT
6371            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6372            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6373            && CONST_INT_P (XEXP (x, 1)))
6374     {
6375       type = ADDRESS_REG_UXTW;
6376       index = XEXP (XEXP (x, 0), 0);
6377       shift = INTVAL (XEXP (XEXP (x, 0), 1));
6378       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
6379         shift = -1;
6380     }
6381   /* (mult:P (reg:P) (const_int scale)) */
6382   else if (GET_CODE (x) == MULT
6383            && GET_MODE (x) == Pmode
6384            && GET_MODE (XEXP (x, 0)) == Pmode
6385            && CONST_INT_P (XEXP (x, 1)))
6386     {
6387       type = ADDRESS_REG_REG;
6388       index = XEXP (x, 0);
6389       shift = exact_log2 (INTVAL (XEXP (x, 1)));
6390     }
6391   /* (ashift:P (reg:P) (const_int shift)) */
6392   else if (GET_CODE (x) == ASHIFT
6393            && GET_MODE (x) == Pmode
6394            && GET_MODE (XEXP (x, 0)) == Pmode
6395            && CONST_INT_P (XEXP (x, 1)))
6396     {
6397       type = ADDRESS_REG_REG;
6398       index = XEXP (x, 0);
6399       shift = INTVAL (XEXP (x, 1));
6400     }
6401   else
6402     return false;
6403
6404   if (!strict_p
6405       && GET_CODE (index) == SUBREG
6406       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
6407     index = SUBREG_REG (index);
6408
6409   if (aarch64_sve_data_mode_p (mode))
6410     {
6411       if (type != ADDRESS_REG_REG
6412           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
6413         return false;
6414     }
6415   else
6416     {
6417       if (shift != 0
6418           && !(IN_RANGE (shift, 1, 3)
6419                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
6420         return false;
6421     }
6422
6423   if (REG_P (index)
6424       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
6425     {
6426       info->type = type;
6427       info->offset = index;
6428       info->shift = shift;
6429       return true;
6430     }
6431
6432   return false;
6433 }
6434
6435 /* Return true if MODE is one of the modes for which we
6436    support LDP/STP operations.  */
6437
6438 static bool
6439 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
6440 {
6441   return mode == SImode || mode == DImode
6442          || mode == SFmode || mode == DFmode
6443          || (aarch64_vector_mode_supported_p (mode)
6444              && (known_eq (GET_MODE_SIZE (mode), 8)
6445                  || (known_eq (GET_MODE_SIZE (mode), 16)
6446                     && (aarch64_tune_params.extra_tuning_flags
6447                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
6448 }
6449
6450 /* Return true if REGNO is a virtual pointer register, or an eliminable
6451    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
6452    include stack_pointer or hard_frame_pointer.  */
6453 static bool
6454 virt_or_elim_regno_p (unsigned regno)
6455 {
6456   return ((regno >= FIRST_VIRTUAL_REGISTER
6457            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
6458           || regno == FRAME_POINTER_REGNUM
6459           || regno == ARG_POINTER_REGNUM);
6460 }
6461
6462 /* Return true if X is a valid address of type TYPE for machine mode MODE.
6463    If it is, fill in INFO appropriately.  STRICT_P is true if
6464    REG_OK_STRICT is in effect.  */
6465
6466 bool
6467 aarch64_classify_address (struct aarch64_address_info *info,
6468                           rtx x, machine_mode mode, bool strict_p,
6469                           aarch64_addr_query_type type)
6470 {
6471   enum rtx_code code = GET_CODE (x);
6472   rtx op0, op1;
6473   poly_int64 offset;
6474
6475   HOST_WIDE_INT const_size;
6476
6477   /* On BE, we use load/store pair for all large int mode load/stores.
6478      TI/TFmode may also use a load/store pair.  */
6479   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6480   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
6481   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
6482                             || type == ADDR_QUERY_LDP_STP_N
6483                             || mode == TImode
6484                             || mode == TFmode
6485                             || (BYTES_BIG_ENDIAN && advsimd_struct_p));
6486
6487   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
6488      corresponds to the actual size of the memory being loaded/stored and the
6489      mode of the corresponding addressing mode is half of that.  */
6490   if (type == ADDR_QUERY_LDP_STP_N
6491       && known_eq (GET_MODE_SIZE (mode), 16))
6492     mode = DFmode;
6493
6494   bool allow_reg_index_p = (!load_store_pair_p
6495                             && (known_lt (GET_MODE_SIZE (mode), 16)
6496                                 || vec_flags == VEC_ADVSIMD
6497                                 || vec_flags == VEC_SVE_DATA));
6498
6499   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
6500      [Rn, #offset, MUL VL].  */
6501   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
6502       && (code != REG && code != PLUS))
6503     return false;
6504
6505   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
6506      REG addressing.  */
6507   if (advsimd_struct_p
6508       && !BYTES_BIG_ENDIAN
6509       && (code != POST_INC && code != REG))
6510     return false;
6511
6512   gcc_checking_assert (GET_MODE (x) == VOIDmode
6513                        || SCALAR_INT_MODE_P (GET_MODE (x)));
6514
6515   switch (code)
6516     {
6517     case REG:
6518     case SUBREG:
6519       info->type = ADDRESS_REG_IMM;
6520       info->base = x;
6521       info->offset = const0_rtx;
6522       info->const_offset = 0;
6523       return aarch64_base_register_rtx_p (x, strict_p);
6524
6525     case PLUS:
6526       op0 = XEXP (x, 0);
6527       op1 = XEXP (x, 1);
6528
6529       if (! strict_p
6530           && REG_P (op0)
6531           && virt_or_elim_regno_p (REGNO (op0))
6532           && poly_int_rtx_p (op1, &offset))
6533         {
6534           info->type = ADDRESS_REG_IMM;
6535           info->base = op0;
6536           info->offset = op1;
6537           info->const_offset = offset;
6538
6539           return true;
6540         }
6541
6542       if (maybe_ne (GET_MODE_SIZE (mode), 0)
6543           && aarch64_base_register_rtx_p (op0, strict_p)
6544           && poly_int_rtx_p (op1, &offset))
6545         {
6546           info->type = ADDRESS_REG_IMM;
6547           info->base = op0;
6548           info->offset = op1;
6549           info->const_offset = offset;
6550
6551           /* TImode and TFmode values are allowed in both pairs of X
6552              registers and individual Q registers.  The available
6553              address modes are:
6554              X,X: 7-bit signed scaled offset
6555              Q:   9-bit signed offset
6556              We conservatively require an offset representable in either mode.
6557              When performing the check for pairs of X registers i.e.  LDP/STP
6558              pass down DImode since that is the natural size of the LDP/STP
6559              instruction memory accesses.  */
6560           if (mode == TImode || mode == TFmode)
6561             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
6562                     && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
6563                         || offset_12bit_unsigned_scaled_p (mode, offset)));
6564
6565           /* A 7bit offset check because OImode will emit a ldp/stp
6566              instruction (only big endian will get here).
6567              For ldp/stp instructions, the offset is scaled for the size of a
6568              single element of the pair.  */
6569           if (mode == OImode)
6570             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
6571
6572           /* Three 9/12 bit offsets checks because CImode will emit three
6573              ldr/str instructions (only big endian will get here).  */
6574           if (mode == CImode)
6575             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
6576                     && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
6577                                                                offset + 32)
6578                         || offset_12bit_unsigned_scaled_p (V16QImode,
6579                                                            offset + 32)));
6580
6581           /* Two 7bit offsets checks because XImode will emit two ldp/stp
6582              instructions (only big endian will get here).  */
6583           if (mode == XImode)
6584             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
6585                     && aarch64_offset_7bit_signed_scaled_p (TImode,
6586                                                             offset + 32));
6587
6588           /* Make "m" use the LD1 offset range for SVE data modes, so
6589              that pre-RTL optimizers like ivopts will work to that
6590              instead of the wider LDR/STR range.  */
6591           if (vec_flags == VEC_SVE_DATA)
6592             return (type == ADDR_QUERY_M
6593                     ? offset_4bit_signed_scaled_p (mode, offset)
6594                     : offset_9bit_signed_scaled_p (mode, offset));
6595
6596           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
6597             {
6598               poly_int64 end_offset = (offset
6599                                        + GET_MODE_SIZE (mode)
6600                                        - BYTES_PER_SVE_VECTOR);
6601               return (type == ADDR_QUERY_M
6602                       ? offset_4bit_signed_scaled_p (mode, offset)
6603                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
6604                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
6605                                                          end_offset)));
6606             }
6607
6608           if (vec_flags == VEC_SVE_PRED)
6609             return offset_9bit_signed_scaled_p (mode, offset);
6610
6611           if (load_store_pair_p)
6612             return ((known_eq (GET_MODE_SIZE (mode), 4)
6613                      || known_eq (GET_MODE_SIZE (mode), 8)
6614                      || known_eq (GET_MODE_SIZE (mode), 16))
6615                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
6616           else
6617             return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
6618                     || offset_12bit_unsigned_scaled_p (mode, offset));
6619         }
6620
6621       if (allow_reg_index_p)
6622         {
6623           /* Look for base + (scaled/extended) index register.  */
6624           if (aarch64_base_register_rtx_p (op0, strict_p)
6625               && aarch64_classify_index (info, op1, mode, strict_p))
6626             {
6627               info->base = op0;
6628               return true;
6629             }
6630           if (aarch64_base_register_rtx_p (op1, strict_p)
6631               && aarch64_classify_index (info, op0, mode, strict_p))
6632             {
6633               info->base = op1;
6634               return true;
6635             }
6636         }
6637
6638       return false;
6639
6640     case POST_INC:
6641     case POST_DEC:
6642     case PRE_INC:
6643     case PRE_DEC:
6644       info->type = ADDRESS_REG_WB;
6645       info->base = XEXP (x, 0);
6646       info->offset = NULL_RTX;
6647       return aarch64_base_register_rtx_p (info->base, strict_p);
6648
6649     case POST_MODIFY:
6650     case PRE_MODIFY:
6651       info->type = ADDRESS_REG_WB;
6652       info->base = XEXP (x, 0);
6653       if (GET_CODE (XEXP (x, 1)) == PLUS
6654           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
6655           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
6656           && aarch64_base_register_rtx_p (info->base, strict_p))
6657         {
6658           info->offset = XEXP (XEXP (x, 1), 1);
6659           info->const_offset = offset;
6660
6661           /* TImode and TFmode values are allowed in both pairs of X
6662              registers and individual Q registers.  The available
6663              address modes are:
6664              X,X: 7-bit signed scaled offset
6665              Q:   9-bit signed offset
6666              We conservatively require an offset representable in either mode.
6667            */
6668           if (mode == TImode || mode == TFmode)
6669             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
6670                     && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
6671
6672           if (load_store_pair_p)
6673             return ((known_eq (GET_MODE_SIZE (mode), 4)
6674                      || known_eq (GET_MODE_SIZE (mode), 8)
6675                      || known_eq (GET_MODE_SIZE (mode), 16))
6676                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
6677           else
6678             return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
6679         }
6680       return false;
6681
6682     case CONST:
6683     case SYMBOL_REF:
6684     case LABEL_REF:
6685       /* load literal: pc-relative constant pool entry.  Only supported
6686          for SI mode or larger.  */
6687       info->type = ADDRESS_SYMBOLIC;
6688
6689       if (!load_store_pair_p
6690           && GET_MODE_SIZE (mode).is_constant (&const_size)
6691           && const_size >= 4)
6692         {
6693           rtx sym, addend;
6694
6695           split_const (x, &sym, &addend);
6696           return ((GET_CODE (sym) == LABEL_REF
6697                    || (GET_CODE (sym) == SYMBOL_REF
6698                        && CONSTANT_POOL_ADDRESS_P (sym)
6699                        && aarch64_pcrelative_literal_loads)));
6700         }
6701       return false;
6702
6703     case LO_SUM:
6704       info->type = ADDRESS_LO_SUM;
6705       info->base = XEXP (x, 0);
6706       info->offset = XEXP (x, 1);
6707       if (allow_reg_index_p
6708           && aarch64_base_register_rtx_p (info->base, strict_p))
6709         {
6710           rtx sym, offs;
6711           split_const (info->offset, &sym, &offs);
6712           if (GET_CODE (sym) == SYMBOL_REF
6713               && (aarch64_classify_symbol (sym, INTVAL (offs))
6714                   == SYMBOL_SMALL_ABSOLUTE))
6715             {
6716               /* The symbol and offset must be aligned to the access size.  */
6717               unsigned int align;
6718
6719               if (CONSTANT_POOL_ADDRESS_P (sym))
6720                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
6721               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
6722                 {
6723                   tree exp = SYMBOL_REF_DECL (sym);
6724                   align = TYPE_ALIGN (TREE_TYPE (exp));
6725                   align = aarch64_constant_alignment (exp, align);
6726                 }
6727               else if (SYMBOL_REF_DECL (sym))
6728                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
6729               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
6730                        && SYMBOL_REF_BLOCK (sym) != NULL)
6731                 align = SYMBOL_REF_BLOCK (sym)->alignment;
6732               else
6733                 align = BITS_PER_UNIT;
6734
6735               poly_int64 ref_size = GET_MODE_SIZE (mode);
6736               if (known_eq (ref_size, 0))
6737                 ref_size = GET_MODE_SIZE (DImode);
6738
6739               return (multiple_p (INTVAL (offs), ref_size)
6740                       && multiple_p (align / BITS_PER_UNIT, ref_size));
6741             }
6742         }
6743       return false;
6744
6745     default:
6746       return false;
6747     }
6748 }
6749
6750 /* Return true if the address X is valid for a PRFM instruction.
6751    STRICT_P is true if we should do strict checking with
6752    aarch64_classify_address.  */
6753
6754 bool
6755 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
6756 {
6757   struct aarch64_address_info addr;
6758
6759   /* PRFM accepts the same addresses as DImode...  */
6760   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
6761   if (!res)
6762     return false;
6763
6764   /* ... except writeback forms.  */
6765   return addr.type != ADDRESS_REG_WB;
6766 }
6767
6768 bool
6769 aarch64_symbolic_address_p (rtx x)
6770 {
6771   rtx offset;
6772
6773   split_const (x, &x, &offset);
6774   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
6775 }
6776
6777 /* Classify the base of symbolic expression X.  */
6778
6779 enum aarch64_symbol_type
6780 aarch64_classify_symbolic_expression (rtx x)
6781 {
6782   rtx offset;
6783
6784   split_const (x, &x, &offset);
6785   return aarch64_classify_symbol (x, INTVAL (offset));
6786 }
6787
6788
6789 /* Return TRUE if X is a legitimate address for accessing memory in
6790    mode MODE.  */
6791 static bool
6792 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
6793 {
6794   struct aarch64_address_info addr;
6795
6796   return aarch64_classify_address (&addr, x, mode, strict_p);
6797 }
6798
6799 /* Return TRUE if X is a legitimate address of type TYPE for accessing
6800    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
6801 bool
6802 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
6803                               aarch64_addr_query_type type)
6804 {
6805   struct aarch64_address_info addr;
6806
6807   return aarch64_classify_address (&addr, x, mode, strict_p, type);
6808 }
6809
6810 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
6811
6812 static bool
6813 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
6814                                          poly_int64 orig_offset,
6815                                          machine_mode mode)
6816 {
6817   HOST_WIDE_INT size;
6818   if (GET_MODE_SIZE (mode).is_constant (&size))
6819     {
6820       HOST_WIDE_INT const_offset, second_offset;
6821
6822       /* A general SVE offset is A * VQ + B.  Remove the A component from
6823          coefficient 0 in order to get the constant B.  */
6824       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
6825
6826       /* Split an out-of-range address displacement into a base and
6827          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
6828          range otherwise to increase opportunities for sharing the base
6829          address of different sizes.  Unaligned accesses use the signed
6830          9-bit range, TImode/TFmode use the intersection of signed
6831          scaled 7-bit and signed 9-bit offset.  */
6832       if (mode == TImode || mode == TFmode)
6833         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
6834       else if ((const_offset & (size - 1)) != 0)
6835         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
6836       else
6837         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
6838
6839       if (second_offset == 0 || known_eq (orig_offset, second_offset))
6840         return false;
6841
6842       /* Split the offset into second_offset and the rest.  */
6843       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6844       *offset2 = gen_int_mode (second_offset, Pmode);
6845       return true;
6846     }
6847   else
6848     {
6849       /* Get the mode we should use as the basis of the range.  For structure
6850          modes this is the mode of one vector.  */
6851       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6852       machine_mode step_mode
6853         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
6854
6855       /* Get the "mul vl" multiplier we'd like to use.  */
6856       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
6857       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
6858       if (vec_flags & VEC_SVE_DATA)
6859         /* LDR supports a 9-bit range, but the move patterns for
6860            structure modes require all vectors to be in range of the
6861            same base.  The simplest way of accomodating that while still
6862            promoting reuse of anchor points between different modes is
6863            to use an 8-bit range unconditionally.  */
6864         vnum = ((vnum + 128) & 255) - 128;
6865       else
6866         /* Predicates are only handled singly, so we might as well use
6867            the full range.  */
6868         vnum = ((vnum + 256) & 511) - 256;
6869       if (vnum == 0)
6870         return false;
6871
6872       /* Convert the "mul vl" multiplier into a byte offset.  */
6873       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
6874       if (known_eq (second_offset, orig_offset))
6875         return false;
6876
6877       /* Split the offset into second_offset and the rest.  */
6878       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6879       *offset2 = gen_int_mode (second_offset, Pmode);
6880       return true;
6881     }
6882 }
6883
6884 /* Return the binary representation of floating point constant VALUE in INTVAL.
6885    If the value cannot be converted, return false without setting INTVAL.
6886    The conversion is done in the given MODE.  */
6887 bool
6888 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
6889 {
6890
6891   /* We make a general exception for 0.  */
6892   if (aarch64_float_const_zero_rtx_p (value))
6893     {
6894       *intval = 0;
6895       return true;
6896     }
6897
6898   scalar_float_mode mode;
6899   if (GET_CODE (value) != CONST_DOUBLE
6900       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
6901       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
6902       /* Only support up to DF mode.  */
6903       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
6904     return false;
6905
6906   unsigned HOST_WIDE_INT ival = 0;
6907
6908   long res[2];
6909   real_to_target (res,
6910                   CONST_DOUBLE_REAL_VALUE (value),
6911                   REAL_MODE_FORMAT (mode));
6912
6913   if (mode == DFmode)
6914     {
6915       int order = BYTES_BIG_ENDIAN ? 1 : 0;
6916       ival = zext_hwi (res[order], 32);
6917       ival |= (zext_hwi (res[1 - order], 32) << 32);
6918     }
6919   else
6920       ival = zext_hwi (res[0], 32);
6921
6922   *intval = ival;
6923   return true;
6924 }
6925
6926 /* Return TRUE if rtx X is an immediate constant that can be moved using a
6927    single MOV(+MOVK) followed by an FMOV.  */
6928 bool
6929 aarch64_float_const_rtx_p (rtx x)
6930 {
6931   machine_mode mode = GET_MODE (x);
6932   if (mode == VOIDmode)
6933     return false;
6934
6935   /* Determine whether it's cheaper to write float constants as
6936      mov/movk pairs over ldr/adrp pairs.  */
6937   unsigned HOST_WIDE_INT ival;
6938
6939   if (GET_CODE (x) == CONST_DOUBLE
6940       && SCALAR_FLOAT_MODE_P (mode)
6941       && aarch64_reinterpret_float_as_int (x, &ival))
6942     {
6943       scalar_int_mode imode = (mode == HFmode
6944                                ? SImode
6945                                : int_mode_for_mode (mode).require ());
6946       int num_instr = aarch64_internal_mov_immediate
6947                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
6948       return num_instr < 3;
6949     }
6950
6951   return false;
6952 }
6953
6954 /* Return TRUE if rtx X is immediate constant 0.0 */
6955 bool
6956 aarch64_float_const_zero_rtx_p (rtx x)
6957 {
6958   if (GET_MODE (x) == VOIDmode)
6959     return false;
6960
6961   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
6962     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
6963   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
6964 }
6965
6966 /* Return TRUE if rtx X is immediate constant that fits in a single
6967    MOVI immediate operation.  */
6968 bool
6969 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
6970 {
6971   if (!TARGET_SIMD)
6972      return false;
6973
6974   machine_mode vmode;
6975   scalar_int_mode imode;
6976   unsigned HOST_WIDE_INT ival;
6977
6978   if (GET_CODE (x) == CONST_DOUBLE
6979       && SCALAR_FLOAT_MODE_P (mode))
6980     {
6981       if (!aarch64_reinterpret_float_as_int (x, &ival))
6982         return false;
6983
6984       /* We make a general exception for 0.  */
6985       if (aarch64_float_const_zero_rtx_p (x))
6986         return true;
6987
6988       imode = int_mode_for_mode (mode).require ();
6989     }
6990   else if (GET_CODE (x) == CONST_INT
6991            && is_a <scalar_int_mode> (mode, &imode))
6992     ival = INTVAL (x);
6993   else
6994     return false;
6995
6996    /* use a 64 bit mode for everything except for DI/DF mode, where we use
6997      a 128 bit vector mode.  */
6998   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
6999
7000   vmode = aarch64_simd_container_mode (imode, width);
7001   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
7002
7003   return aarch64_simd_valid_immediate (v_op, NULL);
7004 }
7005
7006
7007 /* Return the fixed registers used for condition codes.  */
7008
7009 static bool
7010 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
7011 {
7012   *p1 = CC_REGNUM;
7013   *p2 = INVALID_REGNUM;
7014   return true;
7015 }
7016
7017 /* This function is used by the call expanders of the machine description.
7018    RESULT is the register in which the result is returned.  It's NULL for
7019    "call" and "sibcall".
7020    MEM is the location of the function call.
7021    SIBCALL indicates whether this function call is normal call or sibling call.
7022    It will generate different pattern accordingly.  */
7023
7024 void
7025 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
7026 {
7027   rtx call, callee, tmp;
7028   rtvec vec;
7029   machine_mode mode;
7030
7031   gcc_assert (MEM_P (mem));
7032   callee = XEXP (mem, 0);
7033   mode = GET_MODE (callee);
7034   gcc_assert (mode == Pmode);
7035
7036   /* Decide if we should generate indirect calls by loading the
7037      address of the callee into a register before performing
7038      the branch-and-link.  */
7039   if (SYMBOL_REF_P (callee)
7040       ? (aarch64_is_long_call_p (callee)
7041          || aarch64_is_noplt_call_p (callee))
7042       : !REG_P (callee))
7043     XEXP (mem, 0) = force_reg (mode, callee);
7044
7045   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
7046
7047   if (result != NULL_RTX)
7048     call = gen_rtx_SET (result, call);
7049
7050   if (sibcall)
7051     tmp = ret_rtx;
7052   else
7053     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
7054
7055   vec = gen_rtvec (2, call, tmp);
7056   call = gen_rtx_PARALLEL (VOIDmode, vec);
7057
7058   aarch64_emit_call_insn (call);
7059 }
7060
7061 /* Emit call insn with PAT and do aarch64-specific handling.  */
7062
7063 void
7064 aarch64_emit_call_insn (rtx pat)
7065 {
7066   rtx insn = emit_call_insn (pat);
7067
7068   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
7069   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
7070   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
7071 }
7072
7073 machine_mode
7074 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
7075 {
7076   /* All floating point compares return CCFP if it is an equality
7077      comparison, and CCFPE otherwise.  */
7078   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
7079     {
7080       switch (code)
7081         {
7082         case EQ:
7083         case NE:
7084         case UNORDERED:
7085         case ORDERED:
7086         case UNLT:
7087         case UNLE:
7088         case UNGT:
7089         case UNGE:
7090         case UNEQ:
7091           return CCFPmode;
7092
7093         case LT:
7094         case LE:
7095         case GT:
7096         case GE:
7097         case LTGT:
7098           return CCFPEmode;
7099
7100         default:
7101           gcc_unreachable ();
7102         }
7103     }
7104
7105   /* Equality comparisons of short modes against zero can be performed
7106      using the TST instruction with the appropriate bitmask.  */
7107   if (y == const0_rtx && REG_P (x)
7108       && (code == EQ || code == NE)
7109       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
7110     return CC_NZmode;
7111
7112   /* Similarly, comparisons of zero_extends from shorter modes can
7113      be performed using an ANDS with an immediate mask.  */
7114   if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
7115       && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
7116       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
7117       && (code == EQ || code == NE))
7118     return CC_NZmode;
7119
7120   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
7121       && y == const0_rtx
7122       && (code == EQ || code == NE || code == LT || code == GE)
7123       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
7124           || GET_CODE (x) == NEG
7125           || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
7126               && CONST_INT_P (XEXP (x, 2)))))
7127     return CC_NZmode;
7128
7129   /* A compare with a shifted operand.  Because of canonicalization,
7130      the comparison will have to be swapped when we emit the assembly
7131      code.  */
7132   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
7133       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
7134       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
7135           || GET_CODE (x) == LSHIFTRT
7136           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
7137     return CC_SWPmode;
7138
7139   /* Similarly for a negated operand, but we can only do this for
7140      equalities.  */
7141   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
7142       && (REG_P (y) || GET_CODE (y) == SUBREG)
7143       && (code == EQ || code == NE)
7144       && GET_CODE (x) == NEG)
7145     return CC_Zmode;
7146
7147   /* A test for unsigned overflow.  */
7148   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
7149       && code == NE
7150       && GET_CODE (x) == PLUS
7151       && GET_CODE (y) == ZERO_EXTEND)
7152     return CC_Cmode;
7153
7154   /* A test for signed overflow.  */
7155   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
7156       && code == NE
7157       && GET_CODE (x) == PLUS
7158       && GET_CODE (y) == SIGN_EXTEND)
7159     return CC_Vmode;
7160
7161   /* For everything else, return CCmode.  */
7162   return CCmode;
7163 }
7164
7165 static int
7166 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
7167
7168 int
7169 aarch64_get_condition_code (rtx x)
7170 {
7171   machine_mode mode = GET_MODE (XEXP (x, 0));
7172   enum rtx_code comp_code = GET_CODE (x);
7173
7174   if (GET_MODE_CLASS (mode) != MODE_CC)
7175     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
7176   return aarch64_get_condition_code_1 (mode, comp_code);
7177 }
7178
7179 static int
7180 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
7181 {
7182   switch (mode)
7183     {
7184     case E_CCFPmode:
7185     case E_CCFPEmode:
7186       switch (comp_code)
7187         {
7188         case GE: return AARCH64_GE;
7189         case GT: return AARCH64_GT;
7190         case LE: return AARCH64_LS;
7191         case LT: return AARCH64_MI;
7192         case NE: return AARCH64_NE;
7193         case EQ: return AARCH64_EQ;
7194         case ORDERED: return AARCH64_VC;
7195         case UNORDERED: return AARCH64_VS;
7196         case UNLT: return AARCH64_LT;
7197         case UNLE: return AARCH64_LE;
7198         case UNGT: return AARCH64_HI;
7199         case UNGE: return AARCH64_PL;
7200         default: return -1;
7201         }
7202       break;
7203
7204     case E_CCmode:
7205       switch (comp_code)
7206         {
7207         case NE: return AARCH64_NE;
7208         case EQ: return AARCH64_EQ;
7209         case GE: return AARCH64_GE;
7210         case GT: return AARCH64_GT;
7211         case LE: return AARCH64_LE;
7212         case LT: return AARCH64_LT;
7213         case GEU: return AARCH64_CS;
7214         case GTU: return AARCH64_HI;
7215         case LEU: return AARCH64_LS;
7216         case LTU: return AARCH64_CC;
7217         default: return -1;
7218         }
7219       break;
7220
7221     case E_CC_SWPmode:
7222       switch (comp_code)
7223         {
7224         case NE: return AARCH64_NE;
7225         case EQ: return AARCH64_EQ;
7226         case GE: return AARCH64_LE;
7227         case GT: return AARCH64_LT;
7228         case LE: return AARCH64_GE;
7229         case LT: return AARCH64_GT;
7230         case GEU: return AARCH64_LS;
7231         case GTU: return AARCH64_CC;
7232         case LEU: return AARCH64_CS;
7233         case LTU: return AARCH64_HI;
7234         default: return -1;
7235         }
7236       break;
7237
7238     case E_CC_NZmode:
7239       switch (comp_code)
7240         {
7241         case NE: return AARCH64_NE;
7242         case EQ: return AARCH64_EQ;
7243         case GE: return AARCH64_PL;
7244         case LT: return AARCH64_MI;
7245         default: return -1;
7246         }
7247       break;
7248
7249     case E_CC_Zmode:
7250       switch (comp_code)
7251         {
7252         case NE: return AARCH64_NE;
7253         case EQ: return AARCH64_EQ;
7254         default: return -1;
7255         }
7256       break;
7257
7258     case E_CC_Cmode:
7259       switch (comp_code)
7260         {
7261         case NE: return AARCH64_CS;
7262         case EQ: return AARCH64_CC;
7263         default: return -1;
7264         }
7265       break;
7266
7267     case E_CC_Vmode:
7268       switch (comp_code)
7269         {
7270         case NE: return AARCH64_VS;
7271         case EQ: return AARCH64_VC;
7272         default: return -1;
7273         }
7274       break;
7275
7276     default:
7277       return -1;
7278     }
7279
7280   return -1;
7281 }
7282
7283 bool
7284 aarch64_const_vec_all_same_in_range_p (rtx x,
7285                                        HOST_WIDE_INT minval,
7286                                        HOST_WIDE_INT maxval)
7287 {
7288   rtx elt;
7289   return (const_vec_duplicate_p (x, &elt)
7290           && CONST_INT_P (elt)
7291           && IN_RANGE (INTVAL (elt), minval, maxval));
7292 }
7293
7294 bool
7295 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
7296 {
7297   return aarch64_const_vec_all_same_in_range_p (x, val, val);
7298 }
7299
7300 /* Return true if VEC is a constant in which every element is in the range
7301    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
7302
7303 static bool
7304 aarch64_const_vec_all_in_range_p (rtx vec,
7305                                   HOST_WIDE_INT minval,
7306                                   HOST_WIDE_INT maxval)
7307 {
7308   if (GET_CODE (vec) != CONST_VECTOR
7309       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
7310     return false;
7311
7312   int nunits;
7313   if (!CONST_VECTOR_STEPPED_P (vec))
7314     nunits = const_vector_encoded_nelts (vec);
7315   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
7316     return false;
7317
7318   for (int i = 0; i < nunits; i++)
7319     {
7320       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
7321       if (!CONST_INT_P (vec_elem)
7322           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
7323         return false;
7324     }
7325   return true;
7326 }
7327
7328 /* N Z C V.  */
7329 #define AARCH64_CC_V 1
7330 #define AARCH64_CC_C (1 << 1)
7331 #define AARCH64_CC_Z (1 << 2)
7332 #define AARCH64_CC_N (1 << 3)
7333
7334 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
7335 static const int aarch64_nzcv_codes[] =
7336 {
7337   0,            /* EQ, Z == 1.  */
7338   AARCH64_CC_Z, /* NE, Z == 0.  */
7339   0,            /* CS, C == 1.  */
7340   AARCH64_CC_C, /* CC, C == 0.  */
7341   0,            /* MI, N == 1.  */
7342   AARCH64_CC_N, /* PL, N == 0.  */
7343   0,            /* VS, V == 1.  */
7344   AARCH64_CC_V, /* VC, V == 0.  */
7345   0,            /* HI, C ==1 && Z == 0.  */
7346   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
7347   AARCH64_CC_V, /* GE, N == V.  */
7348   0,            /* LT, N != V.  */
7349   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
7350   0,            /* LE, !(Z == 0 && N == V).  */
7351   0,            /* AL, Any.  */
7352   0             /* NV, Any.  */
7353 };
7354
7355 /* Print floating-point vector immediate operand X to F, negating it
7356    first if NEGATE is true.  Return true on success, false if it isn't
7357    a constant we can handle.  */
7358
7359 static bool
7360 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
7361 {
7362   rtx elt;
7363
7364   if (!const_vec_duplicate_p (x, &elt))
7365     return false;
7366
7367   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
7368   if (negate)
7369     r = real_value_negate (&r);
7370
7371   /* We only handle the SVE single-bit immediates here.  */
7372   if (real_equal (&r, &dconst0))
7373     asm_fprintf (f, "0.0");
7374   else if (real_equal (&r, &dconst1))
7375     asm_fprintf (f, "1.0");
7376   else if (real_equal (&r, &dconsthalf))
7377     asm_fprintf (f, "0.5");
7378   else
7379     return false;
7380
7381   return true;
7382 }
7383
7384 /* Return the equivalent letter for size.  */
7385 static char
7386 sizetochar (int size)
7387 {
7388   switch (size)
7389     {
7390     case 64: return 'd';
7391     case 32: return 's';
7392     case 16: return 'h';
7393     case 8 : return 'b';
7394     default: gcc_unreachable ();
7395     }
7396 }
7397
7398 /* Print operand X to file F in a target specific manner according to CODE.
7399    The acceptable formatting commands given by CODE are:
7400      'c':               An integer or symbol address without a preceding #
7401                         sign.
7402      'C':               Take the duplicated element in a vector constant
7403                         and print it in hex.
7404      'D':               Take the duplicated element in a vector constant
7405                         and print it as an unsigned integer, in decimal.
7406      'e':               Print the sign/zero-extend size as a character 8->b,
7407                         16->h, 32->w.
7408      'p':               Prints N such that 2^N == X (X must be power of 2 and
7409                         const int).
7410      'P':               Print the number of non-zero bits in X (a const_int).
7411      'H':               Print the higher numbered register of a pair (TImode)
7412                         of regs.
7413      'm':               Print a condition (eq, ne, etc).
7414      'M':               Same as 'm', but invert condition.
7415      'N':               Take the duplicated element in a vector constant
7416                         and print the negative of it in decimal.
7417      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
7418      'S/T/U/V':         Print a FP/SIMD register name for a register list.
7419                         The register printed is the FP/SIMD register name
7420                         of X + 0/1/2/3 for S/T/U/V.
7421      'R':               Print a scalar FP/SIMD register name + 1.
7422      'X':               Print bottom 16 bits of integer constant in hex.
7423      'w/x':             Print a general register name or the zero register
7424                         (32-bit or 64-bit).
7425      '0':               Print a normal operand, if it's a general register,
7426                         then we assume DImode.
7427      'k':               Print NZCV for conditional compare instructions.
7428      'A':               Output address constant representing the first
7429                         argument of X, specifying a relocation offset
7430                         if appropriate.
7431      'L':               Output constant address specified by X
7432                         with a relocation offset if appropriate.
7433      'G':               Prints address of X, specifying a PC relative
7434                         relocation mode if appropriate.
7435      'y':               Output address of LDP or STP - this is used for
7436                         some LDP/STPs which don't use a PARALLEL in their
7437                         pattern (so the mode needs to be adjusted).
7438      'z':               Output address of a typical LDP or STP.  */
7439
7440 static void
7441 aarch64_print_operand (FILE *f, rtx x, int code)
7442 {
7443   rtx elt;
7444   switch (code)
7445     {
7446     case 'c':
7447       switch (GET_CODE (x))
7448         {
7449         case CONST_INT:
7450           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
7451           break;
7452
7453         case SYMBOL_REF:
7454           output_addr_const (f, x);
7455           break;
7456
7457         case CONST:
7458           if (GET_CODE (XEXP (x, 0)) == PLUS
7459               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
7460             {
7461               output_addr_const (f, x);
7462               break;
7463             }
7464           /* Fall through.  */
7465
7466         default:
7467           output_operand_lossage ("unsupported operand for code '%c'", code);
7468         }
7469       break;
7470
7471     case 'e':
7472       {
7473         int n;
7474
7475         if (!CONST_INT_P (x)
7476             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
7477           {
7478             output_operand_lossage ("invalid operand for '%%%c'", code);
7479             return;
7480           }
7481
7482         switch (n)
7483           {
7484           case 3:
7485             fputc ('b', f);
7486             break;
7487           case 4:
7488             fputc ('h', f);
7489             break;
7490           case 5:
7491             fputc ('w', f);
7492             break;
7493           default:
7494             output_operand_lossage ("invalid operand for '%%%c'", code);
7495             return;
7496           }
7497       }
7498       break;
7499
7500     case 'p':
7501       {
7502         int n;
7503
7504         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
7505           {
7506             output_operand_lossage ("invalid operand for '%%%c'", code);
7507             return;
7508           }
7509
7510         asm_fprintf (f, "%d", n);
7511       }
7512       break;
7513
7514     case 'P':
7515       if (!CONST_INT_P (x))
7516         {
7517           output_operand_lossage ("invalid operand for '%%%c'", code);
7518           return;
7519         }
7520
7521       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
7522       break;
7523
7524     case 'H':
7525       if (x == const0_rtx)
7526         {
7527           asm_fprintf (f, "xzr");
7528           break;
7529         }
7530
7531       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
7532         {
7533           output_operand_lossage ("invalid operand for '%%%c'", code);
7534           return;
7535         }
7536
7537       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
7538       break;
7539
7540     case 'M':
7541     case 'm':
7542       {
7543         int cond_code;
7544         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
7545         if (x == const_true_rtx)
7546           {
7547             if (code == 'M')
7548               fputs ("nv", f);
7549             return;
7550           }
7551
7552         if (!COMPARISON_P (x))
7553           {
7554             output_operand_lossage ("invalid operand for '%%%c'", code);
7555             return;
7556           }
7557
7558         cond_code = aarch64_get_condition_code (x);
7559         gcc_assert (cond_code >= 0);
7560         if (code == 'M')
7561           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
7562         fputs (aarch64_condition_codes[cond_code], f);
7563       }
7564       break;
7565
7566     case 'N':
7567       if (!const_vec_duplicate_p (x, &elt))
7568         {
7569           output_operand_lossage ("invalid vector constant");
7570           return;
7571         }
7572
7573       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
7574         asm_fprintf (f, "%wd", -INTVAL (elt));
7575       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
7576                && aarch64_print_vector_float_operand (f, x, true))
7577         ;
7578       else
7579         {
7580           output_operand_lossage ("invalid vector constant");
7581           return;
7582         }
7583       break;
7584
7585     case 'b':
7586     case 'h':
7587     case 's':
7588     case 'd':
7589     case 'q':
7590       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7591         {
7592           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7593           return;
7594         }
7595       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
7596       break;
7597
7598     case 'S':
7599     case 'T':
7600     case 'U':
7601     case 'V':
7602       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7603         {
7604           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7605           return;
7606         }
7607       asm_fprintf (f, "%c%d",
7608                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
7609                    REGNO (x) - V0_REGNUM + (code - 'S'));
7610       break;
7611
7612     case 'R':
7613       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7614         {
7615           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7616           return;
7617         }
7618       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
7619       break;
7620
7621     case 'X':
7622       if (!CONST_INT_P (x))
7623         {
7624           output_operand_lossage ("invalid operand for '%%%c'", code);
7625           return;
7626         }
7627       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
7628       break;
7629
7630     case 'C':
7631       {
7632         /* Print a replicated constant in hex.  */
7633         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
7634           {
7635             output_operand_lossage ("invalid operand for '%%%c'", code);
7636             return;
7637           }
7638         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
7639         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
7640       }
7641       break;
7642
7643     case 'D':
7644       {
7645         /* Print a replicated constant in decimal, treating it as
7646            unsigned.  */
7647         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
7648           {
7649             output_operand_lossage ("invalid operand for '%%%c'", code);
7650             return;
7651           }
7652         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
7653         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
7654       }
7655       break;
7656
7657     case 'w':
7658     case 'x':
7659       if (x == const0_rtx
7660           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
7661         {
7662           asm_fprintf (f, "%czr", code);
7663           break;
7664         }
7665
7666       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
7667         {
7668           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
7669           break;
7670         }
7671
7672       if (REG_P (x) && REGNO (x) == SP_REGNUM)
7673         {
7674           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
7675           break;
7676         }
7677
7678       /* Fall through */
7679
7680     case 0:
7681       if (x == NULL)
7682         {
7683           output_operand_lossage ("missing operand");
7684           return;
7685         }
7686
7687       switch (GET_CODE (x))
7688         {
7689         case REG:
7690           if (aarch64_sve_data_mode_p (GET_MODE (x)))
7691             {
7692               if (REG_NREGS (x) == 1)
7693                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
7694               else
7695                 {
7696                   char suffix
7697                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
7698                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
7699                                REGNO (x) - V0_REGNUM, suffix,
7700                                END_REGNO (x) - V0_REGNUM - 1, suffix);
7701                 }
7702             }
7703           else
7704             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
7705           break;
7706
7707         case MEM:
7708           output_address (GET_MODE (x), XEXP (x, 0));
7709           break;
7710
7711         case LABEL_REF:
7712         case SYMBOL_REF:
7713           output_addr_const (asm_out_file, x);
7714           break;
7715
7716         case CONST_INT:
7717           asm_fprintf (f, "%wd", INTVAL (x));
7718           break;
7719
7720         case CONST:
7721           if (!VECTOR_MODE_P (GET_MODE (x)))
7722             {
7723               output_addr_const (asm_out_file, x);
7724               break;
7725             }
7726           /* fall through */
7727
7728         case CONST_VECTOR:
7729           if (!const_vec_duplicate_p (x, &elt))
7730             {
7731               output_operand_lossage ("invalid vector constant");
7732               return;
7733             }
7734
7735           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
7736             asm_fprintf (f, "%wd", INTVAL (elt));
7737           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
7738                    && aarch64_print_vector_float_operand (f, x, false))
7739             ;
7740           else
7741             {
7742               output_operand_lossage ("invalid vector constant");
7743               return;
7744             }
7745           break;
7746
7747         case CONST_DOUBLE:
7748           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
7749              be getting CONST_DOUBLEs holding integers.  */
7750           gcc_assert (GET_MODE (x) != VOIDmode);
7751           if (aarch64_float_const_zero_rtx_p (x))
7752             {
7753               fputc ('0', f);
7754               break;
7755             }
7756           else if (aarch64_float_const_representable_p (x))
7757             {
7758 #define buf_size 20
7759               char float_buf[buf_size] = {'\0'};
7760               real_to_decimal_for_mode (float_buf,
7761                                         CONST_DOUBLE_REAL_VALUE (x),
7762                                         buf_size, buf_size,
7763                                         1, GET_MODE (x));
7764               asm_fprintf (asm_out_file, "%s", float_buf);
7765               break;
7766 #undef buf_size
7767             }
7768           output_operand_lossage ("invalid constant");
7769           return;
7770         default:
7771           output_operand_lossage ("invalid operand");
7772           return;
7773         }
7774       break;
7775
7776     case 'A':
7777       if (GET_CODE (x) == HIGH)
7778         x = XEXP (x, 0);
7779
7780       switch (aarch64_classify_symbolic_expression (x))
7781         {
7782         case SYMBOL_SMALL_GOT_4G:
7783           asm_fprintf (asm_out_file, ":got:");
7784           break;
7785
7786         case SYMBOL_SMALL_TLSGD:
7787           asm_fprintf (asm_out_file, ":tlsgd:");
7788           break;
7789
7790         case SYMBOL_SMALL_TLSDESC:
7791           asm_fprintf (asm_out_file, ":tlsdesc:");
7792           break;
7793
7794         case SYMBOL_SMALL_TLSIE:
7795           asm_fprintf (asm_out_file, ":gottprel:");
7796           break;
7797
7798         case SYMBOL_TLSLE24:
7799           asm_fprintf (asm_out_file, ":tprel:");
7800           break;
7801
7802         case SYMBOL_TINY_GOT:
7803           gcc_unreachable ();
7804           break;
7805
7806         default:
7807           break;
7808         }
7809       output_addr_const (asm_out_file, x);
7810       break;
7811
7812     case 'L':
7813       switch (aarch64_classify_symbolic_expression (x))
7814         {
7815         case SYMBOL_SMALL_GOT_4G:
7816           asm_fprintf (asm_out_file, ":lo12:");
7817           break;
7818
7819         case SYMBOL_SMALL_TLSGD:
7820           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
7821           break;
7822
7823         case SYMBOL_SMALL_TLSDESC:
7824           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
7825           break;
7826
7827         case SYMBOL_SMALL_TLSIE:
7828           asm_fprintf (asm_out_file, ":gottprel_lo12:");
7829           break;
7830
7831         case SYMBOL_TLSLE12:
7832           asm_fprintf (asm_out_file, ":tprel_lo12:");
7833           break;
7834
7835         case SYMBOL_TLSLE24:
7836           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
7837           break;
7838
7839         case SYMBOL_TINY_GOT:
7840           asm_fprintf (asm_out_file, ":got:");
7841           break;
7842
7843         case SYMBOL_TINY_TLSIE:
7844           asm_fprintf (asm_out_file, ":gottprel:");
7845           break;
7846
7847         default:
7848           break;
7849         }
7850       output_addr_const (asm_out_file, x);
7851       break;
7852
7853     case 'G':
7854       switch (aarch64_classify_symbolic_expression (x))
7855         {
7856         case SYMBOL_TLSLE24:
7857           asm_fprintf (asm_out_file, ":tprel_hi12:");
7858           break;
7859         default:
7860           break;
7861         }
7862       output_addr_const (asm_out_file, x);
7863       break;
7864
7865     case 'k':
7866       {
7867         HOST_WIDE_INT cond_code;
7868
7869         if (!CONST_INT_P (x))
7870           {
7871             output_operand_lossage ("invalid operand for '%%%c'", code);
7872             return;
7873           }
7874
7875         cond_code = INTVAL (x);
7876         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
7877         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
7878       }
7879       break;
7880
7881     case 'y':
7882     case 'z':
7883       {
7884         machine_mode mode = GET_MODE (x);
7885
7886         if (GET_CODE (x) != MEM
7887             || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
7888           {
7889             output_operand_lossage ("invalid operand for '%%%c'", code);
7890             return;
7891           }
7892
7893         if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
7894                                             code == 'y'
7895                                             ? ADDR_QUERY_LDP_STP_N
7896                                             : ADDR_QUERY_LDP_STP))
7897           output_operand_lossage ("invalid operand prefix '%%%c'", code);
7898       }
7899       break;
7900
7901     default:
7902       output_operand_lossage ("invalid operand prefix '%%%c'", code);
7903       return;
7904     }
7905 }
7906
7907 /* Print address 'x' of a memory access with mode 'mode'.
7908    'op' is the context required by aarch64_classify_address.  It can either be
7909    MEM for a normal memory access or PARALLEL for LDP/STP.  */
7910 static bool
7911 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
7912                                 aarch64_addr_query_type type)
7913 {
7914   struct aarch64_address_info addr;
7915   unsigned int size;
7916
7917   /* Check all addresses are Pmode - including ILP32.  */
7918   if (GET_MODE (x) != Pmode
7919       && (!CONST_INT_P (x)
7920           || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
7921     {
7922       output_operand_lossage ("invalid address mode");
7923       return false;
7924     }
7925
7926   if (aarch64_classify_address (&addr, x, mode, true, type))
7927     switch (addr.type)
7928       {
7929       case ADDRESS_REG_IMM:
7930         if (known_eq (addr.const_offset, 0))
7931           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
7932         else if (aarch64_sve_data_mode_p (mode))
7933           {
7934             HOST_WIDE_INT vnum
7935               = exact_div (addr.const_offset,
7936                            BYTES_PER_SVE_VECTOR).to_constant ();
7937             asm_fprintf (f, "[%s, #%wd, mul vl]",
7938                          reg_names[REGNO (addr.base)], vnum);
7939           }
7940         else if (aarch64_sve_pred_mode_p (mode))
7941           {
7942             HOST_WIDE_INT vnum
7943               = exact_div (addr.const_offset,
7944                            BYTES_PER_SVE_PRED).to_constant ();
7945             asm_fprintf (f, "[%s, #%wd, mul vl]",
7946                          reg_names[REGNO (addr.base)], vnum);
7947           }
7948         else
7949           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
7950                        INTVAL (addr.offset));
7951         return true;
7952
7953       case ADDRESS_REG_REG:
7954         if (addr.shift == 0)
7955           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
7956                        reg_names [REGNO (addr.offset)]);
7957         else
7958           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
7959                        reg_names [REGNO (addr.offset)], addr.shift);
7960         return true;
7961
7962       case ADDRESS_REG_UXTW:
7963         if (addr.shift == 0)
7964           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
7965                        REGNO (addr.offset) - R0_REGNUM);
7966         else
7967           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
7968                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
7969         return true;
7970
7971       case ADDRESS_REG_SXTW:
7972         if (addr.shift == 0)
7973           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
7974                        REGNO (addr.offset) - R0_REGNUM);
7975         else
7976           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
7977                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
7978         return true;
7979
7980       case ADDRESS_REG_WB:
7981         /* Writeback is only supported for fixed-width modes.  */
7982         size = GET_MODE_SIZE (mode).to_constant ();
7983         switch (GET_CODE (x))
7984           {
7985           case PRE_INC:
7986             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
7987             return true;
7988           case POST_INC:
7989             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
7990             return true;
7991           case PRE_DEC:
7992             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
7993             return true;
7994           case POST_DEC:
7995             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
7996             return true;
7997           case PRE_MODIFY:
7998             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
7999                          INTVAL (addr.offset));
8000             return true;
8001           case POST_MODIFY:
8002             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
8003                          INTVAL (addr.offset));
8004             return true;
8005           default:
8006             break;
8007           }
8008         break;
8009
8010       case ADDRESS_LO_SUM:
8011         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
8012         output_addr_const (f, addr.offset);
8013         asm_fprintf (f, "]");
8014         return true;
8015
8016       case ADDRESS_SYMBOLIC:
8017         output_addr_const (f, x);
8018         return true;
8019       }
8020
8021   return false;
8022 }
8023
8024 /* Print address 'x' of a memory access with mode 'mode'.  */
8025 static void
8026 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
8027 {
8028   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
8029     output_addr_const (f, x);
8030 }
8031
8032 bool
8033 aarch64_label_mentioned_p (rtx x)
8034 {
8035   const char *fmt;
8036   int i;
8037
8038   if (GET_CODE (x) == LABEL_REF)
8039     return true;
8040
8041   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
8042      referencing instruction, but they are constant offsets, not
8043      symbols.  */
8044   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
8045     return false;
8046
8047   fmt = GET_RTX_FORMAT (GET_CODE (x));
8048   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
8049     {
8050       if (fmt[i] == 'E')
8051         {
8052           int j;
8053
8054           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
8055             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
8056               return 1;
8057         }
8058       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
8059         return 1;
8060     }
8061
8062   return 0;
8063 }
8064
8065 /* Implement REGNO_REG_CLASS.  */
8066
8067 enum reg_class
8068 aarch64_regno_regclass (unsigned regno)
8069 {
8070   if (GP_REGNUM_P (regno))
8071     return GENERAL_REGS;
8072
8073   if (regno == SP_REGNUM)
8074     return STACK_REG;
8075
8076   if (regno == FRAME_POINTER_REGNUM
8077       || regno == ARG_POINTER_REGNUM)
8078     return POINTER_REGS;
8079
8080   if (FP_REGNUM_P (regno))
8081     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
8082
8083   if (PR_REGNUM_P (regno))
8084     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
8085
8086   return NO_REGS;
8087 }
8088
8089 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
8090    If OFFSET is out of range, return an offset of an anchor point
8091    that is in range.  Return 0 otherwise.  */
8092
8093 static HOST_WIDE_INT
8094 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
8095                        machine_mode mode)
8096 {
8097   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
8098   if (size > 16)
8099     return (offset + 0x400) & ~0x7f0;
8100
8101   /* For offsets that aren't a multiple of the access size, the limit is
8102      -256...255.  */
8103   if (offset & (size - 1))
8104     {
8105       /* BLKmode typically uses LDP of X-registers.  */
8106       if (mode == BLKmode)
8107         return (offset + 512) & ~0x3ff;
8108       return (offset + 0x100) & ~0x1ff;
8109     }
8110
8111   /* Small negative offsets are supported.  */
8112   if (IN_RANGE (offset, -256, 0))
8113     return 0;
8114
8115   if (mode == TImode || mode == TFmode)
8116     return (offset + 0x100) & ~0x1ff;
8117
8118   /* Use 12-bit offset by access size.  */
8119   return offset & (~0xfff * size);
8120 }
8121
8122 static rtx
8123 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
8124 {
8125   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
8126      where mask is selected by alignment and size of the offset.
8127      We try to pick as large a range for the offset as possible to
8128      maximize the chance of a CSE.  However, for aligned addresses
8129      we limit the range to 4k so that structures with different sized
8130      elements are likely to use the same base.  We need to be careful
8131      not to split a CONST for some forms of address expression, otherwise
8132      it will generate sub-optimal code.  */
8133
8134   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
8135     {
8136       rtx base = XEXP (x, 0);
8137       rtx offset_rtx = XEXP (x, 1);
8138       HOST_WIDE_INT offset = INTVAL (offset_rtx);
8139
8140       if (GET_CODE (base) == PLUS)
8141         {
8142           rtx op0 = XEXP (base, 0);
8143           rtx op1 = XEXP (base, 1);
8144
8145           /* Force any scaling into a temp for CSE.  */
8146           op0 = force_reg (Pmode, op0);
8147           op1 = force_reg (Pmode, op1);
8148
8149           /* Let the pointer register be in op0.  */
8150           if (REG_POINTER (op1))
8151             std::swap (op0, op1);
8152
8153           /* If the pointer is virtual or frame related, then we know that
8154              virtual register instantiation or register elimination is going
8155              to apply a second constant.  We want the two constants folded
8156              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
8157           if (virt_or_elim_regno_p (REGNO (op0)))
8158             {
8159               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
8160                                    NULL_RTX, true, OPTAB_DIRECT);
8161               return gen_rtx_PLUS (Pmode, base, op1);
8162             }
8163
8164           /* Otherwise, in order to encourage CSE (and thence loop strength
8165              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
8166           base = expand_binop (Pmode, add_optab, op0, op1,
8167                                NULL_RTX, true, OPTAB_DIRECT);
8168           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
8169         }
8170
8171       HOST_WIDE_INT size;
8172       if (GET_MODE_SIZE (mode).is_constant (&size))
8173         {
8174           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
8175                                                              mode);
8176           if (base_offset != 0)
8177             {
8178               base = plus_constant (Pmode, base, base_offset);
8179               base = force_operand (base, NULL_RTX);
8180               return plus_constant (Pmode, base, offset - base_offset);
8181             }
8182         }
8183     }
8184
8185   return x;
8186 }
8187
8188 static reg_class_t
8189 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
8190                           reg_class_t rclass,
8191                           machine_mode mode,
8192                           secondary_reload_info *sri)
8193 {
8194   /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
8195      directly by the *aarch64_sve_mov<mode>_be move pattern.  See the
8196      comment at the head of aarch64-sve.md for more details about the
8197      big-endian handling.  */
8198   if (BYTES_BIG_ENDIAN
8199       && reg_class_subset_p (rclass, FP_REGS)
8200       && !((REG_P (x) && HARD_REGISTER_P (x))
8201            || aarch64_simd_valid_immediate (x, NULL))
8202       && aarch64_sve_data_mode_p (mode))
8203     {
8204       sri->icode = CODE_FOR_aarch64_sve_reload_be;
8205       return NO_REGS;
8206     }
8207
8208   /* If we have to disable direct literal pool loads and stores because the
8209      function is too big, then we need a scratch register.  */
8210   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
8211       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
8212           || targetm.vector_mode_supported_p (GET_MODE (x)))
8213       && !aarch64_pcrelative_literal_loads)
8214     {
8215       sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
8216       return NO_REGS;
8217     }
8218
8219   /* Without the TARGET_SIMD instructions we cannot move a Q register
8220      to a Q register directly.  We need a scratch.  */
8221   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
8222       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
8223       && reg_class_subset_p (rclass, FP_REGS))
8224     {
8225       sri->icode = code_for_aarch64_reload_mov (mode);
8226       return NO_REGS;
8227     }
8228
8229   /* A TFmode or TImode memory access should be handled via an FP_REGS
8230      because AArch64 has richer addressing modes for LDR/STR instructions
8231      than LDP/STP instructions.  */
8232   if (TARGET_FLOAT && rclass == GENERAL_REGS
8233       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
8234     return FP_REGS;
8235
8236   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
8237       return GENERAL_REGS;
8238
8239   return NO_REGS;
8240 }
8241
8242 static bool
8243 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
8244 {
8245   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
8246
8247   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
8248      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
8249   if (frame_pointer_needed)
8250     return to == HARD_FRAME_POINTER_REGNUM;
8251   return true;
8252 }
8253
8254 poly_int64
8255 aarch64_initial_elimination_offset (unsigned from, unsigned to)
8256 {
8257   if (to == HARD_FRAME_POINTER_REGNUM)
8258     {
8259       if (from == ARG_POINTER_REGNUM)
8260         return cfun->machine->frame.hard_fp_offset;
8261
8262       if (from == FRAME_POINTER_REGNUM)
8263         return cfun->machine->frame.hard_fp_offset
8264                - cfun->machine->frame.locals_offset;
8265     }
8266
8267   if (to == STACK_POINTER_REGNUM)
8268     {
8269       if (from == FRAME_POINTER_REGNUM)
8270           return cfun->machine->frame.frame_size
8271                  - cfun->machine->frame.locals_offset;
8272     }
8273
8274   return cfun->machine->frame.frame_size;
8275 }
8276
8277 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
8278    previous frame.  */
8279
8280 rtx
8281 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
8282 {
8283   if (count != 0)
8284     return const0_rtx;
8285   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
8286 }
8287
8288
8289 static void
8290 aarch64_asm_trampoline_template (FILE *f)
8291 {
8292   int offset1 = 16;
8293   int offset2 = 20;
8294
8295   if (aarch64_bti_enabled ())
8296     {
8297       asm_fprintf (f, "\thint\t34 // bti c\n");
8298       offset1 -= 4;
8299       offset2 -= 4;
8300     }
8301
8302   if (TARGET_ILP32)
8303     {
8304       asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
8305       asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
8306                    offset1);
8307     }
8308   else
8309     {
8310       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
8311       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
8312                    offset2);
8313     }
8314   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
8315
8316   /* The trampoline needs an extra padding instruction.  In case if BTI is
8317      enabled the padding instruction is replaced by the BTI instruction at
8318      the beginning.  */
8319   if (!aarch64_bti_enabled ())
8320     assemble_aligned_integer (4, const0_rtx);
8321
8322   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
8323   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
8324 }
8325
8326 static void
8327 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
8328 {
8329   rtx fnaddr, mem, a_tramp;
8330   const int tramp_code_sz = 16;
8331
8332   /* Don't need to copy the trailing D-words, we fill those in below.  */
8333   emit_block_move (m_tramp, assemble_trampoline_template (),
8334                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
8335   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
8336   fnaddr = XEXP (DECL_RTL (fndecl), 0);
8337   if (GET_MODE (fnaddr) != ptr_mode)
8338     fnaddr = convert_memory_address (ptr_mode, fnaddr);
8339   emit_move_insn (mem, fnaddr);
8340
8341   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
8342   emit_move_insn (mem, chain_value);
8343
8344   /* XXX We should really define a "clear_cache" pattern and use
8345      gen_clear_cache().  */
8346   a_tramp = XEXP (m_tramp, 0);
8347   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
8348                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
8349                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
8350                      ptr_mode);
8351 }
8352
8353 static unsigned char
8354 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
8355 {
8356   /* ??? Logically we should only need to provide a value when
8357      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
8358      can hold MODE, but at the moment we need to handle all modes.
8359      Just ignore any runtime parts for registers that can't store them.  */
8360   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
8361   unsigned int nregs;
8362   switch (regclass)
8363     {
8364     case TAILCALL_ADDR_REGS:
8365     case POINTER_REGS:
8366     case GENERAL_REGS:
8367     case ALL_REGS:
8368     case POINTER_AND_FP_REGS:
8369     case FP_REGS:
8370     case FP_LO_REGS:
8371       if (aarch64_sve_data_mode_p (mode)
8372           && constant_multiple_p (GET_MODE_SIZE (mode),
8373                                   BYTES_PER_SVE_VECTOR, &nregs))
8374         return nregs;
8375       return (aarch64_vector_data_mode_p (mode)
8376               ? CEIL (lowest_size, UNITS_PER_VREG)
8377               : CEIL (lowest_size, UNITS_PER_WORD));
8378     case STACK_REG:
8379     case PR_REGS:
8380     case PR_LO_REGS:
8381     case PR_HI_REGS:
8382       return 1;
8383
8384     case NO_REGS:
8385       return 0;
8386
8387     default:
8388       break;
8389     }
8390   gcc_unreachable ();
8391 }
8392
8393 static reg_class_t
8394 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
8395 {
8396   if (regclass == POINTER_REGS)
8397     return GENERAL_REGS;
8398
8399   if (regclass == STACK_REG)
8400     {
8401       if (REG_P(x)
8402           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
8403           return regclass;
8404
8405       return NO_REGS;
8406     }
8407
8408   /* Register eliminiation can result in a request for
8409      SP+constant->FP_REGS.  We cannot support such operations which
8410      use SP as source and an FP_REG as destination, so reject out
8411      right now.  */
8412   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
8413     {
8414       rtx lhs = XEXP (x, 0);
8415
8416       /* Look through a possible SUBREG introduced by ILP32.  */
8417       if (GET_CODE (lhs) == SUBREG)
8418         lhs = SUBREG_REG (lhs);
8419
8420       gcc_assert (REG_P (lhs));
8421       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
8422                                       POINTER_REGS));
8423       return NO_REGS;
8424     }
8425
8426   return regclass;
8427 }
8428
8429 void
8430 aarch64_asm_output_labelref (FILE* f, const char *name)
8431 {
8432   asm_fprintf (f, "%U%s", name);
8433 }
8434
8435 static void
8436 aarch64_elf_asm_constructor (rtx symbol, int priority)
8437 {
8438   if (priority == DEFAULT_INIT_PRIORITY)
8439     default_ctor_section_asm_out_constructor (symbol, priority);
8440   else
8441     {
8442       section *s;
8443       /* While priority is known to be in range [0, 65535], so 18 bytes
8444          would be enough, the compiler might not know that.  To avoid
8445          -Wformat-truncation false positive, use a larger size.  */
8446       char buf[23];
8447       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
8448       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
8449       switch_to_section (s);
8450       assemble_align (POINTER_SIZE);
8451       assemble_aligned_integer (POINTER_BYTES, symbol);
8452     }
8453 }
8454
8455 static void
8456 aarch64_elf_asm_destructor (rtx symbol, int priority)
8457 {
8458   if (priority == DEFAULT_INIT_PRIORITY)
8459     default_dtor_section_asm_out_destructor (symbol, priority);
8460   else
8461     {
8462       section *s;
8463       /* While priority is known to be in range [0, 65535], so 18 bytes
8464          would be enough, the compiler might not know that.  To avoid
8465          -Wformat-truncation false positive, use a larger size.  */
8466       char buf[23];
8467       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
8468       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
8469       switch_to_section (s);
8470       assemble_align (POINTER_SIZE);
8471       assemble_aligned_integer (POINTER_BYTES, symbol);
8472     }
8473 }
8474
8475 const char*
8476 aarch64_output_casesi (rtx *operands)
8477 {
8478   char buf[100];
8479   char label[100];
8480   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
8481   int index;
8482   static const char *const patterns[4][2] =
8483   {
8484     {
8485       "ldrb\t%w3, [%0,%w1,uxtw]",
8486       "add\t%3, %4, %w3, sxtb #2"
8487     },
8488     {
8489       "ldrh\t%w3, [%0,%w1,uxtw #1]",
8490       "add\t%3, %4, %w3, sxth #2"
8491     },
8492     {
8493       "ldr\t%w3, [%0,%w1,uxtw #2]",
8494       "add\t%3, %4, %w3, sxtw #2"
8495     },
8496     /* We assume that DImode is only generated when not optimizing and
8497        that we don't really need 64-bit address offsets.  That would
8498        imply an object file with 8GB of code in a single function!  */
8499     {
8500       "ldr\t%w3, [%0,%w1,uxtw #2]",
8501       "add\t%3, %4, %w3, sxtw #2"
8502     }
8503   };
8504
8505   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
8506
8507   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
8508   index = exact_log2 (GET_MODE_SIZE (mode));
8509
8510   gcc_assert (index >= 0 && index <= 3);
8511
8512   /* Need to implement table size reduction, by chaning the code below.  */
8513   output_asm_insn (patterns[index][0], operands);
8514   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
8515   snprintf (buf, sizeof (buf),
8516             "adr\t%%4, %s", targetm.strip_name_encoding (label));
8517   output_asm_insn (buf, operands);
8518   output_asm_insn (patterns[index][1], operands);
8519   output_asm_insn ("br\t%3", operands);
8520   assemble_label (asm_out_file, label);
8521   return "";
8522 }
8523
8524
8525 /* Return size in bits of an arithmetic operand which is shifted/scaled and
8526    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
8527    operator.  */
8528
8529 int
8530 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
8531 {
8532   if (shift >= 0 && shift <= 3)
8533     {
8534       int size;
8535       for (size = 8; size <= 32; size *= 2)
8536         {
8537           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
8538           if (mask == bits << shift)
8539             return size;
8540         }
8541     }
8542   return 0;
8543 }
8544
8545 /* Constant pools are per function only when PC relative
8546    literal loads are true or we are in the large memory
8547    model.  */
8548
8549 static inline bool
8550 aarch64_can_use_per_function_literal_pools_p (void)
8551 {
8552   return (aarch64_pcrelative_literal_loads
8553           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
8554 }
8555
8556 static bool
8557 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
8558 {
8559   /* We can't use blocks for constants when we're using a per-function
8560      constant pool.  */
8561   return !aarch64_can_use_per_function_literal_pools_p ();
8562 }
8563
8564 /* Select appropriate section for constants depending
8565    on where we place literal pools.  */
8566
8567 static section *
8568 aarch64_select_rtx_section (machine_mode mode,
8569                             rtx x,
8570                             unsigned HOST_WIDE_INT align)
8571 {
8572   if (aarch64_can_use_per_function_literal_pools_p ())
8573     return function_section (current_function_decl);
8574
8575   return default_elf_select_rtx_section (mode, x, align);
8576 }
8577
8578 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
8579 void
8580 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
8581                                   HOST_WIDE_INT offset)
8582 {
8583   /* When using per-function literal pools, we must ensure that any code
8584      section is aligned to the minimal instruction length, lest we get
8585      errors from the assembler re "unaligned instructions".  */
8586   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
8587     ASM_OUTPUT_ALIGN (f, 2);
8588 }
8589
8590 /* Costs.  */
8591
8592 /* Helper function for rtx cost calculation.  Strip a shift expression
8593    from X.  Returns the inner operand if successful, or the original
8594    expression on failure.  */
8595 static rtx
8596 aarch64_strip_shift (rtx x)
8597 {
8598   rtx op = x;
8599
8600   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
8601      we can convert both to ROR during final output.  */
8602   if ((GET_CODE (op) == ASHIFT
8603        || GET_CODE (op) == ASHIFTRT
8604        || GET_CODE (op) == LSHIFTRT
8605        || GET_CODE (op) == ROTATERT
8606        || GET_CODE (op) == ROTATE)
8607       && CONST_INT_P (XEXP (op, 1)))
8608     return XEXP (op, 0);
8609
8610   if (GET_CODE (op) == MULT
8611       && CONST_INT_P (XEXP (op, 1))
8612       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
8613     return XEXP (op, 0);
8614
8615   return x;
8616 }
8617
8618 /* Helper function for rtx cost calculation.  Strip an extend
8619    expression from X.  Returns the inner operand if successful, or the
8620    original expression on failure.  We deal with a number of possible
8621    canonicalization variations here. If STRIP_SHIFT is true, then
8622    we can strip off a shift also.  */
8623 static rtx
8624 aarch64_strip_extend (rtx x, bool strip_shift)
8625 {
8626   scalar_int_mode mode;
8627   rtx op = x;
8628
8629   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
8630     return op;
8631
8632   /* Zero and sign extraction of a widened value.  */
8633   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
8634       && XEXP (op, 2) == const0_rtx
8635       && GET_CODE (XEXP (op, 0)) == MULT
8636       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
8637                                          XEXP (op, 1)))
8638     return XEXP (XEXP (op, 0), 0);
8639
8640   /* It can also be represented (for zero-extend) as an AND with an
8641      immediate.  */
8642   if (GET_CODE (op) == AND
8643       && GET_CODE (XEXP (op, 0)) == MULT
8644       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
8645       && CONST_INT_P (XEXP (op, 1))
8646       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
8647                            INTVAL (XEXP (op, 1))) != 0)
8648     return XEXP (XEXP (op, 0), 0);
8649
8650   /* Now handle extended register, as this may also have an optional
8651      left shift by 1..4.  */
8652   if (strip_shift
8653       && GET_CODE (op) == ASHIFT
8654       && CONST_INT_P (XEXP (op, 1))
8655       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
8656     op = XEXP (op, 0);
8657
8658   if (GET_CODE (op) == ZERO_EXTEND
8659       || GET_CODE (op) == SIGN_EXTEND)
8660     op = XEXP (op, 0);
8661
8662   if (op != x)
8663     return op;
8664
8665   return x;
8666 }
8667
8668 /* Return true iff CODE is a shift supported in combination
8669    with arithmetic instructions.  */
8670
8671 static bool
8672 aarch64_shift_p (enum rtx_code code)
8673 {
8674   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
8675 }
8676
8677
8678 /* Return true iff X is a cheap shift without a sign extend. */
8679
8680 static bool
8681 aarch64_cheap_mult_shift_p (rtx x)
8682 {
8683   rtx op0, op1;
8684
8685   op0 = XEXP (x, 0);
8686   op1 = XEXP (x, 1);
8687
8688   if (!(aarch64_tune_params.extra_tuning_flags
8689                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
8690     return false;
8691
8692   if (GET_CODE (op0) == SIGN_EXTEND)
8693     return false;
8694
8695   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
8696       && UINTVAL (op1) <= 4)
8697     return true;
8698
8699   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
8700     return false;
8701
8702   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
8703
8704   if (l2 > 0 && l2 <= 4)
8705     return true;
8706
8707   return false;
8708 }
8709
8710 /* Helper function for rtx cost calculation.  Calculate the cost of
8711    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
8712    Return the calculated cost of the expression, recursing manually in to
8713    operands where needed.  */
8714
8715 static int
8716 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
8717 {
8718   rtx op0, op1;
8719   const struct cpu_cost_table *extra_cost
8720     = aarch64_tune_params.insn_extra_cost;
8721   int cost = 0;
8722   bool compound_p = (outer == PLUS || outer == MINUS);
8723   machine_mode mode = GET_MODE (x);
8724
8725   gcc_checking_assert (code == MULT);
8726
8727   op0 = XEXP (x, 0);
8728   op1 = XEXP (x, 1);
8729
8730   if (VECTOR_MODE_P (mode))
8731     mode = GET_MODE_INNER (mode);
8732
8733   /* Integer multiply/fma.  */
8734   if (GET_MODE_CLASS (mode) == MODE_INT)
8735     {
8736       /* The multiply will be canonicalized as a shift, cost it as such.  */
8737       if (aarch64_shift_p (GET_CODE (x))
8738           || (CONST_INT_P (op1)
8739               && exact_log2 (INTVAL (op1)) > 0))
8740         {
8741           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
8742                            || GET_CODE (op0) == SIGN_EXTEND;
8743           if (speed)
8744             {
8745               if (compound_p)
8746                 {
8747                   /* If the shift is considered cheap,
8748                      then don't add any cost. */
8749                   if (aarch64_cheap_mult_shift_p (x))
8750                     ;
8751                   else if (REG_P (op1))
8752                     /* ARITH + shift-by-register.  */
8753                     cost += extra_cost->alu.arith_shift_reg;
8754                   else if (is_extend)
8755                     /* ARITH + extended register.  We don't have a cost field
8756                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
8757                     cost += extra_cost->alu.extend_arith;
8758                   else
8759                     /* ARITH + shift-by-immediate.  */
8760                     cost += extra_cost->alu.arith_shift;
8761                 }
8762               else
8763                 /* LSL (immediate).  */
8764                 cost += extra_cost->alu.shift;
8765
8766             }
8767           /* Strip extends as we will have costed them in the case above.  */
8768           if (is_extend)
8769             op0 = aarch64_strip_extend (op0, true);
8770
8771           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
8772
8773           return cost;
8774         }
8775
8776       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
8777          compound and let the below cases handle it.  After all, MNEG is a
8778          special-case alias of MSUB.  */
8779       if (GET_CODE (op0) == NEG)
8780         {
8781           op0 = XEXP (op0, 0);
8782           compound_p = true;
8783         }
8784
8785       /* Integer multiplies or FMAs have zero/sign extending variants.  */
8786       if ((GET_CODE (op0) == ZERO_EXTEND
8787            && GET_CODE (op1) == ZERO_EXTEND)
8788           || (GET_CODE (op0) == SIGN_EXTEND
8789               && GET_CODE (op1) == SIGN_EXTEND))
8790         {
8791           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
8792           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
8793
8794           if (speed)
8795             {
8796               if (compound_p)
8797                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
8798                 cost += extra_cost->mult[0].extend_add;
8799               else
8800                 /* MUL/SMULL/UMULL.  */
8801                 cost += extra_cost->mult[0].extend;
8802             }
8803
8804           return cost;
8805         }
8806
8807       /* This is either an integer multiply or a MADD.  In both cases
8808          we want to recurse and cost the operands.  */
8809       cost += rtx_cost (op0, mode, MULT, 0, speed);
8810       cost += rtx_cost (op1, mode, MULT, 1, speed);
8811
8812       if (speed)
8813         {
8814           if (compound_p)
8815             /* MADD/MSUB.  */
8816             cost += extra_cost->mult[mode == DImode].add;
8817           else
8818             /* MUL.  */
8819             cost += extra_cost->mult[mode == DImode].simple;
8820         }
8821
8822       return cost;
8823     }
8824   else
8825     {
8826       if (speed)
8827         {
8828           /* Floating-point FMA/FMUL can also support negations of the
8829              operands, unless the rounding mode is upward or downward in
8830              which case FNMUL is different than FMUL with operand negation.  */
8831           bool neg0 = GET_CODE (op0) == NEG;
8832           bool neg1 = GET_CODE (op1) == NEG;
8833           if (compound_p || !flag_rounding_math || (neg0 && neg1))
8834             {
8835               if (neg0)
8836                 op0 = XEXP (op0, 0);
8837               if (neg1)
8838                 op1 = XEXP (op1, 0);
8839             }
8840
8841           if (compound_p)
8842             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
8843             cost += extra_cost->fp[mode == DFmode].fma;
8844           else
8845             /* FMUL/FNMUL.  */
8846             cost += extra_cost->fp[mode == DFmode].mult;
8847         }
8848
8849       cost += rtx_cost (op0, mode, MULT, 0, speed);
8850       cost += rtx_cost (op1, mode, MULT, 1, speed);
8851       return cost;
8852     }
8853 }
8854
8855 static int
8856 aarch64_address_cost (rtx x,
8857                       machine_mode mode,
8858                       addr_space_t as ATTRIBUTE_UNUSED,
8859                       bool speed)
8860 {
8861   enum rtx_code c = GET_CODE (x);
8862   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
8863   struct aarch64_address_info info;
8864   int cost = 0;
8865   info.shift = 0;
8866
8867   if (!aarch64_classify_address (&info, x, mode, false))
8868     {
8869       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
8870         {
8871           /* This is a CONST or SYMBOL ref which will be split
8872              in a different way depending on the code model in use.
8873              Cost it through the generic infrastructure.  */
8874           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
8875           /* Divide through by the cost of one instruction to
8876              bring it to the same units as the address costs.  */
8877           cost_symbol_ref /= COSTS_N_INSNS (1);
8878           /* The cost is then the cost of preparing the address,
8879              followed by an immediate (possibly 0) offset.  */
8880           return cost_symbol_ref + addr_cost->imm_offset;
8881         }
8882       else
8883         {
8884           /* This is most likely a jump table from a case
8885              statement.  */
8886           return addr_cost->register_offset;
8887         }
8888     }
8889
8890   switch (info.type)
8891     {
8892       case ADDRESS_LO_SUM:
8893       case ADDRESS_SYMBOLIC:
8894       case ADDRESS_REG_IMM:
8895         cost += addr_cost->imm_offset;
8896         break;
8897
8898       case ADDRESS_REG_WB:
8899         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
8900           cost += addr_cost->pre_modify;
8901         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
8902           cost += addr_cost->post_modify;
8903         else
8904           gcc_unreachable ();
8905
8906         break;
8907
8908       case ADDRESS_REG_REG:
8909         cost += addr_cost->register_offset;
8910         break;
8911
8912       case ADDRESS_REG_SXTW:
8913         cost += addr_cost->register_sextend;
8914         break;
8915
8916       case ADDRESS_REG_UXTW:
8917         cost += addr_cost->register_zextend;
8918         break;
8919
8920       default:
8921         gcc_unreachable ();
8922     }
8923
8924
8925   if (info.shift > 0)
8926     {
8927       /* For the sake of calculating the cost of the shifted register
8928          component, we can treat same sized modes in the same way.  */
8929       if (known_eq (GET_MODE_BITSIZE (mode), 16))
8930         cost += addr_cost->addr_scale_costs.hi;
8931       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
8932         cost += addr_cost->addr_scale_costs.si;
8933       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
8934         cost += addr_cost->addr_scale_costs.di;
8935       else
8936         /* We can't tell, or this is a 128-bit vector.  */
8937         cost += addr_cost->addr_scale_costs.ti;
8938     }
8939
8940   return cost;
8941 }
8942
8943 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
8944    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
8945    to be taken.  */
8946
8947 int
8948 aarch64_branch_cost (bool speed_p, bool predictable_p)
8949 {
8950   /* When optimizing for speed, use the cost of unpredictable branches.  */
8951   const struct cpu_branch_cost *branch_costs =
8952     aarch64_tune_params.branch_costs;
8953
8954   if (!speed_p || predictable_p)
8955     return branch_costs->predictable;
8956   else
8957     return branch_costs->unpredictable;
8958 }
8959
8960 /* Return true if the RTX X in mode MODE is a zero or sign extract
8961    usable in an ADD or SUB (extended register) instruction.  */
8962 static bool
8963 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
8964 {
8965   /* Catch add with a sign extract.
8966      This is add_<optab><mode>_multp2.  */
8967   if (GET_CODE (x) == SIGN_EXTRACT
8968       || GET_CODE (x) == ZERO_EXTRACT)
8969     {
8970       rtx op0 = XEXP (x, 0);
8971       rtx op1 = XEXP (x, 1);
8972       rtx op2 = XEXP (x, 2);
8973
8974       if (GET_CODE (op0) == MULT
8975           && CONST_INT_P (op1)
8976           && op2 == const0_rtx
8977           && CONST_INT_P (XEXP (op0, 1))
8978           && aarch64_is_extend_from_extract (mode,
8979                                              XEXP (op0, 1),
8980                                              op1))
8981         {
8982           return true;
8983         }
8984     }
8985   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
8986      No shift.  */
8987   else if (GET_CODE (x) == SIGN_EXTEND
8988            || GET_CODE (x) == ZERO_EXTEND)
8989     return REG_P (XEXP (x, 0));
8990
8991   return false;
8992 }
8993
8994 static bool
8995 aarch64_frint_unspec_p (unsigned int u)
8996 {
8997   switch (u)
8998     {
8999       case UNSPEC_FRINTZ:
9000       case UNSPEC_FRINTP:
9001       case UNSPEC_FRINTM:
9002       case UNSPEC_FRINTA:
9003       case UNSPEC_FRINTN:
9004       case UNSPEC_FRINTX:
9005       case UNSPEC_FRINTI:
9006         return true;
9007
9008       default:
9009         return false;
9010     }
9011 }
9012
9013 /* Return true iff X is an rtx that will match an extr instruction
9014    i.e. as described in the *extr<mode>5_insn family of patterns.
9015    OP0 and OP1 will be set to the operands of the shifts involved
9016    on success and will be NULL_RTX otherwise.  */
9017
9018 static bool
9019 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
9020 {
9021   rtx op0, op1;
9022   scalar_int_mode mode;
9023   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
9024     return false;
9025
9026   *res_op0 = NULL_RTX;
9027   *res_op1 = NULL_RTX;
9028
9029   if (GET_CODE (x) != IOR)
9030     return false;
9031
9032   op0 = XEXP (x, 0);
9033   op1 = XEXP (x, 1);
9034
9035   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
9036       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
9037     {
9038      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
9039       if (GET_CODE (op1) == ASHIFT)
9040         std::swap (op0, op1);
9041
9042       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
9043         return false;
9044
9045       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
9046       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
9047
9048       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
9049           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
9050         {
9051           *res_op0 = XEXP (op0, 0);
9052           *res_op1 = XEXP (op1, 0);
9053           return true;
9054         }
9055     }
9056
9057   return false;
9058 }
9059
9060 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
9061    storing it in *COST.  Result is true if the total cost of the operation
9062    has now been calculated.  */
9063 static bool
9064 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
9065 {
9066   rtx inner;
9067   rtx comparator;
9068   enum rtx_code cmpcode;
9069
9070   if (COMPARISON_P (op0))
9071     {
9072       inner = XEXP (op0, 0);
9073       comparator = XEXP (op0, 1);
9074       cmpcode = GET_CODE (op0);
9075     }
9076   else
9077     {
9078       inner = op0;
9079       comparator = const0_rtx;
9080       cmpcode = NE;
9081     }
9082
9083   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
9084     {
9085       /* Conditional branch.  */
9086       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9087         return true;
9088       else
9089         {
9090           if (cmpcode == NE || cmpcode == EQ)
9091             {
9092               if (comparator == const0_rtx)
9093                 {
9094                   /* TBZ/TBNZ/CBZ/CBNZ.  */
9095                   if (GET_CODE (inner) == ZERO_EXTRACT)
9096                     /* TBZ/TBNZ.  */
9097                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
9098                                        ZERO_EXTRACT, 0, speed);
9099                   else
9100                     /* CBZ/CBNZ.  */
9101                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
9102
9103                 return true;
9104               }
9105             }
9106           else if (cmpcode == LT || cmpcode == GE)
9107             {
9108               /* TBZ/TBNZ.  */
9109               if (comparator == const0_rtx)
9110                 return true;
9111             }
9112         }
9113     }
9114   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9115     {
9116       /* CCMP.  */
9117       if (GET_CODE (op1) == COMPARE)
9118         {
9119           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
9120           if (XEXP (op1, 1) == const0_rtx)
9121             *cost += 1;
9122           if (speed)
9123             {
9124               machine_mode mode = GET_MODE (XEXP (op1, 0));
9125               const struct cpu_cost_table *extra_cost
9126                 = aarch64_tune_params.insn_extra_cost;
9127
9128               if (GET_MODE_CLASS (mode) == MODE_INT)
9129                 *cost += extra_cost->alu.arith;
9130               else
9131                 *cost += extra_cost->fp[mode == DFmode].compare;
9132             }
9133           return true;
9134         }
9135
9136       /* It's a conditional operation based on the status flags,
9137          so it must be some flavor of CSEL.  */
9138
9139       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
9140       if (GET_CODE (op1) == NEG
9141           || GET_CODE (op1) == NOT
9142           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
9143         op1 = XEXP (op1, 0);
9144       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
9145         {
9146           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
9147           op1 = XEXP (op1, 0);
9148           op2 = XEXP (op2, 0);
9149         }
9150
9151       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
9152       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
9153       return true;
9154     }
9155
9156   /* We don't know what this is, cost all operands.  */
9157   return false;
9158 }
9159
9160 /* Check whether X is a bitfield operation of the form shift + extend that
9161    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
9162    operand to which the bitfield operation is applied.  Otherwise return
9163    NULL_RTX.  */
9164
9165 static rtx
9166 aarch64_extend_bitfield_pattern_p (rtx x)
9167 {
9168   rtx_code outer_code = GET_CODE (x);
9169   machine_mode outer_mode = GET_MODE (x);
9170
9171   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
9172       && outer_mode != SImode && outer_mode != DImode)
9173     return NULL_RTX;
9174
9175   rtx inner = XEXP (x, 0);
9176   rtx_code inner_code = GET_CODE (inner);
9177   machine_mode inner_mode = GET_MODE (inner);
9178   rtx op = NULL_RTX;
9179
9180   switch (inner_code)
9181     {
9182       case ASHIFT:
9183         if (CONST_INT_P (XEXP (inner, 1))
9184             && (inner_mode == QImode || inner_mode == HImode))
9185           op = XEXP (inner, 0);
9186         break;
9187       case LSHIFTRT:
9188         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
9189             && (inner_mode == QImode || inner_mode == HImode))
9190           op = XEXP (inner, 0);
9191         break;
9192       case ASHIFTRT:
9193         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
9194             && (inner_mode == QImode || inner_mode == HImode))
9195           op = XEXP (inner, 0);
9196         break;
9197       default:
9198         break;
9199     }
9200
9201   return op;
9202 }
9203
9204 /* Return true if the mask and a shift amount from an RTX of the form
9205    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
9206    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
9207
9208 bool
9209 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
9210                                     rtx shft_amnt)
9211 {
9212   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
9213          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
9214          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
9215          && (INTVAL (mask)
9216              & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
9217 }
9218
9219 /* Calculate the cost of calculating X, storing it in *COST.  Result
9220    is true if the total cost of the operation has now been calculated.  */
9221 static bool
9222 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
9223                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
9224 {
9225   rtx op0, op1, op2;
9226   const struct cpu_cost_table *extra_cost
9227     = aarch64_tune_params.insn_extra_cost;
9228   int code = GET_CODE (x);
9229   scalar_int_mode int_mode;
9230
9231   /* By default, assume that everything has equivalent cost to the
9232      cheapest instruction.  Any additional costs are applied as a delta
9233      above this default.  */
9234   *cost = COSTS_N_INSNS (1);
9235
9236   switch (code)
9237     {
9238     case SET:
9239       /* The cost depends entirely on the operands to SET.  */
9240       *cost = 0;
9241       op0 = SET_DEST (x);
9242       op1 = SET_SRC (x);
9243
9244       switch (GET_CODE (op0))
9245         {
9246         case MEM:
9247           if (speed)
9248             {
9249               rtx address = XEXP (op0, 0);
9250               if (VECTOR_MODE_P (mode))
9251                 *cost += extra_cost->ldst.storev;
9252               else if (GET_MODE_CLASS (mode) == MODE_INT)
9253                 *cost += extra_cost->ldst.store;
9254               else if (mode == SFmode)
9255                 *cost += extra_cost->ldst.storef;
9256               else if (mode == DFmode)
9257                 *cost += extra_cost->ldst.stored;
9258
9259               *cost +=
9260                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9261                                                      0, speed));
9262             }
9263
9264           *cost += rtx_cost (op1, mode, SET, 1, speed);
9265           return true;
9266
9267         case SUBREG:
9268           if (! REG_P (SUBREG_REG (op0)))
9269             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
9270
9271           /* Fall through.  */
9272         case REG:
9273           /* The cost is one per vector-register copied.  */
9274           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
9275             {
9276               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
9277               *cost = COSTS_N_INSNS (nregs);
9278             }
9279           /* const0_rtx is in general free, but we will use an
9280              instruction to set a register to 0.  */
9281           else if (REG_P (op1) || op1 == const0_rtx)
9282             {
9283               /* The cost is 1 per register copied.  */
9284               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
9285               *cost = COSTS_N_INSNS (nregs);
9286             }
9287           else
9288             /* Cost is just the cost of the RHS of the set.  */
9289             *cost += rtx_cost (op1, mode, SET, 1, speed);
9290           return true;
9291
9292         case ZERO_EXTRACT:
9293         case SIGN_EXTRACT:
9294           /* Bit-field insertion.  Strip any redundant widening of
9295              the RHS to meet the width of the target.  */
9296           if (GET_CODE (op1) == SUBREG)
9297             op1 = SUBREG_REG (op1);
9298           if ((GET_CODE (op1) == ZERO_EXTEND
9299                || GET_CODE (op1) == SIGN_EXTEND)
9300               && CONST_INT_P (XEXP (op0, 1))
9301               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
9302               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
9303             op1 = XEXP (op1, 0);
9304
9305           if (CONST_INT_P (op1))
9306             {
9307               /* MOV immediate is assumed to always be cheap.  */
9308               *cost = COSTS_N_INSNS (1);
9309             }
9310           else
9311             {
9312               /* BFM.  */
9313               if (speed)
9314                 *cost += extra_cost->alu.bfi;
9315               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
9316             }
9317
9318           return true;
9319
9320         default:
9321           /* We can't make sense of this, assume default cost.  */
9322           *cost = COSTS_N_INSNS (1);
9323           return false;
9324         }
9325       return false;
9326
9327     case CONST_INT:
9328       /* If an instruction can incorporate a constant within the
9329          instruction, the instruction's expression avoids calling
9330          rtx_cost() on the constant.  If rtx_cost() is called on a
9331          constant, then it is usually because the constant must be
9332          moved into a register by one or more instructions.
9333
9334          The exception is constant 0, which can be expressed
9335          as XZR/WZR and is therefore free.  The exception to this is
9336          if we have (set (reg) (const0_rtx)) in which case we must cost
9337          the move.  However, we can catch that when we cost the SET, so
9338          we don't need to consider that here.  */
9339       if (x == const0_rtx)
9340         *cost = 0;
9341       else
9342         {
9343           /* To an approximation, building any other constant is
9344              proportionally expensive to the number of instructions
9345              required to build that constant.  This is true whether we
9346              are compiling for SPEED or otherwise.  */
9347           if (!is_a <scalar_int_mode> (mode, &int_mode))
9348             int_mode = word_mode;
9349           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
9350                                  (NULL_RTX, x, false, int_mode));
9351         }
9352       return true;
9353
9354     case CONST_DOUBLE:
9355
9356       /* First determine number of instructions to do the move
9357           as an integer constant.  */
9358       if (!aarch64_float_const_representable_p (x)
9359            && !aarch64_can_const_movi_rtx_p (x, mode)
9360            && aarch64_float_const_rtx_p (x))
9361         {
9362           unsigned HOST_WIDE_INT ival;
9363           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
9364           gcc_assert (succeed);
9365
9366           scalar_int_mode imode = (mode == HFmode
9367                                    ? SImode
9368                                    : int_mode_for_mode (mode).require ());
9369           int ncost = aarch64_internal_mov_immediate
9370                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
9371           *cost += COSTS_N_INSNS (ncost);
9372           return true;
9373         }
9374
9375       if (speed)
9376         {
9377           /* mov[df,sf]_aarch64.  */
9378           if (aarch64_float_const_representable_p (x))
9379             /* FMOV (scalar immediate).  */
9380             *cost += extra_cost->fp[mode == DFmode].fpconst;
9381           else if (!aarch64_float_const_zero_rtx_p (x))
9382             {
9383               /* This will be a load from memory.  */
9384               if (mode == DFmode)
9385                 *cost += extra_cost->ldst.loadd;
9386               else
9387                 *cost += extra_cost->ldst.loadf;
9388             }
9389           else
9390             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
9391                or MOV v0.s[0], wzr - neither of which are modeled by the
9392                cost tables.  Just use the default cost.  */
9393             {
9394             }
9395         }
9396
9397       return true;
9398
9399     case MEM:
9400       if (speed)
9401         {
9402           /* For loads we want the base cost of a load, plus an
9403              approximation for the additional cost of the addressing
9404              mode.  */
9405           rtx address = XEXP (x, 0);
9406           if (VECTOR_MODE_P (mode))
9407             *cost += extra_cost->ldst.loadv;
9408           else if (GET_MODE_CLASS (mode) == MODE_INT)
9409             *cost += extra_cost->ldst.load;
9410           else if (mode == SFmode)
9411             *cost += extra_cost->ldst.loadf;
9412           else if (mode == DFmode)
9413             *cost += extra_cost->ldst.loadd;
9414
9415           *cost +=
9416                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9417                                                      0, speed));
9418         }
9419
9420       return true;
9421
9422     case NEG:
9423       op0 = XEXP (x, 0);
9424
9425       if (VECTOR_MODE_P (mode))
9426         {
9427           if (speed)
9428             {
9429               /* FNEG.  */
9430               *cost += extra_cost->vect.alu;
9431             }
9432           return false;
9433         }
9434
9435       if (GET_MODE_CLASS (mode) == MODE_INT)
9436         {
9437           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
9438               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
9439             {
9440               /* CSETM.  */
9441               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
9442               return true;
9443             }
9444
9445           /* Cost this as SUB wzr, X.  */
9446           op0 = CONST0_RTX (mode);
9447           op1 = XEXP (x, 0);
9448           goto cost_minus;
9449         }
9450
9451       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9452         {
9453           /* Support (neg(fma...)) as a single instruction only if
9454              sign of zeros is unimportant.  This matches the decision
9455              making in aarch64.md.  */
9456           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
9457             {
9458               /* FNMADD.  */
9459               *cost = rtx_cost (op0, mode, NEG, 0, speed);
9460               return true;
9461             }
9462           if (GET_CODE (op0) == MULT)
9463             {
9464               /* FNMUL.  */
9465               *cost = rtx_cost (op0, mode, NEG, 0, speed);
9466               return true;
9467             }
9468           if (speed)
9469             /* FNEG.  */
9470             *cost += extra_cost->fp[mode == DFmode].neg;
9471           return false;
9472         }
9473
9474       return false;
9475
9476     case CLRSB:
9477     case CLZ:
9478       if (speed)
9479         {
9480           if (VECTOR_MODE_P (mode))
9481             *cost += extra_cost->vect.alu;
9482           else
9483             *cost += extra_cost->alu.clz;
9484         }
9485
9486       return false;
9487
9488     case COMPARE:
9489       op0 = XEXP (x, 0);
9490       op1 = XEXP (x, 1);
9491
9492       if (op1 == const0_rtx
9493           && GET_CODE (op0) == AND)
9494         {
9495           x = op0;
9496           mode = GET_MODE (op0);
9497           goto cost_logic;
9498         }
9499
9500       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
9501         {
9502           /* TODO: A write to the CC flags possibly costs extra, this
9503              needs encoding in the cost tables.  */
9504
9505           mode = GET_MODE (op0);
9506           /* ANDS.  */
9507           if (GET_CODE (op0) == AND)
9508             {
9509               x = op0;
9510               goto cost_logic;
9511             }
9512
9513           if (GET_CODE (op0) == PLUS)
9514             {
9515               /* ADDS (and CMN alias).  */
9516               x = op0;
9517               goto cost_plus;
9518             }
9519
9520           if (GET_CODE (op0) == MINUS)
9521             {
9522               /* SUBS.  */
9523               x = op0;
9524               goto cost_minus;
9525             }
9526
9527           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
9528               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
9529               && CONST_INT_P (XEXP (op0, 2)))
9530             {
9531               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
9532                  Handle it here directly rather than going to cost_logic
9533                  since we know the immediate generated for the TST is valid
9534                  so we can avoid creating an intermediate rtx for it only
9535                  for costing purposes.  */
9536               if (speed)
9537                 *cost += extra_cost->alu.logical;
9538
9539               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
9540                                  ZERO_EXTRACT, 0, speed);
9541               return true;
9542             }
9543
9544           if (GET_CODE (op1) == NEG)
9545             {
9546               /* CMN.  */
9547               if (speed)
9548                 *cost += extra_cost->alu.arith;
9549
9550               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
9551               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
9552               return true;
9553             }
9554
9555           /* CMP.
9556
9557              Compare can freely swap the order of operands, and
9558              canonicalization puts the more complex operation first.
9559              But the integer MINUS logic expects the shift/extend
9560              operation in op1.  */
9561           if (! (REG_P (op0)
9562                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
9563           {
9564             op0 = XEXP (x, 1);
9565             op1 = XEXP (x, 0);
9566           }
9567           goto cost_minus;
9568         }
9569
9570       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
9571         {
9572           /* FCMP.  */
9573           if (speed)
9574             *cost += extra_cost->fp[mode == DFmode].compare;
9575
9576           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
9577             {
9578               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
9579               /* FCMP supports constant 0.0 for no extra cost. */
9580               return true;
9581             }
9582           return false;
9583         }
9584
9585       if (VECTOR_MODE_P (mode))
9586         {
9587           /* Vector compare.  */
9588           if (speed)
9589             *cost += extra_cost->vect.alu;
9590
9591           if (aarch64_float_const_zero_rtx_p (op1))
9592             {
9593               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
9594                  cost.  */
9595               return true;
9596             }
9597           return false;
9598         }
9599       return false;
9600
9601     case MINUS:
9602       {
9603         op0 = XEXP (x, 0);
9604         op1 = XEXP (x, 1);
9605
9606 cost_minus:
9607         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
9608
9609         /* Detect valid immediates.  */
9610         if ((GET_MODE_CLASS (mode) == MODE_INT
9611              || (GET_MODE_CLASS (mode) == MODE_CC
9612                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
9613             && CONST_INT_P (op1)
9614             && aarch64_uimm12_shift (INTVAL (op1)))
9615           {
9616             if (speed)
9617               /* SUB(S) (immediate).  */
9618               *cost += extra_cost->alu.arith;
9619             return true;
9620           }
9621
9622         /* Look for SUB (extended register).  */
9623         if (is_a <scalar_int_mode> (mode, &int_mode)
9624             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
9625           {
9626             if (speed)
9627               *cost += extra_cost->alu.extend_arith;
9628
9629             op1 = aarch64_strip_extend (op1, true);
9630             *cost += rtx_cost (op1, VOIDmode,
9631                                (enum rtx_code) GET_CODE (op1), 0, speed);
9632             return true;
9633           }
9634
9635         rtx new_op1 = aarch64_strip_extend (op1, false);
9636
9637         /* Cost this as an FMA-alike operation.  */
9638         if ((GET_CODE (new_op1) == MULT
9639              || aarch64_shift_p (GET_CODE (new_op1)))
9640             && code != COMPARE)
9641           {
9642             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
9643                                             (enum rtx_code) code,
9644                                             speed);
9645             return true;
9646           }
9647
9648         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
9649
9650         if (speed)
9651           {
9652             if (VECTOR_MODE_P (mode))
9653               {
9654                 /* Vector SUB.  */
9655                 *cost += extra_cost->vect.alu;
9656               }
9657             else if (GET_MODE_CLASS (mode) == MODE_INT)
9658               {
9659                 /* SUB(S).  */
9660                 *cost += extra_cost->alu.arith;
9661               }
9662             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9663               {
9664                 /* FSUB.  */
9665                 *cost += extra_cost->fp[mode == DFmode].addsub;
9666               }
9667           }
9668         return true;
9669       }
9670
9671     case PLUS:
9672       {
9673         rtx new_op0;
9674
9675         op0 = XEXP (x, 0);
9676         op1 = XEXP (x, 1);
9677
9678 cost_plus:
9679         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
9680             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
9681           {
9682             /* CSINC.  */
9683             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
9684             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
9685             return true;
9686           }
9687
9688         if (GET_MODE_CLASS (mode) == MODE_INT
9689             && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
9690                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
9691           {
9692             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
9693
9694             if (speed)
9695               /* ADD (immediate).  */
9696               *cost += extra_cost->alu.arith;
9697             return true;
9698           }
9699
9700         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
9701
9702         /* Look for ADD (extended register).  */
9703         if (is_a <scalar_int_mode> (mode, &int_mode)
9704             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
9705           {
9706             if (speed)
9707               *cost += extra_cost->alu.extend_arith;
9708
9709             op0 = aarch64_strip_extend (op0, true);
9710             *cost += rtx_cost (op0, VOIDmode,
9711                                (enum rtx_code) GET_CODE (op0), 0, speed);
9712             return true;
9713           }
9714
9715         /* Strip any extend, leave shifts behind as we will
9716            cost them through mult_cost.  */
9717         new_op0 = aarch64_strip_extend (op0, false);
9718
9719         if (GET_CODE (new_op0) == MULT
9720             || aarch64_shift_p (GET_CODE (new_op0)))
9721           {
9722             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
9723                                             speed);
9724             return true;
9725           }
9726
9727         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
9728
9729         if (speed)
9730           {
9731             if (VECTOR_MODE_P (mode))
9732               {
9733                 /* Vector ADD.  */
9734                 *cost += extra_cost->vect.alu;
9735               }
9736             else if (GET_MODE_CLASS (mode) == MODE_INT)
9737               {
9738                 /* ADD.  */
9739                 *cost += extra_cost->alu.arith;
9740               }
9741             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9742               {
9743                 /* FADD.  */
9744                 *cost += extra_cost->fp[mode == DFmode].addsub;
9745               }
9746           }
9747         return true;
9748       }
9749
9750     case BSWAP:
9751       *cost = COSTS_N_INSNS (1);
9752
9753       if (speed)
9754         {
9755           if (VECTOR_MODE_P (mode))
9756             *cost += extra_cost->vect.alu;
9757           else
9758             *cost += extra_cost->alu.rev;
9759         }
9760       return false;
9761
9762     case IOR:
9763       if (aarch_rev16_p (x))
9764         {
9765           *cost = COSTS_N_INSNS (1);
9766
9767           if (speed)
9768             {
9769               if (VECTOR_MODE_P (mode))
9770                 *cost += extra_cost->vect.alu;
9771               else
9772                 *cost += extra_cost->alu.rev;
9773             }
9774           return true;
9775         }
9776
9777       if (aarch64_extr_rtx_p (x, &op0, &op1))
9778         {
9779           *cost += rtx_cost (op0, mode, IOR, 0, speed);
9780           *cost += rtx_cost (op1, mode, IOR, 1, speed);
9781           if (speed)
9782             *cost += extra_cost->alu.shift;
9783
9784           return true;
9785         }
9786     /* Fall through.  */
9787     case XOR:
9788     case AND:
9789     cost_logic:
9790       op0 = XEXP (x, 0);
9791       op1 = XEXP (x, 1);
9792
9793       if (VECTOR_MODE_P (mode))
9794         {
9795           if (speed)
9796             *cost += extra_cost->vect.alu;
9797           return true;
9798         }
9799
9800       if (code == AND
9801           && GET_CODE (op0) == MULT
9802           && CONST_INT_P (XEXP (op0, 1))
9803           && CONST_INT_P (op1)
9804           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
9805                                INTVAL (op1)) != 0)
9806         {
9807           /* This is a UBFM/SBFM.  */
9808           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
9809           if (speed)
9810             *cost += extra_cost->alu.bfx;
9811           return true;
9812         }
9813
9814       if (is_int_mode (mode, &int_mode))
9815         {
9816           if (CONST_INT_P (op1))
9817             {
9818               /* We have a mask + shift version of a UBFIZ
9819                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
9820               if (GET_CODE (op0) == ASHIFT
9821                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
9822                                                          XEXP (op0, 1)))
9823                 {
9824                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
9825                                      (enum rtx_code) code, 0, speed);
9826                   if (speed)
9827                     *cost += extra_cost->alu.bfx;
9828
9829                   return true;
9830                 }
9831               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
9832                 {
9833                 /* We possibly get the immediate for free, this is not
9834                    modelled.  */
9835                   *cost += rtx_cost (op0, int_mode,
9836                                      (enum rtx_code) code, 0, speed);
9837                   if (speed)
9838                     *cost += extra_cost->alu.logical;
9839
9840                   return true;
9841                 }
9842             }
9843           else
9844             {
9845               rtx new_op0 = op0;
9846
9847               /* Handle ORN, EON, or BIC.  */
9848               if (GET_CODE (op0) == NOT)
9849                 op0 = XEXP (op0, 0);
9850
9851               new_op0 = aarch64_strip_shift (op0);
9852
9853               /* If we had a shift on op0 then this is a logical-shift-
9854                  by-register/immediate operation.  Otherwise, this is just
9855                  a logical operation.  */
9856               if (speed)
9857                 {
9858                   if (new_op0 != op0)
9859                     {
9860                       /* Shift by immediate.  */
9861                       if (CONST_INT_P (XEXP (op0, 1)))
9862                         *cost += extra_cost->alu.log_shift;
9863                       else
9864                         *cost += extra_cost->alu.log_shift_reg;
9865                     }
9866                   else
9867                     *cost += extra_cost->alu.logical;
9868                 }
9869
9870               /* In both cases we want to cost both operands.  */
9871               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
9872                                  0, speed);
9873               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
9874                                  1, speed);
9875
9876               return true;
9877             }
9878         }
9879       return false;
9880
9881     case NOT:
9882       x = XEXP (x, 0);
9883       op0 = aarch64_strip_shift (x);
9884
9885       if (VECTOR_MODE_P (mode))
9886         {
9887           /* Vector NOT.  */
9888           *cost += extra_cost->vect.alu;
9889           return false;
9890         }
9891
9892       /* MVN-shifted-reg.  */
9893       if (op0 != x)
9894         {
9895           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9896
9897           if (speed)
9898             *cost += extra_cost->alu.log_shift;
9899
9900           return true;
9901         }
9902       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
9903          Handle the second form here taking care that 'a' in the above can
9904          be a shift.  */
9905       else if (GET_CODE (op0) == XOR)
9906         {
9907           rtx newop0 = XEXP (op0, 0);
9908           rtx newop1 = XEXP (op0, 1);
9909           rtx op0_stripped = aarch64_strip_shift (newop0);
9910
9911           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
9912           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
9913
9914           if (speed)
9915             {
9916               if (op0_stripped != newop0)
9917                 *cost += extra_cost->alu.log_shift;
9918               else
9919                 *cost += extra_cost->alu.logical;
9920             }
9921
9922           return true;
9923         }
9924       /* MVN.  */
9925       if (speed)
9926         *cost += extra_cost->alu.logical;
9927
9928       return false;
9929
9930     case ZERO_EXTEND:
9931
9932       op0 = XEXP (x, 0);
9933       /* If a value is written in SI mode, then zero extended to DI
9934          mode, the operation will in general be free as a write to
9935          a 'w' register implicitly zeroes the upper bits of an 'x'
9936          register.  However, if this is
9937
9938            (set (reg) (zero_extend (reg)))
9939
9940          we must cost the explicit register move.  */
9941       if (mode == DImode
9942           && GET_MODE (op0) == SImode
9943           && outer == SET)
9944         {
9945           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
9946
9947         /* If OP_COST is non-zero, then the cost of the zero extend
9948            is effectively the cost of the inner operation.  Otherwise
9949            we have a MOV instruction and we take the cost from the MOV
9950            itself.  This is true independently of whether we are
9951            optimizing for space or time.  */
9952           if (op_cost)
9953             *cost = op_cost;
9954
9955           return true;
9956         }
9957       else if (MEM_P (op0))
9958         {
9959           /* All loads can zero extend to any size for free.  */
9960           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
9961           return true;
9962         }
9963
9964       op0 = aarch64_extend_bitfield_pattern_p (x);
9965       if (op0)
9966         {
9967           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
9968           if (speed)
9969             *cost += extra_cost->alu.bfx;
9970           return true;
9971         }
9972
9973       if (speed)
9974         {
9975           if (VECTOR_MODE_P (mode))
9976             {
9977               /* UMOV.  */
9978               *cost += extra_cost->vect.alu;
9979             }
9980           else
9981             {
9982               /* We generate an AND instead of UXTB/UXTH.  */
9983               *cost += extra_cost->alu.logical;
9984             }
9985         }
9986       return false;
9987
9988     case SIGN_EXTEND:
9989       if (MEM_P (XEXP (x, 0)))
9990         {
9991           /* LDRSH.  */
9992           if (speed)
9993             {
9994               rtx address = XEXP (XEXP (x, 0), 0);
9995               *cost += extra_cost->ldst.load_sign_extend;
9996
9997               *cost +=
9998                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9999                                                      0, speed));
10000             }
10001           return true;
10002         }
10003
10004       op0 = aarch64_extend_bitfield_pattern_p (x);
10005       if (op0)
10006         {
10007           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
10008           if (speed)
10009             *cost += extra_cost->alu.bfx;
10010           return true;
10011         }
10012
10013       if (speed)
10014         {
10015           if (VECTOR_MODE_P (mode))
10016             *cost += extra_cost->vect.alu;
10017           else
10018             *cost += extra_cost->alu.extend;
10019         }
10020       return false;
10021
10022     case ASHIFT:
10023       op0 = XEXP (x, 0);
10024       op1 = XEXP (x, 1);
10025
10026       if (CONST_INT_P (op1))
10027         {
10028           if (speed)
10029             {
10030               if (VECTOR_MODE_P (mode))
10031                 {
10032                   /* Vector shift (immediate).  */
10033                   *cost += extra_cost->vect.alu;
10034                 }
10035               else
10036                 {
10037                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
10038                      aliases.  */
10039                   *cost += extra_cost->alu.shift;
10040                 }
10041             }
10042
10043           /* We can incorporate zero/sign extend for free.  */
10044           if (GET_CODE (op0) == ZERO_EXTEND
10045               || GET_CODE (op0) == SIGN_EXTEND)
10046             op0 = XEXP (op0, 0);
10047
10048           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
10049           return true;
10050         }
10051       else
10052         {
10053           if (VECTOR_MODE_P (mode))
10054             {
10055               if (speed)
10056                 /* Vector shift (register).  */
10057                 *cost += extra_cost->vect.alu;
10058             }
10059           else
10060             {
10061               if (speed)
10062                 /* LSLV.  */
10063                 *cost += extra_cost->alu.shift_reg;
10064
10065               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10066                   && CONST_INT_P (XEXP (op1, 1))
10067                   && known_eq (INTVAL (XEXP (op1, 1)),
10068                                GET_MODE_BITSIZE (mode) - 1))
10069                 {
10070                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10071                   /* We already demanded XEXP (op1, 0) to be REG_P, so
10072                      don't recurse into it.  */
10073                   return true;
10074                 }
10075             }
10076           return false;  /* All arguments need to be in registers.  */
10077         }
10078
10079     case ROTATE:
10080     case ROTATERT:
10081     case LSHIFTRT:
10082     case ASHIFTRT:
10083       op0 = XEXP (x, 0);
10084       op1 = XEXP (x, 1);
10085
10086       if (CONST_INT_P (op1))
10087         {
10088           /* ASR (immediate) and friends.  */
10089           if (speed)
10090             {
10091               if (VECTOR_MODE_P (mode))
10092                 *cost += extra_cost->vect.alu;
10093               else
10094                 *cost += extra_cost->alu.shift;
10095             }
10096
10097           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10098           return true;
10099         }
10100       else
10101         {
10102           if (VECTOR_MODE_P (mode))
10103             {
10104               if (speed)
10105                 /* Vector shift (register).  */
10106                 *cost += extra_cost->vect.alu;
10107             }
10108           else
10109             {
10110               if (speed)
10111                 /* ASR (register) and friends.  */
10112                 *cost += extra_cost->alu.shift_reg;
10113
10114               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10115                   && CONST_INT_P (XEXP (op1, 1))
10116                   && known_eq (INTVAL (XEXP (op1, 1)),
10117                                GET_MODE_BITSIZE (mode) - 1))
10118                 {
10119                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10120                   /* We already demanded XEXP (op1, 0) to be REG_P, so
10121                      don't recurse into it.  */
10122                   return true;
10123                 }
10124             }
10125           return false;  /* All arguments need to be in registers.  */
10126         }
10127
10128     case SYMBOL_REF:
10129
10130       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
10131           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
10132         {
10133           /* LDR.  */
10134           if (speed)
10135             *cost += extra_cost->ldst.load;
10136         }
10137       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
10138                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
10139         {
10140           /* ADRP, followed by ADD.  */
10141           *cost += COSTS_N_INSNS (1);
10142           if (speed)
10143             *cost += 2 * extra_cost->alu.arith;
10144         }
10145       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
10146                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10147         {
10148           /* ADR.  */
10149           if (speed)
10150             *cost += extra_cost->alu.arith;
10151         }
10152
10153       if (flag_pic)
10154         {
10155           /* One extra load instruction, after accessing the GOT.  */
10156           *cost += COSTS_N_INSNS (1);
10157           if (speed)
10158             *cost += extra_cost->ldst.load;
10159         }
10160       return true;
10161
10162     case HIGH:
10163     case LO_SUM:
10164       /* ADRP/ADD (immediate).  */
10165       if (speed)
10166         *cost += extra_cost->alu.arith;
10167       return true;
10168
10169     case ZERO_EXTRACT:
10170     case SIGN_EXTRACT:
10171       /* UBFX/SBFX.  */
10172       if (speed)
10173         {
10174           if (VECTOR_MODE_P (mode))
10175             *cost += extra_cost->vect.alu;
10176           else
10177             *cost += extra_cost->alu.bfx;
10178         }
10179
10180       /* We can trust that the immediates used will be correct (there
10181          are no by-register forms), so we need only cost op0.  */
10182       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
10183       return true;
10184
10185     case MULT:
10186       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
10187       /* aarch64_rtx_mult_cost always handles recursion to its
10188          operands.  */
10189       return true;
10190
10191     case MOD:
10192     /* We can expand signed mod by power of 2 using a NEGS, two parallel
10193        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
10194        an unconditional negate.  This case should only ever be reached through
10195        the set_smod_pow2_cheap check in expmed.c.  */
10196       if (CONST_INT_P (XEXP (x, 1))
10197           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
10198           && (mode == SImode || mode == DImode))
10199         {
10200           /* We expand to 4 instructions.  Reset the baseline.  */
10201           *cost = COSTS_N_INSNS (4);
10202
10203           if (speed)
10204             *cost += 2 * extra_cost->alu.logical
10205                      + 2 * extra_cost->alu.arith;
10206
10207           return true;
10208         }
10209
10210     /* Fall-through.  */
10211     case UMOD:
10212       if (speed)
10213         {
10214           /* Slighly prefer UMOD over SMOD.  */
10215           if (VECTOR_MODE_P (mode))
10216             *cost += extra_cost->vect.alu;
10217           else if (GET_MODE_CLASS (mode) == MODE_INT)
10218             *cost += (extra_cost->mult[mode == DImode].add
10219                       + extra_cost->mult[mode == DImode].idiv
10220                       + (code == MOD ? 1 : 0));
10221         }
10222       return false;  /* All arguments need to be in registers.  */
10223
10224     case DIV:
10225     case UDIV:
10226     case SQRT:
10227       if (speed)
10228         {
10229           if (VECTOR_MODE_P (mode))
10230             *cost += extra_cost->vect.alu;
10231           else if (GET_MODE_CLASS (mode) == MODE_INT)
10232             /* There is no integer SQRT, so only DIV and UDIV can get
10233                here.  */
10234             *cost += (extra_cost->mult[mode == DImode].idiv
10235                      /* Slighly prefer UDIV over SDIV.  */
10236                      + (code == DIV ? 1 : 0));
10237           else
10238             *cost += extra_cost->fp[mode == DFmode].div;
10239         }
10240       return false;  /* All arguments need to be in registers.  */
10241
10242     case IF_THEN_ELSE:
10243       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
10244                                          XEXP (x, 2), cost, speed);
10245
10246     case EQ:
10247     case NE:
10248     case GT:
10249     case GTU:
10250     case LT:
10251     case LTU:
10252     case GE:
10253     case GEU:
10254     case LE:
10255     case LEU:
10256
10257       return false; /* All arguments must be in registers.  */
10258
10259     case FMA:
10260       op0 = XEXP (x, 0);
10261       op1 = XEXP (x, 1);
10262       op2 = XEXP (x, 2);
10263
10264       if (speed)
10265         {
10266           if (VECTOR_MODE_P (mode))
10267             *cost += extra_cost->vect.alu;
10268           else
10269             *cost += extra_cost->fp[mode == DFmode].fma;
10270         }
10271
10272       /* FMSUB, FNMADD, and FNMSUB are free.  */
10273       if (GET_CODE (op0) == NEG)
10274         op0 = XEXP (op0, 0);
10275
10276       if (GET_CODE (op2) == NEG)
10277         op2 = XEXP (op2, 0);
10278
10279       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
10280          and the by-element operand as operand 0.  */
10281       if (GET_CODE (op1) == NEG)
10282         op1 = XEXP (op1, 0);
10283
10284       /* Catch vector-by-element operations.  The by-element operand can
10285          either be (vec_duplicate (vec_select (x))) or just
10286          (vec_select (x)), depending on whether we are multiplying by
10287          a vector or a scalar.
10288
10289          Canonicalization is not very good in these cases, FMA4 will put the
10290          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
10291       if (GET_CODE (op0) == VEC_DUPLICATE)
10292         op0 = XEXP (op0, 0);
10293       else if (GET_CODE (op1) == VEC_DUPLICATE)
10294         op1 = XEXP (op1, 0);
10295
10296       if (GET_CODE (op0) == VEC_SELECT)
10297         op0 = XEXP (op0, 0);
10298       else if (GET_CODE (op1) == VEC_SELECT)
10299         op1 = XEXP (op1, 0);
10300
10301       /* If the remaining parameters are not registers,
10302          get the cost to put them into registers.  */
10303       *cost += rtx_cost (op0, mode, FMA, 0, speed);
10304       *cost += rtx_cost (op1, mode, FMA, 1, speed);
10305       *cost += rtx_cost (op2, mode, FMA, 2, speed);
10306       return true;
10307
10308     case FLOAT:
10309     case UNSIGNED_FLOAT:
10310       if (speed)
10311         *cost += extra_cost->fp[mode == DFmode].fromint;
10312       return false;
10313
10314     case FLOAT_EXTEND:
10315       if (speed)
10316         {
10317           if (VECTOR_MODE_P (mode))
10318             {
10319               /*Vector truncate.  */
10320               *cost += extra_cost->vect.alu;
10321             }
10322           else
10323             *cost += extra_cost->fp[mode == DFmode].widen;
10324         }
10325       return false;
10326
10327     case FLOAT_TRUNCATE:
10328       if (speed)
10329         {
10330           if (VECTOR_MODE_P (mode))
10331             {
10332               /*Vector conversion.  */
10333               *cost += extra_cost->vect.alu;
10334             }
10335           else
10336             *cost += extra_cost->fp[mode == DFmode].narrow;
10337         }
10338       return false;
10339
10340     case FIX:
10341     case UNSIGNED_FIX:
10342       x = XEXP (x, 0);
10343       /* Strip the rounding part.  They will all be implemented
10344          by the fcvt* family of instructions anyway.  */
10345       if (GET_CODE (x) == UNSPEC)
10346         {
10347           unsigned int uns_code = XINT (x, 1);
10348
10349           if (uns_code == UNSPEC_FRINTA
10350               || uns_code == UNSPEC_FRINTM
10351               || uns_code == UNSPEC_FRINTN
10352               || uns_code == UNSPEC_FRINTP
10353               || uns_code == UNSPEC_FRINTZ)
10354             x = XVECEXP (x, 0, 0);
10355         }
10356
10357       if (speed)
10358         {
10359           if (VECTOR_MODE_P (mode))
10360             *cost += extra_cost->vect.alu;
10361           else
10362             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
10363         }
10364
10365       /* We can combine fmul by a power of 2 followed by a fcvt into a single
10366          fixed-point fcvt.  */
10367       if (GET_CODE (x) == MULT
10368           && ((VECTOR_MODE_P (mode)
10369                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
10370               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
10371         {
10372           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
10373                              0, speed);
10374           return true;
10375         }
10376
10377       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
10378       return true;
10379
10380     case ABS:
10381       if (VECTOR_MODE_P (mode))
10382         {
10383           /* ABS (vector).  */
10384           if (speed)
10385             *cost += extra_cost->vect.alu;
10386         }
10387       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10388         {
10389           op0 = XEXP (x, 0);
10390
10391           /* FABD, which is analogous to FADD.  */
10392           if (GET_CODE (op0) == MINUS)
10393             {
10394               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
10395               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
10396               if (speed)
10397                 *cost += extra_cost->fp[mode == DFmode].addsub;
10398
10399               return true;
10400             }
10401           /* Simple FABS is analogous to FNEG.  */
10402           if (speed)
10403             *cost += extra_cost->fp[mode == DFmode].neg;
10404         }
10405       else
10406         {
10407           /* Integer ABS will either be split to
10408              two arithmetic instructions, or will be an ABS
10409              (scalar), which we don't model.  */
10410           *cost = COSTS_N_INSNS (2);
10411           if (speed)
10412             *cost += 2 * extra_cost->alu.arith;
10413         }
10414       return false;
10415
10416     case SMAX:
10417     case SMIN:
10418       if (speed)
10419         {
10420           if (VECTOR_MODE_P (mode))
10421             *cost += extra_cost->vect.alu;
10422           else
10423             {
10424               /* FMAXNM/FMINNM/FMAX/FMIN.
10425                  TODO: This may not be accurate for all implementations, but
10426                  we do not model this in the cost tables.  */
10427               *cost += extra_cost->fp[mode == DFmode].addsub;
10428             }
10429         }
10430       return false;
10431
10432     case UNSPEC:
10433       /* The floating point round to integer frint* instructions.  */
10434       if (aarch64_frint_unspec_p (XINT (x, 1)))
10435         {
10436           if (speed)
10437             *cost += extra_cost->fp[mode == DFmode].roundint;
10438
10439           return false;
10440         }
10441
10442       if (XINT (x, 1) == UNSPEC_RBIT)
10443         {
10444           if (speed)
10445             *cost += extra_cost->alu.rev;
10446
10447           return false;
10448         }
10449       break;
10450
10451     case TRUNCATE:
10452
10453       /* Decompose <su>muldi3_highpart.  */
10454       if (/* (truncate:DI  */
10455           mode == DImode
10456           /*   (lshiftrt:TI  */
10457           && GET_MODE (XEXP (x, 0)) == TImode
10458           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
10459           /*      (mult:TI  */
10460           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
10461           /*        (ANY_EXTEND:TI (reg:DI))
10462                     (ANY_EXTEND:TI (reg:DI)))  */
10463           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
10464                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
10465               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
10466                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
10467           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
10468           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
10469           /*     (const_int 64)  */
10470           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10471           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
10472         {
10473           /* UMULH/SMULH.  */
10474           if (speed)
10475             *cost += extra_cost->mult[mode == DImode].extend;
10476           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
10477                              mode, MULT, 0, speed);
10478           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
10479                              mode, MULT, 1, speed);
10480           return true;
10481         }
10482
10483       /* Fall through.  */
10484     default:
10485       break;
10486     }
10487
10488   if (dump_file
10489       && flag_aarch64_verbose_cost)
10490     fprintf (dump_file,
10491       "\nFailed to cost RTX.  Assuming default cost.\n");
10492
10493   return true;
10494 }
10495
10496 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
10497    calculated for X.  This cost is stored in *COST.  Returns true
10498    if the total cost of X was calculated.  */
10499 static bool
10500 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
10501                    int param, int *cost, bool speed)
10502 {
10503   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
10504
10505   if (dump_file
10506       && flag_aarch64_verbose_cost)
10507     {
10508       print_rtl_single (dump_file, x);
10509       fprintf (dump_file, "\n%s cost: %d (%s)\n",
10510                speed ? "Hot" : "Cold",
10511                *cost, result ? "final" : "partial");
10512     }
10513
10514   return result;
10515 }
10516
10517 static int
10518 aarch64_register_move_cost (machine_mode mode,
10519                             reg_class_t from_i, reg_class_t to_i)
10520 {
10521   enum reg_class from = (enum reg_class) from_i;
10522   enum reg_class to = (enum reg_class) to_i;
10523   const struct cpu_regmove_cost *regmove_cost
10524     = aarch64_tune_params.regmove_cost;
10525
10526   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
10527   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
10528     to = GENERAL_REGS;
10529
10530   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
10531     from = GENERAL_REGS;
10532
10533   /* Moving between GPR and stack cost is the same as GP2GP.  */
10534   if ((from == GENERAL_REGS && to == STACK_REG)
10535       || (to == GENERAL_REGS && from == STACK_REG))
10536     return regmove_cost->GP2GP;
10537
10538   /* To/From the stack register, we move via the gprs.  */
10539   if (to == STACK_REG || from == STACK_REG)
10540     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
10541             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
10542
10543   if (known_eq (GET_MODE_SIZE (mode), 16))
10544     {
10545       /* 128-bit operations on general registers require 2 instructions.  */
10546       if (from == GENERAL_REGS && to == GENERAL_REGS)
10547         return regmove_cost->GP2GP * 2;
10548       else if (from == GENERAL_REGS)
10549         return regmove_cost->GP2FP * 2;
10550       else if (to == GENERAL_REGS)
10551         return regmove_cost->FP2GP * 2;
10552
10553       /* When AdvSIMD instructions are disabled it is not possible to move
10554          a 128-bit value directly between Q registers.  This is handled in
10555          secondary reload.  A general register is used as a scratch to move
10556          the upper DI value and the lower DI value is moved directly,
10557          hence the cost is the sum of three moves. */
10558       if (! TARGET_SIMD)
10559         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
10560
10561       return regmove_cost->FP2FP;
10562     }
10563
10564   if (from == GENERAL_REGS && to == GENERAL_REGS)
10565     return regmove_cost->GP2GP;
10566   else if (from == GENERAL_REGS)
10567     return regmove_cost->GP2FP;
10568   else if (to == GENERAL_REGS)
10569     return regmove_cost->FP2GP;
10570
10571   return regmove_cost->FP2FP;
10572 }
10573
10574 static int
10575 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
10576                           reg_class_t rclass ATTRIBUTE_UNUSED,
10577                           bool in ATTRIBUTE_UNUSED)
10578 {
10579   return aarch64_tune_params.memmov_cost;
10580 }
10581
10582 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
10583    to optimize 1.0/sqrt.  */
10584
10585 static bool
10586 use_rsqrt_p (machine_mode mode)
10587 {
10588   return (!flag_trapping_math
10589           && flag_unsafe_math_optimizations
10590           && ((aarch64_tune_params.approx_modes->recip_sqrt
10591                & AARCH64_APPROX_MODE (mode))
10592               || flag_mrecip_low_precision_sqrt));
10593 }
10594
10595 /* Function to decide when to use the approximate reciprocal square root
10596    builtin.  */
10597
10598 static tree
10599 aarch64_builtin_reciprocal (tree fndecl)
10600 {
10601   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
10602
10603   if (!use_rsqrt_p (mode))
10604     return NULL_TREE;
10605   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
10606 }
10607
10608 /* Emit instruction sequence to compute either the approximate square root
10609    or its approximate reciprocal, depending on the flag RECP, and return
10610    whether the sequence was emitted or not.  */
10611
10612 bool
10613 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
10614 {
10615   machine_mode mode = GET_MODE (dst);
10616
10617   if (GET_MODE_INNER (mode) == HFmode)
10618     {
10619       gcc_assert (!recp);
10620       return false;
10621     }
10622
10623   if (!recp)
10624     {
10625       if (!(flag_mlow_precision_sqrt
10626             || (aarch64_tune_params.approx_modes->sqrt
10627                 & AARCH64_APPROX_MODE (mode))))
10628         return false;
10629
10630       if (flag_finite_math_only
10631           || flag_trapping_math
10632           || !flag_unsafe_math_optimizations
10633           || optimize_function_for_size_p (cfun))
10634         return false;
10635     }
10636   else
10637     /* Caller assumes we cannot fail.  */
10638     gcc_assert (use_rsqrt_p (mode));
10639
10640   machine_mode mmsk = mode_for_int_vector (mode).require ();
10641   rtx xmsk = gen_reg_rtx (mmsk);
10642   if (!recp)
10643     /* When calculating the approximate square root, compare the
10644        argument with 0.0 and create a mask.  */
10645     emit_insn (gen_rtx_SET (xmsk,
10646                             gen_rtx_NEG (mmsk,
10647                                          gen_rtx_EQ (mmsk, src,
10648                                                      CONST0_RTX (mode)))));
10649
10650   /* Estimate the approximate reciprocal square root.  */
10651   rtx xdst = gen_reg_rtx (mode);
10652   emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
10653
10654   /* Iterate over the series twice for SF and thrice for DF.  */
10655   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10656
10657   /* Optionally iterate over the series once less for faster performance
10658      while sacrificing the accuracy.  */
10659   if ((recp && flag_mrecip_low_precision_sqrt)
10660       || (!recp && flag_mlow_precision_sqrt))
10661     iterations--;
10662
10663   /* Iterate over the series to calculate the approximate reciprocal square
10664      root.  */
10665   rtx x1 = gen_reg_rtx (mode);
10666   while (iterations--)
10667     {
10668       rtx x2 = gen_reg_rtx (mode);
10669       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
10670
10671       emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
10672
10673       if (iterations > 0)
10674         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
10675     }
10676
10677   if (!recp)
10678     {
10679       /* Qualify the approximate reciprocal square root when the argument is
10680          0.0 by squashing the intermediary result to 0.0.  */
10681       rtx xtmp = gen_reg_rtx (mmsk);
10682       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
10683                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
10684       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
10685
10686       /* Calculate the approximate square root.  */
10687       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
10688     }
10689
10690   /* Finalize the approximation.  */
10691   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
10692
10693   return true;
10694 }
10695
10696 /* Emit the instruction sequence to compute the approximation for the division
10697    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
10698
10699 bool
10700 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
10701 {
10702   machine_mode mode = GET_MODE (quo);
10703
10704   if (GET_MODE_INNER (mode) == HFmode)
10705     return false;
10706
10707   bool use_approx_division_p = (flag_mlow_precision_div
10708                                 || (aarch64_tune_params.approx_modes->division
10709                                     & AARCH64_APPROX_MODE (mode)));
10710
10711   if (!flag_finite_math_only
10712       || flag_trapping_math
10713       || !flag_unsafe_math_optimizations
10714       || optimize_function_for_size_p (cfun)
10715       || !use_approx_division_p)
10716     return false;
10717
10718   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
10719     return false;
10720
10721   /* Estimate the approximate reciprocal.  */
10722   rtx xrcp = gen_reg_rtx (mode);
10723   emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
10724
10725   /* Iterate over the series twice for SF and thrice for DF.  */
10726   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10727
10728   /* Optionally iterate over the series once less for faster performance,
10729      while sacrificing the accuracy.  */
10730   if (flag_mlow_precision_div)
10731     iterations--;
10732
10733   /* Iterate over the series to calculate the approximate reciprocal.  */
10734   rtx xtmp = gen_reg_rtx (mode);
10735   while (iterations--)
10736     {
10737       emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
10738
10739       if (iterations > 0)
10740         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
10741     }
10742
10743   if (num != CONST1_RTX (mode))
10744     {
10745       /* As the approximate reciprocal of DEN is already calculated, only
10746          calculate the approximate division when NUM is not 1.0.  */
10747       rtx xnum = force_reg (mode, num);
10748       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
10749     }
10750
10751   /* Finalize the approximation.  */
10752   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
10753   return true;
10754 }
10755
10756 /* Return the number of instructions that can be issued per cycle.  */
10757 static int
10758 aarch64_sched_issue_rate (void)
10759 {
10760   return aarch64_tune_params.issue_rate;
10761 }
10762
10763 static int
10764 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
10765 {
10766   int issue_rate = aarch64_sched_issue_rate ();
10767
10768   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
10769 }
10770
10771
10772 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
10773    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
10774    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
10775
10776 static int
10777 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
10778                                                     int ready_index)
10779 {
10780   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
10781 }
10782
10783
10784 /* Vectorizer cost model target hooks.  */
10785
10786 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
10787 static int
10788 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
10789                                     tree vectype,
10790                                     int misalign ATTRIBUTE_UNUSED)
10791 {
10792   unsigned elements;
10793   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
10794   bool fp = false;
10795
10796   if (vectype != NULL)
10797     fp = FLOAT_TYPE_P (vectype);
10798
10799   switch (type_of_cost)
10800     {
10801       case scalar_stmt:
10802         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
10803
10804       case scalar_load:
10805         return costs->scalar_load_cost;
10806
10807       case scalar_store:
10808         return costs->scalar_store_cost;
10809
10810       case vector_stmt:
10811         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10812
10813       case vector_load:
10814         return costs->vec_align_load_cost;
10815
10816       case vector_store:
10817         return costs->vec_store_cost;
10818
10819       case vec_to_scalar:
10820         return costs->vec_to_scalar_cost;
10821
10822       case scalar_to_vec:
10823         return costs->scalar_to_vec_cost;
10824
10825       case unaligned_load:
10826       case vector_gather_load:
10827         return costs->vec_unalign_load_cost;
10828
10829       case unaligned_store:
10830       case vector_scatter_store:
10831         return costs->vec_unalign_store_cost;
10832
10833       case cond_branch_taken:
10834         return costs->cond_taken_branch_cost;
10835
10836       case cond_branch_not_taken:
10837         return costs->cond_not_taken_branch_cost;
10838
10839       case vec_perm:
10840         return costs->vec_permute_cost;
10841
10842       case vec_promote_demote:
10843         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10844
10845       case vec_construct:
10846         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
10847         return elements / 2 + 1;
10848
10849       default:
10850         gcc_unreachable ();
10851     }
10852 }
10853
10854 /* Implement targetm.vectorize.add_stmt_cost.  */
10855 static unsigned
10856 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
10857                        struct _stmt_vec_info *stmt_info, int misalign,
10858                        enum vect_cost_model_location where)
10859 {
10860   unsigned *cost = (unsigned *) data;
10861   unsigned retval = 0;
10862
10863   if (flag_vect_cost_model)
10864     {
10865       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
10866       int stmt_cost =
10867             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
10868
10869       /* Statements in an inner loop relative to the loop being
10870          vectorized are weighted more heavily.  The value here is
10871          arbitrary and could potentially be improved with analysis.  */
10872       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
10873         count *= 50; /*  FIXME  */
10874
10875       retval = (unsigned) (count * stmt_cost);
10876       cost[where] += retval;
10877     }
10878
10879   return retval;
10880 }
10881
10882 static void initialize_aarch64_code_model (struct gcc_options *);
10883
10884 /* Parse the TO_PARSE string and put the architecture struct that it
10885    selects into RES and the architectural features into ISA_FLAGS.
10886    Return an aarch64_parse_opt_result describing the parse result.
10887    If there is an error parsing, RES and ISA_FLAGS are left unchanged.
10888    When the TO_PARSE string contains an invalid extension,
10889    a copy of the string is created and stored to INVALID_EXTENSION.  */
10890
10891 static enum aarch64_parse_opt_result
10892 aarch64_parse_arch (const char *to_parse, const struct processor **res,
10893                     unsigned long *isa_flags, std::string *invalid_extension)
10894 {
10895   const char *ext;
10896   const struct processor *arch;
10897   size_t len;
10898
10899   ext = strchr (to_parse, '+');
10900
10901   if (ext != NULL)
10902     len = ext - to_parse;
10903   else
10904     len = strlen (to_parse);
10905
10906   if (len == 0)
10907     return AARCH64_PARSE_MISSING_ARG;
10908
10909
10910   /* Loop through the list of supported ARCHes to find a match.  */
10911   for (arch = all_architectures; arch->name != NULL; arch++)
10912     {
10913       if (strlen (arch->name) == len
10914           && strncmp (arch->name, to_parse, len) == 0)
10915         {
10916           unsigned long isa_temp = arch->flags;
10917
10918           if (ext != NULL)
10919             {
10920               /* TO_PARSE string contains at least one extension.  */
10921               enum aarch64_parse_opt_result ext_res
10922                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
10923
10924               if (ext_res != AARCH64_PARSE_OK)
10925                 return ext_res;
10926             }
10927           /* Extension parsing was successful.  Confirm the result
10928              arch and ISA flags.  */
10929           *res = arch;
10930           *isa_flags = isa_temp;
10931           return AARCH64_PARSE_OK;
10932         }
10933     }
10934
10935   /* ARCH name not found in list.  */
10936   return AARCH64_PARSE_INVALID_ARG;
10937 }
10938
10939 /* Parse the TO_PARSE string and put the result tuning in RES and the
10940    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
10941    describing the parse result.  If there is an error parsing, RES and
10942    ISA_FLAGS are left unchanged.
10943    When the TO_PARSE string contains an invalid extension,
10944    a copy of the string is created and stored to INVALID_EXTENSION.  */
10945
10946 static enum aarch64_parse_opt_result
10947 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
10948                    unsigned long *isa_flags, std::string *invalid_extension)
10949 {
10950   const char *ext;
10951   const struct processor *cpu;
10952   size_t len;
10953
10954   ext = strchr (to_parse, '+');
10955
10956   if (ext != NULL)
10957     len = ext - to_parse;
10958   else
10959     len = strlen (to_parse);
10960
10961   if (len == 0)
10962     return AARCH64_PARSE_MISSING_ARG;
10963
10964
10965   /* Loop through the list of supported CPUs to find a match.  */
10966   for (cpu = all_cores; cpu->name != NULL; cpu++)
10967     {
10968       if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
10969         {
10970           unsigned long isa_temp = cpu->flags;
10971
10972
10973           if (ext != NULL)
10974             {
10975               /* TO_PARSE string contains at least one extension.  */
10976               enum aarch64_parse_opt_result ext_res
10977                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
10978
10979               if (ext_res != AARCH64_PARSE_OK)
10980                 return ext_res;
10981             }
10982           /* Extension parsing was successfull.  Confirm the result
10983              cpu and ISA flags.  */
10984           *res = cpu;
10985           *isa_flags = isa_temp;
10986           return AARCH64_PARSE_OK;
10987         }
10988     }
10989
10990   /* CPU name not found in list.  */
10991   return AARCH64_PARSE_INVALID_ARG;
10992 }
10993
10994 /* Parse the TO_PARSE string and put the cpu it selects into RES.
10995    Return an aarch64_parse_opt_result describing the parse result.
10996    If the parsing fails the RES does not change.  */
10997
10998 static enum aarch64_parse_opt_result
10999 aarch64_parse_tune (const char *to_parse, const struct processor **res)
11000 {
11001   const struct processor *cpu;
11002
11003   /* Loop through the list of supported CPUs to find a match.  */
11004   for (cpu = all_cores; cpu->name != NULL; cpu++)
11005     {
11006       if (strcmp (cpu->name, to_parse) == 0)
11007         {
11008           *res = cpu;
11009           return AARCH64_PARSE_OK;
11010         }
11011     }
11012
11013   /* CPU name not found in list.  */
11014   return AARCH64_PARSE_INVALID_ARG;
11015 }
11016
11017 /* Parse TOKEN, which has length LENGTH to see if it is an option
11018    described in FLAG.  If it is, return the index bit for that fusion type.
11019    If not, error (printing OPTION_NAME) and return zero.  */
11020
11021 static unsigned int
11022 aarch64_parse_one_option_token (const char *token,
11023                                 size_t length,
11024                                 const struct aarch64_flag_desc *flag,
11025                                 const char *option_name)
11026 {
11027   for (; flag->name != NULL; flag++)
11028     {
11029       if (length == strlen (flag->name)
11030           && !strncmp (flag->name, token, length))
11031         return flag->flag;
11032     }
11033
11034   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
11035   return 0;
11036 }
11037
11038 /* Parse OPTION which is a comma-separated list of flags to enable.
11039    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
11040    default state we inherit from the CPU tuning structures.  OPTION_NAME
11041    gives the top-level option we are parsing in the -moverride string,
11042    for use in error messages.  */
11043
11044 static unsigned int
11045 aarch64_parse_boolean_options (const char *option,
11046                                const struct aarch64_flag_desc *flags,
11047                                unsigned int initial_state,
11048                                const char *option_name)
11049 {
11050   const char separator = '.';
11051   const char* specs = option;
11052   const char* ntoken = option;
11053   unsigned int found_flags = initial_state;
11054
11055   while ((ntoken = strchr (specs, separator)))
11056     {
11057       size_t token_length = ntoken - specs;
11058       unsigned token_ops = aarch64_parse_one_option_token (specs,
11059                                                            token_length,
11060                                                            flags,
11061                                                            option_name);
11062       /* If we find "none" (or, for simplicity's sake, an error) anywhere
11063          in the token stream, reset the supported operations.  So:
11064
11065            adrp+add.cmp+branch.none.adrp+add
11066
11067            would have the result of turning on only adrp+add fusion.  */
11068       if (!token_ops)
11069         found_flags = 0;
11070
11071       found_flags |= token_ops;
11072       specs = ++ntoken;
11073     }
11074
11075   /* We ended with a comma, print something.  */
11076   if (!(*specs))
11077     {
11078       error ("%s string ill-formed\n", option_name);
11079       return 0;
11080     }
11081
11082   /* We still have one more token to parse.  */
11083   size_t token_length = strlen (specs);
11084   unsigned token_ops = aarch64_parse_one_option_token (specs,
11085                                                        token_length,
11086                                                        flags,
11087                                                        option_name);
11088    if (!token_ops)
11089      found_flags = 0;
11090
11091   found_flags |= token_ops;
11092   return found_flags;
11093 }
11094
11095 /* Support for overriding instruction fusion.  */
11096
11097 static void
11098 aarch64_parse_fuse_string (const char *fuse_string,
11099                             struct tune_params *tune)
11100 {
11101   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
11102                                                      aarch64_fusible_pairs,
11103                                                      tune->fusible_ops,
11104                                                      "fuse=");
11105 }
11106
11107 /* Support for overriding other tuning flags.  */
11108
11109 static void
11110 aarch64_parse_tune_string (const char *tune_string,
11111                             struct tune_params *tune)
11112 {
11113   tune->extra_tuning_flags
11114     = aarch64_parse_boolean_options (tune_string,
11115                                      aarch64_tuning_flags,
11116                                      tune->extra_tuning_flags,
11117                                      "tune=");
11118 }
11119
11120 /* Parse the sve_width tuning moverride string in TUNE_STRING.
11121    Accept the valid SVE vector widths allowed by
11122    aarch64_sve_vector_bits_enum and use it to override sve_width
11123    in TUNE.  */
11124
11125 static void
11126 aarch64_parse_sve_width_string (const char *tune_string,
11127                                 struct tune_params *tune)
11128 {
11129   int width = -1;
11130
11131   int n = sscanf (tune_string, "%d", &width);
11132   if (n == EOF)
11133     {
11134       error ("invalid format for sve_width");
11135       return;
11136     }
11137   switch (width)
11138     {
11139     case SVE_128:
11140     case SVE_256:
11141     case SVE_512:
11142     case SVE_1024:
11143     case SVE_2048:
11144       break;
11145     default:
11146       error ("invalid sve_width value: %d", width);
11147     }
11148   tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
11149 }
11150
11151 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
11152    we understand.  If it is, extract the option string and handoff to
11153    the appropriate function.  */
11154
11155 void
11156 aarch64_parse_one_override_token (const char* token,
11157                                   size_t length,
11158                                   struct tune_params *tune)
11159 {
11160   const struct aarch64_tuning_override_function *fn
11161     = aarch64_tuning_override_functions;
11162
11163   const char *option_part = strchr (token, '=');
11164   if (!option_part)
11165     {
11166       error ("tuning string missing in option (%s)", token);
11167       return;
11168     }
11169
11170   /* Get the length of the option name.  */
11171   length = option_part - token;
11172   /* Skip the '=' to get to the option string.  */
11173   option_part++;
11174
11175   for (; fn->name != NULL; fn++)
11176     {
11177       if (!strncmp (fn->name, token, length))
11178         {
11179           fn->parse_override (option_part, tune);
11180           return;
11181         }
11182     }
11183
11184   error ("unknown tuning option (%s)",token);
11185   return;
11186 }
11187
11188 /* A checking mechanism for the implementation of the tls size.  */
11189
11190 static void
11191 initialize_aarch64_tls_size (struct gcc_options *opts)
11192 {
11193   if (aarch64_tls_size == 0)
11194     aarch64_tls_size = 24;
11195
11196   switch (opts->x_aarch64_cmodel_var)
11197     {
11198     case AARCH64_CMODEL_TINY:
11199       /* Both the default and maximum TLS size allowed under tiny is 1M which
11200          needs two instructions to address, so we clamp the size to 24.  */
11201       if (aarch64_tls_size > 24)
11202         aarch64_tls_size = 24;
11203       break;
11204     case AARCH64_CMODEL_SMALL:
11205       /* The maximum TLS size allowed under small is 4G.  */
11206       if (aarch64_tls_size > 32)
11207         aarch64_tls_size = 32;
11208       break;
11209     case AARCH64_CMODEL_LARGE:
11210       /* The maximum TLS size allowed under large is 16E.
11211          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
11212       if (aarch64_tls_size > 48)
11213         aarch64_tls_size = 48;
11214       break;
11215     default:
11216       gcc_unreachable ();
11217     }
11218
11219   return;
11220 }
11221
11222 /* Parse STRING looking for options in the format:
11223      string     :: option:string
11224      option     :: name=substring
11225      name       :: {a-z}
11226      substring  :: defined by option.  */
11227
11228 static void
11229 aarch64_parse_override_string (const char* input_string,
11230                                struct tune_params* tune)
11231 {
11232   const char separator = ':';
11233   size_t string_length = strlen (input_string) + 1;
11234   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
11235   char *string = string_root;
11236   strncpy (string, input_string, string_length);
11237   string[string_length - 1] = '\0';
11238
11239   char* ntoken = string;
11240
11241   while ((ntoken = strchr (string, separator)))
11242     {
11243       size_t token_length = ntoken - string;
11244       /* Make this substring look like a string.  */
11245       *ntoken = '\0';
11246       aarch64_parse_one_override_token (string, token_length, tune);
11247       string = ++ntoken;
11248     }
11249
11250   /* One last option to parse.  */
11251   aarch64_parse_one_override_token (string, strlen (string), tune);
11252   free (string_root);
11253 }
11254
11255
11256 static void
11257 aarch64_override_options_after_change_1 (struct gcc_options *opts)
11258 {
11259   if (accepted_branch_protection_string)
11260     {
11261       opts->x_aarch64_branch_protection_string
11262         = xstrdup (accepted_branch_protection_string);
11263     }
11264
11265   /* PR 70044: We have to be careful about being called multiple times for the
11266      same function.  This means all changes should be repeatable.  */
11267
11268   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
11269      Disable the frame pointer flag so the mid-end will not use a frame
11270      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
11271      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
11272      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
11273   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
11274   if (opts->x_flag_omit_frame_pointer == 0)
11275     opts->x_flag_omit_frame_pointer = 2;
11276
11277   /* If not optimizing for size, set the default
11278      alignment to what the target wants.  */
11279   if (!opts->x_optimize_size)
11280     {
11281       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
11282         opts->x_str_align_loops = aarch64_tune_params.loop_align;
11283       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
11284         opts->x_str_align_jumps = aarch64_tune_params.jump_align;
11285       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
11286         opts->x_str_align_functions = aarch64_tune_params.function_align;
11287     }
11288
11289   /* We default to no pc-relative literal loads.  */
11290
11291   aarch64_pcrelative_literal_loads = false;
11292
11293   /* If -mpc-relative-literal-loads is set on the command line, this
11294      implies that the user asked for PC relative literal loads.  */
11295   if (opts->x_pcrelative_literal_loads == 1)
11296     aarch64_pcrelative_literal_loads = true;
11297
11298   /* In the tiny memory model it makes no sense to disallow PC relative
11299      literal pool loads.  */
11300   if (aarch64_cmodel == AARCH64_CMODEL_TINY
11301       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
11302     aarch64_pcrelative_literal_loads = true;
11303
11304   /* When enabling the lower precision Newton series for the square root, also
11305      enable it for the reciprocal square root, since the latter is an
11306      intermediary step for the former.  */
11307   if (flag_mlow_precision_sqrt)
11308     flag_mrecip_low_precision_sqrt = true;
11309 }
11310
11311 /* 'Unpack' up the internal tuning structs and update the options
11312     in OPTS.  The caller must have set up selected_tune and selected_arch
11313     as all the other target-specific codegen decisions are
11314     derived from them.  */
11315
11316 void
11317 aarch64_override_options_internal (struct gcc_options *opts)
11318 {
11319   aarch64_tune_flags = selected_tune->flags;
11320   aarch64_tune = selected_tune->sched_core;
11321   /* Make a copy of the tuning parameters attached to the core, which
11322      we may later overwrite.  */
11323   aarch64_tune_params = *(selected_tune->tune);
11324   aarch64_architecture_version = selected_arch->architecture_version;
11325
11326   if (opts->x_aarch64_override_tune_string)
11327     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
11328                                   &aarch64_tune_params);
11329
11330   /* This target defaults to strict volatile bitfields.  */
11331   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
11332     opts->x_flag_strict_volatile_bitfields = 1;
11333
11334   initialize_aarch64_code_model (opts);
11335   initialize_aarch64_tls_size (opts);
11336
11337   int queue_depth = 0;
11338   switch (aarch64_tune_params.autoprefetcher_model)
11339     {
11340       case tune_params::AUTOPREFETCHER_OFF:
11341         queue_depth = -1;
11342         break;
11343       case tune_params::AUTOPREFETCHER_WEAK:
11344         queue_depth = 0;
11345         break;
11346       case tune_params::AUTOPREFETCHER_STRONG:
11347         queue_depth = max_insn_queue_index + 1;
11348         break;
11349       default:
11350         gcc_unreachable ();
11351     }
11352
11353   /* We don't mind passing in global_options_set here as we don't use
11354      the *options_set structs anyway.  */
11355   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
11356                          queue_depth,
11357                          opts->x_param_values,
11358                          global_options_set.x_param_values);
11359
11360   /* Set up parameters to be used in prefetching algorithm.  Do not
11361      override the defaults unless we are tuning for a core we have
11362      researched values for.  */
11363   if (aarch64_tune_params.prefetch->num_slots > 0)
11364     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
11365                            aarch64_tune_params.prefetch->num_slots,
11366                            opts->x_param_values,
11367                            global_options_set.x_param_values);
11368   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
11369     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
11370                            aarch64_tune_params.prefetch->l1_cache_size,
11371                            opts->x_param_values,
11372                            global_options_set.x_param_values);
11373   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
11374     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
11375                            aarch64_tune_params.prefetch->l1_cache_line_size,
11376                            opts->x_param_values,
11377                            global_options_set.x_param_values);
11378   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
11379     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
11380                            aarch64_tune_params.prefetch->l2_cache_size,
11381                            opts->x_param_values,
11382                            global_options_set.x_param_values);
11383   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
11384     maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
11385                            0,
11386                            opts->x_param_values,
11387                            global_options_set.x_param_values);
11388   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
11389     maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
11390                            aarch64_tune_params.prefetch->minimum_stride,
11391                            opts->x_param_values,
11392                            global_options_set.x_param_values);
11393
11394   /* Use the alternative scheduling-pressure algorithm by default.  */
11395   maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
11396                          opts->x_param_values,
11397                          global_options_set.x_param_values);
11398
11399   /* If the user hasn't changed it via configure then set the default to 64 KB
11400      for the backend.  */
11401   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
11402                          DEFAULT_STK_CLASH_GUARD_SIZE == 0
11403                            ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE,
11404                          opts->x_param_values,
11405                          global_options_set.x_param_values);
11406
11407   /* Validate the guard size.  */
11408   int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
11409
11410   /* Enforce that interval is the same size as size so the mid-end does the
11411      right thing.  */
11412   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
11413                          guard_size,
11414                          opts->x_param_values,
11415                          global_options_set.x_param_values);
11416
11417   /* The maybe_set calls won't update the value if the user has explicitly set
11418      one.  Which means we need to validate that probing interval and guard size
11419      are equal.  */
11420   int probe_interval
11421     = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
11422   if (guard_size != probe_interval)
11423     error ("stack clash guard size '%d' must be equal to probing interval "
11424            "'%d'", guard_size, probe_interval);
11425
11426   /* Enable sw prefetching at specified optimization level for
11427      CPUS that have prefetch.  Lower optimization level threshold by 1
11428      when profiling is enabled.  */
11429   if (opts->x_flag_prefetch_loop_arrays < 0
11430       && !opts->x_optimize_size
11431       && aarch64_tune_params.prefetch->default_opt_level >= 0
11432       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
11433     opts->x_flag_prefetch_loop_arrays = 1;
11434
11435   if (opts->x_aarch64_arch_string == NULL)
11436     opts->x_aarch64_arch_string = selected_arch->name;
11437   if (opts->x_aarch64_cpu_string == NULL)
11438     opts->x_aarch64_cpu_string = selected_cpu->name;
11439   if (opts->x_aarch64_tune_string == NULL)
11440     opts->x_aarch64_tune_string = selected_tune->name;
11441
11442   aarch64_override_options_after_change_1 (opts);
11443 }
11444
11445 /* Print a hint with a suggestion for a core or architecture name that
11446    most closely resembles what the user passed in STR.  ARCH is true if
11447    the user is asking for an architecture name.  ARCH is false if the user
11448    is asking for a core name.  */
11449
11450 static void
11451 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
11452 {
11453   auto_vec<const char *> candidates;
11454   const struct processor *entry = arch ? all_architectures : all_cores;
11455   for (; entry->name != NULL; entry++)
11456     candidates.safe_push (entry->name);
11457
11458 #ifdef HAVE_LOCAL_CPU_DETECT
11459   /* Add also "native" as possible value.  */
11460   if (arch)
11461     candidates.safe_push ("native");
11462 #endif
11463
11464   char *s;
11465   const char *hint = candidates_list_and_hint (str, s, candidates);
11466   if (hint)
11467     inform (input_location, "valid arguments are: %s;"
11468                              " did you mean %qs?", s, hint);
11469   else
11470     inform (input_location, "valid arguments are: %s", s);
11471
11472   XDELETEVEC (s);
11473 }
11474
11475 /* Print a hint with a suggestion for a core name that most closely resembles
11476    what the user passed in STR.  */
11477
11478 inline static void
11479 aarch64_print_hint_for_core (const char *str)
11480 {
11481   aarch64_print_hint_for_core_or_arch (str, false);
11482 }
11483
11484 /* Print a hint with a suggestion for an architecture name that most closely
11485    resembles what the user passed in STR.  */
11486
11487 inline static void
11488 aarch64_print_hint_for_arch (const char *str)
11489 {
11490   aarch64_print_hint_for_core_or_arch (str, true);
11491 }
11492
11493
11494 /* Print a hint with a suggestion for an extension name
11495    that most closely resembles what the user passed in STR.  */
11496
11497 void
11498 aarch64_print_hint_for_extensions (const std::string &str)
11499 {
11500   auto_vec<const char *> candidates;
11501   aarch64_get_all_extension_candidates (&candidates);
11502   char *s;
11503   const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
11504   if (hint)
11505     inform (input_location, "valid arguments are: %s;"
11506                              " did you mean %qs?", s, hint);
11507   else
11508     inform (input_location, "valid arguments are: %s;", s);
11509
11510   XDELETEVEC (s);
11511 }
11512
11513 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
11514    specified in STR and throw errors if appropriate.  Put the results if
11515    they are valid in RES and ISA_FLAGS.  Return whether the option is
11516    valid.  */
11517
11518 static bool
11519 aarch64_validate_mcpu (const char *str, const struct processor **res,
11520                        unsigned long *isa_flags)
11521 {
11522   std::string invalid_extension;
11523   enum aarch64_parse_opt_result parse_res
11524     = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
11525
11526   if (parse_res == AARCH64_PARSE_OK)
11527     return true;
11528
11529   switch (parse_res)
11530     {
11531       case AARCH64_PARSE_MISSING_ARG:
11532         error ("missing cpu name in %<-mcpu=%s%>", str);
11533         break;
11534       case AARCH64_PARSE_INVALID_ARG:
11535         error ("unknown value %qs for -mcpu", str);
11536         aarch64_print_hint_for_core (str);
11537         break;
11538       case AARCH64_PARSE_INVALID_FEATURE:
11539         error ("invalid feature modifier %qs in %<-mcpu=%s%>",
11540                invalid_extension.c_str (), str);
11541         aarch64_print_hint_for_extensions (invalid_extension);
11542         break;
11543       default:
11544         gcc_unreachable ();
11545     }
11546
11547   return false;
11548 }
11549
11550 /* Parses CONST_STR for branch protection features specified in
11551    aarch64_branch_protect_types, and set any global variables required.  Returns
11552    the parsing result and assigns LAST_STR to the last processed token from
11553    CONST_STR so that it can be used for error reporting.  */
11554
11555 static enum
11556 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
11557                                                           char** last_str)
11558 {
11559   char *str_root = xstrdup (const_str);
11560   char* token_save = NULL;
11561   char *str = strtok_r (str_root, "+", &token_save);
11562   enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
11563   if (!str)
11564     res = AARCH64_PARSE_MISSING_ARG;
11565   else
11566     {
11567       char *next_str = strtok_r (NULL, "+", &token_save);
11568       /* Reset the branch protection features to their defaults.  */
11569       aarch64_handle_no_branch_protection (NULL, NULL);
11570
11571       while (str && res == AARCH64_PARSE_OK)
11572         {
11573           const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
11574           bool found = false;
11575           /* Search for this type.  */
11576           while (type && type->name && !found && res == AARCH64_PARSE_OK)
11577             {
11578               if (strcmp (str, type->name) == 0)
11579                 {
11580                   found = true;
11581                   res = type->handler (str, next_str);
11582                   str = next_str;
11583                   next_str = strtok_r (NULL, "+", &token_save);
11584                 }
11585               else
11586                 type++;
11587             }
11588           if (found && res == AARCH64_PARSE_OK)
11589             {
11590               bool found_subtype = true;
11591               /* Loop through each token until we find one that isn't a
11592                  subtype.  */
11593               while (found_subtype)
11594                 {
11595                   found_subtype = false;
11596                   const aarch64_branch_protect_type *subtype = type->subtypes;
11597                   /* Search for the subtype.  */
11598                   while (str && subtype && subtype->name && !found_subtype
11599                           && res == AARCH64_PARSE_OK)
11600                     {
11601                       if (strcmp (str, subtype->name) == 0)
11602                         {
11603                           found_subtype = true;
11604                           res = subtype->handler (str, next_str);
11605                           str = next_str;
11606                           next_str = strtok_r (NULL, "+", &token_save);
11607                         }
11608                       else
11609                         subtype++;
11610                     }
11611                 }
11612             }
11613           else if (!found)
11614             res = AARCH64_PARSE_INVALID_ARG;
11615         }
11616     }
11617   /* Copy the last processed token into the argument to pass it back.
11618     Used by option and attribute validation to print the offending token.  */
11619   if (last_str)
11620     {
11621       if (str) strcpy (*last_str, str);
11622       else *last_str = NULL;
11623     }
11624   if (res == AARCH64_PARSE_OK)
11625     {
11626       /* If needed, alloc the accepted string then copy in const_str.
11627         Used by override_option_after_change_1.  */
11628       if (!accepted_branch_protection_string)
11629         accepted_branch_protection_string = (char *) xmalloc (
11630                                                       BRANCH_PROTECT_STR_MAX
11631                                                         + 1);
11632       strncpy (accepted_branch_protection_string, const_str,
11633                 BRANCH_PROTECT_STR_MAX + 1);
11634       /* Forcibly null-terminate.  */
11635       accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
11636     }
11637   return res;
11638 }
11639
11640 static bool
11641 aarch64_validate_mbranch_protection (const char *const_str)
11642 {
11643   char *str = (char *) xmalloc (strlen (const_str));
11644   enum aarch64_parse_opt_result res =
11645     aarch64_parse_branch_protection (const_str, &str);
11646   if (res == AARCH64_PARSE_INVALID_ARG)
11647     error ("invalid arg %<%s%> for %<-mbranch-protection=%>", str);
11648   else if (res == AARCH64_PARSE_MISSING_ARG)
11649     error ("missing arg for %<-mbranch-protection=%>");
11650   free (str);
11651   return res == AARCH64_PARSE_OK;
11652 }
11653
11654 /* Validate a command-line -march option.  Parse the arch and extensions
11655    (if any) specified in STR and throw errors if appropriate.  Put the
11656    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
11657    option is valid.  */
11658
11659 static bool
11660 aarch64_validate_march (const char *str, const struct processor **res,
11661                          unsigned long *isa_flags)
11662 {
11663   std::string invalid_extension;
11664   enum aarch64_parse_opt_result parse_res
11665     = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
11666
11667   if (parse_res == AARCH64_PARSE_OK)
11668     return true;
11669
11670   switch (parse_res)
11671     {
11672       case AARCH64_PARSE_MISSING_ARG:
11673         error ("missing arch name in %<-march=%s%>", str);
11674         break;
11675       case AARCH64_PARSE_INVALID_ARG:
11676         error ("unknown value %qs for -march", str);
11677         aarch64_print_hint_for_arch (str);
11678         break;
11679       case AARCH64_PARSE_INVALID_FEATURE:
11680         error ("invalid feature modifier %qs in %<-march=%s%>",
11681                invalid_extension.c_str (), str);
11682         aarch64_print_hint_for_extensions (invalid_extension);
11683         break;
11684       default:
11685         gcc_unreachable ();
11686     }
11687
11688   return false;
11689 }
11690
11691 /* Validate a command-line -mtune option.  Parse the cpu
11692    specified in STR and throw errors if appropriate.  Put the
11693    result, if it is valid, in RES.  Return whether the option is
11694    valid.  */
11695
11696 static bool
11697 aarch64_validate_mtune (const char *str, const struct processor **res)
11698 {
11699   enum aarch64_parse_opt_result parse_res
11700     = aarch64_parse_tune (str, res);
11701
11702   if (parse_res == AARCH64_PARSE_OK)
11703     return true;
11704
11705   switch (parse_res)
11706     {
11707       case AARCH64_PARSE_MISSING_ARG:
11708         error ("missing cpu name in %<-mtune=%s%>", str);
11709         break;
11710       case AARCH64_PARSE_INVALID_ARG:
11711         error ("unknown value %qs for -mtune", str);
11712         aarch64_print_hint_for_core (str);
11713         break;
11714       default:
11715         gcc_unreachable ();
11716     }
11717   return false;
11718 }
11719
11720 /* Return the CPU corresponding to the enum CPU.
11721    If it doesn't specify a cpu, return the default.  */
11722
11723 static const struct processor *
11724 aarch64_get_tune_cpu (enum aarch64_processor cpu)
11725 {
11726   if (cpu != aarch64_none)
11727     return &all_cores[cpu];
11728
11729   /* The & 0x3f is to extract the bottom 6 bits that encode the
11730      default cpu as selected by the --with-cpu GCC configure option
11731      in config.gcc.
11732      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
11733      flags mechanism should be reworked to make it more sane.  */
11734   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
11735 }
11736
11737 /* Return the architecture corresponding to the enum ARCH.
11738    If it doesn't specify a valid architecture, return the default.  */
11739
11740 static const struct processor *
11741 aarch64_get_arch (enum aarch64_arch arch)
11742 {
11743   if (arch != aarch64_no_arch)
11744     return &all_architectures[arch];
11745
11746   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
11747
11748   return &all_architectures[cpu->arch];
11749 }
11750
11751 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
11752
11753 static poly_uint16
11754 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
11755 {
11756   /* For now generate vector-length agnostic code for -msve-vector-bits=128.
11757      This ensures we can clearly distinguish SVE and Advanced SIMD modes when
11758      deciding which .md file patterns to use and when deciding whether
11759      something is a legitimate address or constant.  */
11760   if (value == SVE_SCALABLE || value == SVE_128)
11761     return poly_uint16 (2, 2);
11762   else
11763     return (int) value / 64;
11764 }
11765
11766 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
11767    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
11768    tuning structs.  In particular it must set selected_tune and
11769    aarch64_isa_flags that define the available ISA features and tuning
11770    decisions.  It must also set selected_arch as this will be used to
11771    output the .arch asm tags for each function.  */
11772
11773 static void
11774 aarch64_override_options (void)
11775 {
11776   unsigned long cpu_isa = 0;
11777   unsigned long arch_isa = 0;
11778   aarch64_isa_flags = 0;
11779
11780   bool valid_cpu = true;
11781   bool valid_tune = true;
11782   bool valid_arch = true;
11783
11784   selected_cpu = NULL;
11785   selected_arch = NULL;
11786   selected_tune = NULL;
11787
11788   if (aarch64_branch_protection_string)
11789     aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
11790
11791   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
11792      If either of -march or -mtune is given, they override their
11793      respective component of -mcpu.  */
11794   if (aarch64_cpu_string)
11795     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
11796                                         &cpu_isa);
11797
11798   if (aarch64_arch_string)
11799     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
11800                                           &arch_isa);
11801
11802   if (aarch64_tune_string)
11803     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
11804
11805 #ifdef SUBTARGET_OVERRIDE_OPTIONS
11806   SUBTARGET_OVERRIDE_OPTIONS;
11807 #endif
11808
11809   /* If the user did not specify a processor, choose the default
11810      one for them.  This will be the CPU set during configuration using
11811      --with-cpu, otherwise it is "generic".  */
11812   if (!selected_cpu)
11813     {
11814       if (selected_arch)
11815         {
11816           selected_cpu = &all_cores[selected_arch->ident];
11817           aarch64_isa_flags = arch_isa;
11818           explicit_arch = selected_arch->arch;
11819         }
11820       else
11821         {
11822           /* Get default configure-time CPU.  */
11823           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
11824           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
11825         }
11826
11827       if (selected_tune)
11828         explicit_tune_core = selected_tune->ident;
11829     }
11830   /* If both -mcpu and -march are specified check that they are architecturally
11831      compatible, warn if they're not and prefer the -march ISA flags.  */
11832   else if (selected_arch)
11833     {
11834       if (selected_arch->arch != selected_cpu->arch)
11835         {
11836           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
11837                        all_architectures[selected_cpu->arch].name,
11838                        selected_arch->name);
11839         }
11840       aarch64_isa_flags = arch_isa;
11841       explicit_arch = selected_arch->arch;
11842       explicit_tune_core = selected_tune ? selected_tune->ident
11843                                           : selected_cpu->ident;
11844     }
11845   else
11846     {
11847       /* -mcpu but no -march.  */
11848       aarch64_isa_flags = cpu_isa;
11849       explicit_tune_core = selected_tune ? selected_tune->ident
11850                                           : selected_cpu->ident;
11851       gcc_assert (selected_cpu);
11852       selected_arch = &all_architectures[selected_cpu->arch];
11853       explicit_arch = selected_arch->arch;
11854     }
11855
11856   /* Set the arch as well as we will need it when outputing
11857      the .arch directive in assembly.  */
11858   if (!selected_arch)
11859     {
11860       gcc_assert (selected_cpu);
11861       selected_arch = &all_architectures[selected_cpu->arch];
11862     }
11863
11864   if (!selected_tune)
11865     selected_tune = selected_cpu;
11866
11867   if (aarch64_enable_bti == 2)
11868     {
11869 #ifdef TARGET_ENABLE_BTI
11870       aarch64_enable_bti = 1;
11871 #else
11872       aarch64_enable_bti = 0;
11873 #endif
11874     }
11875
11876   /* Return address signing is currently not supported for ILP32 targets.  For
11877      LP64 targets use the configured option in the absence of a command-line
11878      option for -mbranch-protection.  */
11879   if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
11880     {
11881 #ifdef TARGET_ENABLE_PAC_RET
11882       aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
11883       aarch64_ra_sign_key = AARCH64_KEY_A;
11884 #else
11885       aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
11886 #endif
11887     }
11888
11889 #ifndef HAVE_AS_MABI_OPTION
11890   /* The compiler may have been configured with 2.23.* binutils, which does
11891      not have support for ILP32.  */
11892   if (TARGET_ILP32)
11893     error ("assembler does not support -mabi=ilp32");
11894 #endif
11895
11896   /* Convert -msve-vector-bits to a VG count.  */
11897   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
11898
11899   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
11900     sorry ("return address signing is only supported for -mabi=lp64");
11901
11902   /* Make sure we properly set up the explicit options.  */
11903   if ((aarch64_cpu_string && valid_cpu)
11904        || (aarch64_tune_string && valid_tune))
11905     gcc_assert (explicit_tune_core != aarch64_none);
11906
11907   if ((aarch64_cpu_string && valid_cpu)
11908        || (aarch64_arch_string && valid_arch))
11909     gcc_assert (explicit_arch != aarch64_no_arch);
11910
11911   /* The pass to insert speculation tracking runs before
11912      shrink-wrapping and the latter does not know how to update the
11913      tracking status.  So disable it in this case.  */
11914   if (aarch64_track_speculation)
11915     flag_shrink_wrap = 0;
11916
11917   aarch64_override_options_internal (&global_options);
11918
11919   /* Save these options as the default ones in case we push and pop them later
11920      while processing functions with potential target attributes.  */
11921   target_option_default_node = target_option_current_node
11922       = build_target_option_node (&global_options);
11923 }
11924
11925 /* Implement targetm.override_options_after_change.  */
11926
11927 static void
11928 aarch64_override_options_after_change (void)
11929 {
11930   aarch64_override_options_after_change_1 (&global_options);
11931 }
11932
11933 static struct machine_function *
11934 aarch64_init_machine_status (void)
11935 {
11936   struct machine_function *machine;
11937   machine = ggc_cleared_alloc<machine_function> ();
11938   return machine;
11939 }
11940
11941 void
11942 aarch64_init_expanders (void)
11943 {
11944   init_machine_status = aarch64_init_machine_status;
11945 }
11946
11947 /* A checking mechanism for the implementation of the various code models.  */
11948 static void
11949 initialize_aarch64_code_model (struct gcc_options *opts)
11950 {
11951    if (opts->x_flag_pic)
11952      {
11953        switch (opts->x_aarch64_cmodel_var)
11954          {
11955          case AARCH64_CMODEL_TINY:
11956            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
11957            break;
11958          case AARCH64_CMODEL_SMALL:
11959 #ifdef HAVE_AS_SMALL_PIC_RELOCS
11960            aarch64_cmodel = (flag_pic == 2
11961                              ? AARCH64_CMODEL_SMALL_PIC
11962                              : AARCH64_CMODEL_SMALL_SPIC);
11963 #else
11964            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
11965 #endif
11966            break;
11967          case AARCH64_CMODEL_LARGE:
11968            sorry ("code model %qs with -f%s", "large",
11969                   opts->x_flag_pic > 1 ? "PIC" : "pic");
11970            break;
11971          default:
11972            gcc_unreachable ();
11973          }
11974      }
11975    else
11976      aarch64_cmodel = opts->x_aarch64_cmodel_var;
11977 }
11978
11979 /* Implement TARGET_OPTION_SAVE.  */
11980
11981 static void
11982 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
11983 {
11984   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
11985   ptr->x_aarch64_branch_protection_string
11986     = opts->x_aarch64_branch_protection_string;
11987 }
11988
11989 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
11990    using the information saved in PTR.  */
11991
11992 static void
11993 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
11994 {
11995   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
11996   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11997   opts->x_explicit_arch = ptr->x_explicit_arch;
11998   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
11999   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
12000   opts->x_aarch64_branch_protection_string
12001     = ptr->x_aarch64_branch_protection_string;
12002   if (opts->x_aarch64_branch_protection_string)
12003     {
12004       aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
12005                                         NULL);
12006     }
12007
12008   aarch64_override_options_internal (opts);
12009 }
12010
12011 /* Implement TARGET_OPTION_PRINT.  */
12012
12013 static void
12014 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
12015 {
12016   const struct processor *cpu
12017     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
12018   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
12019   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
12020   std::string extension
12021     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
12022
12023   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
12024   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
12025            arch->name, extension.c_str ());
12026 }
12027
12028 static GTY(()) tree aarch64_previous_fndecl;
12029
12030 void
12031 aarch64_reset_previous_fndecl (void)
12032 {
12033   aarch64_previous_fndecl = NULL;
12034 }
12035
12036 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
12037    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
12038    make sure optab availability predicates are recomputed when necessary.  */
12039
12040 void
12041 aarch64_save_restore_target_globals (tree new_tree)
12042 {
12043   if (TREE_TARGET_GLOBALS (new_tree))
12044     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
12045   else if (new_tree == target_option_default_node)
12046     restore_target_globals (&default_target_globals);
12047   else
12048     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
12049 }
12050
12051 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
12052    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
12053    of the function, if such exists.  This function may be called multiple
12054    times on a single function so use aarch64_previous_fndecl to avoid
12055    setting up identical state.  */
12056
12057 static void
12058 aarch64_set_current_function (tree fndecl)
12059 {
12060   if (!fndecl || fndecl == aarch64_previous_fndecl)
12061     return;
12062
12063   tree old_tree = (aarch64_previous_fndecl
12064                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
12065                    : NULL_TREE);
12066
12067   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12068
12069   /* If current function has no attributes but the previous one did,
12070      use the default node.  */
12071   if (!new_tree && old_tree)
12072     new_tree = target_option_default_node;
12073
12074   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
12075      the default have been handled by aarch64_save_restore_target_globals from
12076      aarch64_pragma_target_parse.  */
12077   if (old_tree == new_tree)
12078     return;
12079
12080   aarch64_previous_fndecl = fndecl;
12081
12082   /* First set the target options.  */
12083   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
12084
12085   aarch64_save_restore_target_globals (new_tree);
12086 }
12087
12088 /* Enum describing the various ways we can handle attributes.
12089    In many cases we can reuse the generic option handling machinery.  */
12090
12091 enum aarch64_attr_opt_type
12092 {
12093   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
12094   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
12095   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
12096   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
12097 };
12098
12099 /* All the information needed to handle a target attribute.
12100    NAME is the name of the attribute.
12101    ATTR_TYPE specifies the type of behavior of the attribute as described
12102    in the definition of enum aarch64_attr_opt_type.
12103    ALLOW_NEG is true if the attribute supports a "no-" form.
12104    HANDLER is the function that takes the attribute string as an argument
12105    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
12106    OPT_NUM is the enum specifying the option that the attribute modifies.
12107    This is needed for attributes that mirror the behavior of a command-line
12108    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
12109    aarch64_attr_enum.  */
12110
12111 struct aarch64_attribute_info
12112 {
12113   const char *name;
12114   enum aarch64_attr_opt_type attr_type;
12115   bool allow_neg;
12116   bool (*handler) (const char *);
12117   enum opt_code opt_num;
12118 };
12119
12120 /* Handle the ARCH_STR argument to the arch= target attribute.  */
12121
12122 static bool
12123 aarch64_handle_attr_arch (const char *str)
12124 {
12125   const struct processor *tmp_arch = NULL;
12126   std::string invalid_extension;
12127   enum aarch64_parse_opt_result parse_res
12128     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
12129
12130   if (parse_res == AARCH64_PARSE_OK)
12131     {
12132       gcc_assert (tmp_arch);
12133       selected_arch = tmp_arch;
12134       explicit_arch = selected_arch->arch;
12135       return true;
12136     }
12137
12138   switch (parse_res)
12139     {
12140       case AARCH64_PARSE_MISSING_ARG:
12141         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
12142         break;
12143       case AARCH64_PARSE_INVALID_ARG:
12144         error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
12145         aarch64_print_hint_for_arch (str);
12146         break;
12147       case AARCH64_PARSE_INVALID_FEATURE:
12148         error ("invalid feature modifier %s of value (\"%s\") in "
12149                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12150         aarch64_print_hint_for_extensions (invalid_extension);
12151         break;
12152       default:
12153         gcc_unreachable ();
12154     }
12155
12156   return false;
12157 }
12158
12159 /* Handle the argument CPU_STR to the cpu= target attribute.  */
12160
12161 static bool
12162 aarch64_handle_attr_cpu (const char *str)
12163 {
12164   const struct processor *tmp_cpu = NULL;
12165   std::string invalid_extension;
12166   enum aarch64_parse_opt_result parse_res
12167     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
12168
12169   if (parse_res == AARCH64_PARSE_OK)
12170     {
12171       gcc_assert (tmp_cpu);
12172       selected_tune = tmp_cpu;
12173       explicit_tune_core = selected_tune->ident;
12174
12175       selected_arch = &all_architectures[tmp_cpu->arch];
12176       explicit_arch = selected_arch->arch;
12177       return true;
12178     }
12179
12180   switch (parse_res)
12181     {
12182       case AARCH64_PARSE_MISSING_ARG:
12183         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
12184         break;
12185       case AARCH64_PARSE_INVALID_ARG:
12186         error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
12187         aarch64_print_hint_for_core (str);
12188         break;
12189       case AARCH64_PARSE_INVALID_FEATURE:
12190         error ("invalid feature modifier %s of value (\"%s\") in "
12191                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12192         aarch64_print_hint_for_extensions (invalid_extension);
12193         break;
12194       default:
12195         gcc_unreachable ();
12196     }
12197
12198   return false;
12199 }
12200
12201 /* Handle the argument STR to the branch-protection= attribute.  */
12202
12203  static bool
12204  aarch64_handle_attr_branch_protection (const char* str)
12205  {
12206   char *err_str = (char *) xmalloc (strlen (str));
12207   enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
12208                                                                       &err_str);
12209   bool success = false;
12210   switch (res)
12211     {
12212      case AARCH64_PARSE_MISSING_ARG:
12213        error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
12214               " attribute");
12215        break;
12216      case AARCH64_PARSE_INVALID_ARG:
12217        error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
12218               "=\")%> pragma or attribute", err_str);
12219        break;
12220      case AARCH64_PARSE_OK:
12221        success = true;
12222       /* Fall through.  */
12223      case AARCH64_PARSE_INVALID_FEATURE:
12224        break;
12225      default:
12226        gcc_unreachable ();
12227     }
12228   free (err_str);
12229   return success;
12230  }
12231
12232 /* Handle the argument STR to the tune= target attribute.  */
12233
12234 static bool
12235 aarch64_handle_attr_tune (const char *str)
12236 {
12237   const struct processor *tmp_tune = NULL;
12238   enum aarch64_parse_opt_result parse_res
12239     = aarch64_parse_tune (str, &tmp_tune);
12240
12241   if (parse_res == AARCH64_PARSE_OK)
12242     {
12243       gcc_assert (tmp_tune);
12244       selected_tune = tmp_tune;
12245       explicit_tune_core = selected_tune->ident;
12246       return true;
12247     }
12248
12249   switch (parse_res)
12250     {
12251       case AARCH64_PARSE_INVALID_ARG:
12252         error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
12253         aarch64_print_hint_for_core (str);
12254         break;
12255       default:
12256         gcc_unreachable ();
12257     }
12258
12259   return false;
12260 }
12261
12262 /* Parse an architecture extensions target attribute string specified in STR.
12263    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
12264    if successful.  Update aarch64_isa_flags to reflect the ISA features
12265    modified.  */
12266
12267 static bool
12268 aarch64_handle_attr_isa_flags (char *str)
12269 {
12270   enum aarch64_parse_opt_result parse_res;
12271   unsigned long isa_flags = aarch64_isa_flags;
12272
12273   /* We allow "+nothing" in the beginning to clear out all architectural
12274      features if the user wants to handpick specific features.  */
12275   if (strncmp ("+nothing", str, 8) == 0)
12276     {
12277       isa_flags = 0;
12278       str += 8;
12279     }
12280
12281   std::string invalid_extension;
12282   parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
12283
12284   if (parse_res == AARCH64_PARSE_OK)
12285     {
12286       aarch64_isa_flags = isa_flags;
12287       return true;
12288     }
12289
12290   switch (parse_res)
12291     {
12292       case AARCH64_PARSE_MISSING_ARG:
12293         error ("missing value in %<target()%> pragma or attribute");
12294         break;
12295
12296       case AARCH64_PARSE_INVALID_FEATURE:
12297         error ("invalid feature modifier %s of value (\"%s\") in "
12298                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12299         break;
12300
12301       default:
12302         gcc_unreachable ();
12303     }
12304
12305  return false;
12306 }
12307
12308 /* The target attributes that we support.  On top of these we also support just
12309    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
12310    handled explicitly in aarch64_process_one_target_attr.  */
12311
12312 static const struct aarch64_attribute_info aarch64_attributes[] =
12313 {
12314   { "general-regs-only", aarch64_attr_mask, false, NULL,
12315      OPT_mgeneral_regs_only },
12316   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
12317      OPT_mfix_cortex_a53_835769 },
12318   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
12319      OPT_mfix_cortex_a53_843419 },
12320   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
12321   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
12322   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
12323      OPT_momit_leaf_frame_pointer },
12324   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
12325   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
12326      OPT_march_ },
12327   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
12328   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
12329      OPT_mtune_ },
12330   { "branch-protection", aarch64_attr_custom, false,
12331      aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
12332   { "sign-return-address", aarch64_attr_enum, false, NULL,
12333      OPT_msign_return_address_ },
12334   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
12335 };
12336
12337 /* Parse ARG_STR which contains the definition of one target attribute.
12338    Show appropriate errors if any or return true if the attribute is valid.  */
12339
12340 static bool
12341 aarch64_process_one_target_attr (char *arg_str)
12342 {
12343   bool invert = false;
12344
12345   size_t len = strlen (arg_str);
12346
12347   if (len == 0)
12348     {
12349       error ("malformed %<target()%> pragma or attribute");
12350       return false;
12351     }
12352
12353   char *str_to_check = (char *) alloca (len + 1);
12354   strcpy (str_to_check, arg_str);
12355
12356   /* Skip leading whitespace.  */
12357   while (*str_to_check == ' ' || *str_to_check == '\t')
12358     str_to_check++;
12359
12360   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
12361      It is easier to detect and handle it explicitly here rather than going
12362      through the machinery for the rest of the target attributes in this
12363      function.  */
12364   if (*str_to_check == '+')
12365     return aarch64_handle_attr_isa_flags (str_to_check);
12366
12367   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
12368     {
12369       invert = true;
12370       str_to_check += 3;
12371     }
12372   char *arg = strchr (str_to_check, '=');
12373
12374   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
12375      and point ARG to "foo".  */
12376   if (arg)
12377     {
12378       *arg = '\0';
12379       arg++;
12380     }
12381   const struct aarch64_attribute_info *p_attr;
12382   bool found = false;
12383   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
12384     {
12385       /* If the names don't match up, or the user has given an argument
12386          to an attribute that doesn't accept one, or didn't give an argument
12387          to an attribute that expects one, fail to match.  */
12388       if (strcmp (str_to_check, p_attr->name) != 0)
12389         continue;
12390
12391       found = true;
12392       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
12393                               || p_attr->attr_type == aarch64_attr_enum;
12394
12395       if (attr_need_arg_p ^ (arg != NULL))
12396         {
12397           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
12398           return false;
12399         }
12400
12401       /* If the name matches but the attribute does not allow "no-" versions
12402          then we can't match.  */
12403       if (invert && !p_attr->allow_neg)
12404         {
12405           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
12406           return false;
12407         }
12408
12409       switch (p_attr->attr_type)
12410         {
12411         /* Has a custom handler registered.
12412            For example, cpu=, arch=, tune=.  */
12413           case aarch64_attr_custom:
12414             gcc_assert (p_attr->handler);
12415             if (!p_attr->handler (arg))
12416               return false;
12417             break;
12418
12419           /* Either set or unset a boolean option.  */
12420           case aarch64_attr_bool:
12421             {
12422               struct cl_decoded_option decoded;
12423
12424               generate_option (p_attr->opt_num, NULL, !invert,
12425                                CL_TARGET, &decoded);
12426               aarch64_handle_option (&global_options, &global_options_set,
12427                                       &decoded, input_location);
12428               break;
12429             }
12430           /* Set or unset a bit in the target_flags.  aarch64_handle_option
12431              should know what mask to apply given the option number.  */
12432           case aarch64_attr_mask:
12433             {
12434               struct cl_decoded_option decoded;
12435               /* We only need to specify the option number.
12436                  aarch64_handle_option will know which mask to apply.  */
12437               decoded.opt_index = p_attr->opt_num;
12438               decoded.value = !invert;
12439               aarch64_handle_option (&global_options, &global_options_set,
12440                                       &decoded, input_location);
12441               break;
12442             }
12443           /* Use the option setting machinery to set an option to an enum.  */
12444           case aarch64_attr_enum:
12445             {
12446               gcc_assert (arg);
12447               bool valid;
12448               int value;
12449               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
12450                                               &value, CL_TARGET);
12451               if (valid)
12452                 {
12453                   set_option (&global_options, NULL, p_attr->opt_num, value,
12454                               NULL, DK_UNSPECIFIED, input_location,
12455                               global_dc);
12456                 }
12457               else
12458                 {
12459                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
12460                 }
12461               break;
12462             }
12463           default:
12464             gcc_unreachable ();
12465         }
12466     }
12467
12468   /* If we reached here we either have found an attribute and validated
12469      it or didn't match any.  If we matched an attribute but its arguments
12470      were malformed we will have returned false already.  */
12471   return found;
12472 }
12473
12474 /* Count how many times the character C appears in
12475    NULL-terminated string STR.  */
12476
12477 static unsigned int
12478 num_occurences_in_str (char c, char *str)
12479 {
12480   unsigned int res = 0;
12481   while (*str != '\0')
12482     {
12483       if (*str == c)
12484         res++;
12485
12486       str++;
12487     }
12488
12489   return res;
12490 }
12491
12492 /* Parse the tree in ARGS that contains the target attribute information
12493    and update the global target options space.  */
12494
12495 bool
12496 aarch64_process_target_attr (tree args)
12497 {
12498   if (TREE_CODE (args) == TREE_LIST)
12499     {
12500       do
12501         {
12502           tree head = TREE_VALUE (args);
12503           if (head)
12504             {
12505               if (!aarch64_process_target_attr (head))
12506                 return false;
12507             }
12508           args = TREE_CHAIN (args);
12509         } while (args);
12510
12511       return true;
12512     }
12513
12514   if (TREE_CODE (args) != STRING_CST)
12515     {
12516       error ("attribute %<target%> argument not a string");
12517       return false;
12518     }
12519
12520   size_t len = strlen (TREE_STRING_POINTER (args));
12521   char *str_to_check = (char *) alloca (len + 1);
12522   strcpy (str_to_check, TREE_STRING_POINTER (args));
12523
12524   if (len == 0)
12525     {
12526       error ("malformed %<target()%> pragma or attribute");
12527       return false;
12528     }
12529
12530   /* Used to catch empty spaces between commas i.e.
12531      attribute ((target ("attr1,,attr2"))).  */
12532   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
12533
12534   /* Handle multiple target attributes separated by ','.  */
12535   char *token = strtok_r (str_to_check, ",", &str_to_check);
12536
12537   unsigned int num_attrs = 0;
12538   while (token)
12539     {
12540       num_attrs++;
12541       if (!aarch64_process_one_target_attr (token))
12542         {
12543           error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
12544           return false;
12545         }
12546
12547       token = strtok_r (NULL, ",", &str_to_check);
12548     }
12549
12550   if (num_attrs != num_commas + 1)
12551     {
12552       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
12553       return false;
12554     }
12555
12556   return true;
12557 }
12558
12559 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
12560    process attribute ((target ("..."))).  */
12561
12562 static bool
12563 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
12564 {
12565   struct cl_target_option cur_target;
12566   bool ret;
12567   tree old_optimize;
12568   tree new_target, new_optimize;
12569   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12570
12571   /* If what we're processing is the current pragma string then the
12572      target option node is already stored in target_option_current_node
12573      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
12574      having to re-parse the string.  This is especially useful to keep
12575      arm_neon.h compile times down since that header contains a lot
12576      of intrinsics enclosed in pragmas.  */
12577   if (!existing_target && args == current_target_pragma)
12578     {
12579       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
12580       return true;
12581     }
12582   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
12583
12584   old_optimize = build_optimization_node (&global_options);
12585   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
12586
12587   /* If the function changed the optimization levels as well as setting
12588      target options, start with the optimizations specified.  */
12589   if (func_optimize && func_optimize != old_optimize)
12590     cl_optimization_restore (&global_options,
12591                              TREE_OPTIMIZATION (func_optimize));
12592
12593   /* Save the current target options to restore at the end.  */
12594   cl_target_option_save (&cur_target, &global_options);
12595
12596   /* If fndecl already has some target attributes applied to it, unpack
12597      them so that we add this attribute on top of them, rather than
12598      overwriting them.  */
12599   if (existing_target)
12600     {
12601       struct cl_target_option *existing_options
12602         = TREE_TARGET_OPTION (existing_target);
12603
12604       if (existing_options)
12605         cl_target_option_restore (&global_options, existing_options);
12606     }
12607   else
12608     cl_target_option_restore (&global_options,
12609                         TREE_TARGET_OPTION (target_option_current_node));
12610
12611   ret = aarch64_process_target_attr (args);
12612
12613   /* Set up any additional state.  */
12614   if (ret)
12615     {
12616       aarch64_override_options_internal (&global_options);
12617       /* Initialize SIMD builtins if we haven't already.
12618          Set current_target_pragma to NULL for the duration so that
12619          the builtin initialization code doesn't try to tag the functions
12620          being built with the attributes specified by any current pragma, thus
12621          going into an infinite recursion.  */
12622       if (TARGET_SIMD)
12623         {
12624           tree saved_current_target_pragma = current_target_pragma;
12625           current_target_pragma = NULL;
12626           aarch64_init_simd_builtins ();
12627           current_target_pragma = saved_current_target_pragma;
12628         }
12629       new_target = build_target_option_node (&global_options);
12630     }
12631   else
12632     new_target = NULL;
12633
12634   new_optimize = build_optimization_node (&global_options);
12635
12636   if (fndecl && ret)
12637     {
12638       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
12639
12640       if (old_optimize != new_optimize)
12641         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
12642     }
12643
12644   cl_target_option_restore (&global_options, &cur_target);
12645
12646   if (old_optimize != new_optimize)
12647     cl_optimization_restore (&global_options,
12648                              TREE_OPTIMIZATION (old_optimize));
12649   return ret;
12650 }
12651
12652 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
12653    tri-bool options (yes, no, don't care) and the default value is
12654    DEF, determine whether to reject inlining.  */
12655
12656 static bool
12657 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
12658                                      int dont_care, int def)
12659 {
12660   /* If the callee doesn't care, always allow inlining.  */
12661   if (callee == dont_care)
12662     return true;
12663
12664   /* If the caller doesn't care, always allow inlining.  */
12665   if (caller == dont_care)
12666     return true;
12667
12668   /* Otherwise, allow inlining if either the callee and caller values
12669      agree, or if the callee is using the default value.  */
12670   return (callee == caller || callee == def);
12671 }
12672
12673 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
12674    to inline CALLEE into CALLER based on target-specific info.
12675    Make sure that the caller and callee have compatible architectural
12676    features.  Then go through the other possible target attributes
12677    and see if they can block inlining.  Try not to reject always_inline
12678    callees unless they are incompatible architecturally.  */
12679
12680 static bool
12681 aarch64_can_inline_p (tree caller, tree callee)
12682 {
12683   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
12684   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
12685
12686   struct cl_target_option *caller_opts
12687         = TREE_TARGET_OPTION (caller_tree ? caller_tree
12688                                            : target_option_default_node);
12689
12690   struct cl_target_option *callee_opts
12691         = TREE_TARGET_OPTION (callee_tree ? callee_tree
12692                                            : target_option_default_node);
12693
12694   /* Callee's ISA flags should be a subset of the caller's.  */
12695   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
12696        != callee_opts->x_aarch64_isa_flags)
12697     return false;
12698
12699   /* Allow non-strict aligned functions inlining into strict
12700      aligned ones.  */
12701   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
12702        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
12703       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
12704            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
12705     return false;
12706
12707   bool always_inline = lookup_attribute ("always_inline",
12708                                           DECL_ATTRIBUTES (callee));
12709
12710   /* If the architectural features match up and the callee is always_inline
12711      then the other attributes don't matter.  */
12712   if (always_inline)
12713     return true;
12714
12715   if (caller_opts->x_aarch64_cmodel_var
12716       != callee_opts->x_aarch64_cmodel_var)
12717     return false;
12718
12719   if (caller_opts->x_aarch64_tls_dialect
12720       != callee_opts->x_aarch64_tls_dialect)
12721     return false;
12722
12723   /* Honour explicit requests to workaround errata.  */
12724   if (!aarch64_tribools_ok_for_inlining_p (
12725           caller_opts->x_aarch64_fix_a53_err835769,
12726           callee_opts->x_aarch64_fix_a53_err835769,
12727           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
12728     return false;
12729
12730   if (!aarch64_tribools_ok_for_inlining_p (
12731           caller_opts->x_aarch64_fix_a53_err843419,
12732           callee_opts->x_aarch64_fix_a53_err843419,
12733           2, TARGET_FIX_ERR_A53_843419))
12734     return false;
12735
12736   /* If the user explicitly specified -momit-leaf-frame-pointer for the
12737      caller and calle and they don't match up, reject inlining.  */
12738   if (!aarch64_tribools_ok_for_inlining_p (
12739           caller_opts->x_flag_omit_leaf_frame_pointer,
12740           callee_opts->x_flag_omit_leaf_frame_pointer,
12741           2, 1))
12742     return false;
12743
12744   /* If the callee has specific tuning overrides, respect them.  */
12745   if (callee_opts->x_aarch64_override_tune_string != NULL
12746       && caller_opts->x_aarch64_override_tune_string == NULL)
12747     return false;
12748
12749   /* If the user specified tuning override strings for the
12750      caller and callee and they don't match up, reject inlining.
12751      We just do a string compare here, we don't analyze the meaning
12752      of the string, as it would be too costly for little gain.  */
12753   if (callee_opts->x_aarch64_override_tune_string
12754       && caller_opts->x_aarch64_override_tune_string
12755       && (strcmp (callee_opts->x_aarch64_override_tune_string,
12756                   caller_opts->x_aarch64_override_tune_string) != 0))
12757     return false;
12758
12759   return true;
12760 }
12761
12762 /* Return true if SYMBOL_REF X binds locally.  */
12763
12764 static bool
12765 aarch64_symbol_binds_local_p (const_rtx x)
12766 {
12767   return (SYMBOL_REF_DECL (x)
12768           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
12769           : SYMBOL_REF_LOCAL_P (x));
12770 }
12771
12772 /* Return true if SYMBOL_REF X is thread local */
12773 static bool
12774 aarch64_tls_symbol_p (rtx x)
12775 {
12776   if (! TARGET_HAVE_TLS)
12777     return false;
12778
12779   if (GET_CODE (x) != SYMBOL_REF)
12780     return false;
12781
12782   return SYMBOL_REF_TLS_MODEL (x) != 0;
12783 }
12784
12785 /* Classify a TLS symbol into one of the TLS kinds.  */
12786 enum aarch64_symbol_type
12787 aarch64_classify_tls_symbol (rtx x)
12788 {
12789   enum tls_model tls_kind = tls_symbolic_operand_type (x);
12790
12791   switch (tls_kind)
12792     {
12793     case TLS_MODEL_GLOBAL_DYNAMIC:
12794     case TLS_MODEL_LOCAL_DYNAMIC:
12795       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
12796
12797     case TLS_MODEL_INITIAL_EXEC:
12798       switch (aarch64_cmodel)
12799         {
12800         case AARCH64_CMODEL_TINY:
12801         case AARCH64_CMODEL_TINY_PIC:
12802           return SYMBOL_TINY_TLSIE;
12803         default:
12804           return SYMBOL_SMALL_TLSIE;
12805         }
12806
12807     case TLS_MODEL_LOCAL_EXEC:
12808       if (aarch64_tls_size == 12)
12809         return SYMBOL_TLSLE12;
12810       else if (aarch64_tls_size == 24)
12811         return SYMBOL_TLSLE24;
12812       else if (aarch64_tls_size == 32)
12813         return SYMBOL_TLSLE32;
12814       else if (aarch64_tls_size == 48)
12815         return SYMBOL_TLSLE48;
12816       else
12817         gcc_unreachable ();
12818
12819     case TLS_MODEL_EMULATED:
12820     case TLS_MODEL_NONE:
12821       return SYMBOL_FORCE_TO_MEM;
12822
12823     default:
12824       gcc_unreachable ();
12825     }
12826 }
12827
12828 /* Return the correct method for accessing X + OFFSET, where X is either
12829    a SYMBOL_REF or LABEL_REF.  */
12830
12831 enum aarch64_symbol_type
12832 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
12833 {
12834   if (GET_CODE (x) == LABEL_REF)
12835     {
12836       switch (aarch64_cmodel)
12837         {
12838         case AARCH64_CMODEL_LARGE:
12839           return SYMBOL_FORCE_TO_MEM;
12840
12841         case AARCH64_CMODEL_TINY_PIC:
12842         case AARCH64_CMODEL_TINY:
12843           return SYMBOL_TINY_ABSOLUTE;
12844
12845         case AARCH64_CMODEL_SMALL_SPIC:
12846         case AARCH64_CMODEL_SMALL_PIC:
12847         case AARCH64_CMODEL_SMALL:
12848           return SYMBOL_SMALL_ABSOLUTE;
12849
12850         default:
12851           gcc_unreachable ();
12852         }
12853     }
12854
12855   if (GET_CODE (x) == SYMBOL_REF)
12856     {
12857       if (aarch64_tls_symbol_p (x))
12858         return aarch64_classify_tls_symbol (x);
12859
12860       switch (aarch64_cmodel)
12861         {
12862         case AARCH64_CMODEL_TINY:
12863           /* When we retrieve symbol + offset address, we have to make sure
12864              the offset does not cause overflow of the final address.  But
12865              we have no way of knowing the address of symbol at compile time
12866              so we can't accurately say if the distance between the PC and
12867              symbol + offset is outside the addressible range of +/-1M in the
12868              TINY code model.  So we rely on images not being greater than
12869              1M and cap the offset at 1M and anything beyond 1M will have to
12870              be loaded using an alternative mechanism.  Furthermore if the
12871              symbol is a weak reference to something that isn't known to
12872              resolve to a symbol in this module, then force to memory.  */
12873           if ((SYMBOL_REF_WEAK (x)
12874                && !aarch64_symbol_binds_local_p (x))
12875               || !IN_RANGE (offset, -1048575, 1048575))
12876             return SYMBOL_FORCE_TO_MEM;
12877           return SYMBOL_TINY_ABSOLUTE;
12878
12879         case AARCH64_CMODEL_SMALL:
12880           /* Same reasoning as the tiny code model, but the offset cap here is
12881              4G.  */
12882           if ((SYMBOL_REF_WEAK (x)
12883                && !aarch64_symbol_binds_local_p (x))
12884               || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
12885                             HOST_WIDE_INT_C (4294967264)))
12886             return SYMBOL_FORCE_TO_MEM;
12887           return SYMBOL_SMALL_ABSOLUTE;
12888
12889         case AARCH64_CMODEL_TINY_PIC:
12890           if (!aarch64_symbol_binds_local_p (x))
12891             return SYMBOL_TINY_GOT;
12892           return SYMBOL_TINY_ABSOLUTE;
12893
12894         case AARCH64_CMODEL_SMALL_SPIC:
12895         case AARCH64_CMODEL_SMALL_PIC:
12896           if (!aarch64_symbol_binds_local_p (x))
12897             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
12898                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
12899           return SYMBOL_SMALL_ABSOLUTE;
12900
12901         case AARCH64_CMODEL_LARGE:
12902           /* This is alright even in PIC code as the constant
12903              pool reference is always PC relative and within
12904              the same translation unit.  */
12905           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
12906             return SYMBOL_SMALL_ABSOLUTE;
12907           else
12908             return SYMBOL_FORCE_TO_MEM;
12909
12910         default:
12911           gcc_unreachable ();
12912         }
12913     }
12914
12915   /* By default push everything into the constant pool.  */
12916   return SYMBOL_FORCE_TO_MEM;
12917 }
12918
12919 bool
12920 aarch64_constant_address_p (rtx x)
12921 {
12922   return (CONSTANT_P (x) && memory_address_p (DImode, x));
12923 }
12924
12925 bool
12926 aarch64_legitimate_pic_operand_p (rtx x)
12927 {
12928   if (GET_CODE (x) == SYMBOL_REF
12929       || (GET_CODE (x) == CONST
12930           && GET_CODE (XEXP (x, 0)) == PLUS
12931           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
12932      return false;
12933
12934   return true;
12935 }
12936
12937 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
12938    that should be rematerialized rather than spilled.  */
12939
12940 static bool
12941 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
12942 {
12943   /* Support CSE and rematerialization of common constants.  */
12944   if (CONST_INT_P (x)
12945       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
12946       || GET_CODE (x) == CONST_VECTOR)
12947     return true;
12948
12949   /* Do not allow vector struct mode constants for Advanced SIMD.
12950      We could support 0 and -1 easily, but they need support in
12951      aarch64-simd.md.  */
12952   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12953   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
12954     return false;
12955
12956   /* Only accept variable-length vector constants if they can be
12957      handled directly.
12958
12959      ??? It would be possible to handle rematerialization of other
12960      constants via secondary reloads.  */
12961   if (vec_flags & VEC_ANY_SVE)
12962     return aarch64_simd_valid_immediate (x, NULL);
12963
12964   if (GET_CODE (x) == HIGH)
12965     x = XEXP (x, 0);
12966
12967   /* Accept polynomial constants that can be calculated by using the
12968      destination of a move as the sole temporary.  Constants that
12969      require a second temporary cannot be rematerialized (they can't be
12970      forced to memory and also aren't legitimate constants).  */
12971   poly_int64 offset;
12972   if (poly_int_rtx_p (x, &offset))
12973     return aarch64_offset_temporaries (false, offset) <= 1;
12974
12975   /* If an offset is being added to something else, we need to allow the
12976      base to be moved into the destination register, meaning that there
12977      are no free temporaries for the offset.  */
12978   x = strip_offset (x, &offset);
12979   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
12980     return false;
12981
12982   /* Do not allow const (plus (anchor_symbol, const_int)).  */
12983   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
12984     return false;
12985
12986   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
12987      so spilling them is better than rematerialization.  */
12988   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
12989     return true;
12990
12991   /* Label references are always constant.  */
12992   if (GET_CODE (x) == LABEL_REF)
12993     return true;
12994
12995   return false;
12996 }
12997
12998 rtx
12999 aarch64_load_tp (rtx target)
13000 {
13001   if (!target
13002       || GET_MODE (target) != Pmode
13003       || !register_operand (target, Pmode))
13004     target = gen_reg_rtx (Pmode);
13005
13006   /* Can return in any reg.  */
13007   emit_insn (gen_aarch64_load_tp_hard (target));
13008   return target;
13009 }
13010
13011 /* On AAPCS systems, this is the "struct __va_list".  */
13012 static GTY(()) tree va_list_type;
13013
13014 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
13015    Return the type to use as __builtin_va_list.
13016
13017    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
13018
13019    struct __va_list
13020    {
13021      void *__stack;
13022      void *__gr_top;
13023      void *__vr_top;
13024      int   __gr_offs;
13025      int   __vr_offs;
13026    };  */
13027
13028 static tree
13029 aarch64_build_builtin_va_list (void)
13030 {
13031   tree va_list_name;
13032   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13033
13034   /* Create the type.  */
13035   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
13036   /* Give it the required name.  */
13037   va_list_name = build_decl (BUILTINS_LOCATION,
13038                              TYPE_DECL,
13039                              get_identifier ("__va_list"),
13040                              va_list_type);
13041   DECL_ARTIFICIAL (va_list_name) = 1;
13042   TYPE_NAME (va_list_type) = va_list_name;
13043   TYPE_STUB_DECL (va_list_type) = va_list_name;
13044
13045   /* Create the fields.  */
13046   f_stack = build_decl (BUILTINS_LOCATION,
13047                         FIELD_DECL, get_identifier ("__stack"),
13048                         ptr_type_node);
13049   f_grtop = build_decl (BUILTINS_LOCATION,
13050                         FIELD_DECL, get_identifier ("__gr_top"),
13051                         ptr_type_node);
13052   f_vrtop = build_decl (BUILTINS_LOCATION,
13053                         FIELD_DECL, get_identifier ("__vr_top"),
13054                         ptr_type_node);
13055   f_groff = build_decl (BUILTINS_LOCATION,
13056                         FIELD_DECL, get_identifier ("__gr_offs"),
13057                         integer_type_node);
13058   f_vroff = build_decl (BUILTINS_LOCATION,
13059                         FIELD_DECL, get_identifier ("__vr_offs"),
13060                         integer_type_node);
13061
13062   /* Tell tree-stdarg pass about our internal offset fields.
13063      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
13064      purpose to identify whether the code is updating va_list internal
13065      offset fields through irregular way.  */
13066   va_list_gpr_counter_field = f_groff;
13067   va_list_fpr_counter_field = f_vroff;
13068
13069   DECL_ARTIFICIAL (f_stack) = 1;
13070   DECL_ARTIFICIAL (f_grtop) = 1;
13071   DECL_ARTIFICIAL (f_vrtop) = 1;
13072   DECL_ARTIFICIAL (f_groff) = 1;
13073   DECL_ARTIFICIAL (f_vroff) = 1;
13074
13075   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
13076   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
13077   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
13078   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
13079   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
13080
13081   TYPE_FIELDS (va_list_type) = f_stack;
13082   DECL_CHAIN (f_stack) = f_grtop;
13083   DECL_CHAIN (f_grtop) = f_vrtop;
13084   DECL_CHAIN (f_vrtop) = f_groff;
13085   DECL_CHAIN (f_groff) = f_vroff;
13086
13087   /* Compute its layout.  */
13088   layout_type (va_list_type);
13089
13090   return va_list_type;
13091 }
13092
13093 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
13094 static void
13095 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
13096 {
13097   const CUMULATIVE_ARGS *cum;
13098   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13099   tree stack, grtop, vrtop, groff, vroff;
13100   tree t;
13101   int gr_save_area_size = cfun->va_list_gpr_size;
13102   int vr_save_area_size = cfun->va_list_fpr_size;
13103   int vr_offset;
13104
13105   cum = &crtl->args.info;
13106   if (cfun->va_list_gpr_size)
13107     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
13108                              cfun->va_list_gpr_size);
13109   if (cfun->va_list_fpr_size)
13110     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
13111                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
13112
13113   if (!TARGET_FLOAT)
13114     {
13115       gcc_assert (cum->aapcs_nvrn == 0);
13116       vr_save_area_size = 0;
13117     }
13118
13119   f_stack = TYPE_FIELDS (va_list_type_node);
13120   f_grtop = DECL_CHAIN (f_stack);
13121   f_vrtop = DECL_CHAIN (f_grtop);
13122   f_groff = DECL_CHAIN (f_vrtop);
13123   f_vroff = DECL_CHAIN (f_groff);
13124
13125   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
13126                   NULL_TREE);
13127   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
13128                   NULL_TREE);
13129   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
13130                   NULL_TREE);
13131   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
13132                   NULL_TREE);
13133   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
13134                   NULL_TREE);
13135
13136   /* Emit code to initialize STACK, which points to the next varargs stack
13137      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
13138      by named arguments.  STACK is 8-byte aligned.  */
13139   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
13140   if (cum->aapcs_stack_size > 0)
13141     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
13142   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
13143   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13144
13145   /* Emit code to initialize GRTOP, the top of the GR save area.
13146      virtual_incoming_args_rtx should have been 16 byte aligned.  */
13147   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
13148   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
13149   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13150
13151   /* Emit code to initialize VRTOP, the top of the VR save area.
13152      This address is gr_save_area_bytes below GRTOP, rounded
13153      down to the next 16-byte boundary.  */
13154   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
13155   vr_offset = ROUND_UP (gr_save_area_size,
13156                         STACK_BOUNDARY / BITS_PER_UNIT);
13157
13158   if (vr_offset)
13159     t = fold_build_pointer_plus_hwi (t, -vr_offset);
13160   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
13161   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13162
13163   /* Emit code to initialize GROFF, the offset from GRTOP of the
13164      next GPR argument.  */
13165   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
13166               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
13167   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13168
13169   /* Likewise emit code to initialize VROFF, the offset from FTOP
13170      of the next VR argument.  */
13171   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
13172               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
13173   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13174 }
13175
13176 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
13177
13178 static tree
13179 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
13180                               gimple_seq *post_p ATTRIBUTE_UNUSED)
13181 {
13182   tree addr;
13183   bool indirect_p;
13184   bool is_ha;           /* is HFA or HVA.  */
13185   bool dw_align;        /* double-word align.  */
13186   machine_mode ag_mode = VOIDmode;
13187   int nregs;
13188   machine_mode mode;
13189
13190   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13191   tree stack, f_top, f_off, off, arg, roundup, on_stack;
13192   HOST_WIDE_INT size, rsize, adjust, align;
13193   tree t, u, cond1, cond2;
13194
13195   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
13196   if (indirect_p)
13197     type = build_pointer_type (type);
13198
13199   mode = TYPE_MODE (type);
13200
13201   f_stack = TYPE_FIELDS (va_list_type_node);
13202   f_grtop = DECL_CHAIN (f_stack);
13203   f_vrtop = DECL_CHAIN (f_grtop);
13204   f_groff = DECL_CHAIN (f_vrtop);
13205   f_vroff = DECL_CHAIN (f_groff);
13206
13207   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
13208                   f_stack, NULL_TREE);
13209   size = int_size_in_bytes (type);
13210   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
13211
13212   dw_align = false;
13213   adjust = 0;
13214   if (aarch64_vfp_is_call_or_return_candidate (mode,
13215                                                type,
13216                                                &ag_mode,
13217                                                &nregs,
13218                                                &is_ha))
13219     {
13220       /* No frontends can create types with variable-sized modes, so we
13221          shouldn't be asked to pass or return them.  */
13222       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
13223
13224       /* TYPE passed in fp/simd registers.  */
13225       if (!TARGET_FLOAT)
13226         aarch64_err_no_fpadvsimd (mode);
13227
13228       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
13229                       unshare_expr (valist), f_vrtop, NULL_TREE);
13230       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
13231                       unshare_expr (valist), f_vroff, NULL_TREE);
13232
13233       rsize = nregs * UNITS_PER_VREG;
13234
13235       if (is_ha)
13236         {
13237           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
13238             adjust = UNITS_PER_VREG - ag_size;
13239         }
13240       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13241                && size < UNITS_PER_VREG)
13242         {
13243           adjust = UNITS_PER_VREG - size;
13244         }
13245     }
13246   else
13247     {
13248       /* TYPE passed in general registers.  */
13249       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
13250                       unshare_expr (valist), f_grtop, NULL_TREE);
13251       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
13252                       unshare_expr (valist), f_groff, NULL_TREE);
13253       rsize = ROUND_UP (size, UNITS_PER_WORD);
13254       nregs = rsize / UNITS_PER_WORD;
13255
13256       if (align > 8)
13257         dw_align = true;
13258
13259       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13260           && size < UNITS_PER_WORD)
13261         {
13262           adjust = UNITS_PER_WORD  - size;
13263         }
13264     }
13265
13266   /* Get a local temporary for the field value.  */
13267   off = get_initialized_tmp_var (f_off, pre_p, NULL);
13268
13269   /* Emit code to branch if off >= 0.  */
13270   t = build2 (GE_EXPR, boolean_type_node, off,
13271               build_int_cst (TREE_TYPE (off), 0));
13272   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
13273
13274   if (dw_align)
13275     {
13276       /* Emit: offs = (offs + 15) & -16.  */
13277       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
13278                   build_int_cst (TREE_TYPE (off), 15));
13279       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
13280                   build_int_cst (TREE_TYPE (off), -16));
13281       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
13282     }
13283   else
13284     roundup = NULL;
13285
13286   /* Update ap.__[g|v]r_offs  */
13287   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
13288               build_int_cst (TREE_TYPE (off), rsize));
13289   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
13290
13291   /* String up.  */
13292   if (roundup)
13293     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
13294
13295   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
13296   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
13297               build_int_cst (TREE_TYPE (f_off), 0));
13298   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
13299
13300   /* String up: make sure the assignment happens before the use.  */
13301   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
13302   COND_EXPR_ELSE (cond1) = t;
13303
13304   /* Prepare the trees handling the argument that is passed on the stack;
13305      the top level node will store in ON_STACK.  */
13306   arg = get_initialized_tmp_var (stack, pre_p, NULL);
13307   if (align > 8)
13308     {
13309       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
13310       t = fold_build_pointer_plus_hwi (arg, 15);
13311       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
13312                   build_int_cst (TREE_TYPE (t), -16));
13313       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
13314     }
13315   else
13316     roundup = NULL;
13317   /* Advance ap.__stack  */
13318   t = fold_build_pointer_plus_hwi (arg, size + 7);
13319   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
13320               build_int_cst (TREE_TYPE (t), -8));
13321   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
13322   /* String up roundup and advance.  */
13323   if (roundup)
13324     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
13325   /* String up with arg */
13326   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
13327   /* Big-endianness related address adjustment.  */
13328   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13329       && size < UNITS_PER_WORD)
13330   {
13331     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
13332                 size_int (UNITS_PER_WORD - size));
13333     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
13334   }
13335
13336   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
13337   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
13338
13339   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
13340   t = off;
13341   if (adjust)
13342     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
13343                 build_int_cst (TREE_TYPE (off), adjust));
13344
13345   t = fold_convert (sizetype, t);
13346   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
13347
13348   if (is_ha)
13349     {
13350       /* type ha; // treat as "struct {ftype field[n];}"
13351          ... [computing offs]
13352          for (i = 0; i <nregs; ++i, offs += 16)
13353            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
13354          return ha;  */
13355       int i;
13356       tree tmp_ha, field_t, field_ptr_t;
13357
13358       /* Declare a local variable.  */
13359       tmp_ha = create_tmp_var_raw (type, "ha");
13360       gimple_add_tmp_var (tmp_ha);
13361
13362       /* Establish the base type.  */
13363       switch (ag_mode)
13364         {
13365         case E_SFmode:
13366           field_t = float_type_node;
13367           field_ptr_t = float_ptr_type_node;
13368           break;
13369         case E_DFmode:
13370           field_t = double_type_node;
13371           field_ptr_t = double_ptr_type_node;
13372           break;
13373         case E_TFmode:
13374           field_t = long_double_type_node;
13375           field_ptr_t = long_double_ptr_type_node;
13376           break;
13377         case E_HFmode:
13378           field_t = aarch64_fp16_type_node;
13379           field_ptr_t = aarch64_fp16_ptr_type_node;
13380           break;
13381         case E_V2SImode:
13382         case E_V4SImode:
13383             {
13384               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
13385               field_t = build_vector_type_for_mode (innertype, ag_mode);
13386               field_ptr_t = build_pointer_type (field_t);
13387             }
13388           break;
13389         default:
13390           gcc_assert (0);
13391         }
13392
13393       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
13394       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
13395       addr = t;
13396       t = fold_convert (field_ptr_t, addr);
13397       t = build2 (MODIFY_EXPR, field_t,
13398                   build1 (INDIRECT_REF, field_t, tmp_ha),
13399                   build1 (INDIRECT_REF, field_t, t));
13400
13401       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
13402       for (i = 1; i < nregs; ++i)
13403         {
13404           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
13405           u = fold_convert (field_ptr_t, addr);
13406           u = build2 (MODIFY_EXPR, field_t,
13407                       build2 (MEM_REF, field_t, tmp_ha,
13408                               build_int_cst (field_ptr_t,
13409                                              (i *
13410                                               int_size_in_bytes (field_t)))),
13411                       build1 (INDIRECT_REF, field_t, u));
13412           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
13413         }
13414
13415       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
13416       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
13417     }
13418
13419   COND_EXPR_ELSE (cond2) = t;
13420   addr = fold_convert (build_pointer_type (type), cond1);
13421   addr = build_va_arg_indirect_ref (addr);
13422
13423   if (indirect_p)
13424     addr = build_va_arg_indirect_ref (addr);
13425
13426   return addr;
13427 }
13428
13429 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
13430
13431 static void
13432 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
13433                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
13434                                 int no_rtl)
13435 {
13436   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
13437   CUMULATIVE_ARGS local_cum;
13438   int gr_saved = cfun->va_list_gpr_size;
13439   int vr_saved = cfun->va_list_fpr_size;
13440
13441   /* The caller has advanced CUM up to, but not beyond, the last named
13442      argument.  Advance a local copy of CUM past the last "real" named
13443      argument, to find out how many registers are left over.  */
13444   local_cum = *cum;
13445   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
13446
13447   /* Found out how many registers we need to save.
13448      Honor tree-stdvar analysis results.  */
13449   if (cfun->va_list_gpr_size)
13450     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
13451                     cfun->va_list_gpr_size / UNITS_PER_WORD);
13452   if (cfun->va_list_fpr_size)
13453     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
13454                     cfun->va_list_fpr_size / UNITS_PER_VREG);
13455
13456   if (!TARGET_FLOAT)
13457     {
13458       gcc_assert (local_cum.aapcs_nvrn == 0);
13459       vr_saved = 0;
13460     }
13461
13462   if (!no_rtl)
13463     {
13464       if (gr_saved > 0)
13465         {
13466           rtx ptr, mem;
13467
13468           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
13469           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
13470                                - gr_saved * UNITS_PER_WORD);
13471           mem = gen_frame_mem (BLKmode, ptr);
13472           set_mem_alias_set (mem, get_varargs_alias_set ());
13473
13474           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
13475                                mem, gr_saved);
13476         }
13477       if (vr_saved > 0)
13478         {
13479           /* We can't use move_block_from_reg, because it will use
13480              the wrong mode, storing D regs only.  */
13481           machine_mode mode = TImode;
13482           int off, i, vr_start;
13483
13484           /* Set OFF to the offset from virtual_incoming_args_rtx of
13485              the first vector register.  The VR save area lies below
13486              the GR one, and is aligned to 16 bytes.  */
13487           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
13488                            STACK_BOUNDARY / BITS_PER_UNIT);
13489           off -= vr_saved * UNITS_PER_VREG;
13490
13491           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
13492           for (i = 0; i < vr_saved; ++i)
13493             {
13494               rtx ptr, mem;
13495
13496               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
13497               mem = gen_frame_mem (mode, ptr);
13498               set_mem_alias_set (mem, get_varargs_alias_set ());
13499               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
13500               off += UNITS_PER_VREG;
13501             }
13502         }
13503     }
13504
13505   /* We don't save the size into *PRETEND_SIZE because we want to avoid
13506      any complication of having crtl->args.pretend_args_size changed.  */
13507   cfun->machine->frame.saved_varargs_size
13508     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
13509                  STACK_BOUNDARY / BITS_PER_UNIT)
13510        + vr_saved * UNITS_PER_VREG);
13511 }
13512
13513 static void
13514 aarch64_conditional_register_usage (void)
13515 {
13516   int i;
13517   if (!TARGET_FLOAT)
13518     {
13519       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
13520         {
13521           fixed_regs[i] = 1;
13522           call_used_regs[i] = 1;
13523         }
13524     }
13525   if (!TARGET_SVE)
13526     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
13527       {
13528         fixed_regs[i] = 1;
13529         call_used_regs[i] = 1;
13530       }
13531
13532   /* When tracking speculation, we need a couple of call-clobbered registers
13533      to track the speculation state.  It would be nice to just use
13534      IP0 and IP1, but currently there are numerous places that just
13535      assume these registers are free for other uses (eg pointer
13536      authentication).  */
13537   if (aarch64_track_speculation)
13538     {
13539       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
13540       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
13541       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
13542       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
13543     }
13544 }
13545
13546 /* Walk down the type tree of TYPE counting consecutive base elements.
13547    If *MODEP is VOIDmode, then set it to the first valid floating point
13548    type.  If a non-floating point type is found, or if a floating point
13549    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
13550    otherwise return the count in the sub-tree.  */
13551 static int
13552 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
13553 {
13554   machine_mode mode;
13555   HOST_WIDE_INT size;
13556
13557   switch (TREE_CODE (type))
13558     {
13559     case REAL_TYPE:
13560       mode = TYPE_MODE (type);
13561       if (mode != DFmode && mode != SFmode
13562           && mode != TFmode && mode != HFmode)
13563         return -1;
13564
13565       if (*modep == VOIDmode)
13566         *modep = mode;
13567
13568       if (*modep == mode)
13569         return 1;
13570
13571       break;
13572
13573     case COMPLEX_TYPE:
13574       mode = TYPE_MODE (TREE_TYPE (type));
13575       if (mode != DFmode && mode != SFmode
13576           && mode != TFmode && mode != HFmode)
13577         return -1;
13578
13579       if (*modep == VOIDmode)
13580         *modep = mode;
13581
13582       if (*modep == mode)
13583         return 2;
13584
13585       break;
13586
13587     case VECTOR_TYPE:
13588       /* Use V2SImode and V4SImode as representatives of all 64-bit
13589          and 128-bit vector types.  */
13590       size = int_size_in_bytes (type);
13591       switch (size)
13592         {
13593         case 8:
13594           mode = V2SImode;
13595           break;
13596         case 16:
13597           mode = V4SImode;
13598           break;
13599         default:
13600           return -1;
13601         }
13602
13603       if (*modep == VOIDmode)
13604         *modep = mode;
13605
13606       /* Vector modes are considered to be opaque: two vectors are
13607          equivalent for the purposes of being homogeneous aggregates
13608          if they are the same size.  */
13609       if (*modep == mode)
13610         return 1;
13611
13612       break;
13613
13614     case ARRAY_TYPE:
13615       {
13616         int count;
13617         tree index = TYPE_DOMAIN (type);
13618
13619         /* Can't handle incomplete types nor sizes that are not
13620            fixed.  */
13621         if (!COMPLETE_TYPE_P (type)
13622             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13623           return -1;
13624
13625         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
13626         if (count == -1
13627             || !index
13628             || !TYPE_MAX_VALUE (index)
13629             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
13630             || !TYPE_MIN_VALUE (index)
13631             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
13632             || count < 0)
13633           return -1;
13634
13635         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
13636                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
13637
13638         /* There must be no padding.  */
13639         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13640                       count * GET_MODE_BITSIZE (*modep)))
13641           return -1;
13642
13643         return count;
13644       }
13645
13646     case RECORD_TYPE:
13647       {
13648         int count = 0;
13649         int sub_count;
13650         tree field;
13651
13652         /* Can't handle incomplete types nor sizes that are not
13653            fixed.  */
13654         if (!COMPLETE_TYPE_P (type)
13655             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13656           return -1;
13657
13658         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
13659           {
13660             if (TREE_CODE (field) != FIELD_DECL)
13661               continue;
13662
13663             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
13664             if (sub_count < 0)
13665               return -1;
13666             count += sub_count;
13667           }
13668
13669         /* There must be no padding.  */
13670         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13671                       count * GET_MODE_BITSIZE (*modep)))
13672           return -1;
13673
13674         return count;
13675       }
13676
13677     case UNION_TYPE:
13678     case QUAL_UNION_TYPE:
13679       {
13680         /* These aren't very interesting except in a degenerate case.  */
13681         int count = 0;
13682         int sub_count;
13683         tree field;
13684
13685         /* Can't handle incomplete types nor sizes that are not
13686            fixed.  */
13687         if (!COMPLETE_TYPE_P (type)
13688             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13689           return -1;
13690
13691         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
13692           {
13693             if (TREE_CODE (field) != FIELD_DECL)
13694               continue;
13695
13696             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
13697             if (sub_count < 0)
13698               return -1;
13699             count = count > sub_count ? count : sub_count;
13700           }
13701
13702         /* There must be no padding.  */
13703         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13704                       count * GET_MODE_BITSIZE (*modep)))
13705           return -1;
13706
13707         return count;
13708       }
13709
13710     default:
13711       break;
13712     }
13713
13714   return -1;
13715 }
13716
13717 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
13718    type as described in AAPCS64 \S 4.1.2.
13719
13720    See the comment above aarch64_composite_type_p for the notes on MODE.  */
13721
13722 static bool
13723 aarch64_short_vector_p (const_tree type,
13724                         machine_mode mode)
13725 {
13726   poly_int64 size = -1;
13727
13728   if (type && TREE_CODE (type) == VECTOR_TYPE)
13729     size = int_size_in_bytes (type);
13730   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
13731             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
13732     size = GET_MODE_SIZE (mode);
13733
13734   return known_eq (size, 8) || known_eq (size, 16);
13735 }
13736
13737 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
13738    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
13739    array types.  The C99 floating-point complex types are also considered
13740    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
13741    types, which are GCC extensions and out of the scope of AAPCS64, are
13742    treated as composite types here as well.
13743
13744    Note that MODE itself is not sufficient in determining whether a type
13745    is such a composite type or not.  This is because
13746    stor-layout.c:compute_record_mode may have already changed the MODE
13747    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
13748    structure with only one field may have its MODE set to the mode of the
13749    field.  Also an integer mode whose size matches the size of the
13750    RECORD_TYPE type may be used to substitute the original mode
13751    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
13752    solely relied on.  */
13753
13754 static bool
13755 aarch64_composite_type_p (const_tree type,
13756                           machine_mode mode)
13757 {
13758   if (aarch64_short_vector_p (type, mode))
13759     return false;
13760
13761   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
13762     return true;
13763
13764   if (mode == BLKmode
13765       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
13766       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
13767     return true;
13768
13769   return false;
13770 }
13771
13772 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
13773    shall be passed or returned in simd/fp register(s) (providing these
13774    parameter passing registers are available).
13775
13776    Upon successful return, *COUNT returns the number of needed registers,
13777    *BASE_MODE returns the mode of the individual register and when IS_HAF
13778    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
13779    floating-point aggregate or a homogeneous short-vector aggregate.  */
13780
13781 static bool
13782 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
13783                                          const_tree type,
13784                                          machine_mode *base_mode,
13785                                          int *count,
13786                                          bool *is_ha)
13787 {
13788   machine_mode new_mode = VOIDmode;
13789   bool composite_p = aarch64_composite_type_p (type, mode);
13790
13791   if (is_ha != NULL) *is_ha = false;
13792
13793   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
13794       || aarch64_short_vector_p (type, mode))
13795     {
13796       *count = 1;
13797       new_mode = mode;
13798     }
13799   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
13800     {
13801       if (is_ha != NULL) *is_ha = true;
13802       *count = 2;
13803       new_mode = GET_MODE_INNER (mode);
13804     }
13805   else if (type && composite_p)
13806     {
13807       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
13808
13809       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
13810         {
13811           if (is_ha != NULL) *is_ha = true;
13812           *count = ag_count;
13813         }
13814       else
13815         return false;
13816     }
13817   else
13818     return false;
13819
13820   *base_mode = new_mode;
13821   return true;
13822 }
13823
13824 /* Implement TARGET_STRUCT_VALUE_RTX.  */
13825
13826 static rtx
13827 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
13828                           int incoming ATTRIBUTE_UNUSED)
13829 {
13830   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
13831 }
13832
13833 /* Implements target hook vector_mode_supported_p.  */
13834 static bool
13835 aarch64_vector_mode_supported_p (machine_mode mode)
13836 {
13837   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13838   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
13839 }
13840
13841 /* Return appropriate SIMD container
13842    for MODE within a vector of WIDTH bits.  */
13843 static machine_mode
13844 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
13845 {
13846   if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
13847     switch (mode)
13848       {
13849       case E_DFmode:
13850         return VNx2DFmode;
13851       case E_SFmode:
13852         return VNx4SFmode;
13853       case E_HFmode:
13854         return VNx8HFmode;
13855       case E_DImode:
13856         return VNx2DImode;
13857       case E_SImode:
13858         return VNx4SImode;
13859       case E_HImode:
13860         return VNx8HImode;
13861       case E_QImode:
13862         return VNx16QImode;
13863       default:
13864         return word_mode;
13865       }
13866
13867   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
13868   if (TARGET_SIMD)
13869     {
13870       if (known_eq (width, 128))
13871         switch (mode)
13872           {
13873           case E_DFmode:
13874             return V2DFmode;
13875           case E_SFmode:
13876             return V4SFmode;
13877           case E_HFmode:
13878             return V8HFmode;
13879           case E_SImode:
13880             return V4SImode;
13881           case E_HImode:
13882             return V8HImode;
13883           case E_QImode:
13884             return V16QImode;
13885           case E_DImode:
13886             return V2DImode;
13887           default:
13888             break;
13889           }
13890       else
13891         switch (mode)
13892           {
13893           case E_SFmode:
13894             return V2SFmode;
13895           case E_HFmode:
13896             return V4HFmode;
13897           case E_SImode:
13898             return V2SImode;
13899           case E_HImode:
13900             return V4HImode;
13901           case E_QImode:
13902             return V8QImode;
13903           default:
13904             break;
13905           }
13906     }
13907   return word_mode;
13908 }
13909
13910 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
13911 static machine_mode
13912 aarch64_preferred_simd_mode (scalar_mode mode)
13913 {
13914   poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
13915   return aarch64_simd_container_mode (mode, bits);
13916 }
13917
13918 /* Return a list of possible vector sizes for the vectorizer
13919    to iterate over.  */
13920 static void
13921 aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
13922 {
13923   if (TARGET_SVE)
13924     sizes->safe_push (BYTES_PER_SVE_VECTOR);
13925   sizes->safe_push (16);
13926   sizes->safe_push (8);
13927 }
13928
13929 /* Implement TARGET_MANGLE_TYPE.  */
13930
13931 static const char *
13932 aarch64_mangle_type (const_tree type)
13933 {
13934   /* The AArch64 ABI documents say that "__va_list" has to be
13935      mangled as if it is in the "std" namespace.  */
13936   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
13937     return "St9__va_list";
13938
13939   /* Half-precision float.  */
13940   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
13941     return "Dh";
13942
13943   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
13944      builtin types.  */
13945   if (TYPE_NAME (type) != NULL)
13946     return aarch64_mangle_builtin_type (type);
13947
13948   /* Use the default mangling.  */
13949   return NULL;
13950 }
13951
13952 /* Find the first rtx_insn before insn that will generate an assembly
13953    instruction.  */
13954
13955 static rtx_insn *
13956 aarch64_prev_real_insn (rtx_insn *insn)
13957 {
13958   if (!insn)
13959     return NULL;
13960
13961   do
13962     {
13963       insn = prev_real_insn (insn);
13964     }
13965   while (insn && recog_memoized (insn) < 0);
13966
13967   return insn;
13968 }
13969
13970 static bool
13971 is_madd_op (enum attr_type t1)
13972 {
13973   unsigned int i;
13974   /* A number of these may be AArch32 only.  */
13975   enum attr_type mlatypes[] = {
13976     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
13977     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
13978     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
13979   };
13980
13981   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
13982     {
13983       if (t1 == mlatypes[i])
13984         return true;
13985     }
13986
13987   return false;
13988 }
13989
13990 /* Check if there is a register dependency between a load and the insn
13991    for which we hold recog_data.  */
13992
13993 static bool
13994 dep_between_memop_and_curr (rtx memop)
13995 {
13996   rtx load_reg;
13997   int opno;
13998
13999   gcc_assert (GET_CODE (memop) == SET);
14000
14001   if (!REG_P (SET_DEST (memop)))
14002     return false;
14003
14004   load_reg = SET_DEST (memop);
14005   for (opno = 1; opno < recog_data.n_operands; opno++)
14006     {
14007       rtx operand = recog_data.operand[opno];
14008       if (REG_P (operand)
14009           && reg_overlap_mentioned_p (load_reg, operand))
14010         return true;
14011
14012     }
14013   return false;
14014 }
14015
14016
14017 /* When working around the Cortex-A53 erratum 835769,
14018    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
14019    instruction and has a preceding memory instruction such that a NOP
14020    should be inserted between them.  */
14021
14022 bool
14023 aarch64_madd_needs_nop (rtx_insn* insn)
14024 {
14025   enum attr_type attr_type;
14026   rtx_insn *prev;
14027   rtx body;
14028
14029   if (!TARGET_FIX_ERR_A53_835769)
14030     return false;
14031
14032   if (!INSN_P (insn) || recog_memoized (insn) < 0)
14033     return false;
14034
14035   attr_type = get_attr_type (insn);
14036   if (!is_madd_op (attr_type))
14037     return false;
14038
14039   prev = aarch64_prev_real_insn (insn);
14040   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
14041      Restore recog state to INSN to avoid state corruption.  */
14042   extract_constrain_insn_cached (insn);
14043
14044   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
14045     return false;
14046
14047   body = single_set (prev);
14048
14049   /* If the previous insn is a memory op and there is no dependency between
14050      it and the DImode madd, emit a NOP between them.  If body is NULL then we
14051      have a complex memory operation, probably a load/store pair.
14052      Be conservative for now and emit a NOP.  */
14053   if (GET_MODE (recog_data.operand[0]) == DImode
14054       && (!body || !dep_between_memop_and_curr (body)))
14055     return true;
14056
14057   return false;
14058
14059 }
14060
14061
14062 /* Implement FINAL_PRESCAN_INSN.  */
14063
14064 void
14065 aarch64_final_prescan_insn (rtx_insn *insn)
14066 {
14067   if (aarch64_madd_needs_nop (insn))
14068     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
14069 }
14070
14071
14072 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
14073    instruction.  */
14074
14075 bool
14076 aarch64_sve_index_immediate_p (rtx base_or_step)
14077 {
14078   return (CONST_INT_P (base_or_step)
14079           && IN_RANGE (INTVAL (base_or_step), -16, 15));
14080 }
14081
14082 /* Return true if X is a valid immediate for the SVE ADD and SUB
14083    instructions.  Negate X first if NEGATE_P is true.  */
14084
14085 bool
14086 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
14087 {
14088   rtx elt;
14089
14090   if (!const_vec_duplicate_p (x, &elt)
14091       || !CONST_INT_P (elt))
14092     return false;
14093
14094   HOST_WIDE_INT val = INTVAL (elt);
14095   if (negate_p)
14096     val = -val;
14097   val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
14098
14099   if (val & 0xff)
14100     return IN_RANGE (val, 0, 0xff);
14101   return IN_RANGE (val, 0, 0xff00);
14102 }
14103
14104 /* Return true if X is a valid immediate operand for an SVE logical
14105    instruction such as AND.  */
14106
14107 bool
14108 aarch64_sve_bitmask_immediate_p (rtx x)
14109 {
14110   rtx elt;
14111
14112   return (const_vec_duplicate_p (x, &elt)
14113           && CONST_INT_P (elt)
14114           && aarch64_bitmask_imm (INTVAL (elt),
14115                                   GET_MODE_INNER (GET_MODE (x))));
14116 }
14117
14118 /* Return true if X is a valid immediate for the SVE DUP and CPY
14119    instructions.  */
14120
14121 bool
14122 aarch64_sve_dup_immediate_p (rtx x)
14123 {
14124   rtx elt;
14125
14126   if (!const_vec_duplicate_p (x, &elt)
14127       || !CONST_INT_P (elt))
14128     return false;
14129
14130   HOST_WIDE_INT val = INTVAL (elt);
14131   if (val & 0xff)
14132     return IN_RANGE (val, -0x80, 0x7f);
14133   return IN_RANGE (val, -0x8000, 0x7f00);
14134 }
14135
14136 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
14137    SIGNED_P says whether the operand is signed rather than unsigned.  */
14138
14139 bool
14140 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
14141 {
14142   rtx elt;
14143
14144   return (const_vec_duplicate_p (x, &elt)
14145           && CONST_INT_P (elt)
14146           && (signed_p
14147               ? IN_RANGE (INTVAL (elt), -16, 15)
14148               : IN_RANGE (INTVAL (elt), 0, 127)));
14149 }
14150
14151 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
14152    instruction.  Negate X first if NEGATE_P is true.  */
14153
14154 bool
14155 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
14156 {
14157   rtx elt;
14158   REAL_VALUE_TYPE r;
14159
14160   if (!const_vec_duplicate_p (x, &elt)
14161       || GET_CODE (elt) != CONST_DOUBLE)
14162     return false;
14163
14164   r = *CONST_DOUBLE_REAL_VALUE (elt);
14165
14166   if (negate_p)
14167     r = real_value_negate (&r);
14168
14169   if (real_equal (&r, &dconst1))
14170     return true;
14171   if (real_equal (&r, &dconsthalf))
14172     return true;
14173   return false;
14174 }
14175
14176 /* Return true if X is a valid immediate operand for an SVE FMUL
14177    instruction.  */
14178
14179 bool
14180 aarch64_sve_float_mul_immediate_p (rtx x)
14181 {
14182   rtx elt;
14183
14184   /* GCC will never generate a multiply with an immediate of 2, so there is no
14185      point testing for it (even though it is a valid constant).  */
14186   return (const_vec_duplicate_p (x, &elt)
14187           && GET_CODE (elt) == CONST_DOUBLE
14188           && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
14189 }
14190
14191 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
14192    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
14193    is nonnull, use it to describe valid immediates.  */
14194 static bool
14195 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
14196                                     simd_immediate_info *info,
14197                                     enum simd_immediate_check which,
14198                                     simd_immediate_info::insn_type insn)
14199 {
14200   /* Try a 4-byte immediate with LSL.  */
14201   for (unsigned int shift = 0; shift < 32; shift += 8)
14202     if ((val32 & (0xff << shift)) == val32)
14203       {
14204         if (info)
14205           *info = simd_immediate_info (SImode, val32 >> shift, insn,
14206                                        simd_immediate_info::LSL, shift);
14207         return true;
14208       }
14209
14210   /* Try a 2-byte immediate with LSL.  */
14211   unsigned int imm16 = val32 & 0xffff;
14212   if (imm16 == (val32 >> 16))
14213     for (unsigned int shift = 0; shift < 16; shift += 8)
14214       if ((imm16 & (0xff << shift)) == imm16)
14215         {
14216           if (info)
14217             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
14218                                          simd_immediate_info::LSL, shift);
14219           return true;
14220         }
14221
14222   /* Try a 4-byte immediate with MSL, except for cases that MVN
14223      can handle.  */
14224   if (which == AARCH64_CHECK_MOV)
14225     for (unsigned int shift = 8; shift < 24; shift += 8)
14226       {
14227         unsigned int low = (1 << shift) - 1;
14228         if (((val32 & (0xff << shift)) | low) == val32)
14229           {
14230             if (info)
14231               *info = simd_immediate_info (SImode, val32 >> shift, insn,
14232                                            simd_immediate_info::MSL, shift);
14233             return true;
14234           }
14235       }
14236
14237   return false;
14238 }
14239
14240 /* Return true if replicating VAL64 is a valid immediate for the
14241    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
14242    use it to describe valid immediates.  */
14243 static bool
14244 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
14245                                  simd_immediate_info *info,
14246                                  enum simd_immediate_check which)
14247 {
14248   unsigned int val32 = val64 & 0xffffffff;
14249   unsigned int val16 = val64 & 0xffff;
14250   unsigned int val8 = val64 & 0xff;
14251
14252   if (val32 == (val64 >> 32))
14253     {
14254       if ((which & AARCH64_CHECK_ORR) != 0
14255           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
14256                                                  simd_immediate_info::MOV))
14257         return true;
14258
14259       if ((which & AARCH64_CHECK_BIC) != 0
14260           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
14261                                                  simd_immediate_info::MVN))
14262         return true;
14263
14264       /* Try using a replicated byte.  */
14265       if (which == AARCH64_CHECK_MOV
14266           && val16 == (val32 >> 16)
14267           && val8 == (val16 >> 8))
14268         {
14269           if (info)
14270             *info = simd_immediate_info (QImode, val8);
14271           return true;
14272         }
14273     }
14274
14275   /* Try using a bit-to-bytemask.  */
14276   if (which == AARCH64_CHECK_MOV)
14277     {
14278       unsigned int i;
14279       for (i = 0; i < 64; i += 8)
14280         {
14281           unsigned char byte = (val64 >> i) & 0xff;
14282           if (byte != 0 && byte != 0xff)
14283             break;
14284         }
14285       if (i == 64)
14286         {
14287           if (info)
14288             *info = simd_immediate_info (DImode, val64);
14289           return true;
14290         }
14291     }
14292   return false;
14293 }
14294
14295 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
14296    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
14297
14298 static bool
14299 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
14300                              simd_immediate_info *info)
14301 {
14302   scalar_int_mode mode = DImode;
14303   unsigned int val32 = val64 & 0xffffffff;
14304   if (val32 == (val64 >> 32))
14305     {
14306       mode = SImode;
14307       unsigned int val16 = val32 & 0xffff;
14308       if (val16 == (val32 >> 16))
14309         {
14310           mode = HImode;
14311           unsigned int val8 = val16 & 0xff;
14312           if (val8 == (val16 >> 8))
14313             mode = QImode;
14314         }
14315     }
14316   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
14317   if (IN_RANGE (val, -0x80, 0x7f))
14318     {
14319       /* DUP with no shift.  */
14320       if (info)
14321         *info = simd_immediate_info (mode, val);
14322       return true;
14323     }
14324   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
14325     {
14326       /* DUP with LSL #8.  */
14327       if (info)
14328         *info = simd_immediate_info (mode, val);
14329       return true;
14330     }
14331   if (aarch64_bitmask_imm (val64, mode))
14332     {
14333       /* DUPM.  */
14334       if (info)
14335         *info = simd_immediate_info (mode, val);
14336       return true;
14337     }
14338   return false;
14339 }
14340
14341 /* Return true if OP is a valid SIMD immediate for the operation
14342    described by WHICH.  If INFO is nonnull, use it to describe valid
14343    immediates.  */
14344 bool
14345 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
14346                               enum simd_immediate_check which)
14347 {
14348   machine_mode mode = GET_MODE (op);
14349   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14350   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
14351     return false;
14352
14353   scalar_mode elt_mode = GET_MODE_INNER (mode);
14354   rtx base, step;
14355   unsigned int n_elts;
14356   if (GET_CODE (op) == CONST_VECTOR
14357       && CONST_VECTOR_DUPLICATE_P (op))
14358     n_elts = CONST_VECTOR_NPATTERNS (op);
14359   else if ((vec_flags & VEC_SVE_DATA)
14360            && const_vec_series_p (op, &base, &step))
14361     {
14362       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
14363       if (!aarch64_sve_index_immediate_p (base)
14364           || !aarch64_sve_index_immediate_p (step))
14365         return false;
14366
14367       if (info)
14368         *info = simd_immediate_info (elt_mode, base, step);
14369       return true;
14370     }
14371   else if (GET_CODE (op) == CONST_VECTOR
14372            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
14373     /* N_ELTS set above.  */;
14374   else
14375     return false;
14376
14377   /* Handle PFALSE and PTRUE.  */
14378   if (vec_flags & VEC_SVE_PRED)
14379     return (op == CONST0_RTX (mode)
14380             || op == CONSTM1_RTX (mode));
14381
14382   scalar_float_mode elt_float_mode;
14383   if (n_elts == 1
14384       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
14385     {
14386       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
14387       if (aarch64_float_const_zero_rtx_p (elt)
14388           || aarch64_float_const_representable_p (elt))
14389         {
14390           if (info)
14391             *info = simd_immediate_info (elt_float_mode, elt);
14392           return true;
14393         }
14394     }
14395
14396   unsigned int elt_size = GET_MODE_SIZE (elt_mode);
14397   if (elt_size > 8)
14398     return false;
14399
14400   scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
14401
14402   /* Expand the vector constant out into a byte vector, with the least
14403      significant byte of the register first.  */
14404   auto_vec<unsigned char, 16> bytes;
14405   bytes.reserve (n_elts * elt_size);
14406   for (unsigned int i = 0; i < n_elts; i++)
14407     {
14408       /* The vector is provided in gcc endian-neutral fashion.
14409          For aarch64_be Advanced SIMD, it must be laid out in the vector
14410          register in reverse order.  */
14411       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
14412       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
14413
14414       if (elt_mode != elt_int_mode)
14415         elt = gen_lowpart (elt_int_mode, elt);
14416
14417       if (!CONST_INT_P (elt))
14418         return false;
14419
14420       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
14421       for (unsigned int byte = 0; byte < elt_size; byte++)
14422         {
14423           bytes.quick_push (elt_val & 0xff);
14424           elt_val >>= BITS_PER_UNIT;
14425         }
14426     }
14427
14428   /* The immediate must repeat every eight bytes.  */
14429   unsigned int nbytes = bytes.length ();
14430   for (unsigned i = 8; i < nbytes; ++i)
14431     if (bytes[i] != bytes[i - 8])
14432       return false;
14433
14434   /* Get the repeating 8-byte value as an integer.  No endian correction
14435      is needed here because bytes is already in lsb-first order.  */
14436   unsigned HOST_WIDE_INT val64 = 0;
14437   for (unsigned int i = 0; i < 8; i++)
14438     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
14439               << (i * BITS_PER_UNIT));
14440
14441   if (vec_flags & VEC_SVE_DATA)
14442     return aarch64_sve_valid_immediate (val64, info);
14443   else
14444     return aarch64_advsimd_valid_immediate (val64, info, which);
14445 }
14446
14447 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
14448    has a step in the range of INDEX.  Return the index expression if so,
14449    otherwise return null.  */
14450 rtx
14451 aarch64_check_zero_based_sve_index_immediate (rtx x)
14452 {
14453   rtx base, step;
14454   if (const_vec_series_p (x, &base, &step)
14455       && base == const0_rtx
14456       && aarch64_sve_index_immediate_p (step))
14457     return step;
14458   return NULL_RTX;
14459 }
14460
14461 /* Check of immediate shift constants are within range.  */
14462 bool
14463 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
14464 {
14465   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
14466   if (left)
14467     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
14468   else
14469     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
14470 }
14471
14472 /* Return the bitmask CONST_INT to select the bits required by a zero extract
14473    operation of width WIDTH at bit position POS.  */
14474
14475 rtx
14476 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
14477 {
14478   gcc_assert (CONST_INT_P (width));
14479   gcc_assert (CONST_INT_P (pos));
14480
14481   unsigned HOST_WIDE_INT mask
14482     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
14483   return GEN_INT (mask << UINTVAL (pos));
14484 }
14485
14486 bool
14487 aarch64_mov_operand_p (rtx x, machine_mode mode)
14488 {
14489   if (GET_CODE (x) == HIGH
14490       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
14491     return true;
14492
14493   if (CONST_INT_P (x))
14494     return true;
14495
14496   if (VECTOR_MODE_P (GET_MODE (x)))
14497     return aarch64_simd_valid_immediate (x, NULL);
14498
14499   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
14500     return true;
14501
14502   if (aarch64_sve_cnt_immediate_p (x))
14503     return true;
14504
14505   return aarch64_classify_symbolic_expression (x)
14506     == SYMBOL_TINY_ABSOLUTE;
14507 }
14508
14509 /* Return a const_int vector of VAL.  */
14510 rtx
14511 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
14512 {
14513   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
14514   return gen_const_vec_duplicate (mode, c);
14515 }
14516
14517 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
14518
14519 bool
14520 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
14521 {
14522   machine_mode vmode;
14523
14524   vmode = aarch64_simd_container_mode (mode, 64);
14525   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
14526   return aarch64_simd_valid_immediate (op_v, NULL);
14527 }
14528
14529 /* Construct and return a PARALLEL RTX vector with elements numbering the
14530    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
14531    the vector - from the perspective of the architecture.  This does not
14532    line up with GCC's perspective on lane numbers, so we end up with
14533    different masks depending on our target endian-ness.  The diagram
14534    below may help.  We must draw the distinction when building masks
14535    which select one half of the vector.  An instruction selecting
14536    architectural low-lanes for a big-endian target, must be described using
14537    a mask selecting GCC high-lanes.
14538
14539                  Big-Endian             Little-Endian
14540
14541 GCC             0   1   2   3           3   2   1   0
14542               | x | x | x | x |       | x | x | x | x |
14543 Architecture    3   2   1   0           3   2   1   0
14544
14545 Low Mask:         { 2, 3 }                { 0, 1 }
14546 High Mask:        { 0, 1 }                { 2, 3 }
14547
14548    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
14549
14550 rtx
14551 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
14552 {
14553   rtvec v = rtvec_alloc (nunits / 2);
14554   int high_base = nunits / 2;
14555   int low_base = 0;
14556   int base;
14557   rtx t1;
14558   int i;
14559
14560   if (BYTES_BIG_ENDIAN)
14561     base = high ? low_base : high_base;
14562   else
14563     base = high ? high_base : low_base;
14564
14565   for (i = 0; i < nunits / 2; i++)
14566     RTVEC_ELT (v, i) = GEN_INT (base + i);
14567
14568   t1 = gen_rtx_PARALLEL (mode, v);
14569   return t1;
14570 }
14571
14572 /* Check OP for validity as a PARALLEL RTX vector with elements
14573    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
14574    from the perspective of the architecture.  See the diagram above
14575    aarch64_simd_vect_par_cnst_half for more details.  */
14576
14577 bool
14578 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
14579                                        bool high)
14580 {
14581   int nelts;
14582   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
14583     return false;
14584
14585   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
14586   HOST_WIDE_INT count_op = XVECLEN (op, 0);
14587   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
14588   int i = 0;
14589
14590   if (count_op != count_ideal)
14591     return false;
14592
14593   for (i = 0; i < count_ideal; i++)
14594     {
14595       rtx elt_op = XVECEXP (op, 0, i);
14596       rtx elt_ideal = XVECEXP (ideal, 0, i);
14597
14598       if (!CONST_INT_P (elt_op)
14599           || INTVAL (elt_ideal) != INTVAL (elt_op))
14600         return false;
14601     }
14602   return true;
14603 }
14604
14605 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
14606    HIGH (exclusive).  */
14607 void
14608 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
14609                           const_tree exp)
14610 {
14611   HOST_WIDE_INT lane;
14612   gcc_assert (CONST_INT_P (operand));
14613   lane = INTVAL (operand);
14614
14615   if (lane < low || lane >= high)
14616   {
14617     if (exp)
14618       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
14619     else
14620       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
14621   }
14622 }
14623
14624 /* Peform endian correction on lane number N, which indexes a vector
14625    of mode MODE, and return the result as an SImode rtx.  */
14626
14627 rtx
14628 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
14629 {
14630   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
14631 }
14632
14633 /* Return TRUE if OP is a valid vector addressing mode.  */
14634
14635 bool
14636 aarch64_simd_mem_operand_p (rtx op)
14637 {
14638   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
14639                         || REG_P (XEXP (op, 0)));
14640 }
14641
14642 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
14643
14644 bool
14645 aarch64_sve_ld1r_operand_p (rtx op)
14646 {
14647   struct aarch64_address_info addr;
14648   scalar_mode mode;
14649
14650   return (MEM_P (op)
14651           && is_a <scalar_mode> (GET_MODE (op), &mode)
14652           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
14653           && addr.type == ADDRESS_REG_IMM
14654           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
14655 }
14656
14657 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
14658    The conditions for STR are the same.  */
14659 bool
14660 aarch64_sve_ldr_operand_p (rtx op)
14661 {
14662   struct aarch64_address_info addr;
14663
14664   return (MEM_P (op)
14665           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
14666                                        false, ADDR_QUERY_ANY)
14667           && addr.type == ADDRESS_REG_IMM);
14668 }
14669
14670 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
14671    We need to be able to access the individual pieces, so the range
14672    is different from LD[234] and ST[234].  */
14673 bool
14674 aarch64_sve_struct_memory_operand_p (rtx op)
14675 {
14676   if (!MEM_P (op))
14677     return false;
14678
14679   machine_mode mode = GET_MODE (op);
14680   struct aarch64_address_info addr;
14681   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
14682                                  ADDR_QUERY_ANY)
14683       || addr.type != ADDRESS_REG_IMM)
14684     return false;
14685
14686   poly_int64 first = addr.const_offset;
14687   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
14688   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
14689           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
14690 }
14691
14692 /* Emit a register copy from operand to operand, taking care not to
14693    early-clobber source registers in the process.
14694
14695    COUNT is the number of components into which the copy needs to be
14696    decomposed.  */
14697 void
14698 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
14699                                 unsigned int count)
14700 {
14701   unsigned int i;
14702   int rdest = REGNO (operands[0]);
14703   int rsrc = REGNO (operands[1]);
14704
14705   if (!reg_overlap_mentioned_p (operands[0], operands[1])
14706       || rdest < rsrc)
14707     for (i = 0; i < count; i++)
14708       emit_move_insn (gen_rtx_REG (mode, rdest + i),
14709                       gen_rtx_REG (mode, rsrc + i));
14710   else
14711     for (i = 0; i < count; i++)
14712       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
14713                       gen_rtx_REG (mode, rsrc + count - i - 1));
14714 }
14715
14716 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
14717    one of VSTRUCT modes: OI, CI, or XI.  */
14718 int
14719 aarch64_simd_attr_length_rglist (machine_mode mode)
14720 {
14721   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
14722   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
14723 }
14724
14725 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
14726    alignment of a vector to 128 bits.  SVE predicates have an alignment of
14727    16 bits.  */
14728 static HOST_WIDE_INT
14729 aarch64_simd_vector_alignment (const_tree type)
14730 {
14731   if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14732     /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
14733        be set for non-predicate vectors of booleans.  Modes are the most
14734        direct way we have of identifying real SVE predicate types.  */
14735     return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
14736   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
14737   return MIN (align, 128);
14738 }
14739
14740 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
14741 static poly_uint64
14742 aarch64_vectorize_preferred_vector_alignment (const_tree type)
14743 {
14744   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
14745     {
14746       /* If the length of the vector is fixed, try to align to that length,
14747          otherwise don't try to align at all.  */
14748       HOST_WIDE_INT result;
14749       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
14750         result = TYPE_ALIGN (TREE_TYPE (type));
14751       return result;
14752     }
14753   return TYPE_ALIGN (type);
14754 }
14755
14756 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
14757 static bool
14758 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
14759 {
14760   if (is_packed)
14761     return false;
14762
14763   /* For fixed-length vectors, check that the vectorizer will aim for
14764      full-vector alignment.  This isn't true for generic GCC vectors
14765      that are wider than the ABI maximum of 128 bits.  */
14766   poly_uint64 preferred_alignment =
14767     aarch64_vectorize_preferred_vector_alignment (type);
14768   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
14769       && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
14770                    preferred_alignment))
14771     return false;
14772
14773   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
14774   return true;
14775 }
14776
14777 /* Return true if the vector misalignment factor is supported by the
14778    target.  */
14779 static bool
14780 aarch64_builtin_support_vector_misalignment (machine_mode mode,
14781                                              const_tree type, int misalignment,
14782                                              bool is_packed)
14783 {
14784   if (TARGET_SIMD && STRICT_ALIGNMENT)
14785     {
14786       /* Return if movmisalign pattern is not supported for this mode.  */
14787       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
14788         return false;
14789
14790       /* Misalignment factor is unknown at compile time.  */
14791       if (misalignment == -1)
14792         return false;
14793     }
14794   return default_builtin_support_vector_misalignment (mode, type, misalignment,
14795                                                       is_packed);
14796 }
14797
14798 /* If VALS is a vector constant that can be loaded into a register
14799    using DUP, generate instructions to do so and return an RTX to
14800    assign to the register.  Otherwise return NULL_RTX.  */
14801 static rtx
14802 aarch64_simd_dup_constant (rtx vals)
14803 {
14804   machine_mode mode = GET_MODE (vals);
14805   machine_mode inner_mode = GET_MODE_INNER (mode);
14806   rtx x;
14807
14808   if (!const_vec_duplicate_p (vals, &x))
14809     return NULL_RTX;
14810
14811   /* We can load this constant by using DUP and a constant in a
14812      single ARM register.  This will be cheaper than a vector
14813      load.  */
14814   x = copy_to_mode_reg (inner_mode, x);
14815   return gen_vec_duplicate (mode, x);
14816 }
14817
14818
14819 /* Generate code to load VALS, which is a PARALLEL containing only
14820    constants (for vec_init) or CONST_VECTOR, efficiently into a
14821    register.  Returns an RTX to copy into the register, or NULL_RTX
14822    for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
14823 static rtx
14824 aarch64_simd_make_constant (rtx vals)
14825 {
14826   machine_mode mode = GET_MODE (vals);
14827   rtx const_dup;
14828   rtx const_vec = NULL_RTX;
14829   int n_const = 0;
14830   int i;
14831
14832   if (GET_CODE (vals) == CONST_VECTOR)
14833     const_vec = vals;
14834   else if (GET_CODE (vals) == PARALLEL)
14835     {
14836       /* A CONST_VECTOR must contain only CONST_INTs and
14837          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
14838          Only store valid constants in a CONST_VECTOR.  */
14839       int n_elts = XVECLEN (vals, 0);
14840       for (i = 0; i < n_elts; ++i)
14841         {
14842           rtx x = XVECEXP (vals, 0, i);
14843           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
14844             n_const++;
14845         }
14846       if (n_const == n_elts)
14847         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
14848     }
14849   else
14850     gcc_unreachable ();
14851
14852   if (const_vec != NULL_RTX
14853       && aarch64_simd_valid_immediate (const_vec, NULL))
14854     /* Load using MOVI/MVNI.  */
14855     return const_vec;
14856   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
14857     /* Loaded using DUP.  */
14858     return const_dup;
14859   else if (const_vec != NULL_RTX)
14860     /* Load from constant pool. We cannot take advantage of single-cycle
14861        LD1 because we need a PC-relative addressing mode.  */
14862     return const_vec;
14863   else
14864     /* A PARALLEL containing something not valid inside CONST_VECTOR.
14865        We cannot construct an initializer.  */
14866     return NULL_RTX;
14867 }
14868
14869 /* Expand a vector initialisation sequence, such that TARGET is
14870    initialised to contain VALS.  */
14871
14872 void
14873 aarch64_expand_vector_init (rtx target, rtx vals)
14874 {
14875   machine_mode mode = GET_MODE (target);
14876   scalar_mode inner_mode = GET_MODE_INNER (mode);
14877   /* The number of vector elements.  */
14878   int n_elts = XVECLEN (vals, 0);
14879   /* The number of vector elements which are not constant.  */
14880   int n_var = 0;
14881   rtx any_const = NULL_RTX;
14882   /* The first element of vals.  */
14883   rtx v0 = XVECEXP (vals, 0, 0);
14884   bool all_same = true;
14885
14886   /* Count the number of variable elements to initialise.  */
14887   for (int i = 0; i < n_elts; ++i)
14888     {
14889       rtx x = XVECEXP (vals, 0, i);
14890       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
14891         ++n_var;
14892       else
14893         any_const = x;
14894
14895       all_same &= rtx_equal_p (x, v0);
14896     }
14897
14898   /* No variable elements, hand off to aarch64_simd_make_constant which knows
14899      how best to handle this.  */
14900   if (n_var == 0)
14901     {
14902       rtx constant = aarch64_simd_make_constant (vals);
14903       if (constant != NULL_RTX)
14904         {
14905           emit_move_insn (target, constant);
14906           return;
14907         }
14908     }
14909
14910   /* Splat a single non-constant element if we can.  */
14911   if (all_same)
14912     {
14913       rtx x = copy_to_mode_reg (inner_mode, v0);
14914       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
14915       return;
14916     }
14917
14918   enum insn_code icode = optab_handler (vec_set_optab, mode);
14919   gcc_assert (icode != CODE_FOR_nothing);
14920
14921   /* If there are only variable elements, try to optimize
14922      the insertion using dup for the most common element
14923      followed by insertions.  */
14924
14925   /* The algorithm will fill matches[*][0] with the earliest matching element,
14926      and matches[X][1] with the count of duplicate elements (if X is the
14927      earliest element which has duplicates).  */
14928
14929   if (n_var == n_elts && n_elts <= 16)
14930     {
14931       int matches[16][2] = {0};
14932       for (int i = 0; i < n_elts; i++)
14933         {
14934           for (int j = 0; j <= i; j++)
14935             {
14936               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
14937                 {
14938                   matches[i][0] = j;
14939                   matches[j][1]++;
14940                   break;
14941                 }
14942             }
14943         }
14944       int maxelement = 0;
14945       int maxv = 0;
14946       for (int i = 0; i < n_elts; i++)
14947         if (matches[i][1] > maxv)
14948           {
14949             maxelement = i;
14950             maxv = matches[i][1];
14951           }
14952
14953       /* Create a duplicate of the most common element, unless all elements
14954          are equally useless to us, in which case just immediately set the
14955          vector register using the first element.  */
14956
14957       if (maxv == 1)
14958         {
14959           /* For vectors of two 64-bit elements, we can do even better.  */
14960           if (n_elts == 2
14961               && (inner_mode == E_DImode
14962                   || inner_mode == E_DFmode))
14963
14964             {
14965               rtx x0 = XVECEXP (vals, 0, 0);
14966               rtx x1 = XVECEXP (vals, 0, 1);
14967               /* Combine can pick up this case, but handling it directly
14968                  here leaves clearer RTL.
14969
14970                  This is load_pair_lanes<mode>, and also gives us a clean-up
14971                  for store_pair_lanes<mode>.  */
14972               if (memory_operand (x0, inner_mode)
14973                   && memory_operand (x1, inner_mode)
14974                   && !STRICT_ALIGNMENT
14975                   && rtx_equal_p (XEXP (x1, 0),
14976                                   plus_constant (Pmode,
14977                                                  XEXP (x0, 0),
14978                                                  GET_MODE_SIZE (inner_mode))))
14979                 {
14980                   rtx t;
14981                   if (inner_mode == DFmode)
14982                     t = gen_load_pair_lanesdf (target, x0, x1);
14983                   else
14984                     t = gen_load_pair_lanesdi (target, x0, x1);
14985                   emit_insn (t);
14986                   return;
14987                 }
14988             }
14989           /* The subreg-move sequence below will move into lane zero of the
14990              vector register.  For big-endian we want that position to hold
14991              the last element of VALS.  */
14992           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
14993           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
14994           aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
14995         }
14996       else
14997         {
14998           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
14999           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
15000         }
15001
15002       /* Insert the rest.  */
15003       for (int i = 0; i < n_elts; i++)
15004         {
15005           rtx x = XVECEXP (vals, 0, i);
15006           if (matches[i][0] == maxelement)
15007             continue;
15008           x = copy_to_mode_reg (inner_mode, x);
15009           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
15010         }
15011       return;
15012     }
15013
15014   /* Initialise a vector which is part-variable.  We want to first try
15015      to build those lanes which are constant in the most efficient way we
15016      can.  */
15017   if (n_var != n_elts)
15018     {
15019       rtx copy = copy_rtx (vals);
15020
15021       /* Load constant part of vector.  We really don't care what goes into the
15022          parts we will overwrite, but we're more likely to be able to load the
15023          constant efficiently if it has fewer, larger, repeating parts
15024          (see aarch64_simd_valid_immediate).  */
15025       for (int i = 0; i < n_elts; i++)
15026         {
15027           rtx x = XVECEXP (vals, 0, i);
15028           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15029             continue;
15030           rtx subst = any_const;
15031           for (int bit = n_elts / 2; bit > 0; bit /= 2)
15032             {
15033               /* Look in the copied vector, as more elements are const.  */
15034               rtx test = XVECEXP (copy, 0, i ^ bit);
15035               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
15036                 {
15037                   subst = test;
15038                   break;
15039                 }
15040             }
15041           XVECEXP (copy, 0, i) = subst;
15042         }
15043       aarch64_expand_vector_init (target, copy);
15044     }
15045
15046   /* Insert the variable lanes directly.  */
15047   for (int i = 0; i < n_elts; i++)
15048     {
15049       rtx x = XVECEXP (vals, 0, i);
15050       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15051         continue;
15052       x = copy_to_mode_reg (inner_mode, x);
15053       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
15054     }
15055 }
15056
15057 static unsigned HOST_WIDE_INT
15058 aarch64_shift_truncation_mask (machine_mode mode)
15059 {
15060   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
15061     return 0;
15062   return GET_MODE_UNIT_BITSIZE (mode) - 1;
15063 }
15064
15065 /* Select a format to encode pointers in exception handling data.  */
15066 int
15067 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
15068 {
15069    int type;
15070    switch (aarch64_cmodel)
15071      {
15072      case AARCH64_CMODEL_TINY:
15073      case AARCH64_CMODEL_TINY_PIC:
15074      case AARCH64_CMODEL_SMALL:
15075      case AARCH64_CMODEL_SMALL_PIC:
15076      case AARCH64_CMODEL_SMALL_SPIC:
15077        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
15078           for everything.  */
15079        type = DW_EH_PE_sdata4;
15080        break;
15081      default:
15082        /* No assumptions here.  8-byte relocs required.  */
15083        type = DW_EH_PE_sdata8;
15084        break;
15085      }
15086    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
15087 }
15088
15089 /* The last .arch and .tune assembly strings that we printed.  */
15090 static std::string aarch64_last_printed_arch_string;
15091 static std::string aarch64_last_printed_tune_string;
15092
15093 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
15094    by the function fndecl.  */
15095
15096 void
15097 aarch64_declare_function_name (FILE *stream, const char* name,
15098                                 tree fndecl)
15099 {
15100   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
15101
15102   struct cl_target_option *targ_options;
15103   if (target_parts)
15104     targ_options = TREE_TARGET_OPTION (target_parts);
15105   else
15106     targ_options = TREE_TARGET_OPTION (target_option_current_node);
15107   gcc_assert (targ_options);
15108
15109   const struct processor *this_arch
15110     = aarch64_get_arch (targ_options->x_explicit_arch);
15111
15112   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
15113   std::string extension
15114     = aarch64_get_extension_string_for_isa_flags (isa_flags,
15115                                                   this_arch->flags);
15116   /* Only update the assembler .arch string if it is distinct from the last
15117      such string we printed.  */
15118   std::string to_print = this_arch->name + extension;
15119   if (to_print != aarch64_last_printed_arch_string)
15120     {
15121       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
15122       aarch64_last_printed_arch_string = to_print;
15123     }
15124
15125   /* Print the cpu name we're tuning for in the comments, might be
15126      useful to readers of the generated asm.  Do it only when it changes
15127      from function to function and verbose assembly is requested.  */
15128   const struct processor *this_tune
15129     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
15130
15131   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
15132     {
15133       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
15134                    this_tune->name);
15135       aarch64_last_printed_tune_string = this_tune->name;
15136     }
15137
15138   /* Don't forget the type directive for ELF.  */
15139   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
15140   ASM_OUTPUT_LABEL (stream, name);
15141 }
15142
15143 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
15144
15145 static void
15146 aarch64_start_file (void)
15147 {
15148   struct cl_target_option *default_options
15149     = TREE_TARGET_OPTION (target_option_default_node);
15150
15151   const struct processor *default_arch
15152     = aarch64_get_arch (default_options->x_explicit_arch);
15153   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
15154   std::string extension
15155     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
15156                                                   default_arch->flags);
15157
15158    aarch64_last_printed_arch_string = default_arch->name + extension;
15159    aarch64_last_printed_tune_string = "";
15160    asm_fprintf (asm_out_file, "\t.arch %s\n",
15161                 aarch64_last_printed_arch_string.c_str ());
15162
15163    default_file_start ();
15164 }
15165
15166 /* Emit load exclusive.  */
15167
15168 static void
15169 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
15170                              rtx mem, rtx model_rtx)
15171 {
15172   emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
15173 }
15174
15175 /* Emit store exclusive.  */
15176
15177 static void
15178 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
15179                               rtx rval, rtx mem, rtx model_rtx)
15180 {
15181   emit_insn (gen_aarch64_store_exclusive (mode, bval, rval, mem, model_rtx));
15182 }
15183
15184 /* Mark the previous jump instruction as unlikely.  */
15185
15186 static void
15187 aarch64_emit_unlikely_jump (rtx insn)
15188 {
15189   rtx_insn *jump = emit_jump_insn (insn);
15190   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
15191 }
15192
15193 /* Expand a compare and swap pattern.  */
15194
15195 void
15196 aarch64_expand_compare_and_swap (rtx operands[])
15197 {
15198   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
15199   machine_mode mode, r_mode;
15200
15201   bval = operands[0];
15202   rval = operands[1];
15203   mem = operands[2];
15204   oldval = operands[3];
15205   newval = operands[4];
15206   is_weak = operands[5];
15207   mod_s = operands[6];
15208   mod_f = operands[7];
15209   mode = GET_MODE (mem);
15210
15211   /* Normally the succ memory model must be stronger than fail, but in the
15212      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
15213      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
15214   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
15215       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
15216     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
15217
15218   r_mode = mode;
15219   if (mode == QImode || mode == HImode)
15220     {
15221       r_mode = SImode;
15222       rval = gen_reg_rtx (r_mode);
15223     }
15224
15225   if (TARGET_LSE)
15226     {
15227       /* The CAS insn requires oldval and rval overlap, but we need to
15228          have a copy of oldval saved across the operation to tell if
15229          the operation is successful.  */
15230       if (reg_overlap_mentioned_p (rval, oldval))
15231         rval = copy_to_mode_reg (r_mode, oldval);
15232       else
15233         emit_move_insn (rval, gen_lowpart (r_mode, oldval));
15234
15235       emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
15236                                                    newval, mod_s));
15237       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
15238     }
15239   else
15240     {
15241       /* The oldval predicate varies by mode.  Test it and force to reg.  */
15242       insn_code code = code_for_aarch64_compare_and_swap (mode);
15243       if (!insn_data[code].operand[2].predicate (oldval, mode))
15244         oldval = force_reg (mode, oldval);
15245
15246       emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
15247                                  is_weak, mod_s, mod_f));
15248       cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
15249     }
15250
15251   if (r_mode != mode)
15252     rval = gen_lowpart (mode, rval);
15253   emit_move_insn (operands[1], rval);
15254
15255   x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
15256   emit_insn (gen_rtx_SET (bval, x));
15257 }
15258
15259 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
15260    sequence implementing an atomic operation.  */
15261
15262 static void
15263 aarch64_emit_post_barrier (enum memmodel model)
15264 {
15265   const enum memmodel base_model = memmodel_base (model);
15266
15267   if (is_mm_sync (model)
15268       && (base_model == MEMMODEL_ACQUIRE
15269           || base_model == MEMMODEL_ACQ_REL
15270           || base_model == MEMMODEL_SEQ_CST))
15271     {
15272       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
15273     }
15274 }
15275
15276 /* Split a compare and swap pattern.  */
15277
15278 void
15279 aarch64_split_compare_and_swap (rtx operands[])
15280 {
15281   rtx rval, mem, oldval, newval, scratch;
15282   machine_mode mode;
15283   bool is_weak;
15284   rtx_code_label *label1, *label2;
15285   rtx x, cond;
15286   enum memmodel model;
15287   rtx model_rtx;
15288
15289   rval = operands[0];
15290   mem = operands[1];
15291   oldval = operands[2];
15292   newval = operands[3];
15293   is_weak = (operands[4] != const0_rtx);
15294   model_rtx = operands[5];
15295   scratch = operands[7];
15296   mode = GET_MODE (mem);
15297   model = memmodel_from_int (INTVAL (model_rtx));
15298
15299   /* When OLDVAL is zero and we want the strong version we can emit a tighter
15300     loop:
15301     .label1:
15302         LD[A]XR rval, [mem]
15303         CBNZ    rval, .label2
15304         ST[L]XR scratch, newval, [mem]
15305         CBNZ    scratch, .label1
15306     .label2:
15307         CMP     rval, 0.  */
15308   bool strong_zero_p = !is_weak && oldval == const0_rtx;
15309
15310   label1 = NULL;
15311   if (!is_weak)
15312     {
15313       label1 = gen_label_rtx ();
15314       emit_label (label1);
15315     }
15316   label2 = gen_label_rtx ();
15317
15318   /* The initial load can be relaxed for a __sync operation since a final
15319      barrier will be emitted to stop code hoisting.  */
15320   if (is_mm_sync (model))
15321     aarch64_emit_load_exclusive (mode, rval, mem,
15322                                  GEN_INT (MEMMODEL_RELAXED));
15323   else
15324     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
15325
15326   if (strong_zero_p)
15327     {
15328       if (aarch64_track_speculation)
15329         {
15330           /* Emit an explicit compare instruction, so that we can correctly
15331              track the condition codes.  */
15332           rtx cc_reg = aarch64_gen_compare_reg (NE, rval, const0_rtx);
15333           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
15334         }
15335       else
15336         x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
15337
15338       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15339                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
15340       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15341     }
15342   else
15343     {
15344       cond = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
15345       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
15346       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15347                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
15348       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15349     }
15350
15351   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
15352
15353   if (!is_weak)
15354     {
15355       if (aarch64_track_speculation)
15356         {
15357           /* Emit an explicit compare instruction, so that we can correctly
15358              track the condition codes.  */
15359           rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
15360           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
15361         }
15362       else
15363         x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
15364
15365       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15366                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
15367       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15368     }
15369   else
15370     {
15371       cond = gen_rtx_REG (CCmode, CC_REGNUM);
15372       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
15373       emit_insn (gen_rtx_SET (cond, x));
15374     }
15375
15376   emit_label (label2);
15377   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
15378      to set the condition flags.  If this is not used it will be removed by
15379      later passes.  */
15380   if (strong_zero_p)
15381     {
15382       cond = gen_rtx_REG (CCmode, CC_REGNUM);
15383       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
15384       emit_insn (gen_rtx_SET (cond, x));
15385     }
15386   /* Emit any final barrier needed for a __sync operation.  */
15387   if (is_mm_sync (model))
15388     aarch64_emit_post_barrier (model);
15389 }
15390
15391 /* Split an atomic operation.  */
15392
15393 void
15394 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
15395                          rtx value, rtx model_rtx, rtx cond)
15396 {
15397   machine_mode mode = GET_MODE (mem);
15398   machine_mode wmode = (mode == DImode ? DImode : SImode);
15399   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
15400   const bool is_sync = is_mm_sync (model);
15401   rtx_code_label *label;
15402   rtx x;
15403
15404   /* Split the atomic operation into a sequence.  */
15405   label = gen_label_rtx ();
15406   emit_label (label);
15407
15408   if (new_out)
15409     new_out = gen_lowpart (wmode, new_out);
15410   if (old_out)
15411     old_out = gen_lowpart (wmode, old_out);
15412   else
15413     old_out = new_out;
15414   value = simplify_gen_subreg (wmode, value, mode, 0);
15415
15416   /* The initial load can be relaxed for a __sync operation since a final
15417      barrier will be emitted to stop code hoisting.  */
15418  if (is_sync)
15419     aarch64_emit_load_exclusive (mode, old_out, mem,
15420                                  GEN_INT (MEMMODEL_RELAXED));
15421   else
15422     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
15423
15424   switch (code)
15425     {
15426     case SET:
15427       new_out = value;
15428       break;
15429
15430     case NOT:
15431       x = gen_rtx_AND (wmode, old_out, value);
15432       emit_insn (gen_rtx_SET (new_out, x));
15433       x = gen_rtx_NOT (wmode, new_out);
15434       emit_insn (gen_rtx_SET (new_out, x));
15435       break;
15436
15437     case MINUS:
15438       if (CONST_INT_P (value))
15439         {
15440           value = GEN_INT (-INTVAL (value));
15441           code = PLUS;
15442         }
15443       /* Fall through.  */
15444
15445     default:
15446       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
15447       emit_insn (gen_rtx_SET (new_out, x));
15448       break;
15449     }
15450
15451   aarch64_emit_store_exclusive (mode, cond, mem,
15452                                 gen_lowpart (mode, new_out), model_rtx);
15453
15454   if (aarch64_track_speculation)
15455     {
15456       /* Emit an explicit compare instruction, so that we can correctly
15457          track the condition codes.  */
15458       rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
15459       x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
15460     }
15461   else
15462     x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
15463
15464   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15465                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
15466   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15467
15468   /* Emit any final barrier needed for a __sync operation.  */
15469   if (is_sync)
15470     aarch64_emit_post_barrier (model);
15471 }
15472
15473 static void
15474 aarch64_init_libfuncs (void)
15475 {
15476    /* Half-precision float operations.  The compiler handles all operations
15477      with NULL libfuncs by converting to SFmode.  */
15478
15479   /* Conversions.  */
15480   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
15481   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
15482
15483   /* Arithmetic.  */
15484   set_optab_libfunc (add_optab, HFmode, NULL);
15485   set_optab_libfunc (sdiv_optab, HFmode, NULL);
15486   set_optab_libfunc (smul_optab, HFmode, NULL);
15487   set_optab_libfunc (neg_optab, HFmode, NULL);
15488   set_optab_libfunc (sub_optab, HFmode, NULL);
15489
15490   /* Comparisons.  */
15491   set_optab_libfunc (eq_optab, HFmode, NULL);
15492   set_optab_libfunc (ne_optab, HFmode, NULL);
15493   set_optab_libfunc (lt_optab, HFmode, NULL);
15494   set_optab_libfunc (le_optab, HFmode, NULL);
15495   set_optab_libfunc (ge_optab, HFmode, NULL);
15496   set_optab_libfunc (gt_optab, HFmode, NULL);
15497   set_optab_libfunc (unord_optab, HFmode, NULL);
15498 }
15499
15500 /* Target hook for c_mode_for_suffix.  */
15501 static machine_mode
15502 aarch64_c_mode_for_suffix (char suffix)
15503 {
15504   if (suffix == 'q')
15505     return TFmode;
15506
15507   return VOIDmode;
15508 }
15509
15510 /* We can only represent floating point constants which will fit in
15511    "quarter-precision" values.  These values are characterised by
15512    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
15513    by:
15514
15515    (-1)^s * (n/16) * 2^r
15516
15517    Where:
15518      's' is the sign bit.
15519      'n' is an integer in the range 16 <= n <= 31.
15520      'r' is an integer in the range -3 <= r <= 4.  */
15521
15522 /* Return true iff X can be represented by a quarter-precision
15523    floating point immediate operand X.  Note, we cannot represent 0.0.  */
15524 bool
15525 aarch64_float_const_representable_p (rtx x)
15526 {
15527   /* This represents our current view of how many bits
15528      make up the mantissa.  */
15529   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
15530   int exponent;
15531   unsigned HOST_WIDE_INT mantissa, mask;
15532   REAL_VALUE_TYPE r, m;
15533   bool fail;
15534
15535   if (!CONST_DOUBLE_P (x))
15536     return false;
15537
15538   if (GET_MODE (x) == VOIDmode
15539       || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
15540     return false;
15541
15542   r = *CONST_DOUBLE_REAL_VALUE (x);
15543
15544   /* We cannot represent infinities, NaNs or +/-zero.  We won't
15545      know if we have +zero until we analyse the mantissa, but we
15546      can reject the other invalid values.  */
15547   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
15548       || REAL_VALUE_MINUS_ZERO (r))
15549     return false;
15550
15551   /* Extract exponent.  */
15552   r = real_value_abs (&r);
15553   exponent = REAL_EXP (&r);
15554
15555   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
15556      highest (sign) bit, with a fixed binary point at bit point_pos.
15557      m1 holds the low part of the mantissa, m2 the high part.
15558      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
15559      bits for the mantissa, this can fail (low bits will be lost).  */
15560   real_ldexp (&m, &r, point_pos - exponent);
15561   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
15562
15563   /* If the low part of the mantissa has bits set we cannot represent
15564      the value.  */
15565   if (w.ulow () != 0)
15566     return false;
15567   /* We have rejected the lower HOST_WIDE_INT, so update our
15568      understanding of how many bits lie in the mantissa and
15569      look only at the high HOST_WIDE_INT.  */
15570   mantissa = w.elt (1);
15571   point_pos -= HOST_BITS_PER_WIDE_INT;
15572
15573   /* We can only represent values with a mantissa of the form 1.xxxx.  */
15574   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
15575   if ((mantissa & mask) != 0)
15576     return false;
15577
15578   /* Having filtered unrepresentable values, we may now remove all
15579      but the highest 5 bits.  */
15580   mantissa >>= point_pos - 5;
15581
15582   /* We cannot represent the value 0.0, so reject it.  This is handled
15583      elsewhere.  */
15584   if (mantissa == 0)
15585     return false;
15586
15587   /* Then, as bit 4 is always set, we can mask it off, leaving
15588      the mantissa in the range [0, 15].  */
15589   mantissa &= ~(1 << 4);
15590   gcc_assert (mantissa <= 15);
15591
15592   /* GCC internally does not use IEEE754-like encoding (where normalized
15593      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
15594      Our mantissa values are shifted 4 places to the left relative to
15595      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
15596      by 5 places to correct for GCC's representation.  */
15597   exponent = 5 - exponent;
15598
15599   return (exponent >= 0 && exponent <= 7);
15600 }
15601
15602 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
15603    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
15604    output MOVI/MVNI, ORR or BIC immediate.  */
15605 char*
15606 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
15607                                    enum simd_immediate_check which)
15608 {
15609   bool is_valid;
15610   static char templ[40];
15611   const char *mnemonic;
15612   const char *shift_op;
15613   unsigned int lane_count = 0;
15614   char element_char;
15615
15616   struct simd_immediate_info info;
15617
15618   /* This will return true to show const_vector is legal for use as either
15619      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
15620      It will also update INFO to show how the immediate should be generated.
15621      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
15622   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
15623   gcc_assert (is_valid);
15624
15625   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
15626   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
15627
15628   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15629     {
15630       gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
15631       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
15632          move immediate path.  */
15633       if (aarch64_float_const_zero_rtx_p (info.value))
15634         info.value = GEN_INT (0);
15635       else
15636         {
15637           const unsigned int buf_size = 20;
15638           char float_buf[buf_size] = {'\0'};
15639           real_to_decimal_for_mode (float_buf,
15640                                     CONST_DOUBLE_REAL_VALUE (info.value),
15641                                     buf_size, buf_size, 1, info.elt_mode);
15642
15643           if (lane_count == 1)
15644             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
15645           else
15646             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
15647                       lane_count, element_char, float_buf);
15648           return templ;
15649         }
15650     }
15651
15652   gcc_assert (CONST_INT_P (info.value));
15653
15654   if (which == AARCH64_CHECK_MOV)
15655     {
15656       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
15657       shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
15658       if (lane_count == 1)
15659         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
15660                   mnemonic, UINTVAL (info.value));
15661       else if (info.shift)
15662         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
15663                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
15664                   element_char, UINTVAL (info.value), shift_op, info.shift);
15665       else
15666         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
15667                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
15668                   element_char, UINTVAL (info.value));
15669     }
15670   else
15671     {
15672       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
15673       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
15674       if (info.shift)
15675         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
15676                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
15677                   element_char, UINTVAL (info.value), "lsl", info.shift);
15678       else
15679         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
15680                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
15681                   element_char, UINTVAL (info.value));
15682     }
15683   return templ;
15684 }
15685
15686 char*
15687 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
15688 {
15689
15690   /* If a floating point number was passed and we desire to use it in an
15691      integer mode do the conversion to integer.  */
15692   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
15693     {
15694       unsigned HOST_WIDE_INT ival;
15695       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
15696           gcc_unreachable ();
15697       immediate = gen_int_mode (ival, mode);
15698     }
15699
15700   machine_mode vmode;
15701   /* use a 64 bit mode for everything except for DI/DF mode, where we use
15702      a 128 bit vector mode.  */
15703   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
15704
15705   vmode = aarch64_simd_container_mode (mode, width);
15706   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
15707   return aarch64_output_simd_mov_immediate (v_op, width);
15708 }
15709
15710 /* Return the output string to use for moving immediate CONST_VECTOR
15711    into an SVE register.  */
15712
15713 char *
15714 aarch64_output_sve_mov_immediate (rtx const_vector)
15715 {
15716   static char templ[40];
15717   struct simd_immediate_info info;
15718   char element_char;
15719
15720   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
15721   gcc_assert (is_valid);
15722
15723   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
15724
15725   if (info.step)
15726     {
15727       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
15728                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
15729                 element_char, INTVAL (info.value), INTVAL (info.step));
15730       return templ;
15731     }
15732
15733   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15734     {
15735       if (aarch64_float_const_zero_rtx_p (info.value))
15736         info.value = GEN_INT (0);
15737       else
15738         {
15739           const int buf_size = 20;
15740           char float_buf[buf_size] = {};
15741           real_to_decimal_for_mode (float_buf,
15742                                     CONST_DOUBLE_REAL_VALUE (info.value),
15743                                     buf_size, buf_size, 1, info.elt_mode);
15744
15745           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
15746                     element_char, float_buf);
15747           return templ;
15748         }
15749     }
15750
15751   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
15752             element_char, INTVAL (info.value));
15753   return templ;
15754 }
15755
15756 /* Return the asm format for a PTRUE instruction whose destination has
15757    mode MODE.  SUFFIX is the element size suffix.  */
15758
15759 char *
15760 aarch64_output_ptrue (machine_mode mode, char suffix)
15761 {
15762   unsigned int nunits;
15763   static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
15764   if (GET_MODE_NUNITS (mode).is_constant (&nunits))
15765     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
15766   else
15767     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
15768   return buf;
15769 }
15770
15771 /* Split operands into moves from op[1] + op[2] into op[0].  */
15772
15773 void
15774 aarch64_split_combinev16qi (rtx operands[3])
15775 {
15776   unsigned int dest = REGNO (operands[0]);
15777   unsigned int src1 = REGNO (operands[1]);
15778   unsigned int src2 = REGNO (operands[2]);
15779   machine_mode halfmode = GET_MODE (operands[1]);
15780   unsigned int halfregs = REG_NREGS (operands[1]);
15781   rtx destlo, desthi;
15782
15783   gcc_assert (halfmode == V16QImode);
15784
15785   if (src1 == dest && src2 == dest + halfregs)
15786     {
15787       /* No-op move.  Can't split to nothing; emit something.  */
15788       emit_note (NOTE_INSN_DELETED);
15789       return;
15790     }
15791
15792   /* Preserve register attributes for variable tracking.  */
15793   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
15794   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
15795                                GET_MODE_SIZE (halfmode));
15796
15797   /* Special case of reversed high/low parts.  */
15798   if (reg_overlap_mentioned_p (operands[2], destlo)
15799       && reg_overlap_mentioned_p (operands[1], desthi))
15800     {
15801       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15802       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
15803       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15804     }
15805   else if (!reg_overlap_mentioned_p (operands[2], destlo))
15806     {
15807       /* Try to avoid unnecessary moves if part of the result
15808          is in the right place already.  */
15809       if (src1 != dest)
15810         emit_move_insn (destlo, operands[1]);
15811       if (src2 != dest + halfregs)
15812         emit_move_insn (desthi, operands[2]);
15813     }
15814   else
15815     {
15816       if (src2 != dest + halfregs)
15817         emit_move_insn (desthi, operands[2]);
15818       if (src1 != dest)
15819         emit_move_insn (destlo, operands[1]);
15820     }
15821 }
15822
15823 /* vec_perm support.  */
15824
15825 struct expand_vec_perm_d
15826 {
15827   rtx target, op0, op1;
15828   vec_perm_indices perm;
15829   machine_mode vmode;
15830   unsigned int vec_flags;
15831   bool one_vector_p;
15832   bool testing_p;
15833 };
15834
15835 /* Generate a variable permutation.  */
15836
15837 static void
15838 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
15839 {
15840   machine_mode vmode = GET_MODE (target);
15841   bool one_vector_p = rtx_equal_p (op0, op1);
15842
15843   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
15844   gcc_checking_assert (GET_MODE (op0) == vmode);
15845   gcc_checking_assert (GET_MODE (op1) == vmode);
15846   gcc_checking_assert (GET_MODE (sel) == vmode);
15847   gcc_checking_assert (TARGET_SIMD);
15848
15849   if (one_vector_p)
15850     {
15851       if (vmode == V8QImode)
15852         {
15853           /* Expand the argument to a V16QI mode by duplicating it.  */
15854           rtx pair = gen_reg_rtx (V16QImode);
15855           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
15856           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15857         }
15858       else
15859         {
15860           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
15861         }
15862     }
15863   else
15864     {
15865       rtx pair;
15866
15867       if (vmode == V8QImode)
15868         {
15869           pair = gen_reg_rtx (V16QImode);
15870           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
15871           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15872         }
15873       else
15874         {
15875           pair = gen_reg_rtx (OImode);
15876           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
15877           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
15878         }
15879     }
15880 }
15881
15882 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
15883    NELT is the number of elements in the vector.  */
15884
15885 void
15886 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
15887                          unsigned int nelt)
15888 {
15889   machine_mode vmode = GET_MODE (target);
15890   bool one_vector_p = rtx_equal_p (op0, op1);
15891   rtx mask;
15892
15893   /* The TBL instruction does not use a modulo index, so we must take care
15894      of that ourselves.  */
15895   mask = aarch64_simd_gen_const_vector_dup (vmode,
15896       one_vector_p ? nelt - 1 : 2 * nelt - 1);
15897   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
15898
15899   /* For big-endian, we also need to reverse the index within the vector
15900      (but not which vector).  */
15901   if (BYTES_BIG_ENDIAN)
15902     {
15903       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
15904       if (!one_vector_p)
15905         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
15906       sel = expand_simple_binop (vmode, XOR, sel, mask,
15907                                  NULL, 0, OPTAB_LIB_WIDEN);
15908     }
15909   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
15910 }
15911
15912 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
15913
15914 static void
15915 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
15916 {
15917   emit_insn (gen_rtx_SET (target,
15918                           gen_rtx_UNSPEC (GET_MODE (target),
15919                                           gen_rtvec (2, op0, op1), code)));
15920 }
15921
15922 /* Expand an SVE vec_perm with the given operands.  */
15923
15924 void
15925 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
15926 {
15927   machine_mode data_mode = GET_MODE (target);
15928   machine_mode sel_mode = GET_MODE (sel);
15929   /* Enforced by the pattern condition.  */
15930   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
15931
15932   /* Note: vec_perm indices are supposed to wrap when they go beyond the
15933      size of the two value vectors, i.e. the upper bits of the indices
15934      are effectively ignored.  SVE TBL instead produces 0 for any
15935      out-of-range indices, so we need to modulo all the vec_perm indices
15936      to ensure they are all in range.  */
15937   rtx sel_reg = force_reg (sel_mode, sel);
15938
15939   /* Check if the sel only references the first values vector.  */
15940   if (GET_CODE (sel) == CONST_VECTOR
15941       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
15942     {
15943       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
15944       return;
15945     }
15946
15947   /* Check if the two values vectors are the same.  */
15948   if (rtx_equal_p (op0, op1))
15949     {
15950       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
15951       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15952                                          NULL, 0, OPTAB_DIRECT);
15953       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
15954       return;
15955     }
15956
15957   /* Run TBL on for each value vector and combine the results.  */
15958
15959   rtx res0 = gen_reg_rtx (data_mode);
15960   rtx res1 = gen_reg_rtx (data_mode);
15961   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
15962   if (GET_CODE (sel) != CONST_VECTOR
15963       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
15964     {
15965       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
15966                                                        2 * nunits - 1);
15967       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15968                                      NULL, 0, OPTAB_DIRECT);
15969     }
15970   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
15971   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
15972                                      NULL, 0, OPTAB_DIRECT);
15973   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
15974   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
15975     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
15976   else
15977     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
15978 }
15979
15980 /* Recognize patterns suitable for the TRN instructions.  */
15981 static bool
15982 aarch64_evpc_trn (struct expand_vec_perm_d *d)
15983 {
15984   HOST_WIDE_INT odd;
15985   poly_uint64 nelt = d->perm.length ();
15986   rtx out, in0, in1, x;
15987   machine_mode vmode = d->vmode;
15988
15989   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15990     return false;
15991
15992   /* Note that these are little-endian tests.
15993      We correct for big-endian later.  */
15994   if (!d->perm[0].is_constant (&odd)
15995       || (odd != 0 && odd != 1)
15996       || !d->perm.series_p (0, 2, odd, 2)
15997       || !d->perm.series_p (1, 2, nelt + odd, 2))
15998     return false;
15999
16000   /* Success!  */
16001   if (d->testing_p)
16002     return true;
16003
16004   in0 = d->op0;
16005   in1 = d->op1;
16006   /* We don't need a big-endian lane correction for SVE; see the comment
16007      at the head of aarch64-sve.md for details.  */
16008   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16009     {
16010       x = in0, in0 = in1, in1 = x;
16011       odd = !odd;
16012     }
16013   out = d->target;
16014
16015   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16016                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
16017   return true;
16018 }
16019
16020 /* Recognize patterns suitable for the UZP instructions.  */
16021 static bool
16022 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
16023 {
16024   HOST_WIDE_INT odd;
16025   rtx out, in0, in1, x;
16026   machine_mode vmode = d->vmode;
16027
16028   if (GET_MODE_UNIT_SIZE (vmode) > 8)
16029     return false;
16030
16031   /* Note that these are little-endian tests.
16032      We correct for big-endian later.  */
16033   if (!d->perm[0].is_constant (&odd)
16034       || (odd != 0 && odd != 1)
16035       || !d->perm.series_p (0, 1, odd, 2))
16036     return false;
16037
16038   /* Success!  */
16039   if (d->testing_p)
16040     return true;
16041
16042   in0 = d->op0;
16043   in1 = d->op1;
16044   /* We don't need a big-endian lane correction for SVE; see the comment
16045      at the head of aarch64-sve.md for details.  */
16046   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16047     {
16048       x = in0, in0 = in1, in1 = x;
16049       odd = !odd;
16050     }
16051   out = d->target;
16052
16053   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16054                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
16055   return true;
16056 }
16057
16058 /* Recognize patterns suitable for the ZIP instructions.  */
16059 static bool
16060 aarch64_evpc_zip (struct expand_vec_perm_d *d)
16061 {
16062   unsigned int high;
16063   poly_uint64 nelt = d->perm.length ();
16064   rtx out, in0, in1, x;
16065   machine_mode vmode = d->vmode;
16066
16067   if (GET_MODE_UNIT_SIZE (vmode) > 8)
16068     return false;
16069
16070   /* Note that these are little-endian tests.
16071      We correct for big-endian later.  */
16072   poly_uint64 first = d->perm[0];
16073   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
16074       || !d->perm.series_p (0, 2, first, 1)
16075       || !d->perm.series_p (1, 2, first + nelt, 1))
16076     return false;
16077   high = maybe_ne (first, 0U);
16078
16079   /* Success!  */
16080   if (d->testing_p)
16081     return true;
16082
16083   in0 = d->op0;
16084   in1 = d->op1;
16085   /* We don't need a big-endian lane correction for SVE; see the comment
16086      at the head of aarch64-sve.md for details.  */
16087   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16088     {
16089       x = in0, in0 = in1, in1 = x;
16090       high = !high;
16091     }
16092   out = d->target;
16093
16094   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16095                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
16096   return true;
16097 }
16098
16099 /* Recognize patterns for the EXT insn.  */
16100
16101 static bool
16102 aarch64_evpc_ext (struct expand_vec_perm_d *d)
16103 {
16104   HOST_WIDE_INT location;
16105   rtx offset;
16106
16107   /* The first element always refers to the first vector.
16108      Check if the extracted indices are increasing by one.  */
16109   if (d->vec_flags == VEC_SVE_PRED
16110       || !d->perm[0].is_constant (&location)
16111       || !d->perm.series_p (0, 1, location, 1))
16112     return false;
16113
16114   /* Success! */
16115   if (d->testing_p)
16116     return true;
16117
16118   /* The case where (location == 0) is a no-op for both big- and little-endian,
16119      and is removed by the mid-end at optimization levels -O1 and higher.
16120
16121      We don't need a big-endian lane correction for SVE; see the comment
16122      at the head of aarch64-sve.md for details.  */
16123   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
16124     {
16125       /* After setup, we want the high elements of the first vector (stored
16126          at the LSB end of the register), and the low elements of the second
16127          vector (stored at the MSB end of the register). So swap.  */
16128       std::swap (d->op0, d->op1);
16129       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
16130          to_constant () is safe since this is restricted to Advanced SIMD
16131          vectors.  */
16132       location = d->perm.length ().to_constant () - location;
16133     }
16134
16135   offset = GEN_INT (location);
16136   emit_set_insn (d->target,
16137                  gen_rtx_UNSPEC (d->vmode,
16138                                  gen_rtvec (3, d->op0, d->op1, offset),
16139                                  UNSPEC_EXT));
16140   return true;
16141 }
16142
16143 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
16144    within each 64-bit, 32-bit or 16-bit granule.  */
16145
16146 static bool
16147 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
16148 {
16149   HOST_WIDE_INT diff;
16150   unsigned int i, size, unspec;
16151   machine_mode pred_mode;
16152
16153   if (d->vec_flags == VEC_SVE_PRED
16154       || !d->one_vector_p
16155       || !d->perm[0].is_constant (&diff))
16156     return false;
16157
16158   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
16159   if (size == 8)
16160     {
16161       unspec = UNSPEC_REV64;
16162       pred_mode = VNx2BImode;
16163     }
16164   else if (size == 4)
16165     {
16166       unspec = UNSPEC_REV32;
16167       pred_mode = VNx4BImode;
16168     }
16169   else if (size == 2)
16170     {
16171       unspec = UNSPEC_REV16;
16172       pred_mode = VNx8BImode;
16173     }
16174   else
16175     return false;
16176
16177   unsigned int step = diff + 1;
16178   for (i = 0; i < step; ++i)
16179     if (!d->perm.series_p (i, step, diff - i, step))
16180       return false;
16181
16182   /* Success! */
16183   if (d->testing_p)
16184     return true;
16185
16186   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
16187   if (d->vec_flags == VEC_SVE_DATA)
16188     {
16189       rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
16190       src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
16191                             UNSPEC_MERGE_PTRUE);
16192     }
16193   emit_set_insn (d->target, src);
16194   return true;
16195 }
16196
16197 /* Recognize patterns for the REV insn, which reverses elements within
16198    a full vector.  */
16199
16200 static bool
16201 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
16202 {
16203   poly_uint64 nelt = d->perm.length ();
16204
16205   if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
16206     return false;
16207
16208   if (!d->perm.series_p (0, 1, nelt - 1, -1))
16209     return false;
16210
16211   /* Success! */
16212   if (d->testing_p)
16213     return true;
16214
16215   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
16216   emit_set_insn (d->target, src);
16217   return true;
16218 }
16219
16220 static bool
16221 aarch64_evpc_dup (struct expand_vec_perm_d *d)
16222 {
16223   rtx out = d->target;
16224   rtx in0;
16225   HOST_WIDE_INT elt;
16226   machine_mode vmode = d->vmode;
16227   rtx lane;
16228
16229   if (d->vec_flags == VEC_SVE_PRED
16230       || d->perm.encoding ().encoded_nelts () != 1
16231       || !d->perm[0].is_constant (&elt))
16232     return false;
16233
16234   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
16235     return false;
16236
16237   /* Success! */
16238   if (d->testing_p)
16239     return true;
16240
16241   /* The generic preparation in aarch64_expand_vec_perm_const_1
16242      swaps the operand order and the permute indices if it finds
16243      d->perm[0] to be in the second operand.  Thus, we can always
16244      use d->op0 and need not do any extra arithmetic to get the
16245      correct lane number.  */
16246   in0 = d->op0;
16247   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
16248
16249   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
16250   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
16251   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
16252   return true;
16253 }
16254
16255 static bool
16256 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
16257 {
16258   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
16259   machine_mode vmode = d->vmode;
16260
16261   /* Make sure that the indices are constant.  */
16262   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
16263   for (unsigned int i = 0; i < encoded_nelts; ++i)
16264     if (!d->perm[i].is_constant ())
16265       return false;
16266
16267   if (d->testing_p)
16268     return true;
16269
16270   /* Generic code will try constant permutation twice.  Once with the
16271      original mode and again with the elements lowered to QImode.
16272      So wait and don't do the selector expansion ourselves.  */
16273   if (vmode != V8QImode && vmode != V16QImode)
16274     return false;
16275
16276   /* to_constant is safe since this routine is specific to Advanced SIMD
16277      vectors.  */
16278   unsigned int nelt = d->perm.length ().to_constant ();
16279   for (unsigned int i = 0; i < nelt; ++i)
16280     /* If big-endian and two vectors we end up with a weird mixed-endian
16281        mode on NEON.  Reverse the index within each word but not the word
16282        itself.  to_constant is safe because we checked is_constant above.  */
16283     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
16284                         ? d->perm[i].to_constant () ^ (nelt - 1)
16285                         : d->perm[i].to_constant ());
16286
16287   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
16288   sel = force_reg (vmode, sel);
16289
16290   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
16291   return true;
16292 }
16293
16294 /* Try to implement D using an SVE TBL instruction.  */
16295
16296 static bool
16297 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
16298 {
16299   unsigned HOST_WIDE_INT nelt;
16300
16301   /* Permuting two variable-length vectors could overflow the
16302      index range.  */
16303   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
16304     return false;
16305
16306   if (d->testing_p)
16307     return true;
16308
16309   machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
16310   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
16311   if (d->one_vector_p)
16312     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
16313   else
16314     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
16315   return true;
16316 }
16317
16318 static bool
16319 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
16320 {
16321   /* The pattern matching functions above are written to look for a small
16322      number to begin the sequence (0, 1, N/2).  If we begin with an index
16323      from the second operand, we can swap the operands.  */
16324   poly_int64 nelt = d->perm.length ();
16325   if (known_ge (d->perm[0], nelt))
16326     {
16327       d->perm.rotate_inputs (1);
16328       std::swap (d->op0, d->op1);
16329     }
16330
16331   if ((d->vec_flags == VEC_ADVSIMD
16332        || d->vec_flags == VEC_SVE_DATA
16333        || d->vec_flags == VEC_SVE_PRED)
16334       && known_gt (nelt, 1))
16335     {
16336       if (aarch64_evpc_rev_local (d))
16337         return true;
16338       else if (aarch64_evpc_rev_global (d))
16339         return true;
16340       else if (aarch64_evpc_ext (d))
16341         return true;
16342       else if (aarch64_evpc_dup (d))
16343         return true;
16344       else if (aarch64_evpc_zip (d))
16345         return true;
16346       else if (aarch64_evpc_uzp (d))
16347         return true;
16348       else if (aarch64_evpc_trn (d))
16349         return true;
16350       if (d->vec_flags == VEC_SVE_DATA)
16351         return aarch64_evpc_sve_tbl (d);
16352       else if (d->vec_flags == VEC_ADVSIMD)
16353         return aarch64_evpc_tbl (d);
16354     }
16355   return false;
16356 }
16357
16358 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
16359
16360 static bool
16361 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
16362                                   rtx op1, const vec_perm_indices &sel)
16363 {
16364   struct expand_vec_perm_d d;
16365
16366   /* Check whether the mask can be applied to a single vector.  */
16367   if (sel.ninputs () == 1
16368       || (op0 && rtx_equal_p (op0, op1)))
16369     d.one_vector_p = true;
16370   else if (sel.all_from_input_p (0))
16371     {
16372       d.one_vector_p = true;
16373       op1 = op0;
16374     }
16375   else if (sel.all_from_input_p (1))
16376     {
16377       d.one_vector_p = true;
16378       op0 = op1;
16379     }
16380   else
16381     d.one_vector_p = false;
16382
16383   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
16384                      sel.nelts_per_input ());
16385   d.vmode = vmode;
16386   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
16387   d.target = target;
16388   d.op0 = op0;
16389   d.op1 = op1;
16390   d.testing_p = !target;
16391
16392   if (!d.testing_p)
16393     return aarch64_expand_vec_perm_const_1 (&d);
16394
16395   rtx_insn *last = get_last_insn ();
16396   bool ret = aarch64_expand_vec_perm_const_1 (&d);
16397   gcc_assert (last == get_last_insn ());
16398
16399   return ret;
16400 }
16401
16402 /* Generate a byte permute mask for a register of mode MODE,
16403    which has NUNITS units.  */
16404
16405 rtx
16406 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
16407 {
16408   /* We have to reverse each vector because we dont have
16409      a permuted load that can reverse-load according to ABI rules.  */
16410   rtx mask;
16411   rtvec v = rtvec_alloc (16);
16412   unsigned int i, j;
16413   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
16414
16415   gcc_assert (BYTES_BIG_ENDIAN);
16416   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
16417
16418   for (i = 0; i < nunits; i++)
16419     for (j = 0; j < usize; j++)
16420       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
16421   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
16422   return force_reg (V16QImode, mask);
16423 }
16424
16425 /* Return true if X is a valid second operand for the SVE instruction
16426    that implements integer comparison OP_CODE.  */
16427
16428 static bool
16429 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
16430 {
16431   if (register_operand (x, VOIDmode))
16432     return true;
16433
16434   switch (op_code)
16435     {
16436     case LTU:
16437     case LEU:
16438     case GEU:
16439     case GTU:
16440       return aarch64_sve_cmp_immediate_p (x, false);
16441     case LT:
16442     case LE:
16443     case GE:
16444     case GT:
16445     case NE:
16446     case EQ:
16447       return aarch64_sve_cmp_immediate_p (x, true);
16448     default:
16449       gcc_unreachable ();
16450     }
16451 }
16452
16453 /* Use predicated SVE instructions to implement the equivalent of:
16454
16455      (set TARGET OP)
16456
16457    given that PTRUE is an all-true predicate of the appropriate mode.  */
16458
16459 static void
16460 aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
16461 {
16462   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
16463                                gen_rtvec (2, ptrue, op),
16464                                UNSPEC_MERGE_PTRUE);
16465   rtx_insn *insn = emit_set_insn (target, unspec);
16466   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
16467 }
16468
16469 /* Likewise, but also clobber the condition codes.  */
16470
16471 static void
16472 aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
16473 {
16474   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
16475                                gen_rtvec (2, ptrue, op),
16476                                UNSPEC_MERGE_PTRUE);
16477   rtx_insn *insn = emit_insn (gen_set_clobber_cc (target, unspec));
16478   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
16479 }
16480
16481 /* Return the UNSPEC_COND_* code for comparison CODE.  */
16482
16483 static unsigned int
16484 aarch64_unspec_cond_code (rtx_code code)
16485 {
16486   switch (code)
16487     {
16488     case NE:
16489       return UNSPEC_COND_NE;
16490     case EQ:
16491       return UNSPEC_COND_EQ;
16492     case LT:
16493       return UNSPEC_COND_LT;
16494     case GT:
16495       return UNSPEC_COND_GT;
16496     case LE:
16497       return UNSPEC_COND_LE;
16498     case GE:
16499       return UNSPEC_COND_GE;
16500     default:
16501       gcc_unreachable ();
16502     }
16503 }
16504
16505 /* Emit:
16506
16507       (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
16508
16509    where <X> is the operation associated with comparison CODE.  This form
16510    of instruction is used when (and (CODE OP0 OP1) PRED) would have different
16511    semantics, such as when PRED might not be all-true and when comparing
16512    inactive lanes could have side effects.  */
16513
16514 static void
16515 aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
16516                                   rtx pred, rtx op0, rtx op1)
16517 {
16518   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
16519                                gen_rtvec (3, pred, op0, op1),
16520                                aarch64_unspec_cond_code (code));
16521   emit_set_insn (target, unspec);
16522 }
16523
16524 /* Expand an SVE integer comparison using the SVE equivalent of:
16525
16526      (set TARGET (CODE OP0 OP1)).  */
16527
16528 void
16529 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
16530 {
16531   machine_mode pred_mode = GET_MODE (target);
16532   machine_mode data_mode = GET_MODE (op0);
16533
16534   if (!aarch64_sve_cmp_operand_p (code, op1))
16535     op1 = force_reg (data_mode, op1);
16536
16537   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
16538   rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
16539   aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
16540 }
16541
16542 /* Emit the SVE equivalent of:
16543
16544       (set TMP1 (CODE1 OP0 OP1))
16545       (set TMP2 (CODE2 OP0 OP1))
16546       (set TARGET (ior:PRED_MODE TMP1 TMP2))
16547
16548    PTRUE is an all-true predicate with the same mode as TARGET.  */
16549
16550 static void
16551 aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
16552                            rtx ptrue, rtx op0, rtx op1)
16553 {
16554   machine_mode pred_mode = GET_MODE (ptrue);
16555   rtx tmp1 = gen_reg_rtx (pred_mode);
16556   aarch64_emit_sve_ptrue_op (tmp1, ptrue,
16557                              gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
16558   rtx tmp2 = gen_reg_rtx (pred_mode);
16559   aarch64_emit_sve_ptrue_op (tmp2, ptrue,
16560                              gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
16561   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
16562 }
16563
16564 /* Emit the SVE equivalent of:
16565
16566       (set TMP (CODE OP0 OP1))
16567       (set TARGET (not TMP))
16568
16569    PTRUE is an all-true predicate with the same mode as TARGET.  */
16570
16571 static void
16572 aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
16573                                 rtx op0, rtx op1)
16574 {
16575   machine_mode pred_mode = GET_MODE (ptrue);
16576   rtx tmp = gen_reg_rtx (pred_mode);
16577   aarch64_emit_sve_ptrue_op (tmp, ptrue,
16578                              gen_rtx_fmt_ee (code, pred_mode, op0, op1));
16579   aarch64_emit_unop (target, one_cmpl_optab, tmp);
16580 }
16581
16582 /* Expand an SVE floating-point comparison using the SVE equivalent of:
16583
16584      (set TARGET (CODE OP0 OP1))
16585
16586    If CAN_INVERT_P is true, the caller can also handle inverted results;
16587    return true if the result is in fact inverted.  */
16588
16589 bool
16590 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
16591                                   rtx op0, rtx op1, bool can_invert_p)
16592 {
16593   machine_mode pred_mode = GET_MODE (target);
16594   machine_mode data_mode = GET_MODE (op0);
16595
16596   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
16597   switch (code)
16598     {
16599     case UNORDERED:
16600       /* UNORDERED has no immediate form.  */
16601       op1 = force_reg (data_mode, op1);
16602       /* fall through */
16603     case LT:
16604     case LE:
16605     case GT:
16606     case GE:
16607     case EQ:
16608     case NE:
16609       {
16610         /* There is native support for the comparison.  */
16611         rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
16612         aarch64_emit_sve_ptrue_op (target, ptrue, cond);
16613         return false;
16614       }
16615
16616     case LTGT:
16617       /* This is a trapping operation (LT or GT).  */
16618       aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
16619       return false;
16620
16621     case UNEQ:
16622       if (!flag_trapping_math)
16623         {
16624           /* This would trap for signaling NaNs.  */
16625           op1 = force_reg (data_mode, op1);
16626           aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
16627           return false;
16628         }
16629       /* fall through */
16630     case UNLT:
16631     case UNLE:
16632     case UNGT:
16633     case UNGE:
16634       if (flag_trapping_math)
16635         {
16636           /* Work out which elements are ordered.  */
16637           rtx ordered = gen_reg_rtx (pred_mode);
16638           op1 = force_reg (data_mode, op1);
16639           aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
16640
16641           /* Test the opposite condition for the ordered elements,
16642              then invert the result.  */
16643           if (code == UNEQ)
16644             code = NE;
16645           else
16646             code = reverse_condition_maybe_unordered (code);
16647           if (can_invert_p)
16648             {
16649               aarch64_emit_sve_predicated_cond (target, code,
16650                                                 ordered, op0, op1);
16651               return true;
16652             }
16653           rtx tmp = gen_reg_rtx (pred_mode);
16654           aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
16655           aarch64_emit_unop (target, one_cmpl_optab, tmp);
16656           return false;
16657         }
16658       break;
16659
16660     case ORDERED:
16661       /* ORDERED has no immediate form.  */
16662       op1 = force_reg (data_mode, op1);
16663       break;
16664
16665     default:
16666       gcc_unreachable ();
16667     }
16668
16669   /* There is native support for the inverse comparison.  */
16670   code = reverse_condition_maybe_unordered (code);
16671   if (can_invert_p)
16672     {
16673       rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
16674       aarch64_emit_sve_ptrue_op (target, ptrue, cond);
16675       return true;
16676     }
16677   aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
16678   return false;
16679 }
16680
16681 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
16682    of the data being selected and CMP_MODE is the mode of the values being
16683    compared.  */
16684
16685 void
16686 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
16687                           rtx *ops)
16688 {
16689   machine_mode pred_mode
16690     = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
16691                              GET_MODE_SIZE (cmp_mode)).require ();
16692   rtx pred = gen_reg_rtx (pred_mode);
16693   if (FLOAT_MODE_P (cmp_mode))
16694     {
16695       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
16696                                             ops[4], ops[5], true))
16697         std::swap (ops[1], ops[2]);
16698     }
16699   else
16700     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
16701
16702   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
16703   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
16704 }
16705
16706 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
16707    true.  However due to issues with register allocation it is preferable
16708    to avoid tieing integer scalar and FP scalar modes.  Executing integer
16709    operations in general registers is better than treating them as scalar
16710    vector operations.  This reduces latency and avoids redundant int<->FP
16711    moves.  So tie modes if they are either the same class, or vector modes
16712    with other vector modes, vector structs or any scalar mode.  */
16713
16714 static bool
16715 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
16716 {
16717   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
16718     return true;
16719
16720   /* We specifically want to allow elements of "structure" modes to
16721      be tieable to the structure.  This more general condition allows
16722      other rarer situations too.  The reason we don't extend this to
16723      predicate modes is that there are no predicate structure modes
16724      nor any specific instructions for extracting part of a predicate
16725      register.  */
16726   if (aarch64_vector_data_mode_p (mode1)
16727       && aarch64_vector_data_mode_p (mode2))
16728     return true;
16729
16730   /* Also allow any scalar modes with vectors.  */
16731   if (aarch64_vector_mode_supported_p (mode1)
16732       || aarch64_vector_mode_supported_p (mode2))
16733     return true;
16734
16735   return false;
16736 }
16737
16738 /* Return a new RTX holding the result of moving POINTER forward by
16739    AMOUNT bytes.  */
16740
16741 static rtx
16742 aarch64_move_pointer (rtx pointer, poly_int64 amount)
16743 {
16744   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
16745
16746   return adjust_automodify_address (pointer, GET_MODE (pointer),
16747                                     next, amount);
16748 }
16749
16750 /* Return a new RTX holding the result of moving POINTER forward by the
16751    size of the mode it points to.  */
16752
16753 static rtx
16754 aarch64_progress_pointer (rtx pointer)
16755 {
16756   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
16757 }
16758
16759 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
16760    MODE bytes.  */
16761
16762 static void
16763 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
16764                                               machine_mode mode)
16765 {
16766   rtx reg = gen_reg_rtx (mode);
16767
16768   /* "Cast" the pointers to the correct mode.  */
16769   *src = adjust_address (*src, mode, 0);
16770   *dst = adjust_address (*dst, mode, 0);
16771   /* Emit the memcpy.  */
16772   emit_move_insn (reg, *src);
16773   emit_move_insn (*dst, reg);
16774   /* Move the pointers forward.  */
16775   *src = aarch64_progress_pointer (*src);
16776   *dst = aarch64_progress_pointer (*dst);
16777 }
16778
16779 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
16780    we succeed, otherwise return false.  */
16781
16782 bool
16783 aarch64_expand_movmem (rtx *operands)
16784 {
16785   int n, mode_bits;
16786   rtx dst = operands[0];
16787   rtx src = operands[1];
16788   rtx base;
16789   machine_mode cur_mode = BLKmode, next_mode;
16790   bool speed_p = !optimize_function_for_size_p (cfun);
16791
16792   /* When optimizing for size, give a better estimate of the length of a
16793      memcpy call, but use the default otherwise.  Moves larger than 8 bytes
16794      will always require an even number of instructions to do now.  And each
16795      operation requires both a load+store, so devide the max number by 2.  */
16796   int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
16797
16798   /* We can't do anything smart if the amount to copy is not constant.  */
16799   if (!CONST_INT_P (operands[2]))
16800     return false;
16801
16802   n = INTVAL (operands[2]);
16803
16804   /* Try to keep the number of instructions low.  For all cases we will do at
16805      most two moves for the residual amount, since we'll always overlap the
16806      remainder.  */
16807   if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
16808     return false;
16809
16810   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
16811   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
16812
16813   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
16814   src = adjust_automodify_address (src, VOIDmode, base, 0);
16815
16816   /* Convert n to bits to make the rest of the code simpler.  */
16817   n = n * BITS_PER_UNIT;
16818
16819   /* Maximum amount to copy in one go.  The AArch64 back-end has integer modes
16820      larger than TImode, but we should not use them for loads/stores here.  */
16821   const int copy_limit = GET_MODE_BITSIZE (TImode);
16822
16823   while (n > 0)
16824     {
16825       /* Find the largest mode in which to do the copy in without over reading
16826          or writing.  */
16827       opt_scalar_int_mode mode_iter;
16828       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
16829         if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
16830           cur_mode = mode_iter.require ();
16831
16832       gcc_assert (cur_mode != BLKmode);
16833
16834       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
16835       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
16836
16837       n -= mode_bits;
16838
16839       /* Do certain trailing copies as overlapping if it's going to be
16840          cheaper.  i.e. less instructions to do so.  For instance doing a 15
16841          byte copy it's more efficient to do two overlapping 8 byte copies than
16842          8 + 6 + 1.  */
16843       if (n > 0 && n <= 8 * BITS_PER_UNIT)
16844         {
16845           next_mode = smallest_mode_for_size (n, MODE_INT);
16846           int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
16847           src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
16848           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
16849           n = n_bits;
16850         }
16851     }
16852
16853   return true;
16854 }
16855
16856 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
16857    SImode stores.  Handle the case when the constant has identical
16858    bottom and top halves.  This is beneficial when the two stores can be
16859    merged into an STP and we avoid synthesising potentially expensive
16860    immediates twice.  Return true if such a split is possible.  */
16861
16862 bool
16863 aarch64_split_dimode_const_store (rtx dst, rtx src)
16864 {
16865   rtx lo = gen_lowpart (SImode, src);
16866   rtx hi = gen_highpart_mode (SImode, DImode, src);
16867
16868   bool size_p = optimize_function_for_size_p (cfun);
16869
16870   if (!rtx_equal_p (lo, hi))
16871     return false;
16872
16873   unsigned int orig_cost
16874     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
16875   unsigned int lo_cost
16876     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
16877
16878   /* We want to transform:
16879      MOV        x1, 49370
16880      MOVK       x1, 0x140, lsl 16
16881      MOVK       x1, 0xc0da, lsl 32
16882      MOVK       x1, 0x140, lsl 48
16883      STR        x1, [x0]
16884    into:
16885      MOV        w1, 49370
16886      MOVK       w1, 0x140, lsl 16
16887      STP        w1, w1, [x0]
16888    So we want to perform this only when we save two instructions
16889    or more.  When optimizing for size, however, accept any code size
16890    savings we can.  */
16891   if (size_p && orig_cost <= lo_cost)
16892     return false;
16893
16894   if (!size_p
16895       && (orig_cost <= lo_cost + 1))
16896     return false;
16897
16898   rtx mem_lo = adjust_address (dst, SImode, 0);
16899   if (!aarch64_mem_pair_operand (mem_lo, SImode))
16900     return false;
16901
16902   rtx tmp_reg = gen_reg_rtx (SImode);
16903   aarch64_expand_mov_immediate (tmp_reg, lo);
16904   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
16905   /* Don't emit an explicit store pair as this may not be always profitable.
16906      Let the sched-fusion logic decide whether to merge them.  */
16907   emit_move_insn (mem_lo, tmp_reg);
16908   emit_move_insn (mem_hi, tmp_reg);
16909
16910   return true;
16911 }
16912
16913 /* Generate RTL for a conditional branch with rtx comparison CODE in
16914    mode CC_MODE.  The destination of the unlikely conditional branch
16915    is LABEL_REF.  */
16916
16917 void
16918 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
16919                               rtx label_ref)
16920 {
16921   rtx x;
16922   x = gen_rtx_fmt_ee (code, VOIDmode,
16923                       gen_rtx_REG (cc_mode, CC_REGNUM),
16924                       const0_rtx);
16925
16926   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16927                             gen_rtx_LABEL_REF (VOIDmode, label_ref),
16928                             pc_rtx);
16929   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16930 }
16931
16932 /* Generate DImode scratch registers for 128-bit (TImode) addition.
16933
16934    OP1 represents the TImode destination operand 1
16935    OP2 represents the TImode destination operand 2
16936    LOW_DEST represents the low half (DImode) of TImode operand 0
16937    LOW_IN1 represents the low half (DImode) of TImode operand 1
16938    LOW_IN2 represents the low half (DImode) of TImode operand 2
16939    HIGH_DEST represents the high half (DImode) of TImode operand 0
16940    HIGH_IN1 represents the high half (DImode) of TImode operand 1
16941    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
16942
16943 void
16944 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
16945                             rtx *low_in1, rtx *low_in2,
16946                             rtx *high_dest, rtx *high_in1,
16947                             rtx *high_in2)
16948 {
16949   *low_dest = gen_reg_rtx (DImode);
16950   *low_in1 = gen_lowpart (DImode, op1);
16951   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
16952                                   subreg_lowpart_offset (DImode, TImode));
16953   *high_dest = gen_reg_rtx (DImode);
16954   *high_in1 = gen_highpart (DImode, op1);
16955   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
16956                                    subreg_highpart_offset (DImode, TImode));
16957 }
16958
16959 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
16960
16961    This function differs from 'arch64_addti_scratch_regs' in that
16962    OP1 can be an immediate constant (zero). We must call
16963    subreg_highpart_offset with DImode and TImode arguments, otherwise
16964    VOIDmode will be used for the const_int which generates an internal
16965    error from subreg_size_highpart_offset which does not expect a size of zero.
16966
16967    OP1 represents the TImode destination operand 1
16968    OP2 represents the TImode destination operand 2
16969    LOW_DEST represents the low half (DImode) of TImode operand 0
16970    LOW_IN1 represents the low half (DImode) of TImode operand 1
16971    LOW_IN2 represents the low half (DImode) of TImode operand 2
16972    HIGH_DEST represents the high half (DImode) of TImode operand 0
16973    HIGH_IN1 represents the high half (DImode) of TImode operand 1
16974    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
16975
16976
16977 void
16978 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
16979                              rtx *low_in1, rtx *low_in2,
16980                              rtx *high_dest, rtx *high_in1,
16981                              rtx *high_in2)
16982 {
16983   *low_dest = gen_reg_rtx (DImode);
16984   *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
16985                                   subreg_lowpart_offset (DImode, TImode));
16986
16987   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
16988                                   subreg_lowpart_offset (DImode, TImode));
16989   *high_dest = gen_reg_rtx (DImode);
16990
16991   *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
16992                                    subreg_highpart_offset (DImode, TImode));
16993   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
16994                                    subreg_highpart_offset (DImode, TImode));
16995 }
16996
16997 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
16998
16999    OP0 represents the TImode destination operand 0
17000    LOW_DEST represents the low half (DImode) of TImode operand 0
17001    LOW_IN1 represents the low half (DImode) of TImode operand 1
17002    LOW_IN2 represents the low half (DImode) of TImode operand 2
17003    HIGH_DEST represents the high half (DImode) of TImode operand 0
17004    HIGH_IN1 represents the high half (DImode) of TImode operand 1
17005    HIGH_IN2 represents the high half (DImode) of TImode operand 2
17006    UNSIGNED_P is true if the operation is being performed on unsigned
17007    values.  */
17008 void
17009 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
17010                        rtx low_in2, rtx high_dest, rtx high_in1,
17011                        rtx high_in2, bool unsigned_p)
17012 {
17013   if (low_in2 == const0_rtx)
17014     {
17015       low_dest = low_in1;
17016       high_in2 = force_reg (DImode, high_in2);
17017       if (unsigned_p)
17018         emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
17019       else
17020         emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
17021     }
17022   else
17023     {
17024       if (CONST_INT_P (low_in2))
17025         {
17026           high_in2 = force_reg (DImode, high_in2);
17027           emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
17028                                               GEN_INT (-INTVAL (low_in2))));
17029         }
17030       else
17031         emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
17032
17033       if (unsigned_p)
17034         emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
17035       else
17036         emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
17037     }
17038
17039   emit_move_insn (gen_lowpart (DImode, op0), low_dest);
17040   emit_move_insn (gen_highpart (DImode, op0), high_dest);
17041
17042 }
17043
17044 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
17045
17046 static unsigned HOST_WIDE_INT
17047 aarch64_asan_shadow_offset (void)
17048 {
17049   return (HOST_WIDE_INT_1 << 36);
17050 }
17051
17052 static rtx
17053 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
17054                         int code, tree treeop0, tree treeop1)
17055 {
17056   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
17057   rtx op0, op1;
17058   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
17059   insn_code icode;
17060   struct expand_operand ops[4];
17061
17062   start_sequence ();
17063   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
17064
17065   op_mode = GET_MODE (op0);
17066   if (op_mode == VOIDmode)
17067     op_mode = GET_MODE (op1);
17068
17069   switch (op_mode)
17070     {
17071     case E_QImode:
17072     case E_HImode:
17073     case E_SImode:
17074       cmp_mode = SImode;
17075       icode = CODE_FOR_cmpsi;
17076       break;
17077
17078     case E_DImode:
17079       cmp_mode = DImode;
17080       icode = CODE_FOR_cmpdi;
17081       break;
17082
17083     case E_SFmode:
17084       cmp_mode = SFmode;
17085       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
17086       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
17087       break;
17088
17089     case E_DFmode:
17090       cmp_mode = DFmode;
17091       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
17092       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
17093       break;
17094
17095     default:
17096       end_sequence ();
17097       return NULL_RTX;
17098     }
17099
17100   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
17101   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
17102   if (!op0 || !op1)
17103     {
17104       end_sequence ();
17105       return NULL_RTX;
17106     }
17107   *prep_seq = get_insns ();
17108   end_sequence ();
17109
17110   create_fixed_operand (&ops[0], op0);
17111   create_fixed_operand (&ops[1], op1);
17112
17113   start_sequence ();
17114   if (!maybe_expand_insn (icode, 2, ops))
17115     {
17116       end_sequence ();
17117       return NULL_RTX;
17118     }
17119   *gen_seq = get_insns ();
17120   end_sequence ();
17121
17122   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
17123                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
17124 }
17125
17126 static rtx
17127 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
17128                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
17129 {
17130   rtx op0, op1, target;
17131   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
17132   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
17133   insn_code icode;
17134   struct expand_operand ops[6];
17135   int aarch64_cond;
17136
17137   push_to_sequence (*prep_seq);
17138   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
17139
17140   op_mode = GET_MODE (op0);
17141   if (op_mode == VOIDmode)
17142     op_mode = GET_MODE (op1);
17143
17144   switch (op_mode)
17145     {
17146     case E_QImode:
17147     case E_HImode:
17148     case E_SImode:
17149       cmp_mode = SImode;
17150       icode = CODE_FOR_ccmpsi;
17151       break;
17152
17153     case E_DImode:
17154       cmp_mode = DImode;
17155       icode = CODE_FOR_ccmpdi;
17156       break;
17157
17158     case E_SFmode:
17159       cmp_mode = SFmode;
17160       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
17161       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
17162       break;
17163
17164     case E_DFmode:
17165       cmp_mode = DFmode;
17166       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
17167       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
17168       break;
17169
17170     default:
17171       end_sequence ();
17172       return NULL_RTX;
17173     }
17174
17175   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
17176   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
17177   if (!op0 || !op1)
17178     {
17179       end_sequence ();
17180       return NULL_RTX;
17181     }
17182   *prep_seq = get_insns ();
17183   end_sequence ();
17184
17185   target = gen_rtx_REG (cc_mode, CC_REGNUM);
17186   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
17187
17188   if (bit_code != AND)
17189     {
17190       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
17191                                                 GET_MODE (XEXP (prev, 0))),
17192                              VOIDmode, XEXP (prev, 0), const0_rtx);
17193       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
17194     }
17195
17196   create_fixed_operand (&ops[0], XEXP (prev, 0));
17197   create_fixed_operand (&ops[1], target);
17198   create_fixed_operand (&ops[2], op0);
17199   create_fixed_operand (&ops[3], op1);
17200   create_fixed_operand (&ops[4], prev);
17201   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
17202
17203   push_to_sequence (*gen_seq);
17204   if (!maybe_expand_insn (icode, 6, ops))
17205     {
17206       end_sequence ();
17207       return NULL_RTX;
17208     }
17209
17210   *gen_seq = get_insns ();
17211   end_sequence ();
17212
17213   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
17214 }
17215
17216 #undef TARGET_GEN_CCMP_FIRST
17217 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
17218
17219 #undef TARGET_GEN_CCMP_NEXT
17220 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
17221
17222 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
17223    instruction fusion of some sort.  */
17224
17225 static bool
17226 aarch64_macro_fusion_p (void)
17227 {
17228   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
17229 }
17230
17231
17232 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
17233    should be kept together during scheduling.  */
17234
17235 static bool
17236 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
17237 {
17238   rtx set_dest;
17239   rtx prev_set = single_set (prev);
17240   rtx curr_set = single_set (curr);
17241   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
17242   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
17243
17244   if (!aarch64_macro_fusion_p ())
17245     return false;
17246
17247   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
17248     {
17249       /* We are trying to match:
17250          prev (mov)  == (set (reg r0) (const_int imm16))
17251          curr (movk) == (set (zero_extract (reg r0)
17252                                            (const_int 16)
17253                                            (const_int 16))
17254                              (const_int imm16_1))  */
17255
17256       set_dest = SET_DEST (curr_set);
17257
17258       if (GET_CODE (set_dest) == ZERO_EXTRACT
17259           && CONST_INT_P (SET_SRC (curr_set))
17260           && CONST_INT_P (SET_SRC (prev_set))
17261           && CONST_INT_P (XEXP (set_dest, 2))
17262           && INTVAL (XEXP (set_dest, 2)) == 16
17263           && REG_P (XEXP (set_dest, 0))
17264           && REG_P (SET_DEST (prev_set))
17265           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
17266         {
17267           return true;
17268         }
17269     }
17270
17271   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
17272     {
17273
17274       /*  We're trying to match:
17275           prev (adrp) == (set (reg r1)
17276                               (high (symbol_ref ("SYM"))))
17277           curr (add) == (set (reg r0)
17278                              (lo_sum (reg r1)
17279                                      (symbol_ref ("SYM"))))
17280           Note that r0 need not necessarily be the same as r1, especially
17281           during pre-regalloc scheduling.  */
17282
17283       if (satisfies_constraint_Ush (SET_SRC (prev_set))
17284           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
17285         {
17286           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
17287               && REG_P (XEXP (SET_SRC (curr_set), 0))
17288               && REGNO (XEXP (SET_SRC (curr_set), 0))
17289                  == REGNO (SET_DEST (prev_set))
17290               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
17291                               XEXP (SET_SRC (curr_set), 1)))
17292             return true;
17293         }
17294     }
17295
17296   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
17297     {
17298
17299       /* We're trying to match:
17300          prev (movk) == (set (zero_extract (reg r0)
17301                                            (const_int 16)
17302                                            (const_int 32))
17303                              (const_int imm16_1))
17304          curr (movk) == (set (zero_extract (reg r0)
17305                                            (const_int 16)
17306                                            (const_int 48))
17307                              (const_int imm16_2))  */
17308
17309       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
17310           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
17311           && REG_P (XEXP (SET_DEST (prev_set), 0))
17312           && REG_P (XEXP (SET_DEST (curr_set), 0))
17313           && REGNO (XEXP (SET_DEST (prev_set), 0))
17314              == REGNO (XEXP (SET_DEST (curr_set), 0))
17315           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
17316           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
17317           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
17318           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
17319           && CONST_INT_P (SET_SRC (prev_set))
17320           && CONST_INT_P (SET_SRC (curr_set)))
17321         return true;
17322
17323     }
17324   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
17325     {
17326       /* We're trying to match:
17327           prev (adrp) == (set (reg r0)
17328                               (high (symbol_ref ("SYM"))))
17329           curr (ldr) == (set (reg r1)
17330                              (mem (lo_sum (reg r0)
17331                                              (symbol_ref ("SYM")))))
17332                  or
17333           curr (ldr) == (set (reg r1)
17334                              (zero_extend (mem
17335                                            (lo_sum (reg r0)
17336                                                    (symbol_ref ("SYM"))))))  */
17337       if (satisfies_constraint_Ush (SET_SRC (prev_set))
17338           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
17339         {
17340           rtx curr_src = SET_SRC (curr_set);
17341
17342           if (GET_CODE (curr_src) == ZERO_EXTEND)
17343             curr_src = XEXP (curr_src, 0);
17344
17345           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
17346               && REG_P (XEXP (XEXP (curr_src, 0), 0))
17347               && REGNO (XEXP (XEXP (curr_src, 0), 0))
17348                  == REGNO (SET_DEST (prev_set))
17349               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
17350                               XEXP (SET_SRC (prev_set), 0)))
17351               return true;
17352         }
17353     }
17354
17355   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
17356        && aarch_crypto_can_dual_issue (prev, curr))
17357     return true;
17358
17359   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
17360       && any_condjump_p (curr))
17361     {
17362       unsigned int condreg1, condreg2;
17363       rtx cc_reg_1;
17364       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
17365       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
17366
17367       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
17368           && prev
17369           && modified_in_p (cc_reg_1, prev))
17370         {
17371           enum attr_type prev_type = get_attr_type (prev);
17372
17373           /* FIXME: this misses some which is considered simple arthematic
17374              instructions for ThunderX.  Simple shifts are missed here.  */
17375           if (prev_type == TYPE_ALUS_SREG
17376               || prev_type == TYPE_ALUS_IMM
17377               || prev_type == TYPE_LOGICS_REG
17378               || prev_type == TYPE_LOGICS_IMM)
17379             return true;
17380         }
17381     }
17382
17383   if (prev_set
17384       && curr_set
17385       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
17386       && any_condjump_p (curr))
17387     {
17388       /* We're trying to match:
17389           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
17390           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
17391                                                          (const_int 0))
17392                                                  (label_ref ("SYM"))
17393                                                  (pc))  */
17394       if (SET_DEST (curr_set) == (pc_rtx)
17395           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
17396           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
17397           && REG_P (SET_DEST (prev_set))
17398           && REGNO (SET_DEST (prev_set))
17399              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
17400         {
17401           /* Fuse ALU operations followed by conditional branch instruction.  */
17402           switch (get_attr_type (prev))
17403             {
17404             case TYPE_ALU_IMM:
17405             case TYPE_ALU_SREG:
17406             case TYPE_ADC_REG:
17407             case TYPE_ADC_IMM:
17408             case TYPE_ADCS_REG:
17409             case TYPE_ADCS_IMM:
17410             case TYPE_LOGIC_REG:
17411             case TYPE_LOGIC_IMM:
17412             case TYPE_CSEL:
17413             case TYPE_ADR:
17414             case TYPE_MOV_IMM:
17415             case TYPE_SHIFT_REG:
17416             case TYPE_SHIFT_IMM:
17417             case TYPE_BFM:
17418             case TYPE_RBIT:
17419             case TYPE_REV:
17420             case TYPE_EXTEND:
17421               return true;
17422
17423             default:;
17424             }
17425         }
17426     }
17427
17428   return false;
17429 }
17430
17431 /* Return true iff the instruction fusion described by OP is enabled.  */
17432
17433 bool
17434 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
17435 {
17436   return (aarch64_tune_params.fusible_ops & op) != 0;
17437 }
17438
17439 /* If MEM is in the form of [base+offset], extract the two parts
17440    of address and set to BASE and OFFSET, otherwise return false
17441    after clearing BASE and OFFSET.  */
17442
17443 bool
17444 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
17445 {
17446   rtx addr;
17447
17448   gcc_assert (MEM_P (mem));
17449
17450   addr = XEXP (mem, 0);
17451
17452   if (REG_P (addr))
17453     {
17454       *base = addr;
17455       *offset = const0_rtx;
17456       return true;
17457     }
17458
17459   if (GET_CODE (addr) == PLUS
17460       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
17461     {
17462       *base = XEXP (addr, 0);
17463       *offset = XEXP (addr, 1);
17464       return true;
17465     }
17466
17467   *base = NULL_RTX;
17468   *offset = NULL_RTX;
17469
17470   return false;
17471 }
17472
17473 /* Types for scheduling fusion.  */
17474 enum sched_fusion_type
17475 {
17476   SCHED_FUSION_NONE = 0,
17477   SCHED_FUSION_LD_SIGN_EXTEND,
17478   SCHED_FUSION_LD_ZERO_EXTEND,
17479   SCHED_FUSION_LD,
17480   SCHED_FUSION_ST,
17481   SCHED_FUSION_NUM
17482 };
17483
17484 /* If INSN is a load or store of address in the form of [base+offset],
17485    extract the two parts and set to BASE and OFFSET.  Return scheduling
17486    fusion type this INSN is.  */
17487
17488 static enum sched_fusion_type
17489 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
17490 {
17491   rtx x, dest, src;
17492   enum sched_fusion_type fusion = SCHED_FUSION_LD;
17493
17494   gcc_assert (INSN_P (insn));
17495   x = PATTERN (insn);
17496   if (GET_CODE (x) != SET)
17497     return SCHED_FUSION_NONE;
17498
17499   src = SET_SRC (x);
17500   dest = SET_DEST (x);
17501
17502   machine_mode dest_mode = GET_MODE (dest);
17503
17504   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
17505     return SCHED_FUSION_NONE;
17506
17507   if (GET_CODE (src) == SIGN_EXTEND)
17508     {
17509       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
17510       src = XEXP (src, 0);
17511       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
17512         return SCHED_FUSION_NONE;
17513     }
17514   else if (GET_CODE (src) == ZERO_EXTEND)
17515     {
17516       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
17517       src = XEXP (src, 0);
17518       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
17519         return SCHED_FUSION_NONE;
17520     }
17521
17522   if (GET_CODE (src) == MEM && REG_P (dest))
17523     extract_base_offset_in_addr (src, base, offset);
17524   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
17525     {
17526       fusion = SCHED_FUSION_ST;
17527       extract_base_offset_in_addr (dest, base, offset);
17528     }
17529   else
17530     return SCHED_FUSION_NONE;
17531
17532   if (*base == NULL_RTX || *offset == NULL_RTX)
17533     fusion = SCHED_FUSION_NONE;
17534
17535   return fusion;
17536 }
17537
17538 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
17539
17540    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
17541    and PRI are only calculated for these instructions.  For other instruction,
17542    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
17543    type instruction fusion can be added by returning different priorities.
17544
17545    It's important that irrelevant instructions get the largest FUSION_PRI.  */
17546
17547 static void
17548 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
17549                                int *fusion_pri, int *pri)
17550 {
17551   int tmp, off_val;
17552   rtx base, offset;
17553   enum sched_fusion_type fusion;
17554
17555   gcc_assert (INSN_P (insn));
17556
17557   tmp = max_pri - 1;
17558   fusion = fusion_load_store (insn, &base, &offset);
17559   if (fusion == SCHED_FUSION_NONE)
17560     {
17561       *pri = tmp;
17562       *fusion_pri = tmp;
17563       return;
17564     }
17565
17566   /* Set FUSION_PRI according to fusion type and base register.  */
17567   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
17568
17569   /* Calculate PRI.  */
17570   tmp /= 2;
17571
17572   /* INSN with smaller offset goes first.  */
17573   off_val = (int)(INTVAL (offset));
17574   if (off_val >= 0)
17575     tmp -= (off_val & 0xfffff);
17576   else
17577     tmp += ((- off_val) & 0xfffff);
17578
17579   *pri = tmp;
17580   return;
17581 }
17582
17583 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
17584    Adjust priority of sha1h instructions so they are scheduled before
17585    other SHA1 instructions.  */
17586
17587 static int
17588 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
17589 {
17590   rtx x = PATTERN (insn);
17591
17592   if (GET_CODE (x) == SET)
17593     {
17594       x = SET_SRC (x);
17595
17596       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
17597         return priority + 10;
17598     }
17599
17600   return priority;
17601 }
17602
17603 /* Given OPERANDS of consecutive load/store, check if we can merge
17604    them into ldp/stp.  LOAD is true if they are load instructions.
17605    MODE is the mode of memory operands.  */
17606
17607 bool
17608 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
17609                                 machine_mode mode)
17610 {
17611   HOST_WIDE_INT offval_1, offval_2, msize;
17612   enum reg_class rclass_1, rclass_2;
17613   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
17614
17615   if (load)
17616     {
17617       mem_1 = operands[1];
17618       mem_2 = operands[3];
17619       reg_1 = operands[0];
17620       reg_2 = operands[2];
17621       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
17622       if (REGNO (reg_1) == REGNO (reg_2))
17623         return false;
17624     }
17625   else
17626     {
17627       mem_1 = operands[0];
17628       mem_2 = operands[2];
17629       reg_1 = operands[1];
17630       reg_2 = operands[3];
17631     }
17632
17633   /* The mems cannot be volatile.  */
17634   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
17635     return false;
17636
17637   /* If we have SImode and slow unaligned ldp,
17638      check the alignment to be at least 8 byte. */
17639   if (mode == SImode
17640       && (aarch64_tune_params.extra_tuning_flags
17641           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
17642       && !optimize_size
17643       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
17644     return false;
17645
17646   /* Check if the addresses are in the form of [base+offset].  */
17647   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
17648   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
17649     return false;
17650   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
17651   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
17652     return false;
17653
17654   /* Check if the bases are same.  */
17655   if (!rtx_equal_p (base_1, base_2))
17656     return false;
17657
17658   /* The operands must be of the same size.  */
17659   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
17660                          GET_MODE_SIZE (GET_MODE (mem_2))));
17661
17662   offval_1 = INTVAL (offset_1);
17663   offval_2 = INTVAL (offset_2);
17664   /* We should only be trying this for fixed-sized modes.  There is no
17665      SVE LDP/STP instruction.  */
17666   msize = GET_MODE_SIZE (mode).to_constant ();
17667   /* Check if the offsets are consecutive.  */
17668   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
17669     return false;
17670
17671   /* Check if the addresses are clobbered by load.  */
17672   if (load)
17673     {
17674       if (reg_mentioned_p (reg_1, mem_1))
17675         return false;
17676
17677       /* In increasing order, the last load can clobber the address.  */
17678       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
17679         return false;
17680     }
17681
17682   /* One of the memory accesses must be a mempair operand.
17683      If it is not the first one, they need to be swapped by the
17684      peephole.  */
17685   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
17686        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
17687     return false;
17688
17689   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
17690     rclass_1 = FP_REGS;
17691   else
17692     rclass_1 = GENERAL_REGS;
17693
17694   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
17695     rclass_2 = FP_REGS;
17696   else
17697     rclass_2 = GENERAL_REGS;
17698
17699   /* Check if the registers are of same class.  */
17700   if (rclass_1 != rclass_2)
17701     return false;
17702
17703   return true;
17704 }
17705
17706 /* Given OPERANDS of consecutive load/store that can be merged,
17707    swap them if they are not in ascending order.  */
17708 void
17709 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
17710 {
17711   rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
17712   HOST_WIDE_INT offval_1, offval_2;
17713
17714   if (load)
17715     {
17716       mem_1 = operands[1];
17717       mem_2 = operands[3];
17718     }
17719   else
17720     {
17721       mem_1 = operands[0];
17722       mem_2 = operands[2];
17723     }
17724
17725   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
17726   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
17727
17728   offval_1 = INTVAL (offset_1);
17729   offval_2 = INTVAL (offset_2);
17730
17731   if (offval_1 > offval_2)
17732     {
17733       /* Irrespective of whether this is a load or a store,
17734          we do the same swap.  */
17735       std::swap (operands[0], operands[2]);
17736       std::swap (operands[1], operands[3]);
17737     }
17738 }
17739
17740 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
17741    comparison between the two.  */
17742 int
17743 aarch64_host_wide_int_compare (const void *x, const void *y)
17744 {
17745   return wi::cmps (* ((const HOST_WIDE_INT *) x),
17746                    * ((const HOST_WIDE_INT *) y));
17747 }
17748
17749 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
17750    other pointing to a REG rtx containing an offset, compare the offsets
17751    of the two pairs.
17752
17753    Return:
17754
17755         1 iff offset (X) > offset (Y)
17756         0 iff offset (X) == offset (Y)
17757         -1 iff offset (X) < offset (Y)  */
17758 int
17759 aarch64_ldrstr_offset_compare (const void *x, const void *y)
17760 {
17761   const rtx * operands_1 = (const rtx *) x;
17762   const rtx * operands_2 = (const rtx *) y;
17763   rtx mem_1, mem_2, base, offset_1, offset_2;
17764
17765   if (MEM_P (operands_1[0]))
17766     mem_1 = operands_1[0];
17767   else
17768     mem_1 = operands_1[1];
17769
17770   if (MEM_P (operands_2[0]))
17771     mem_2 = operands_2[0];
17772   else
17773     mem_2 = operands_2[1];
17774
17775   /* Extract the offsets.  */
17776   extract_base_offset_in_addr (mem_1, &base, &offset_1);
17777   extract_base_offset_in_addr (mem_2, &base, &offset_2);
17778
17779   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
17780
17781   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
17782 }
17783
17784 /* Given OPERANDS of consecutive load/store, check if we can merge
17785    them into ldp/stp by adjusting the offset.  LOAD is true if they
17786    are load instructions.  MODE is the mode of memory operands.
17787
17788    Given below consecutive stores:
17789
17790      str  w1, [xb, 0x100]
17791      str  w1, [xb, 0x104]
17792      str  w1, [xb, 0x108]
17793      str  w1, [xb, 0x10c]
17794
17795    Though the offsets are out of the range supported by stp, we can
17796    still pair them after adjusting the offset, like:
17797
17798      add  scratch, xb, 0x100
17799      stp  w1, w1, [scratch]
17800      stp  w1, w1, [scratch, 0x8]
17801
17802    The peephole patterns detecting this opportunity should guarantee
17803    the scratch register is avaliable.  */
17804
17805 bool
17806 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
17807                                        scalar_mode mode)
17808 {
17809   const int num_insns = 4;
17810   enum reg_class rclass;
17811   HOST_WIDE_INT offvals[num_insns], msize;
17812   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
17813
17814   if (load)
17815     {
17816       for (int i = 0; i < num_insns; i++)
17817         {
17818           reg[i] = operands[2 * i];
17819           mem[i] = operands[2 * i + 1];
17820
17821           gcc_assert (REG_P (reg[i]));
17822         }
17823
17824       /* Do not attempt to merge the loads if the loads clobber each other.  */
17825       for (int i = 0; i < 8; i += 2)
17826         for (int j = i + 2; j < 8; j += 2)
17827           if (reg_overlap_mentioned_p (operands[i], operands[j]))
17828             return false;
17829     }
17830   else
17831     for (int i = 0; i < num_insns; i++)
17832       {
17833         mem[i] = operands[2 * i];
17834         reg[i] = operands[2 * i + 1];
17835       }
17836
17837   /* Skip if memory operand is by itself valid for ldp/stp.  */
17838   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
17839     return false;
17840
17841   for (int i = 0; i < num_insns; i++)
17842     {
17843       /* The mems cannot be volatile.  */
17844       if (MEM_VOLATILE_P (mem[i]))
17845         return false;
17846
17847       /* Check if the addresses are in the form of [base+offset].  */
17848       extract_base_offset_in_addr (mem[i], base + i, offset + i);
17849       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
17850         return false;
17851     }
17852
17853   /* Check if the registers are of same class.  */
17854   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
17855     ? FP_REGS : GENERAL_REGS;
17856
17857   for (int i = 1; i < num_insns; i++)
17858     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
17859       {
17860         if (rclass != FP_REGS)
17861           return false;
17862       }
17863     else
17864       {
17865         if (rclass != GENERAL_REGS)
17866           return false;
17867       }
17868
17869   /* Only the last register in the order in which they occur
17870      may be clobbered by the load.  */
17871   if (rclass == GENERAL_REGS && load)
17872     for (int i = 0; i < num_insns - 1; i++)
17873       if (reg_mentioned_p (reg[i], mem[i]))
17874         return false;
17875
17876   /* Check if the bases are same.  */
17877   for (int i = 0; i < num_insns - 1; i++)
17878     if (!rtx_equal_p (base[i], base[i + 1]))
17879       return false;
17880
17881   for (int i = 0; i < num_insns; i++)
17882     offvals[i] = INTVAL (offset[i]);
17883
17884   msize = GET_MODE_SIZE (mode);
17885
17886   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
17887   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
17888          aarch64_host_wide_int_compare);
17889
17890   if (!(offvals[1] == offvals[0] + msize
17891         && offvals[3] == offvals[2] + msize))
17892     return false;
17893
17894   /* Check that offsets are within range of each other.  The ldp/stp
17895      instructions have 7 bit immediate offsets, so use 0x80.  */
17896   if (offvals[2] - offvals[0] >= msize * 0x80)
17897     return false;
17898
17899   /* The offsets must be aligned with respect to each other.  */
17900   if (offvals[0] % msize != offvals[2] % msize)
17901     return false;
17902
17903   /* If we have SImode and slow unaligned ldp,
17904      check the alignment to be at least 8 byte. */
17905   if (mode == SImode
17906       && (aarch64_tune_params.extra_tuning_flags
17907           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
17908       && !optimize_size
17909       && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
17910     return false;
17911
17912   return true;
17913 }
17914
17915 /* Given OPERANDS of consecutive load/store, this function pairs them
17916    into LDP/STP after adjusting the offset.  It depends on the fact
17917    that the operands can be sorted so the offsets are correct for STP.
17918    MODE is the mode of memory operands.  CODE is the rtl operator
17919    which should be applied to all memory operands, it's SIGN_EXTEND,
17920    ZERO_EXTEND or UNKNOWN.  */
17921
17922 bool
17923 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
17924                              scalar_mode mode, RTX_CODE code)
17925 {
17926   rtx base, offset_1, offset_3, t1, t2;
17927   rtx mem_1, mem_2, mem_3, mem_4;
17928   rtx temp_operands[8];
17929   HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
17930                 stp_off_upper_limit, stp_off_lower_limit, msize;
17931
17932   /* We make changes on a copy as we may still bail out.  */
17933   for (int i = 0; i < 8; i ++)
17934     temp_operands[i] = operands[i];
17935
17936   /* Sort the operands.  */
17937   qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
17938
17939   if (load)
17940     {
17941       mem_1 = temp_operands[1];
17942       mem_2 = temp_operands[3];
17943       mem_3 = temp_operands[5];
17944       mem_4 = temp_operands[7];
17945     }
17946   else
17947     {
17948       mem_1 = temp_operands[0];
17949       mem_2 = temp_operands[2];
17950       mem_3 = temp_operands[4];
17951       mem_4 = temp_operands[6];
17952       gcc_assert (code == UNKNOWN);
17953     }
17954
17955   extract_base_offset_in_addr (mem_1, &base, &offset_1);
17956   extract_base_offset_in_addr (mem_3, &base, &offset_3);
17957   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
17958               && offset_3 != NULL_RTX);
17959
17960   /* Adjust offset so it can fit in LDP/STP instruction.  */
17961   msize = GET_MODE_SIZE (mode);
17962   stp_off_upper_limit = msize * (0x40 - 1);
17963   stp_off_lower_limit = - msize * 0x40;
17964
17965   off_val_1 = INTVAL (offset_1);
17966   off_val_3 = INTVAL (offset_3);
17967
17968   /* The base offset is optimally half way between the two STP/LDP offsets.  */
17969   if (msize <= 4)
17970     base_off = (off_val_1 + off_val_3) / 2;
17971   else
17972     /* However, due to issues with negative LDP/STP offset generation for
17973        larger modes, for DF, DI and vector modes. we must not use negative
17974        addresses smaller than 9 signed unadjusted bits can store.  This
17975        provides the most range in this case.  */
17976     base_off = off_val_1;
17977
17978   /* Adjust the base so that it is aligned with the addresses but still
17979      optimal.  */
17980   if (base_off % msize != off_val_1 % msize)
17981     /* Fix the offset, bearing in mind we want to make it bigger not
17982        smaller.  */
17983     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17984   else if (msize <= 4)
17985     /* The negative range of LDP/STP is one larger than the positive range.  */
17986     base_off += msize;
17987
17988   /* Check if base offset is too big or too small.  We can attempt to resolve
17989      this issue by setting it to the maximum value and seeing if the offsets
17990      still fit.  */
17991   if (base_off >= 0x1000)
17992     {
17993       base_off = 0x1000 - 1;
17994       /* We must still make sure that the base offset is aligned with respect
17995          to the address.  But it may may not be made any bigger.  */
17996       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17997     }
17998
17999   /* Likewise for the case where the base is too small.  */
18000   if (base_off <= -0x1000)
18001     {
18002       base_off = -0x1000 + 1;
18003       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18004     }
18005
18006   /* Offset of the first STP/LDP.  */
18007   new_off_1 = off_val_1 - base_off;
18008
18009   /* Offset of the second STP/LDP.  */
18010   new_off_3 = off_val_3 - base_off;
18011
18012   /* The offsets must be within the range of the LDP/STP instructions.  */
18013   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
18014       || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
18015     return false;
18016
18017   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
18018                                                   new_off_1), true);
18019   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
18020                                                   new_off_1 + msize), true);
18021   replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
18022                                                   new_off_3), true);
18023   replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
18024                                                   new_off_3 + msize), true);
18025
18026   if (!aarch64_mem_pair_operand (mem_1, mode)
18027       || !aarch64_mem_pair_operand (mem_3, mode))
18028     return false;
18029
18030   if (code == ZERO_EXTEND)
18031     {
18032       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
18033       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
18034       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
18035       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
18036     }
18037   else if (code == SIGN_EXTEND)
18038     {
18039       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
18040       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
18041       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
18042       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
18043     }
18044
18045   if (load)
18046     {
18047       operands[0] = temp_operands[0];
18048       operands[1] = mem_1;
18049       operands[2] = temp_operands[2];
18050       operands[3] = mem_2;
18051       operands[4] = temp_operands[4];
18052       operands[5] = mem_3;
18053       operands[6] = temp_operands[6];
18054       operands[7] = mem_4;
18055     }
18056   else
18057     {
18058       operands[0] = mem_1;
18059       operands[1] = temp_operands[1];
18060       operands[2] = mem_2;
18061       operands[3] = temp_operands[3];
18062       operands[4] = mem_3;
18063       operands[5] = temp_operands[5];
18064       operands[6] = mem_4;
18065       operands[7] = temp_operands[7];
18066     }
18067
18068   /* Emit adjusting instruction.  */
18069   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
18070   /* Emit ldp/stp instructions.  */
18071   t1 = gen_rtx_SET (operands[0], operands[1]);
18072   t2 = gen_rtx_SET (operands[2], operands[3]);
18073   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
18074   t1 = gen_rtx_SET (operands[4], operands[5]);
18075   t2 = gen_rtx_SET (operands[6], operands[7]);
18076   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
18077   return true;
18078 }
18079
18080 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
18081    it isn't worth branching around empty masked ops (including masked
18082    stores).  */
18083
18084 static bool
18085 aarch64_empty_mask_is_expensive (unsigned)
18086 {
18087   return false;
18088 }
18089
18090 /* Return 1 if pseudo register should be created and used to hold
18091    GOT address for PIC code.  */
18092
18093 bool
18094 aarch64_use_pseudo_pic_reg (void)
18095 {
18096   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
18097 }
18098
18099 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
18100
18101 static int
18102 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
18103 {
18104   switch (XINT (x, 1))
18105     {
18106     case UNSPEC_GOTSMALLPIC:
18107     case UNSPEC_GOTSMALLPIC28K:
18108     case UNSPEC_GOTTINYPIC:
18109       return 0;
18110     default:
18111       break;
18112     }
18113
18114   return default_unspec_may_trap_p (x, flags);
18115 }
18116
18117
18118 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
18119    return the log2 of that value.  Otherwise return -1.  */
18120
18121 int
18122 aarch64_fpconst_pow_of_2 (rtx x)
18123 {
18124   const REAL_VALUE_TYPE *r;
18125
18126   if (!CONST_DOUBLE_P (x))
18127     return -1;
18128
18129   r = CONST_DOUBLE_REAL_VALUE (x);
18130
18131   if (REAL_VALUE_NEGATIVE (*r)
18132       || REAL_VALUE_ISNAN (*r)
18133       || REAL_VALUE_ISINF (*r)
18134       || !real_isinteger (r, DFmode))
18135     return -1;
18136
18137   return exact_log2 (real_to_integer (r));
18138 }
18139
18140 /* If X is a vector of equal CONST_DOUBLE values and that value is
18141    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
18142
18143 int
18144 aarch64_vec_fpconst_pow_of_2 (rtx x)
18145 {
18146   int nelts;
18147   if (GET_CODE (x) != CONST_VECTOR
18148       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
18149     return -1;
18150
18151   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
18152     return -1;
18153
18154   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
18155   if (firstval <= 0)
18156     return -1;
18157
18158   for (int i = 1; i < nelts; i++)
18159     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
18160       return -1;
18161
18162   return firstval;
18163 }
18164
18165 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
18166    to float.
18167
18168    __fp16 always promotes through this hook.
18169    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
18170    through the generic excess precision logic rather than here.  */
18171
18172 static tree
18173 aarch64_promoted_type (const_tree t)
18174 {
18175   if (SCALAR_FLOAT_TYPE_P (t)
18176       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
18177     return float_type_node;
18178
18179   return NULL_TREE;
18180 }
18181
18182 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
18183
18184 static bool
18185 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
18186                            optimization_type opt_type)
18187 {
18188   switch (op)
18189     {
18190     case rsqrt_optab:
18191       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
18192
18193     default:
18194       return true;
18195     }
18196 }
18197
18198 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
18199
18200 static unsigned int
18201 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
18202                                         int *offset)
18203 {
18204   /* Polynomial invariant 1 == (VG / 2) - 1.  */
18205   gcc_assert (i == 1);
18206   *factor = 2;
18207   *offset = 1;
18208   return AARCH64_DWARF_VG;
18209 }
18210
18211 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
18212    if MODE is HFmode, and punt to the generic implementation otherwise.  */
18213
18214 static bool
18215 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
18216 {
18217   return (mode == HFmode
18218           ? true
18219           : default_libgcc_floating_mode_supported_p (mode));
18220 }
18221
18222 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
18223    if MODE is HFmode, and punt to the generic implementation otherwise.  */
18224
18225 static bool
18226 aarch64_scalar_mode_supported_p (scalar_mode mode)
18227 {
18228   return (mode == HFmode
18229           ? true
18230           : default_scalar_mode_supported_p (mode));
18231 }
18232
18233 /* Set the value of FLT_EVAL_METHOD.
18234    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
18235
18236     0: evaluate all operations and constants, whose semantic type has at
18237        most the range and precision of type float, to the range and
18238        precision of float; evaluate all other operations and constants to
18239        the range and precision of the semantic type;
18240
18241     N, where _FloatN is a supported interchange floating type
18242        evaluate all operations and constants, whose semantic type has at
18243        most the range and precision of _FloatN type, to the range and
18244        precision of the _FloatN type; evaluate all other operations and
18245        constants to the range and precision of the semantic type;
18246
18247    If we have the ARMv8.2-A extensions then we support _Float16 in native
18248    precision, so we should set this to 16.  Otherwise, we support the type,
18249    but want to evaluate expressions in float precision, so set this to
18250    0.  */
18251
18252 static enum flt_eval_method
18253 aarch64_excess_precision (enum excess_precision_type type)
18254 {
18255   switch (type)
18256     {
18257       case EXCESS_PRECISION_TYPE_FAST:
18258       case EXCESS_PRECISION_TYPE_STANDARD:
18259         /* We can calculate either in 16-bit range and precision or
18260            32-bit range and precision.  Make that decision based on whether
18261            we have native support for the ARMv8.2-A 16-bit floating-point
18262            instructions or not.  */
18263         return (TARGET_FP_F16INST
18264                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
18265                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
18266       case EXCESS_PRECISION_TYPE_IMPLICIT:
18267         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
18268       default:
18269         gcc_unreachable ();
18270     }
18271   return FLT_EVAL_METHOD_UNPREDICTABLE;
18272 }
18273
18274 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
18275    scheduled for speculative execution.  Reject the long-running division
18276    and square-root instructions.  */
18277
18278 static bool
18279 aarch64_sched_can_speculate_insn (rtx_insn *insn)
18280 {
18281   switch (get_attr_type (insn))
18282     {
18283       case TYPE_SDIV:
18284       case TYPE_UDIV:
18285       case TYPE_FDIVS:
18286       case TYPE_FDIVD:
18287       case TYPE_FSQRTS:
18288       case TYPE_FSQRTD:
18289       case TYPE_NEON_FP_SQRT_S:
18290       case TYPE_NEON_FP_SQRT_D:
18291       case TYPE_NEON_FP_SQRT_S_Q:
18292       case TYPE_NEON_FP_SQRT_D_Q:
18293       case TYPE_NEON_FP_DIV_S:
18294       case TYPE_NEON_FP_DIV_D:
18295       case TYPE_NEON_FP_DIV_S_Q:
18296       case TYPE_NEON_FP_DIV_D_Q:
18297         return false;
18298       default:
18299         return true;
18300     }
18301 }
18302
18303 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
18304
18305 static int
18306 aarch64_compute_pressure_classes (reg_class *classes)
18307 {
18308   int i = 0;
18309   classes[i++] = GENERAL_REGS;
18310   classes[i++] = FP_REGS;
18311   /* PR_REGS isn't a useful pressure class because many predicate pseudo
18312      registers need to go in PR_LO_REGS at some point during their
18313      lifetime.  Splitting it into two halves has the effect of making
18314      all predicates count against PR_LO_REGS, so that we try whenever
18315      possible to restrict the number of live predicates to 8.  This
18316      greatly reduces the amount of spilling in certain loops.  */
18317   classes[i++] = PR_LO_REGS;
18318   classes[i++] = PR_HI_REGS;
18319   return i;
18320 }
18321
18322 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
18323
18324 static bool
18325 aarch64_can_change_mode_class (machine_mode from,
18326                                machine_mode to, reg_class_t)
18327 {
18328   if (BYTES_BIG_ENDIAN)
18329     {
18330       bool from_sve_p = aarch64_sve_data_mode_p (from);
18331       bool to_sve_p = aarch64_sve_data_mode_p (to);
18332
18333       /* Don't allow changes between SVE data modes and non-SVE modes.
18334          See the comment at the head of aarch64-sve.md for details.  */
18335       if (from_sve_p != to_sve_p)
18336         return false;
18337
18338       /* Don't allow changes in element size: lane 0 of the new vector
18339          would not then be lane 0 of the old vector.  See the comment
18340          above aarch64_maybe_expand_sve_subreg_move for a more detailed
18341          description.
18342
18343          In the worst case, this forces a register to be spilled in
18344          one mode and reloaded in the other, which handles the
18345          endianness correctly.  */
18346       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
18347         return false;
18348     }
18349   return true;
18350 }
18351
18352 /* Implement TARGET_EARLY_REMAT_MODES.  */
18353
18354 static void
18355 aarch64_select_early_remat_modes (sbitmap modes)
18356 {
18357   /* SVE values are not normally live across a call, so it should be
18358      worth doing early rematerialization even in VL-specific mode.  */
18359   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
18360     {
18361       machine_mode mode = (machine_mode) i;
18362       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
18363       if (vec_flags & VEC_ANY_SVE)
18364         bitmap_set_bit (modes, i);
18365     }
18366 }
18367
18368 /* Override the default target speculation_safe_value.  */
18369 static rtx
18370 aarch64_speculation_safe_value (machine_mode mode,
18371                                 rtx result, rtx val, rtx failval)
18372 {
18373   /* Maybe we should warn if falling back to hard barriers.  They are
18374      likely to be noticably more expensive than the alternative below.  */
18375   if (!aarch64_track_speculation)
18376     return default_speculation_safe_value (mode, result, val, failval);
18377
18378   if (!REG_P (val))
18379     val = copy_to_mode_reg (mode, val);
18380
18381   if (!aarch64_reg_or_zero (failval, mode))
18382     failval = copy_to_mode_reg (mode, failval);
18383
18384   emit_insn (gen_despeculate_copy (mode, result, val, failval));
18385   return result;
18386 }
18387
18388 /* Implement TARGET_ESTIMATED_POLY_VALUE.
18389    Look into the tuning structure for an estimate.
18390    VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
18391    Advanced SIMD 128 bits.  */
18392
18393 static HOST_WIDE_INT
18394 aarch64_estimated_poly_value (poly_int64 val)
18395 {
18396   enum aarch64_sve_vector_bits_enum width_source
18397     = aarch64_tune_params.sve_width;
18398
18399   /* If we still don't have an estimate, use the default.  */
18400   if (width_source == SVE_SCALABLE)
18401     return default_estimated_poly_value (val);
18402
18403   HOST_WIDE_INT over_128 = width_source - 128;
18404   return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
18405 }
18406
18407 /* Target-specific selftests.  */
18408
18409 #if CHECKING_P
18410
18411 namespace selftest {
18412
18413 /* Selftest for the RTL loader.
18414    Verify that the RTL loader copes with a dump from
18415    print_rtx_function.  This is essentially just a test that class
18416    function_reader can handle a real dump, but it also verifies
18417    that lookup_reg_by_dump_name correctly handles hard regs.
18418    The presence of hard reg names in the dump means that the test is
18419    target-specific, hence it is in this file.  */
18420
18421 static void
18422 aarch64_test_loading_full_dump ()
18423 {
18424   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
18425
18426   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
18427
18428   rtx_insn *insn_1 = get_insn_by_uid (1);
18429   ASSERT_EQ (NOTE, GET_CODE (insn_1));
18430
18431   rtx_insn *insn_15 = get_insn_by_uid (15);
18432   ASSERT_EQ (INSN, GET_CODE (insn_15));
18433   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
18434
18435   /* Verify crtl->return_rtx.  */
18436   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
18437   ASSERT_EQ (0, REGNO (crtl->return_rtx));
18438   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
18439 }
18440
18441 /* Run all target-specific selftests.  */
18442
18443 static void
18444 aarch64_run_selftests (void)
18445 {
18446   aarch64_test_loading_full_dump ();
18447 }
18448
18449 } // namespace selftest
18450
18451 #endif /* #if CHECKING_P */
18452
18453 #undef TARGET_ADDRESS_COST
18454 #define TARGET_ADDRESS_COST aarch64_address_cost
18455
18456 /* This hook will determines whether unnamed bitfields affect the alignment
18457    of the containing structure.  The hook returns true if the structure
18458    should inherit the alignment requirements of an unnamed bitfield's
18459    type.  */
18460 #undef TARGET_ALIGN_ANON_BITFIELD
18461 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
18462
18463 #undef TARGET_ASM_ALIGNED_DI_OP
18464 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
18465
18466 #undef TARGET_ASM_ALIGNED_HI_OP
18467 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
18468
18469 #undef TARGET_ASM_ALIGNED_SI_OP
18470 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
18471
18472 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
18473 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
18474   hook_bool_const_tree_hwi_hwi_const_tree_true
18475
18476 #undef TARGET_ASM_FILE_START
18477 #define TARGET_ASM_FILE_START aarch64_start_file
18478
18479 #undef TARGET_ASM_OUTPUT_MI_THUNK
18480 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
18481
18482 #undef TARGET_ASM_SELECT_RTX_SECTION
18483 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
18484
18485 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
18486 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
18487
18488 #undef TARGET_BUILD_BUILTIN_VA_LIST
18489 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
18490
18491 #undef TARGET_CALLEE_COPIES
18492 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
18493
18494 #undef TARGET_CAN_ELIMINATE
18495 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
18496
18497 #undef TARGET_CAN_INLINE_P
18498 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
18499
18500 #undef TARGET_CANNOT_FORCE_CONST_MEM
18501 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
18502
18503 #undef TARGET_CASE_VALUES_THRESHOLD
18504 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
18505
18506 #undef TARGET_CONDITIONAL_REGISTER_USAGE
18507 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
18508
18509 /* Only the least significant bit is used for initialization guard
18510    variables.  */
18511 #undef TARGET_CXX_GUARD_MASK_BIT
18512 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
18513
18514 #undef TARGET_C_MODE_FOR_SUFFIX
18515 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
18516
18517 #ifdef TARGET_BIG_ENDIAN_DEFAULT
18518 #undef  TARGET_DEFAULT_TARGET_FLAGS
18519 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
18520 #endif
18521
18522 #undef TARGET_CLASS_MAX_NREGS
18523 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
18524
18525 #undef TARGET_BUILTIN_DECL
18526 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
18527
18528 #undef TARGET_BUILTIN_RECIPROCAL
18529 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
18530
18531 #undef TARGET_C_EXCESS_PRECISION
18532 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
18533
18534 #undef  TARGET_EXPAND_BUILTIN
18535 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
18536
18537 #undef TARGET_EXPAND_BUILTIN_VA_START
18538 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
18539
18540 #undef TARGET_FOLD_BUILTIN
18541 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
18542
18543 #undef TARGET_FUNCTION_ARG
18544 #define TARGET_FUNCTION_ARG aarch64_function_arg
18545
18546 #undef TARGET_FUNCTION_ARG_ADVANCE
18547 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
18548
18549 #undef TARGET_FUNCTION_ARG_BOUNDARY
18550 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
18551
18552 #undef TARGET_FUNCTION_ARG_PADDING
18553 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
18554
18555 #undef TARGET_GET_RAW_RESULT_MODE
18556 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
18557 #undef TARGET_GET_RAW_ARG_MODE
18558 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
18559
18560 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
18561 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
18562
18563 #undef TARGET_FUNCTION_VALUE
18564 #define TARGET_FUNCTION_VALUE aarch64_function_value
18565
18566 #undef TARGET_FUNCTION_VALUE_REGNO_P
18567 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
18568
18569 #undef TARGET_GIMPLE_FOLD_BUILTIN
18570 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
18571
18572 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
18573 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
18574
18575 #undef  TARGET_INIT_BUILTINS
18576 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
18577
18578 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
18579 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
18580   aarch64_ira_change_pseudo_allocno_class
18581
18582 #undef TARGET_LEGITIMATE_ADDRESS_P
18583 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
18584
18585 #undef TARGET_LEGITIMATE_CONSTANT_P
18586 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
18587
18588 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
18589 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
18590   aarch64_legitimize_address_displacement
18591
18592 #undef TARGET_LIBGCC_CMP_RETURN_MODE
18593 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
18594
18595 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
18596 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
18597 aarch64_libgcc_floating_mode_supported_p
18598
18599 #undef TARGET_MANGLE_TYPE
18600 #define TARGET_MANGLE_TYPE aarch64_mangle_type
18601
18602 #undef TARGET_MEMORY_MOVE_COST
18603 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
18604
18605 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
18606 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
18607
18608 #undef TARGET_MUST_PASS_IN_STACK
18609 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
18610
18611 /* This target hook should return true if accesses to volatile bitfields
18612    should use the narrowest mode possible.  It should return false if these
18613    accesses should use the bitfield container type.  */
18614 #undef TARGET_NARROW_VOLATILE_BITFIELD
18615 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
18616
18617 #undef  TARGET_OPTION_OVERRIDE
18618 #define TARGET_OPTION_OVERRIDE aarch64_override_options
18619
18620 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
18621 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
18622   aarch64_override_options_after_change
18623
18624 #undef TARGET_OPTION_SAVE
18625 #define TARGET_OPTION_SAVE aarch64_option_save
18626
18627 #undef TARGET_OPTION_RESTORE
18628 #define TARGET_OPTION_RESTORE aarch64_option_restore
18629
18630 #undef TARGET_OPTION_PRINT
18631 #define TARGET_OPTION_PRINT aarch64_option_print
18632
18633 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
18634 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
18635
18636 #undef TARGET_SET_CURRENT_FUNCTION
18637 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
18638
18639 #undef TARGET_PASS_BY_REFERENCE
18640 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
18641
18642 #undef TARGET_PREFERRED_RELOAD_CLASS
18643 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
18644
18645 #undef TARGET_SCHED_REASSOCIATION_WIDTH
18646 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
18647
18648 #undef TARGET_PROMOTED_TYPE
18649 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
18650
18651 #undef TARGET_SECONDARY_RELOAD
18652 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
18653
18654 #undef TARGET_SHIFT_TRUNCATION_MASK
18655 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
18656
18657 #undef TARGET_SETUP_INCOMING_VARARGS
18658 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
18659
18660 #undef TARGET_STRUCT_VALUE_RTX
18661 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
18662
18663 #undef TARGET_REGISTER_MOVE_COST
18664 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
18665
18666 #undef TARGET_RETURN_IN_MEMORY
18667 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
18668
18669 #undef TARGET_RETURN_IN_MSB
18670 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
18671
18672 #undef TARGET_RTX_COSTS
18673 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
18674
18675 #undef TARGET_SCALAR_MODE_SUPPORTED_P
18676 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
18677
18678 #undef TARGET_SCHED_ISSUE_RATE
18679 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
18680
18681 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
18682 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
18683   aarch64_sched_first_cycle_multipass_dfa_lookahead
18684
18685 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
18686 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
18687   aarch64_first_cycle_multipass_dfa_lookahead_guard
18688
18689 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
18690 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
18691   aarch64_get_separate_components
18692
18693 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
18694 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
18695   aarch64_components_for_bb
18696
18697 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
18698 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
18699   aarch64_disqualify_components
18700
18701 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
18702 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
18703   aarch64_emit_prologue_components
18704
18705 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
18706 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
18707   aarch64_emit_epilogue_components
18708
18709 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
18710 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
18711   aarch64_set_handled_components
18712
18713 #undef TARGET_TRAMPOLINE_INIT
18714 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
18715
18716 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
18717 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
18718
18719 #undef TARGET_VECTOR_MODE_SUPPORTED_P
18720 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
18721
18722 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
18723 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
18724   aarch64_builtin_support_vector_misalignment
18725
18726 #undef TARGET_ARRAY_MODE
18727 #define TARGET_ARRAY_MODE aarch64_array_mode
18728
18729 #undef TARGET_ARRAY_MODE_SUPPORTED_P
18730 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
18731
18732 #undef TARGET_VECTORIZE_ADD_STMT_COST
18733 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
18734
18735 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
18736 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
18737   aarch64_builtin_vectorization_cost
18738
18739 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
18740 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
18741
18742 #undef TARGET_VECTORIZE_BUILTINS
18743 #define TARGET_VECTORIZE_BUILTINS
18744
18745 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
18746 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
18747   aarch64_builtin_vectorized_function
18748
18749 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
18750 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
18751   aarch64_autovectorize_vector_sizes
18752
18753 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
18754 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
18755   aarch64_atomic_assign_expand_fenv
18756
18757 /* Section anchor support.  */
18758
18759 #undef TARGET_MIN_ANCHOR_OFFSET
18760 #define TARGET_MIN_ANCHOR_OFFSET -256
18761
18762 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
18763    byte offset; we can do much more for larger data types, but have no way
18764    to determine the size of the access.  We assume accesses are aligned.  */
18765 #undef TARGET_MAX_ANCHOR_OFFSET
18766 #define TARGET_MAX_ANCHOR_OFFSET 4095
18767
18768 #undef TARGET_VECTOR_ALIGNMENT
18769 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
18770
18771 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
18772 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
18773   aarch64_vectorize_preferred_vector_alignment
18774 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
18775 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
18776   aarch64_simd_vector_alignment_reachable
18777
18778 /* vec_perm support.  */
18779
18780 #undef TARGET_VECTORIZE_VEC_PERM_CONST
18781 #define TARGET_VECTORIZE_VEC_PERM_CONST \
18782   aarch64_vectorize_vec_perm_const
18783
18784 #undef TARGET_VECTORIZE_GET_MASK_MODE
18785 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
18786 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
18787 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
18788   aarch64_empty_mask_is_expensive
18789 #undef TARGET_PREFERRED_ELSE_VALUE
18790 #define TARGET_PREFERRED_ELSE_VALUE \
18791   aarch64_preferred_else_value
18792
18793 #undef TARGET_INIT_LIBFUNCS
18794 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
18795
18796 #undef TARGET_FIXED_CONDITION_CODE_REGS
18797 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
18798
18799 #undef TARGET_FLAGS_REGNUM
18800 #define TARGET_FLAGS_REGNUM CC_REGNUM
18801
18802 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
18803 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
18804
18805 #undef TARGET_ASAN_SHADOW_OFFSET
18806 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
18807
18808 #undef TARGET_LEGITIMIZE_ADDRESS
18809 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
18810
18811 #undef TARGET_SCHED_CAN_SPECULATE_INSN
18812 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
18813
18814 #undef TARGET_CAN_USE_DOLOOP_P
18815 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
18816
18817 #undef TARGET_SCHED_ADJUST_PRIORITY
18818 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
18819
18820 #undef TARGET_SCHED_MACRO_FUSION_P
18821 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
18822
18823 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
18824 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
18825
18826 #undef TARGET_SCHED_FUSION_PRIORITY
18827 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
18828
18829 #undef TARGET_UNSPEC_MAY_TRAP_P
18830 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
18831
18832 #undef TARGET_USE_PSEUDO_PIC_REG
18833 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
18834
18835 #undef TARGET_PRINT_OPERAND
18836 #define TARGET_PRINT_OPERAND aarch64_print_operand
18837
18838 #undef TARGET_PRINT_OPERAND_ADDRESS
18839 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
18840
18841 #undef TARGET_OPTAB_SUPPORTED_P
18842 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
18843
18844 #undef TARGET_OMIT_STRUCT_RETURN_REG
18845 #define TARGET_OMIT_STRUCT_RETURN_REG true
18846
18847 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
18848 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
18849   aarch64_dwarf_poly_indeterminate_value
18850
18851 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
18852 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
18853 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
18854
18855 #undef TARGET_HARD_REGNO_NREGS
18856 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
18857 #undef TARGET_HARD_REGNO_MODE_OK
18858 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
18859
18860 #undef TARGET_MODES_TIEABLE_P
18861 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
18862
18863 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
18864 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
18865   aarch64_hard_regno_call_part_clobbered
18866
18867 #undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
18868 #define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \
18869   aarch64_remove_extra_call_preserved_regs
18870
18871 #undef TARGET_CONSTANT_ALIGNMENT
18872 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
18873
18874 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
18875 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
18876   aarch64_stack_clash_protection_alloca_probe_range
18877
18878 #undef TARGET_COMPUTE_PRESSURE_CLASSES
18879 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
18880
18881 #undef TARGET_CAN_CHANGE_MODE_CLASS
18882 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
18883
18884 #undef TARGET_SELECT_EARLY_REMAT_MODES
18885 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
18886
18887 #undef TARGET_SPECULATION_SAFE_VALUE
18888 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
18889
18890 #undef TARGET_ESTIMATED_POLY_VALUE
18891 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
18892
18893 #undef TARGET_ATTRIBUTE_TABLE
18894 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
18895
18896 #if CHECKING_P
18897 #undef TARGET_RUN_TARGET_SELFTESTS
18898 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
18899 #endif /* #if CHECKING_P */
18900
18901 struct gcc_target targetm = TARGET_INITIALIZER;
18902
18903 #include "gt-aarch64.h"