gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2019 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #include "config.h"
  24 #define INCLUDE_STRING
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "memmodel.h"
  32 #include "gimple.h"
  33 #include "cfghooks.h"
  34 #include "cfgloop.h"
  35 #include "df.h"
  36 #include "tm_p.h"
  37 #include "stringpool.h"
  38 #include "attribs.h"
  39 #include "optabs.h"
  40 #include "regs.h"
  41 #include "emit-rtl.h"
  42 #include "recog.h"
  43 #include "diagnostic.h"
  44 #include "insn-attr.h"
  45 #include "alias.h"
  46 #include "fold-const.h"
  47 #include "stor-layout.h"
  48 #include "calls.h"
  49 #include "varasm.h"
  50 #include "output.h"
  51 #include "flags.h"
  52 #include "explow.h"
  53 #include "expr.h"
  54 #include "reload.h"
  55 #include "langhooks.h"
  56 #include "opts.h"
  57 #include "params.h"
  58 #include "gimplify.h"
  59 #include "dwarf2.h"
  60 #include "gimple-iterator.h"
  61 #include "tree-vectorizer.h"
  62 #include "aarch64-cost-tables.h"
  63 #include "dumpfile.h"
  64 #include "builtins.h"
  65 #include "rtl-iter.h"
  66 #include "tm-constrs.h"
  67 #include "sched-int.h"
  68 #include "target-globals.h"
  69 #include "common/common-target.h"
  70 #include "cfgrtl.h"
  71 #include "selftest.h"
  72 #include "selftest-rtl.h"
  73 #include "rtx-vector-builder.h"
  74
  75 /* This file should be included last.  */
  76 #include "target-def.h"
  77
  78 /* Defined for convenience.  */
  79 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  80
  81 /* Information about a legitimate vector immediate operand.  */
  82 struct simd_immediate_info
  83 {
  84   enum insn_type { MOV, MVN };
  85   enum modifier_type { LSL, MSL };
  86
  87   simd_immediate_info () {}
  88   simd_immediate_info (scalar_float_mode, rtx);
  89   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
  90                        insn_type = MOV, modifier_type = LSL,
  91                        unsigned int = 0);
  92   simd_immediate_info (scalar_mode, rtx, rtx);
  93
  94   /* The mode of the elements.  */
  95   scalar_mode elt_mode;
  96
  97   /* The value of each element if all elements are the same, or the
  98      first value if the constant is a series.  */
  99   rtx value;
 100
 101   /* The value of the step if the constant is a series, null otherwise.  */
 102   rtx step;
 103
 104   /* The instruction to use to move the immediate into a vector.  */
 105   insn_type insn;
 106
 107   /* The kind of shift modifier to use, and the number of bits to shift.
 108      This is (LSL, 0) if no shift is needed.  */
 109   modifier_type modifier;
 110   unsigned int shift;
 111 };
 112
 113 /* Construct a floating-point immediate in which each element has mode
 114    ELT_MODE_IN and value VALUE_IN.  */
 115 inline simd_immediate_info
 116 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 117   : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
 118     modifier (LSL), shift (0)
 119 {}
 120
 121 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 122    and value VALUE_IN.  The other parameters are as for the structure
 123    fields.  */
 124 inline simd_immediate_info
 125 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 126                        unsigned HOST_WIDE_INT value_in,
 127                        insn_type insn_in, modifier_type modifier_in,
 128                        unsigned int shift_in)
 129   : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
 130     step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
 131 {}
 132
 133 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 134    and where element I is equal to VALUE_IN + I * STEP_IN.  */
 135 inline simd_immediate_info
 136 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
 137   : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
 138     modifier (LSL), shift (0)
 139 {}
 140
 141 /* The current code model.  */
 142 enum aarch64_code_model aarch64_cmodel;
 143
 144 /* The number of 64-bit elements in an SVE vector.  */
 145 poly_uint16 aarch64_sve_vg;
 146
 147 #ifdef HAVE_AS_TLS
 148 #undef TARGET_HAVE_TLS
 149 #define TARGET_HAVE_TLS 1
 150 #endif
 151
 152 static bool aarch64_composite_type_p (const_tree, machine_mode);
 153 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 154                                                      const_tree,
 155                                                      machine_mode *, int *,
 156                                                      bool *);
 157 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 158 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 159 static void aarch64_override_options_after_change (void);
 160 static bool aarch64_vector_mode_supported_p (machine_mode);
 161 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 162 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 163                                                          const_tree type,
 164                                                          int misalignment,
 165                                                          bool is_packed);
 166 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 167 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 168                                             aarch64_addr_query_type);
 169 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
 170
 171 /* Major revision number of the ARM Architecture implemented by the target.  */
 172 unsigned aarch64_architecture_version;
 173
 174 /* The processor for which instructions should be scheduled.  */
 175 enum aarch64_processor aarch64_tune = cortexa53;
 176
 177 /* Mask to specify which instruction scheduling options should be used.  */
 178 unsigned long aarch64_tune_flags = 0;
 179
 180 /* Global flag for PC relative loads.  */
 181 bool aarch64_pcrelative_literal_loads;
 182
 183 /* Global flag for whether frame pointer is enabled.  */
 184 bool aarch64_use_frame_pointer;
 185
 186 #define BRANCH_PROTECT_STR_MAX 255
 187 char *accepted_branch_protection_string = NULL;
 188
 189 static enum aarch64_parse_opt_result
 190 aarch64_parse_branch_protection (const char*, char**);
 191
 192 /* Support for command line parsing of boolean flags in the tuning
 193    structures.  */
 194 struct aarch64_flag_desc
 195 {
 196   const char* name;
 197   unsigned int flag;
 198 };
 199
 200 #define AARCH64_FUSION_PAIR(name, internal_name) \
 201   { name, AARCH64_FUSE_##internal_name },
 202 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 203 {
 204   { "none", AARCH64_FUSE_NOTHING },
 205 #include "aarch64-fusion-pairs.def"
 206   { "all", AARCH64_FUSE_ALL },
 207   { NULL, AARCH64_FUSE_NOTHING }
 208 };
 209
 210 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 211   { name, AARCH64_EXTRA_TUNE_##internal_name },
 212 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 213 {
 214   { "none", AARCH64_EXTRA_TUNE_NONE },
 215 #include "aarch64-tuning-flags.def"
 216   { "all", AARCH64_EXTRA_TUNE_ALL },
 217   { NULL, AARCH64_EXTRA_TUNE_NONE }
 218 };
 219
 220 /* Tuning parameters.  */
 221
 222 static const struct cpu_addrcost_table generic_addrcost_table =
 223 {
 224     {
 225       1, /* hi  */
 226       0, /* si  */
 227       0, /* di  */
 228       1, /* ti  */
 229     },
 230   0, /* pre_modify  */
 231   0, /* post_modify  */
 232   0, /* register_offset  */
 233   0, /* register_sextend  */
 234   0, /* register_zextend  */
 235   0 /* imm_offset  */
 236 };
 237
 238 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 239 {
 240     {
 241       0, /* hi  */
 242       0, /* si  */
 243       0, /* di  */
 244       2, /* ti  */
 245     },
 246   0, /* pre_modify  */
 247   0, /* post_modify  */
 248   1, /* register_offset  */
 249   1, /* register_sextend  */
 250   2, /* register_zextend  */
 251   0, /* imm_offset  */
 252 };
 253
 254 static const struct cpu_addrcost_table xgene1_addrcost_table =
 255 {
 256     {
 257       1, /* hi  */
 258       0, /* si  */
 259       0, /* di  */
 260       1, /* ti  */
 261     },
 262   1, /* pre_modify  */
 263   1, /* post_modify  */
 264   0, /* register_offset  */
 265   1, /* register_sextend  */
 266   1, /* register_zextend  */
 267   0, /* imm_offset  */
 268 };
 269
 270 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 271 {
 272     {
 273       1, /* hi  */
 274       1, /* si  */
 275       1, /* di  */
 276       2, /* ti  */
 277     },
 278   0, /* pre_modify  */
 279   0, /* post_modify  */
 280   2, /* register_offset  */
 281   3, /* register_sextend  */
 282   3, /* register_zextend  */
 283   0, /* imm_offset  */
 284 };
 285
 286 static const struct cpu_addrcost_table tsv110_addrcost_table =
 287 {
 288     {
 289       1, /* hi  */
 290       0, /* si  */
 291       0, /* di  */
 292       1, /* ti  */
 293     },
 294   0, /* pre_modify  */
 295   0, /* post_modify  */
 296   0, /* register_offset  */
 297   1, /* register_sextend  */
 298   1, /* register_zextend  */
 299   0, /* imm_offset  */
 300 };
 301
 302 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 303 {
 304     {
 305       1, /* hi  */
 306       1, /* si  */
 307       1, /* di  */
 308       2, /* ti  */
 309     },
 310   1, /* pre_modify  */
 311   1, /* post_modify  */
 312   3, /* register_offset  */
 313   3, /* register_sextend  */
 314   3, /* register_zextend  */
 315   2, /* imm_offset  */
 316 };
 317
 318 static const struct cpu_regmove_cost generic_regmove_cost =
 319 {
 320   1, /* GP2GP  */
 321   /* Avoid the use of slow int<->fp moves for spilling by setting
 322      their cost higher than memmov_cost.  */
 323   5, /* GP2FP  */
 324   5, /* FP2GP  */
 325   2 /* FP2FP  */
 326 };
 327
 328 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 329 {
 330   1, /* GP2GP  */
 331   /* Avoid the use of slow int<->fp moves for spilling by setting
 332      their cost higher than memmov_cost.  */
 333   5, /* GP2FP  */
 334   5, /* FP2GP  */
 335   2 /* FP2FP  */
 336 };
 337
 338 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 339 {
 340   1, /* GP2GP  */
 341   /* Avoid the use of slow int<->fp moves for spilling by setting
 342      their cost higher than memmov_cost.  */
 343   5, /* GP2FP  */
 344   5, /* FP2GP  */
 345   2 /* FP2FP  */
 346 };
 347
 348 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 349 {
 350   1, /* GP2GP  */
 351   /* Avoid the use of slow int<->fp moves for spilling by setting
 352      their cost higher than memmov_cost (actual, 4 and 9).  */
 353   9, /* GP2FP  */
 354   9, /* FP2GP  */
 355   1 /* FP2FP  */
 356 };
 357
 358 static const struct cpu_regmove_cost thunderx_regmove_cost =
 359 {
 360   2, /* GP2GP  */
 361   2, /* GP2FP  */
 362   6, /* FP2GP  */
 363   4 /* FP2FP  */
 364 };
 365
 366 static const struct cpu_regmove_cost xgene1_regmove_cost =
 367 {
 368   1, /* GP2GP  */
 369   /* Avoid the use of slow int<->fp moves for spilling by setting
 370      their cost higher than memmov_cost.  */
 371   8, /* GP2FP  */
 372   8, /* FP2GP  */
 373   2 /* FP2FP  */
 374 };
 375
 376 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 377 {
 378   2, /* GP2GP  */
 379   /* Avoid the use of int<->fp moves for spilling.  */
 380   6, /* GP2FP  */
 381   6, /* FP2GP  */
 382   4 /* FP2FP  */
 383 };
 384
 385 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 386 {
 387   1, /* GP2GP  */
 388   /* Avoid the use of int<->fp moves for spilling.  */
 389   8, /* GP2FP  */
 390   8, /* FP2GP  */
 391   4  /* FP2FP  */
 392 };
 393
 394 static const struct cpu_regmove_cost tsv110_regmove_cost =
 395 {
 396   1, /* GP2GP  */
 397   /* Avoid the use of slow int<->fp moves for spilling by setting
 398      their cost higher than memmov_cost.  */
 399   2, /* GP2FP  */
 400   3, /* FP2GP  */
 401   2  /* FP2FP  */
 402 };
 403
 404 /* Generic costs for vector insn classes.  */
 405 static const struct cpu_vector_cost generic_vector_cost =
 406 {
 407   1, /* scalar_int_stmt_cost  */
 408   1, /* scalar_fp_stmt_cost  */
 409   1, /* scalar_load_cost  */
 410   1, /* scalar_store_cost  */
 411   1, /* vec_int_stmt_cost  */
 412   1, /* vec_fp_stmt_cost  */
 413   2, /* vec_permute_cost  */
 414   1, /* vec_to_scalar_cost  */
 415   1, /* scalar_to_vec_cost  */
 416   1, /* vec_align_load_cost  */
 417   1, /* vec_unalign_load_cost  */
 418   1, /* vec_unalign_store_cost  */
 419   1, /* vec_store_cost  */
 420   3, /* cond_taken_branch_cost  */
 421   1 /* cond_not_taken_branch_cost  */
 422 };
 423
 424 /* QDF24XX costs for vector insn classes.  */
 425 static const struct cpu_vector_cost qdf24xx_vector_cost =
 426 {
 427   1, /* scalar_int_stmt_cost  */
 428   1, /* scalar_fp_stmt_cost  */
 429   1, /* scalar_load_cost  */
 430   1, /* scalar_store_cost  */
 431   1, /* vec_int_stmt_cost  */
 432   3, /* vec_fp_stmt_cost  */
 433   2, /* vec_permute_cost  */
 434   1, /* vec_to_scalar_cost  */
 435   1, /* scalar_to_vec_cost  */
 436   1, /* vec_align_load_cost  */
 437   1, /* vec_unalign_load_cost  */
 438   1, /* vec_unalign_store_cost  */
 439   1, /* vec_store_cost  */
 440   3, /* cond_taken_branch_cost  */
 441   1 /* cond_not_taken_branch_cost  */
 442 };
 443
 444 /* ThunderX costs for vector insn classes.  */
 445 static const struct cpu_vector_cost thunderx_vector_cost =
 446 {
 447   1, /* scalar_int_stmt_cost  */
 448   1, /* scalar_fp_stmt_cost  */
 449   3, /* scalar_load_cost  */
 450   1, /* scalar_store_cost  */
 451   4, /* vec_int_stmt_cost  */
 452   1, /* vec_fp_stmt_cost  */
 453   4, /* vec_permute_cost  */
 454   2, /* vec_to_scalar_cost  */
 455   2, /* scalar_to_vec_cost  */
 456   3, /* vec_align_load_cost  */
 457   5, /* vec_unalign_load_cost  */
 458   5, /* vec_unalign_store_cost  */
 459   1, /* vec_store_cost  */
 460   3, /* cond_taken_branch_cost  */
 461   3 /* cond_not_taken_branch_cost  */
 462 };
 463
 464 static const struct cpu_vector_cost tsv110_vector_cost =
 465 {
 466   1, /* scalar_int_stmt_cost  */
 467   1, /* scalar_fp_stmt_cost  */
 468   5, /* scalar_load_cost  */
 469   1, /* scalar_store_cost  */
 470   2, /* vec_int_stmt_cost  */
 471   2, /* vec_fp_stmt_cost  */
 472   2, /* vec_permute_cost  */
 473   3, /* vec_to_scalar_cost  */
 474   2, /* scalar_to_vec_cost  */
 475   5, /* vec_align_load_cost  */
 476   5, /* vec_unalign_load_cost  */
 477   1, /* vec_unalign_store_cost  */
 478   1, /* vec_store_cost  */
 479   1, /* cond_taken_branch_cost  */
 480   1 /* cond_not_taken_branch_cost  */
 481 };
 482
 483 /* Generic costs for vector insn classes.  */
 484 static const struct cpu_vector_cost cortexa57_vector_cost =
 485 {
 486   1, /* scalar_int_stmt_cost  */
 487   1, /* scalar_fp_stmt_cost  */
 488   4, /* scalar_load_cost  */
 489   1, /* scalar_store_cost  */
 490   2, /* vec_int_stmt_cost  */
 491   2, /* vec_fp_stmt_cost  */
 492   3, /* vec_permute_cost  */
 493   8, /* vec_to_scalar_cost  */
 494   8, /* scalar_to_vec_cost  */
 495   4, /* vec_align_load_cost  */
 496   4, /* vec_unalign_load_cost  */
 497   1, /* vec_unalign_store_cost  */
 498   1, /* vec_store_cost  */
 499   1, /* cond_taken_branch_cost  */
 500   1 /* cond_not_taken_branch_cost  */
 501 };
 502
 503 static const struct cpu_vector_cost exynosm1_vector_cost =
 504 {
 505   1, /* scalar_int_stmt_cost  */
 506   1, /* scalar_fp_stmt_cost  */
 507   5, /* scalar_load_cost  */
 508   1, /* scalar_store_cost  */
 509   3, /* vec_int_stmt_cost  */
 510   3, /* vec_fp_stmt_cost  */
 511   3, /* vec_permute_cost  */
 512   3, /* vec_to_scalar_cost  */
 513   3, /* scalar_to_vec_cost  */
 514   5, /* vec_align_load_cost  */
 515   5, /* vec_unalign_load_cost  */
 516   1, /* vec_unalign_store_cost  */
 517   1, /* vec_store_cost  */
 518   1, /* cond_taken_branch_cost  */
 519   1 /* cond_not_taken_branch_cost  */
 520 };
 521
 522 /* Generic costs for vector insn classes.  */
 523 static const struct cpu_vector_cost xgene1_vector_cost =
 524 {
 525   1, /* scalar_int_stmt_cost  */
 526   1, /* scalar_fp_stmt_cost  */
 527   5, /* scalar_load_cost  */
 528   1, /* scalar_store_cost  */
 529   2, /* vec_int_stmt_cost  */
 530   2, /* vec_fp_stmt_cost  */
 531   2, /* vec_permute_cost  */
 532   4, /* vec_to_scalar_cost  */
 533   4, /* scalar_to_vec_cost  */
 534   10, /* vec_align_load_cost  */
 535   10, /* vec_unalign_load_cost  */
 536   2, /* vec_unalign_store_cost  */
 537   2, /* vec_store_cost  */
 538   2, /* cond_taken_branch_cost  */
 539   1 /* cond_not_taken_branch_cost  */
 540 };
 541
 542 /* Costs for vector insn classes for Vulcan.  */
 543 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 544 {
 545   1, /* scalar_int_stmt_cost  */
 546   6, /* scalar_fp_stmt_cost  */
 547   4, /* scalar_load_cost  */
 548   1, /* scalar_store_cost  */
 549   5, /* vec_int_stmt_cost  */
 550   6, /* vec_fp_stmt_cost  */
 551   3, /* vec_permute_cost  */
 552   6, /* vec_to_scalar_cost  */
 553   5, /* scalar_to_vec_cost  */
 554   8, /* vec_align_load_cost  */
 555   8, /* vec_unalign_load_cost  */
 556   4, /* vec_unalign_store_cost  */
 557   4, /* vec_store_cost  */
 558   2, /* cond_taken_branch_cost  */
 559   1  /* cond_not_taken_branch_cost  */
 560 };
 561
 562 /* Generic costs for branch instructions.  */
 563 static const struct cpu_branch_cost generic_branch_cost =
 564 {
 565   1,  /* Predictable.  */
 566   3   /* Unpredictable.  */
 567 };
 568
 569 /* Generic approximation modes.  */
 570 static const cpu_approx_modes generic_approx_modes =
 571 {
 572   AARCH64_APPROX_NONE,  /* division  */
 573   AARCH64_APPROX_NONE,  /* sqrt  */
 574   AARCH64_APPROX_NONE   /* recip_sqrt  */
 575 };
 576
 577 /* Approximation modes for Exynos M1.  */
 578 static const cpu_approx_modes exynosm1_approx_modes =
 579 {
 580   AARCH64_APPROX_NONE,  /* division  */
 581   AARCH64_APPROX_ALL,   /* sqrt  */
 582   AARCH64_APPROX_ALL    /* recip_sqrt  */
 583 };
 584
 585 /* Approximation modes for X-Gene 1.  */
 586 static const cpu_approx_modes xgene1_approx_modes =
 587 {
 588   AARCH64_APPROX_NONE,  /* division  */
 589   AARCH64_APPROX_NONE,  /* sqrt  */
 590   AARCH64_APPROX_ALL    /* recip_sqrt  */
 591 };
 592
 593 /* Generic prefetch settings (which disable prefetch).  */
 594 static const cpu_prefetch_tune generic_prefetch_tune =
 595 {
 596   0,                    /* num_slots  */
 597   -1,                   /* l1_cache_size  */
 598   -1,                   /* l1_cache_line_size  */
 599   -1,                   /* l2_cache_size  */
 600   true,                 /* prefetch_dynamic_strides */
 601   -1,                   /* minimum_stride */
 602   -1                    /* default_opt_level  */
 603 };
 604
 605 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 606 {
 607   0,                    /* num_slots  */
 608   -1,                   /* l1_cache_size  */
 609   64,                   /* l1_cache_line_size  */
 610   -1,                   /* l2_cache_size  */
 611   true,                 /* prefetch_dynamic_strides */
 612   -1,                   /* minimum_stride */
 613   -1                    /* default_opt_level  */
 614 };
 615
 616 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 617 {
 618   4,                    /* num_slots  */
 619   32,                   /* l1_cache_size  */
 620   64,                   /* l1_cache_line_size  */
 621   512,                  /* l2_cache_size  */
 622   false,                /* prefetch_dynamic_strides */
 623   2048,                 /* minimum_stride */
 624   3                     /* default_opt_level  */
 625 };
 626
 627 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 628 {
 629   8,                    /* num_slots  */
 630   32,                   /* l1_cache_size  */
 631   128,                  /* l1_cache_line_size  */
 632   16*1024,              /* l2_cache_size  */
 633   true,                 /* prefetch_dynamic_strides */
 634   -1,                   /* minimum_stride */
 635   3                     /* default_opt_level  */
 636 };
 637
 638 static const cpu_prefetch_tune thunderx_prefetch_tune =
 639 {
 640   8,                    /* num_slots  */
 641   32,                   /* l1_cache_size  */
 642   128,                  /* l1_cache_line_size  */
 643   -1,                   /* l2_cache_size  */
 644   true,                 /* prefetch_dynamic_strides */
 645   -1,                   /* minimum_stride */
 646   -1                    /* default_opt_level  */
 647 };
 648
 649 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 650 {
 651   8,                    /* num_slots  */
 652   32,                   /* l1_cache_size  */
 653   64,                   /* l1_cache_line_size  */
 654   256,                  /* l2_cache_size  */
 655   true,                 /* prefetch_dynamic_strides */
 656   -1,                   /* minimum_stride */
 657   -1                    /* default_opt_level  */
 658 };
 659
 660 static const cpu_prefetch_tune tsv110_prefetch_tune =
 661 {
 662   0,                    /* num_slots  */
 663   64,                   /* l1_cache_size  */
 664   64,                   /* l1_cache_line_size  */
 665   512,                  /* l2_cache_size  */
 666   true,                 /* prefetch_dynamic_strides */
 667   -1,                   /* minimum_stride */
 668   -1                    /* default_opt_level  */
 669 };
 670
 671 static const cpu_prefetch_tune xgene1_prefetch_tune =
 672 {
 673   8,                    /* num_slots  */
 674   32,                   /* l1_cache_size  */
 675   64,                   /* l1_cache_line_size  */
 676   256,                  /* l2_cache_size  */
 677   true,                 /* prefetch_dynamic_strides */
 678   -1,                   /* minimum_stride */
 679   -1                    /* default_opt_level  */
 680 };
 681
 682 static const struct tune_params generic_tunings =
 683 {
 684   &cortexa57_extra_costs,
 685   &generic_addrcost_table,
 686   &generic_regmove_cost,
 687   &generic_vector_cost,
 688   &generic_branch_cost,
 689   &generic_approx_modes,
 690   SVE_NOT_IMPLEMENTED, /* sve_width  */
 691   4, /* memmov_cost  */
 692   2, /* issue_rate  */
 693   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 694   "8",  /* function_align.  */
 695   "4",  /* jump_align.  */
 696   "8",  /* loop_align.  */
 697   2,    /* int_reassoc_width.  */
 698   4,    /* fp_reassoc_width.  */
 699   1,    /* vec_reassoc_width.  */
 700   2,    /* min_div_recip_mul_sf.  */
 701   2,    /* min_div_recip_mul_df.  */
 702   0,    /* max_case_values.  */
 703   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 704   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 705   &generic_prefetch_tune
 706 };
 707
 708 static const struct tune_params cortexa35_tunings =
 709 {
 710   &cortexa53_extra_costs,
 711   &generic_addrcost_table,
 712   &cortexa53_regmove_cost,
 713   &generic_vector_cost,
 714   &generic_branch_cost,
 715   &generic_approx_modes,
 716   SVE_NOT_IMPLEMENTED, /* sve_width  */
 717   4, /* memmov_cost  */
 718   1, /* issue_rate  */
 719   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 720    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 721   "16", /* function_align.  */
 722   "4",  /* jump_align.  */
 723   "8",  /* loop_align.  */
 724   2,    /* int_reassoc_width.  */
 725   4,    /* fp_reassoc_width.  */
 726   1,    /* vec_reassoc_width.  */
 727   2,    /* min_div_recip_mul_sf.  */
 728   2,    /* min_div_recip_mul_df.  */
 729   0,    /* max_case_values.  */
 730   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 731   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 732   &generic_prefetch_tune
 733 };
 734
 735 static const struct tune_params cortexa53_tunings =
 736 {
 737   &cortexa53_extra_costs,
 738   &generic_addrcost_table,
 739   &cortexa53_regmove_cost,
 740   &generic_vector_cost,
 741   &generic_branch_cost,
 742   &generic_approx_modes,
 743   SVE_NOT_IMPLEMENTED, /* sve_width  */
 744   4, /* memmov_cost  */
 745   2, /* issue_rate  */
 746   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 747    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 748   "16", /* function_align.  */
 749   "4",  /* jump_align.  */
 750   "8",  /* loop_align.  */
 751   2,    /* int_reassoc_width.  */
 752   4,    /* fp_reassoc_width.  */
 753   1,    /* vec_reassoc_width.  */
 754   2,    /* min_div_recip_mul_sf.  */
 755   2,    /* min_div_recip_mul_df.  */
 756   0,    /* max_case_values.  */
 757   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 758   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 759   &generic_prefetch_tune
 760 };
 761
 762 static const struct tune_params cortexa57_tunings =
 763 {
 764   &cortexa57_extra_costs,
 765   &generic_addrcost_table,
 766   &cortexa57_regmove_cost,
 767   &cortexa57_vector_cost,
 768   &generic_branch_cost,
 769   &generic_approx_modes,
 770   SVE_NOT_IMPLEMENTED, /* sve_width  */
 771   4, /* memmov_cost  */
 772   3, /* issue_rate  */
 773   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 774    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 775   "16", /* function_align.  */
 776   "4",  /* jump_align.  */
 777   "8",  /* loop_align.  */
 778   2,    /* int_reassoc_width.  */
 779   4,    /* fp_reassoc_width.  */
 780   1,    /* vec_reassoc_width.  */
 781   2,    /* min_div_recip_mul_sf.  */
 782   2,    /* min_div_recip_mul_df.  */
 783   0,    /* max_case_values.  */
 784   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 785   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 786   &generic_prefetch_tune
 787 };
 788
 789 static const struct tune_params cortexa72_tunings =
 790 {
 791   &cortexa57_extra_costs,
 792   &generic_addrcost_table,
 793   &cortexa57_regmove_cost,
 794   &cortexa57_vector_cost,
 795   &generic_branch_cost,
 796   &generic_approx_modes,
 797   SVE_NOT_IMPLEMENTED, /* sve_width  */
 798   4, /* memmov_cost  */
 799   3, /* issue_rate  */
 800   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 801    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 802   "16", /* function_align.  */
 803   "4",  /* jump_align.  */
 804   "8",  /* loop_align.  */
 805   2,    /* int_reassoc_width.  */
 806   4,    /* fp_reassoc_width.  */
 807   1,    /* vec_reassoc_width.  */
 808   2,    /* min_div_recip_mul_sf.  */
 809   2,    /* min_div_recip_mul_df.  */
 810   0,    /* max_case_values.  */
 811   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 812   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 813   &generic_prefetch_tune
 814 };
 815
 816 static const struct tune_params cortexa73_tunings =
 817 {
 818   &cortexa57_extra_costs,
 819   &generic_addrcost_table,
 820   &cortexa57_regmove_cost,
 821   &cortexa57_vector_cost,
 822   &generic_branch_cost,
 823   &generic_approx_modes,
 824   SVE_NOT_IMPLEMENTED, /* sve_width  */
 825   4, /* memmov_cost.  */
 826   2, /* issue_rate.  */
 827   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 828    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 829   "16", /* function_align.  */
 830   "4",  /* jump_align.  */
 831   "8",  /* loop_align.  */
 832   2,    /* int_reassoc_width.  */
 833   4,    /* fp_reassoc_width.  */
 834   1,    /* vec_reassoc_width.  */
 835   2,    /* min_div_recip_mul_sf.  */
 836   2,    /* min_div_recip_mul_df.  */
 837   0,    /* max_case_values.  */
 838   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 839   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 840   &generic_prefetch_tune
 841 };
 842
 843
 844
 845 static const struct tune_params exynosm1_tunings =
 846 {
 847   &exynosm1_extra_costs,
 848   &exynosm1_addrcost_table,
 849   &exynosm1_regmove_cost,
 850   &exynosm1_vector_cost,
 851   &generic_branch_cost,
 852   &exynosm1_approx_modes,
 853   SVE_NOT_IMPLEMENTED, /* sve_width  */
 854   4,    /* memmov_cost  */
 855   3,    /* issue_rate  */
 856   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 857   "4",  /* function_align.  */
 858   "4",  /* jump_align.  */
 859   "4",  /* loop_align.  */
 860   2,    /* int_reassoc_width.  */
 861   4,    /* fp_reassoc_width.  */
 862   1,    /* vec_reassoc_width.  */
 863   2,    /* min_div_recip_mul_sf.  */
 864   2,    /* min_div_recip_mul_df.  */
 865   48,   /* max_case_values.  */
 866   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 867   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 868   &exynosm1_prefetch_tune
 869 };
 870
 871 static const struct tune_params thunderxt88_tunings =
 872 {
 873   &thunderx_extra_costs,
 874   &generic_addrcost_table,
 875   &thunderx_regmove_cost,
 876   &thunderx_vector_cost,
 877   &generic_branch_cost,
 878   &generic_approx_modes,
 879   SVE_NOT_IMPLEMENTED, /* sve_width  */
 880   6, /* memmov_cost  */
 881   2, /* issue_rate  */
 882   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 883   "8",  /* function_align.  */
 884   "8",  /* jump_align.  */
 885   "8",  /* loop_align.  */
 886   2,    /* int_reassoc_width.  */
 887   4,    /* fp_reassoc_width.  */
 888   1,    /* vec_reassoc_width.  */
 889   2,    /* min_div_recip_mul_sf.  */
 890   2,    /* min_div_recip_mul_df.  */
 891   0,    /* max_case_values.  */
 892   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 893   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 894   &thunderxt88_prefetch_tune
 895 };
 896
 897 static const struct tune_params thunderx_tunings =
 898 {
 899   &thunderx_extra_costs,
 900   &generic_addrcost_table,
 901   &thunderx_regmove_cost,
 902   &thunderx_vector_cost,
 903   &generic_branch_cost,
 904   &generic_approx_modes,
 905   SVE_NOT_IMPLEMENTED, /* sve_width  */
 906   6, /* memmov_cost  */
 907   2, /* issue_rate  */
 908   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 909   "8",  /* function_align.  */
 910   "8",  /* jump_align.  */
 911   "8",  /* loop_align.  */
 912   2,    /* int_reassoc_width.  */
 913   4,    /* fp_reassoc_width.  */
 914   1,    /* vec_reassoc_width.  */
 915   2,    /* min_div_recip_mul_sf.  */
 916   2,    /* min_div_recip_mul_df.  */
 917   0,    /* max_case_values.  */
 918   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 919   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 920    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 921   &thunderx_prefetch_tune
 922 };
 923
 924 static const struct tune_params tsv110_tunings =
 925 {
 926   &tsv110_extra_costs,
 927   &tsv110_addrcost_table,
 928   &tsv110_regmove_cost,
 929   &tsv110_vector_cost,
 930   &generic_branch_cost,
 931   &generic_approx_modes,
 932   SVE_NOT_IMPLEMENTED, /* sve_width  */
 933   4,    /* memmov_cost  */
 934   4,    /* issue_rate  */
 935   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
 936    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 937   "16", /* function_align.  */
 938   "4",  /* jump_align.  */
 939   "8",  /* loop_align.  */
 940   2,    /* int_reassoc_width.  */
 941   4,    /* fp_reassoc_width.  */
 942   1,    /* vec_reassoc_width.  */
 943   2,    /* min_div_recip_mul_sf.  */
 944   2,    /* min_div_recip_mul_df.  */
 945   0,    /* max_case_values.  */
 946   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 947   (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
 948   &tsv110_prefetch_tune
 949 };
 950
 951 static const struct tune_params xgene1_tunings =
 952 {
 953   &xgene1_extra_costs,
 954   &xgene1_addrcost_table,
 955   &xgene1_regmove_cost,
 956   &xgene1_vector_cost,
 957   &generic_branch_cost,
 958   &xgene1_approx_modes,
 959   SVE_NOT_IMPLEMENTED, /* sve_width  */
 960   6, /* memmov_cost  */
 961   4, /* issue_rate  */
 962   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 963   "16", /* function_align.  */
 964   "16", /* jump_align.  */
 965   "16", /* loop_align.  */
 966   2,    /* int_reassoc_width.  */
 967   4,    /* fp_reassoc_width.  */
 968   1,    /* vec_reassoc_width.  */
 969   2,    /* min_div_recip_mul_sf.  */
 970   2,    /* min_div_recip_mul_df.  */
 971   17,   /* max_case_values.  */
 972   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 973   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
 974   &xgene1_prefetch_tune
 975 };
 976
 977 static const struct tune_params emag_tunings =
 978 {
 979   &xgene1_extra_costs,
 980   &xgene1_addrcost_table,
 981   &xgene1_regmove_cost,
 982   &xgene1_vector_cost,
 983   &generic_branch_cost,
 984   &xgene1_approx_modes,
 985   SVE_NOT_IMPLEMENTED,
 986   6, /* memmov_cost  */
 987   4, /* issue_rate  */
 988   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 989   "16", /* function_align.  */
 990   "16", /* jump_align.  */
 991   "16", /* loop_align.  */
 992   2,    /* int_reassoc_width.  */
 993   4,    /* fp_reassoc_width.  */
 994   1,    /* vec_reassoc_width.  */
 995   2,    /* min_div_recip_mul_sf.  */
 996   2,    /* min_div_recip_mul_df.  */
 997   17,   /* max_case_values.  */
 998   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 999   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1000   &xgene1_prefetch_tune
1001 };
1002
1003 static const struct tune_params qdf24xx_tunings =
1004 {
1005   &qdf24xx_extra_costs,
1006   &qdf24xx_addrcost_table,
1007   &qdf24xx_regmove_cost,
1008   &qdf24xx_vector_cost,
1009   &generic_branch_cost,
1010   &generic_approx_modes,
1011   SVE_NOT_IMPLEMENTED, /* sve_width  */
1012   4, /* memmov_cost  */
1013   4, /* issue_rate  */
1014   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1015    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1016   "16", /* function_align.  */
1017   "8",  /* jump_align.  */
1018   "16", /* loop_align.  */
1019   2,    /* int_reassoc_width.  */
1020   4,    /* fp_reassoc_width.  */
1021   1,    /* vec_reassoc_width.  */
1022   2,    /* min_div_recip_mul_sf.  */
1023   2,    /* min_div_recip_mul_df.  */
1024   0,    /* max_case_values.  */
1025   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1026   AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
1027   &qdf24xx_prefetch_tune
1028 };
1029
1030 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
1031    for now.  */
1032 static const struct tune_params saphira_tunings =
1033 {
1034   &generic_extra_costs,
1035   &generic_addrcost_table,
1036   &generic_regmove_cost,
1037   &generic_vector_cost,
1038   &generic_branch_cost,
1039   &generic_approx_modes,
1040   SVE_NOT_IMPLEMENTED, /* sve_width  */
1041   4, /* memmov_cost  */
1042   4, /* issue_rate  */
1043   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1044    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1045   "16", /* function_align.  */
1046   "8",  /* jump_align.  */
1047   "16", /* loop_align.  */
1048   2,    /* int_reassoc_width.  */
1049   4,    /* fp_reassoc_width.  */
1050   1,    /* vec_reassoc_width.  */
1051   2,    /* min_div_recip_mul_sf.  */
1052   2,    /* min_div_recip_mul_df.  */
1053   0,    /* max_case_values.  */
1054   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1055   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
1056   &generic_prefetch_tune
1057 };
1058
1059 static const struct tune_params thunderx2t99_tunings =
1060 {
1061   &thunderx2t99_extra_costs,
1062   &thunderx2t99_addrcost_table,
1063   &thunderx2t99_regmove_cost,
1064   &thunderx2t99_vector_cost,
1065   &generic_branch_cost,
1066   &generic_approx_modes,
1067   SVE_NOT_IMPLEMENTED, /* sve_width  */
1068   4, /* memmov_cost.  */
1069   4, /* issue_rate.  */
1070   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
1071    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
1072   "16", /* function_align.  */
1073   "8",  /* jump_align.  */
1074   "16", /* loop_align.  */
1075   3,    /* int_reassoc_width.  */
1076   2,    /* fp_reassoc_width.  */
1077   2,    /* vec_reassoc_width.  */
1078   2,    /* min_div_recip_mul_sf.  */
1079   2,    /* min_div_recip_mul_df.  */
1080   0,    /* max_case_values.  */
1081   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1082   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1083   &thunderx2t99_prefetch_tune
1084 };
1085
1086 /* Support for fine-grained override of the tuning structures.  */
1087 struct aarch64_tuning_override_function
1088 {
1089   const char* name;
1090   void (*parse_override)(const char*, struct tune_params*);
1091 };
1092
1093 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1094 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1095 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1096
1097 static const struct aarch64_tuning_override_function
1098 aarch64_tuning_override_functions[] =
1099 {
1100   { "fuse", aarch64_parse_fuse_string },
1101   { "tune", aarch64_parse_tune_string },
1102   { "sve_width", aarch64_parse_sve_width_string },
1103   { NULL, NULL }
1104 };
1105
1106 /* A processor implementing AArch64.  */
1107 struct processor
1108 {
1109   const char *const name;
1110   enum aarch64_processor ident;
1111   enum aarch64_processor sched_core;
1112   enum aarch64_arch arch;
1113   unsigned architecture_version;
1114   const unsigned long flags;
1115   const struct tune_params *const tune;
1116 };
1117
1118 /* Architectures implementing AArch64.  */
1119 static const struct processor all_architectures[] =
1120 {
1121 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1122   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1123 #include "aarch64-arches.def"
1124   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1125 };
1126
1127 /* Processor cores implementing AArch64.  */
1128 static const struct processor all_cores[] =
1129 {
1130 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1131   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
1132   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
1133   FLAGS, &COSTS##_tunings},
1134 #include "aarch64-cores.def"
1135   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1136     AARCH64_FL_FOR_ARCH8, &generic_tunings},
1137   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1138 };
1139
1140
1141 /* Target specification.  These are populated by the -march, -mtune, -mcpu
1142    handling code or by target attributes.  */
1143 static const struct processor *selected_arch;
1144 static const struct processor *selected_cpu;
1145 static const struct processor *selected_tune;
1146
1147 /* The current tuning set.  */
1148 struct tune_params aarch64_tune_params = generic_tunings;
1149
1150 /* Table of machine attributes.  */
1151 static const struct attribute_spec aarch64_attribute_table[] =
1152 {
1153   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1154        affects_type_identity, handler, exclude } */
1155   { "aarch64_vector_pcs", 0, 0, false, true,  true,  false, NULL, NULL },
1156   { NULL,                 0, 0, false, false, false, false, NULL, NULL }
1157 };
1158
1159 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1160
1161 /* An ISA extension in the co-processor and main instruction set space.  */
1162 struct aarch64_option_extension
1163 {
1164   const char *const name;
1165   const unsigned long flags_on;
1166   const unsigned long flags_off;
1167 };
1168
1169 typedef enum aarch64_cond_code
1170 {
1171   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1172   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1173   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1174 }
1175 aarch64_cc;
1176
1177 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1178
1179 struct aarch64_branch_protect_type
1180 {
1181   /* The type's name that the user passes to the branch-protection option
1182     string.  */
1183   const char* name;
1184   /* Function to handle the protection type and set global variables.
1185     First argument is the string token corresponding with this type and the
1186     second argument is the next token in the option string.
1187     Return values:
1188     * AARCH64_PARSE_OK: Handling was sucessful.
1189     * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1190       should print an error.
1191     * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1192       own error.  */
1193   enum aarch64_parse_opt_result (*handler)(char*, char*);
1194   /* A list of types that can follow this type in the option string.  */
1195   const aarch64_branch_protect_type* subtypes;
1196   unsigned int num_subtypes;
1197 };
1198
1199 static enum aarch64_parse_opt_result
1200 aarch64_handle_no_branch_protection (char* str, char* rest)
1201 {
1202   aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1203   aarch64_enable_bti = 0;
1204   if (rest)
1205     {
1206       error ("unexpected %<%s%> after %<%s%>", rest, str);
1207       return AARCH64_PARSE_INVALID_FEATURE;
1208     }
1209   return AARCH64_PARSE_OK;
1210 }
1211
1212 static enum aarch64_parse_opt_result
1213 aarch64_handle_standard_branch_protection (char* str, char* rest)
1214 {
1215   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1216   aarch64_enable_bti = 1;
1217   if (rest)
1218     {
1219       error ("unexpected %<%s%> after %<%s%>", rest, str);
1220       return AARCH64_PARSE_INVALID_FEATURE;
1221     }
1222   return AARCH64_PARSE_OK;
1223 }
1224
1225 static enum aarch64_parse_opt_result
1226 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1227                                     char* rest ATTRIBUTE_UNUSED)
1228 {
1229   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1230   return AARCH64_PARSE_OK;
1231 }
1232
1233 static enum aarch64_parse_opt_result
1234 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1235                               char* rest ATTRIBUTE_UNUSED)
1236 {
1237   aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1238   return AARCH64_PARSE_OK;
1239 }
1240
1241 static enum aarch64_parse_opt_result
1242 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1243                                     char* rest ATTRIBUTE_UNUSED)
1244 {
1245   aarch64_enable_bti = 1;
1246   return AARCH64_PARSE_OK;
1247 }
1248
1249 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1250   { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1251   { NULL, NULL, NULL, 0 }
1252 };
1253
1254 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1255   { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1256   { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1257   { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1258     ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1259   { "bti", aarch64_handle_bti_protection, NULL, 0 },
1260   { NULL, NULL, NULL, 0 }
1261 };
1262
1263 /* The condition codes of the processor, and the inverse function.  */
1264 static const char * const aarch64_condition_codes[] =
1265 {
1266   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1267   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1268 };
1269
1270 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1271 const char *
1272 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1273                         const char * branch_format)
1274 {
1275     rtx_code_label * tmp_label = gen_label_rtx ();
1276     char label_buf[256];
1277     char buffer[128];
1278     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1279                                  CODE_LABEL_NUMBER (tmp_label));
1280     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1281     rtx dest_label = operands[pos_label];
1282     operands[pos_label] = tmp_label;
1283
1284     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1285     output_asm_insn (buffer, operands);
1286
1287     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1288     operands[pos_label] = dest_label;
1289     output_asm_insn (buffer, operands);
1290     return "";
1291 }
1292
1293 void
1294 aarch64_err_no_fpadvsimd (machine_mode mode)
1295 {
1296   if (TARGET_GENERAL_REGS_ONLY)
1297     if (FLOAT_MODE_P (mode))
1298       error ("%qs is incompatible with the use of floating-point types",
1299              "-mgeneral-regs-only");
1300     else
1301       error ("%qs is incompatible with the use of vector types",
1302              "-mgeneral-regs-only");
1303   else
1304     if (FLOAT_MODE_P (mode))
1305       error ("%qs feature modifier is incompatible with the use of"
1306              " floating-point types", "+nofp");
1307     else
1308       error ("%qs feature modifier is incompatible with the use of"
1309              " vector types", "+nofp");
1310 }
1311
1312 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1313    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1314    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1315    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1316    and GENERAL_REGS is lower than the memory cost (in this case the best class
1317    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
1318    cost results in bad allocations with many redundant int<->FP moves which
1319    are expensive on various cores.
1320    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1321    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
1322    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
1323    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
1324    The result of this is that it is no longer inefficient to have a higher
1325    memory move cost than the register move cost.
1326 */
1327
1328 static reg_class_t
1329 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1330                                          reg_class_t best_class)
1331 {
1332   machine_mode mode;
1333
1334   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1335       || !reg_class_subset_p (FP_REGS, allocno_class))
1336     return allocno_class;
1337
1338   if (!reg_class_subset_p (GENERAL_REGS, best_class)
1339       || !reg_class_subset_p (FP_REGS, best_class))
1340     return best_class;
1341
1342   mode = PSEUDO_REGNO_MODE (regno);
1343   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1344 }
1345
1346 static unsigned int
1347 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1348 {
1349   if (GET_MODE_UNIT_SIZE (mode) == 4)
1350     return aarch64_tune_params.min_div_recip_mul_sf;
1351   return aarch64_tune_params.min_div_recip_mul_df;
1352 }
1353
1354 /* Return the reassociation width of treeop OPC with mode MODE.  */
1355 static int
1356 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1357 {
1358   if (VECTOR_MODE_P (mode))
1359     return aarch64_tune_params.vec_reassoc_width;
1360   if (INTEGRAL_MODE_P (mode))
1361     return aarch64_tune_params.int_reassoc_width;
1362   /* Avoid reassociating floating point addition so we emit more FMAs.  */
1363   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1364     return aarch64_tune_params.fp_reassoc_width;
1365   return 1;
1366 }
1367
1368 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1369 unsigned
1370 aarch64_dbx_register_number (unsigned regno)
1371 {
1372    if (GP_REGNUM_P (regno))
1373      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1374    else if (regno == SP_REGNUM)
1375      return AARCH64_DWARF_SP;
1376    else if (FP_REGNUM_P (regno))
1377      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1378    else if (PR_REGNUM_P (regno))
1379      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1380    else if (regno == VG_REGNUM)
1381      return AARCH64_DWARF_VG;
1382
1383    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1384       equivalent DWARF register.  */
1385    return DWARF_FRAME_REGISTERS;
1386 }
1387
1388 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1389 static bool
1390 aarch64_advsimd_struct_mode_p (machine_mode mode)
1391 {
1392   return (TARGET_SIMD
1393           && (mode == OImode || mode == CImode || mode == XImode));
1394 }
1395
1396 /* Return true if MODE is an SVE predicate mode.  */
1397 static bool
1398 aarch64_sve_pred_mode_p (machine_mode mode)
1399 {
1400   return (TARGET_SVE
1401           && (mode == VNx16BImode
1402               || mode == VNx8BImode
1403               || mode == VNx4BImode
1404               || mode == VNx2BImode));
1405 }
1406
1407 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1408 const unsigned int VEC_ADVSIMD  = 1;
1409 const unsigned int VEC_SVE_DATA = 2;
1410 const unsigned int VEC_SVE_PRED = 4;
1411 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1412    a structure of 2, 3 or 4 vectors.  */
1413 const unsigned int VEC_STRUCT   = 8;
1414 /* Useful combinations of the above.  */
1415 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1416 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1417
1418 /* Return a set of flags describing the vector properties of mode MODE.
1419    Ignore modes that are not supported by the current target.  */
1420 static unsigned int
1421 aarch64_classify_vector_mode (machine_mode mode)
1422 {
1423   if (aarch64_advsimd_struct_mode_p (mode))
1424     return VEC_ADVSIMD | VEC_STRUCT;
1425
1426   if (aarch64_sve_pred_mode_p (mode))
1427     return VEC_SVE_PRED;
1428
1429   scalar_mode inner = GET_MODE_INNER (mode);
1430   if (VECTOR_MODE_P (mode)
1431       && (inner == QImode
1432           || inner == HImode
1433           || inner == HFmode
1434           || inner == SImode
1435           || inner == SFmode
1436           || inner == DImode
1437           || inner == DFmode))
1438     {
1439       if (TARGET_SVE)
1440         {
1441           if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1442             return VEC_SVE_DATA;
1443           if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1444               || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1445               || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1446             return VEC_SVE_DATA | VEC_STRUCT;
1447         }
1448
1449       /* This includes V1DF but not V1DI (which doesn't exist).  */
1450       if (TARGET_SIMD
1451           && (known_eq (GET_MODE_BITSIZE (mode), 64)
1452               || known_eq (GET_MODE_BITSIZE (mode), 128)))
1453         return VEC_ADVSIMD;
1454     }
1455
1456   return 0;
1457 }
1458
1459 /* Return true if MODE is any of the data vector modes, including
1460    structure modes.  */
1461 static bool
1462 aarch64_vector_data_mode_p (machine_mode mode)
1463 {
1464   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1465 }
1466
1467 /* Return true if MODE is an SVE data vector mode; either a single vector
1468    or a structure of vectors.  */
1469 static bool
1470 aarch64_sve_data_mode_p (machine_mode mode)
1471 {
1472   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1473 }
1474
1475 /* Implement target hook TARGET_ARRAY_MODE.  */
1476 static opt_machine_mode
1477 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1478 {
1479   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1480       && IN_RANGE (nelems, 2, 4))
1481     return mode_for_vector (GET_MODE_INNER (mode),
1482                             GET_MODE_NUNITS (mode) * nelems);
1483
1484   return opt_machine_mode ();
1485 }
1486
1487 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1488 static bool
1489 aarch64_array_mode_supported_p (machine_mode mode,
1490                                 unsigned HOST_WIDE_INT nelems)
1491 {
1492   if (TARGET_SIMD
1493       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1494           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1495       && (nelems >= 2 && nelems <= 4))
1496     return true;
1497
1498   return false;
1499 }
1500
1501 /* Return the SVE predicate mode to use for elements that have
1502    ELEM_NBYTES bytes, if such a mode exists.  */
1503
1504 opt_machine_mode
1505 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1506 {
1507   if (TARGET_SVE)
1508     {
1509       if (elem_nbytes == 1)
1510         return VNx16BImode;
1511       if (elem_nbytes == 2)
1512         return VNx8BImode;
1513       if (elem_nbytes == 4)
1514         return VNx4BImode;
1515       if (elem_nbytes == 8)
1516         return VNx2BImode;
1517     }
1518   return opt_machine_mode ();
1519 }
1520
1521 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1522
1523 static opt_machine_mode
1524 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1525 {
1526   if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1527     {
1528       unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1529       machine_mode pred_mode;
1530       if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1531         return pred_mode;
1532     }
1533
1534   return default_get_mask_mode (nunits, nbytes);
1535 }
1536
1537 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
1538    prefer to use the first arithmetic operand as the else value if
1539    the else value doesn't matter, since that exactly matches the SVE
1540    destructive merging form.  For ternary operations we could either
1541    pick the first operand and use FMAD-like instructions or the last
1542    operand and use FMLA-like instructions; the latter seems more
1543    natural.  */
1544
1545 static tree
1546 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1547 {
1548   return nops == 3 ? ops[2] : ops[0];
1549 }
1550
1551 /* Implement TARGET_HARD_REGNO_NREGS.  */
1552
1553 static unsigned int
1554 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1555 {
1556   /* ??? Logically we should only need to provide a value when
1557      HARD_REGNO_MODE_OK says that the combination is valid,
1558      but at the moment we need to handle all modes.  Just ignore
1559      any runtime parts for registers that can't store them.  */
1560   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1561   switch (aarch64_regno_regclass (regno))
1562     {
1563     case FP_REGS:
1564     case FP_LO_REGS:
1565       if (aarch64_sve_data_mode_p (mode))
1566         return exact_div (GET_MODE_SIZE (mode),
1567                           BYTES_PER_SVE_VECTOR).to_constant ();
1568       return CEIL (lowest_size, UNITS_PER_VREG);
1569     case PR_REGS:
1570     case PR_LO_REGS:
1571     case PR_HI_REGS:
1572       return 1;
1573     default:
1574       return CEIL (lowest_size, UNITS_PER_WORD);
1575     }
1576   gcc_unreachable ();
1577 }
1578
1579 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1580
1581 static bool
1582 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1583 {
1584   if (GET_MODE_CLASS (mode) == MODE_CC)
1585     return regno == CC_REGNUM;
1586
1587   if (regno == VG_REGNUM)
1588     /* This must have the same size as _Unwind_Word.  */
1589     return mode == DImode;
1590
1591   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1592   if (vec_flags & VEC_SVE_PRED)
1593     return PR_REGNUM_P (regno);
1594
1595   if (PR_REGNUM_P (regno))
1596     return 0;
1597
1598   if (regno == SP_REGNUM)
1599     /* The purpose of comparing with ptr_mode is to support the
1600        global register variable associated with the stack pointer
1601        register via the syntax of asm ("wsp") in ILP32.  */
1602     return mode == Pmode || mode == ptr_mode;
1603
1604   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1605     return mode == Pmode;
1606
1607   if (GP_REGNUM_P (regno))
1608     {
1609       if (known_le (GET_MODE_SIZE (mode), 8))
1610         return true;
1611       else if (known_le (GET_MODE_SIZE (mode), 16))
1612         return (regno & 1) == 0;
1613     }
1614   else if (FP_REGNUM_P (regno))
1615     {
1616       if (vec_flags & VEC_STRUCT)
1617         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1618       else
1619         return !VECTOR_MODE_P (mode) || vec_flags != 0;
1620     }
1621
1622   return false;
1623 }
1624
1625 /* Return true if this is a definition of a vectorized simd function.  */
1626
1627 static bool
1628 aarch64_simd_decl_p (tree fndecl)
1629 {
1630   tree fntype;
1631
1632   if (fndecl == NULL)
1633     return false;
1634   fntype = TREE_TYPE (fndecl);
1635   if (fntype == NULL)
1636     return false;
1637
1638   /* Functions with the aarch64_vector_pcs attribute use the simd ABI.  */
1639   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
1640     return true;
1641
1642   return false;
1643 }
1644
1645 /* Return the mode a register save/restore should use.  DImode for integer
1646    registers, DFmode for FP registers in non-SIMD functions (they only save
1647    the bottom half of a 128 bit register), or TFmode for FP registers in
1648    SIMD functions.  */
1649
1650 static machine_mode
1651 aarch64_reg_save_mode (tree fndecl, unsigned regno)
1652 {
1653   return GP_REGNUM_P (regno)
1654            ? E_DImode
1655            : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
1656 }
1657
1658 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1659    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1660    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1661
1662 static bool
1663 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1664 {
1665   return FP_REGNUM_P (regno) && maybe_gt (GET_MODE_SIZE (mode), 8);
1666 }
1667
1668 /* Implement REGMODE_NATURAL_SIZE.  */
1669 poly_uint64
1670 aarch64_regmode_natural_size (machine_mode mode)
1671 {
1672   /* The natural size for SVE data modes is one SVE data vector,
1673      and similarly for predicates.  We can't independently modify
1674      anything smaller than that.  */
1675   /* ??? For now, only do this for variable-width SVE registers.
1676      Doing it for constant-sized registers breaks lower-subreg.c.  */
1677   /* ??? And once that's fixed, we should probably have similar
1678      code for Advanced SIMD.  */
1679   if (!aarch64_sve_vg.is_constant ())
1680     {
1681       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1682       if (vec_flags & VEC_SVE_PRED)
1683         return BYTES_PER_SVE_PRED;
1684       if (vec_flags & VEC_SVE_DATA)
1685         return BYTES_PER_SVE_VECTOR;
1686     }
1687   return UNITS_PER_WORD;
1688 }
1689
1690 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1691 machine_mode
1692 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1693                                      machine_mode mode)
1694 {
1695   /* The predicate mode determines which bits are significant and
1696      which are "don't care".  Decreasing the number of lanes would
1697      lose data while increasing the number of lanes would make bits
1698      unnecessarily significant.  */
1699   if (PR_REGNUM_P (regno))
1700     return mode;
1701   if (known_ge (GET_MODE_SIZE (mode), 4))
1702     return mode;
1703   else
1704     return SImode;
1705 }
1706
1707 /* Return true if I's bits are consecutive ones from the MSB.  */
1708 bool
1709 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
1710 {
1711   return exact_log2 (-i) != HOST_WIDE_INT_M1;
1712 }
1713
1714 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
1715    that strcpy from constants will be faster.  */
1716
1717 static HOST_WIDE_INT
1718 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1719 {
1720   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1721     return MAX (align, BITS_PER_WORD);
1722   return align;
1723 }
1724
1725 /* Return true if calls to DECL should be treated as
1726    long-calls (ie called via a register).  */
1727 static bool
1728 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1729 {
1730   return false;
1731 }
1732
1733 /* Return true if calls to symbol-ref SYM should be treated as
1734    long-calls (ie called via a register).  */
1735 bool
1736 aarch64_is_long_call_p (rtx sym)
1737 {
1738   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1739 }
1740
1741 /* Return true if calls to symbol-ref SYM should not go through
1742    plt stubs.  */
1743
1744 bool
1745 aarch64_is_noplt_call_p (rtx sym)
1746 {
1747   const_tree decl = SYMBOL_REF_DECL (sym);
1748
1749   if (flag_pic
1750       && decl
1751       && (!flag_plt
1752           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1753       && !targetm.binds_local_p (decl))
1754     return true;
1755
1756   return false;
1757 }
1758
1759 /* Return true if the offsets to a zero/sign-extract operation
1760    represent an expression that matches an extend operation.  The
1761    operands represent the paramters from
1762
1763    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1764 bool
1765 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1766                                 rtx extract_imm)
1767 {
1768   HOST_WIDE_INT mult_val, extract_val;
1769
1770   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1771     return false;
1772
1773   mult_val = INTVAL (mult_imm);
1774   extract_val = INTVAL (extract_imm);
1775
1776   if (extract_val > 8
1777       && extract_val < GET_MODE_BITSIZE (mode)
1778       && exact_log2 (extract_val & ~7) > 0
1779       && (extract_val & 7) <= 4
1780       && mult_val == (1 << (extract_val & 7)))
1781     return true;
1782
1783   return false;
1784 }
1785
1786 /* Emit an insn that's a simple single-set.  Both the operands must be
1787    known to be valid.  */
1788 inline static rtx_insn *
1789 emit_set_insn (rtx x, rtx y)
1790 {
1791   return emit_insn (gen_rtx_SET (x, y));
1792 }
1793
1794 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1795    return the rtx for register 0 in the proper mode.  */
1796 rtx
1797 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1798 {
1799   machine_mode mode = SELECT_CC_MODE (code, x, y);
1800   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1801
1802   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1803   return cc_reg;
1804 }
1805
1806 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
1807
1808 static rtx
1809 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
1810                                   machine_mode y_mode)
1811 {
1812   if (y_mode == E_QImode || y_mode == E_HImode)
1813     {
1814       if (CONST_INT_P (y))
1815         y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
1816       else
1817         {
1818           rtx t, cc_reg;
1819           machine_mode cc_mode;
1820
1821           t = gen_rtx_ZERO_EXTEND (SImode, y);
1822           t = gen_rtx_COMPARE (CC_SWPmode, t, x);
1823           cc_mode = CC_SWPmode;
1824           cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
1825           emit_set_insn (cc_reg, t);
1826           return cc_reg;
1827         }
1828     }
1829
1830   return aarch64_gen_compare_reg (code, x, y);
1831 }
1832
1833 /* Build the SYMBOL_REF for __tls_get_addr.  */
1834
1835 static GTY(()) rtx tls_get_addr_libfunc;
1836
1837 rtx
1838 aarch64_tls_get_addr (void)
1839 {
1840   if (!tls_get_addr_libfunc)
1841     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1842   return tls_get_addr_libfunc;
1843 }
1844
1845 /* Return the TLS model to use for ADDR.  */
1846
1847 static enum tls_model
1848 tls_symbolic_operand_type (rtx addr)
1849 {
1850   enum tls_model tls_kind = TLS_MODEL_NONE;
1851   if (GET_CODE (addr) == CONST)
1852     {
1853       poly_int64 addend;
1854       rtx sym = strip_offset (addr, &addend);
1855       if (GET_CODE (sym) == SYMBOL_REF)
1856         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1857     }
1858   else if (GET_CODE (addr) == SYMBOL_REF)
1859     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1860
1861   return tls_kind;
1862 }
1863
1864 /* We'll allow lo_sum's in addresses in our legitimate addresses
1865    so that combine would take care of combining addresses where
1866    necessary, but for generation purposes, we'll generate the address
1867    as :
1868    RTL                               Absolute
1869    tmp = hi (symbol_ref);            adrp  x1, foo
1870    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1871                                      nop
1872
1873    PIC                               TLS
1874    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1875    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1876                                      bl   __tls_get_addr
1877                                      nop
1878
1879    Load TLS symbol, depending on TLS mechanism and TLS access model.
1880
1881    Global Dynamic - Traditional TLS:
1882    adrp tmp, :tlsgd:imm
1883    add  dest, tmp, #:tlsgd_lo12:imm
1884    bl   __tls_get_addr
1885
1886    Global Dynamic - TLS Descriptors:
1887    adrp dest, :tlsdesc:imm
1888    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1889    add  dest, dest, #:tlsdesc_lo12:imm
1890    blr  tmp
1891    mrs  tp, tpidr_el0
1892    add  dest, dest, tp
1893
1894    Initial Exec:
1895    mrs  tp, tpidr_el0
1896    adrp tmp, :gottprel:imm
1897    ldr  dest, [tmp, #:gottprel_lo12:imm]
1898    add  dest, dest, tp
1899
1900    Local Exec:
1901    mrs  tp, tpidr_el0
1902    add  t0, tp, #:tprel_hi12:imm, lsl #12
1903    add  t0, t0, #:tprel_lo12_nc:imm
1904 */
1905
1906 static void
1907 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1908                                    enum aarch64_symbol_type type)
1909 {
1910   switch (type)
1911     {
1912     case SYMBOL_SMALL_ABSOLUTE:
1913       {
1914         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1915         rtx tmp_reg = dest;
1916         machine_mode mode = GET_MODE (dest);
1917
1918         gcc_assert (mode == Pmode || mode == ptr_mode);
1919
1920         if (can_create_pseudo_p ())
1921           tmp_reg = gen_reg_rtx (mode);
1922
1923         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1924         emit_insn (gen_add_losym (dest, tmp_reg, imm));
1925         return;
1926       }
1927
1928     case SYMBOL_TINY_ABSOLUTE:
1929       emit_insn (gen_rtx_SET (dest, imm));
1930       return;
1931
1932     case SYMBOL_SMALL_GOT_28K:
1933       {
1934         machine_mode mode = GET_MODE (dest);
1935         rtx gp_rtx = pic_offset_table_rtx;
1936         rtx insn;
1937         rtx mem;
1938
1939         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1940            here before rtl expand.  Tree IVOPT will generate rtl pattern to
1941            decide rtx costs, in which case pic_offset_table_rtx is not
1942            initialized.  For that case no need to generate the first adrp
1943            instruction as the final cost for global variable access is
1944            one instruction.  */
1945         if (gp_rtx != NULL)
1946           {
1947             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1948                using the page base as GOT base, the first page may be wasted,
1949                in the worst scenario, there is only 28K space for GOT).
1950
1951                The generate instruction sequence for accessing global variable
1952                is:
1953
1954                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1955
1956                Only one instruction needed. But we must initialize
1957                pic_offset_table_rtx properly.  We generate initialize insn for
1958                every global access, and allow CSE to remove all redundant.
1959
1960                The final instruction sequences will look like the following
1961                for multiply global variables access.
1962
1963                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1964
1965                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1966                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1967                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1968                  ...  */
1969
1970             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1971             crtl->uses_pic_offset_table = 1;
1972             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1973
1974             if (mode != GET_MODE (gp_rtx))
1975              gp_rtx = gen_lowpart (mode, gp_rtx);
1976
1977           }
1978
1979         if (mode == ptr_mode)
1980           {
1981             if (mode == DImode)
1982               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1983             else
1984               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1985
1986             mem = XVECEXP (SET_SRC (insn), 0, 0);
1987           }
1988         else
1989           {
1990             gcc_assert (mode == Pmode);
1991
1992             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1993             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1994           }
1995
1996         /* The operand is expected to be MEM.  Whenever the related insn
1997            pattern changed, above code which calculate mem should be
1998            updated.  */
1999         gcc_assert (GET_CODE (mem) == MEM);
2000         MEM_READONLY_P (mem) = 1;
2001         MEM_NOTRAP_P (mem) = 1;
2002         emit_insn (insn);
2003         return;
2004       }
2005
2006     case SYMBOL_SMALL_GOT_4G:
2007       {
2008         /* In ILP32, the mode of dest can be either SImode or DImode,
2009            while the got entry is always of SImode size.  The mode of
2010            dest depends on how dest is used: if dest is assigned to a
2011            pointer (e.g. in the memory), it has SImode; it may have
2012            DImode if dest is dereferenced to access the memeory.
2013            This is why we have to handle three different ldr_got_small
2014            patterns here (two patterns for ILP32).  */
2015
2016         rtx insn;
2017         rtx mem;
2018         rtx tmp_reg = dest;
2019         machine_mode mode = GET_MODE (dest);
2020
2021         if (can_create_pseudo_p ())
2022           tmp_reg = gen_reg_rtx (mode);
2023
2024         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2025         if (mode == ptr_mode)
2026           {
2027             if (mode == DImode)
2028               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2029             else
2030               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2031
2032             mem = XVECEXP (SET_SRC (insn), 0, 0);
2033           }
2034         else
2035           {
2036             gcc_assert (mode == Pmode);
2037
2038             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2039             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2040           }
2041
2042         gcc_assert (GET_CODE (mem) == MEM);
2043         MEM_READONLY_P (mem) = 1;
2044         MEM_NOTRAP_P (mem) = 1;
2045         emit_insn (insn);
2046         return;
2047       }
2048
2049     case SYMBOL_SMALL_TLSGD:
2050       {
2051         rtx_insn *insns;
2052         machine_mode mode = GET_MODE (dest);
2053         rtx result = gen_rtx_REG (mode, R0_REGNUM);
2054
2055         start_sequence ();
2056         if (TARGET_ILP32)
2057           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2058         else
2059           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2060         insns = get_insns ();
2061         end_sequence ();
2062
2063         RTL_CONST_CALL_P (insns) = 1;
2064         emit_libcall_block (insns, dest, result, imm);
2065         return;
2066       }
2067
2068     case SYMBOL_SMALL_TLSDESC:
2069       {
2070         machine_mode mode = GET_MODE (dest);
2071         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2072         rtx tp;
2073
2074         gcc_assert (mode == Pmode || mode == ptr_mode);
2075
2076         /* In ILP32, the got entry is always of SImode size.  Unlike
2077            small GOT, the dest is fixed at reg 0.  */
2078         if (TARGET_ILP32)
2079           emit_insn (gen_tlsdesc_small_si (imm));
2080         else
2081           emit_insn (gen_tlsdesc_small_di (imm));
2082         tp = aarch64_load_tp (NULL);
2083
2084         if (mode != Pmode)
2085           tp = gen_lowpart (mode, tp);
2086
2087         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2088         if (REG_P (dest))
2089           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2090         return;
2091       }
2092
2093     case SYMBOL_SMALL_TLSIE:
2094       {
2095         /* In ILP32, the mode of dest can be either SImode or DImode,
2096            while the got entry is always of SImode size.  The mode of
2097            dest depends on how dest is used: if dest is assigned to a
2098            pointer (e.g. in the memory), it has SImode; it may have
2099            DImode if dest is dereferenced to access the memeory.
2100            This is why we have to handle three different tlsie_small
2101            patterns here (two patterns for ILP32).  */
2102         machine_mode mode = GET_MODE (dest);
2103         rtx tmp_reg = gen_reg_rtx (mode);
2104         rtx tp = aarch64_load_tp (NULL);
2105
2106         if (mode == ptr_mode)
2107           {
2108             if (mode == DImode)
2109               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2110             else
2111               {
2112                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2113                 tp = gen_lowpart (mode, tp);
2114               }
2115           }
2116         else
2117           {
2118             gcc_assert (mode == Pmode);
2119             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2120           }
2121
2122         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2123         if (REG_P (dest))
2124           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2125         return;
2126       }
2127
2128     case SYMBOL_TLSLE12:
2129     case SYMBOL_TLSLE24:
2130     case SYMBOL_TLSLE32:
2131     case SYMBOL_TLSLE48:
2132       {
2133         machine_mode mode = GET_MODE (dest);
2134         rtx tp = aarch64_load_tp (NULL);
2135
2136         if (mode != Pmode)
2137           tp = gen_lowpart (mode, tp);
2138
2139         switch (type)
2140           {
2141           case SYMBOL_TLSLE12:
2142             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2143                         (dest, tp, imm));
2144             break;
2145           case SYMBOL_TLSLE24:
2146             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2147                         (dest, tp, imm));
2148           break;
2149           case SYMBOL_TLSLE32:
2150             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2151                         (dest, imm));
2152             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2153                         (dest, dest, tp));
2154           break;
2155           case SYMBOL_TLSLE48:
2156             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2157                         (dest, imm));
2158             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2159                         (dest, dest, tp));
2160             break;
2161           default:
2162             gcc_unreachable ();
2163           }
2164
2165         if (REG_P (dest))
2166           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2167         return;
2168       }
2169
2170     case SYMBOL_TINY_GOT:
2171       emit_insn (gen_ldr_got_tiny (dest, imm));
2172       return;
2173
2174     case SYMBOL_TINY_TLSIE:
2175       {
2176         machine_mode mode = GET_MODE (dest);
2177         rtx tp = aarch64_load_tp (NULL);
2178
2179         if (mode == ptr_mode)
2180           {
2181             if (mode == DImode)
2182               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2183             else
2184               {
2185                 tp = gen_lowpart (mode, tp);
2186                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2187               }
2188           }
2189         else
2190           {
2191             gcc_assert (mode == Pmode);
2192             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2193           }
2194
2195         if (REG_P (dest))
2196           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2197         return;
2198       }
2199
2200     default:
2201       gcc_unreachable ();
2202     }
2203 }
2204
2205 /* Emit a move from SRC to DEST.  Assume that the move expanders can
2206    handle all moves if !can_create_pseudo_p ().  The distinction is
2207    important because, unlike emit_move_insn, the move expanders know
2208    how to force Pmode objects into the constant pool even when the
2209    constant pool address is not itself legitimate.  */
2210 static rtx
2211 aarch64_emit_move (rtx dest, rtx src)
2212 {
2213   return (can_create_pseudo_p ()
2214           ? emit_move_insn (dest, src)
2215           : emit_move_insn_1 (dest, src));
2216 }
2217
2218 /* Apply UNOPTAB to OP and store the result in DEST.  */
2219
2220 static void
2221 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2222 {
2223   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2224   if (dest != tmp)
2225     emit_move_insn (dest, tmp);
2226 }
2227
2228 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
2229
2230 static void
2231 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2232 {
2233   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2234                           OPTAB_DIRECT);
2235   if (dest != tmp)
2236     emit_move_insn (dest, tmp);
2237 }
2238
2239 /* Split a 128-bit move operation into two 64-bit move operations,
2240    taking care to handle partial overlap of register to register
2241    copies.  Special cases are needed when moving between GP regs and
2242    FP regs.  SRC can be a register, constant or memory; DST a register
2243    or memory.  If either operand is memory it must not have any side
2244    effects.  */
2245 void
2246 aarch64_split_128bit_move (rtx dst, rtx src)
2247 {
2248   rtx dst_lo, dst_hi;
2249   rtx src_lo, src_hi;
2250
2251   machine_mode mode = GET_MODE (dst);
2252
2253   gcc_assert (mode == TImode || mode == TFmode);
2254   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2255   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2256
2257   if (REG_P (dst) && REG_P (src))
2258     {
2259       int src_regno = REGNO (src);
2260       int dst_regno = REGNO (dst);
2261
2262       /* Handle FP <-> GP regs.  */
2263       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2264         {
2265           src_lo = gen_lowpart (word_mode, src);
2266           src_hi = gen_highpart (word_mode, src);
2267
2268           emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2269           emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2270           return;
2271         }
2272       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2273         {
2274           dst_lo = gen_lowpart (word_mode, dst);
2275           dst_hi = gen_highpart (word_mode, dst);
2276
2277           emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2278           emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2279           return;
2280         }
2281     }
2282
2283   dst_lo = gen_lowpart (word_mode, dst);
2284   dst_hi = gen_highpart (word_mode, dst);
2285   src_lo = gen_lowpart (word_mode, src);
2286   src_hi = gen_highpart_mode (word_mode, mode, src);
2287
2288   /* At most one pairing may overlap.  */
2289   if (reg_overlap_mentioned_p (dst_lo, src_hi))
2290     {
2291       aarch64_emit_move (dst_hi, src_hi);
2292       aarch64_emit_move (dst_lo, src_lo);
2293     }
2294   else
2295     {
2296       aarch64_emit_move (dst_lo, src_lo);
2297       aarch64_emit_move (dst_hi, src_hi);
2298     }
2299 }
2300
2301 bool
2302 aarch64_split_128bit_move_p (rtx dst, rtx src)
2303 {
2304   return (! REG_P (src)
2305           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2306 }
2307
2308 /* Split a complex SIMD combine.  */
2309
2310 void
2311 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2312 {
2313   machine_mode src_mode = GET_MODE (src1);
2314   machine_mode dst_mode = GET_MODE (dst);
2315
2316   gcc_assert (VECTOR_MODE_P (dst_mode));
2317   gcc_assert (register_operand (dst, dst_mode)
2318               && register_operand (src1, src_mode)
2319               && register_operand (src2, src_mode));
2320
2321   emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2322   return;
2323 }
2324
2325 /* Split a complex SIMD move.  */
2326
2327 void
2328 aarch64_split_simd_move (rtx dst, rtx src)
2329 {
2330   machine_mode src_mode = GET_MODE (src);
2331   machine_mode dst_mode = GET_MODE (dst);
2332
2333   gcc_assert (VECTOR_MODE_P (dst_mode));
2334
2335   if (REG_P (dst) && REG_P (src))
2336     {
2337       gcc_assert (VECTOR_MODE_P (src_mode));
2338       emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2339     }
2340 }
2341
2342 bool
2343 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2344                               machine_mode ymode, rtx y)
2345 {
2346   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2347   gcc_assert (r != NULL);
2348   return rtx_equal_p (x, r);
2349 }
2350
2351
2352 static rtx
2353 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2354 {
2355   if (can_create_pseudo_p ())
2356     return force_reg (mode, value);
2357   else
2358     {
2359       gcc_assert (x);
2360       aarch64_emit_move (x, value);
2361       return x;
2362     }
2363 }
2364
2365 /* Return true if we can move VALUE into a register using a single
2366    CNT[BHWD] instruction.  */
2367
2368 static bool
2369 aarch64_sve_cnt_immediate_p (poly_int64 value)
2370 {
2371   HOST_WIDE_INT factor = value.coeffs[0];
2372   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
2373   return (value.coeffs[1] == factor
2374           && IN_RANGE (factor, 2, 16 * 16)
2375           && (factor & 1) == 0
2376           && factor <= 16 * (factor & -factor));
2377 }
2378
2379 /* Likewise for rtx X.  */
2380
2381 bool
2382 aarch64_sve_cnt_immediate_p (rtx x)
2383 {
2384   poly_int64 value;
2385   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2386 }
2387
2388 /* Return the asm string for an instruction with a CNT-like vector size
2389    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2390    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2391    first part of the operands template (the part that comes before the
2392    vector size itself).  FACTOR is the number of quadwords.
2393    NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2394    If it is zero, we can use any element size.  */
2395
2396 static char *
2397 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2398                                   unsigned int factor,
2399                                   unsigned int nelts_per_vq)
2400 {
2401   static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2402
2403   if (nelts_per_vq == 0)
2404     /* There is some overlap in the ranges of the four CNT instructions.
2405        Here we always use the smallest possible element size, so that the
2406        multiplier is 1 whereever possible.  */
2407     nelts_per_vq = factor & -factor;
2408   int shift = std::min (exact_log2 (nelts_per_vq), 4);
2409   gcc_assert (IN_RANGE (shift, 1, 4));
2410   char suffix = "dwhb"[shift - 1];
2411
2412   factor >>= shift;
2413   unsigned int written;
2414   if (factor == 1)
2415     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2416                         prefix, suffix, operands);
2417   else
2418     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2419                         prefix, suffix, operands, factor);
2420   gcc_assert (written < sizeof (buffer));
2421   return buffer;
2422 }
2423
2424 /* Return the asm string for an instruction with a CNT-like vector size
2425    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2426    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2427    first part of the operands template (the part that comes before the
2428    vector size itself).  X is the value of the vector size operand,
2429    as a polynomial integer rtx.  */
2430
2431 char *
2432 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2433                                   rtx x)
2434 {
2435   poly_int64 value = rtx_to_poly_int64 (x);
2436   gcc_assert (aarch64_sve_cnt_immediate_p (value));
2437   return aarch64_output_sve_cnt_immediate (prefix, operands,
2438                                            value.coeffs[1], 0);
2439 }
2440
2441 /* Return true if we can add VALUE to a register using a single ADDVL
2442    or ADDPL instruction.  */
2443
2444 static bool
2445 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2446 {
2447   HOST_WIDE_INT factor = value.coeffs[0];
2448   if (factor == 0 || value.coeffs[1] != factor)
2449     return false;
2450   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2451      and a value of 16 is one vector width.  */
2452   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2453           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2454 }
2455
2456 /* Likewise for rtx X.  */
2457
2458 bool
2459 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2460 {
2461   poly_int64 value;
2462   return (poly_int_rtx_p (x, &value)
2463           && aarch64_sve_addvl_addpl_immediate_p (value));
2464 }
2465
2466 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2467    and storing the result in operand 0.  */
2468
2469 char *
2470 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2471 {
2472   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2473   poly_int64 offset_value = rtx_to_poly_int64 (offset);
2474   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2475
2476   /* Use INC or DEC if possible.  */
2477   if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2478     {
2479       if (aarch64_sve_cnt_immediate_p (offset_value))
2480         return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2481                                                  offset_value.coeffs[1], 0);
2482       if (aarch64_sve_cnt_immediate_p (-offset_value))
2483         return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2484                                                  -offset_value.coeffs[1], 0);
2485     }
2486
2487   int factor = offset_value.coeffs[1];
2488   if ((factor & 15) == 0)
2489     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2490   else
2491     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2492   return buffer;
2493 }
2494
2495 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2496    instruction.  If it is, store the number of elements in each vector
2497    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2498    factor in *FACTOR_OUT (if nonnull).  */
2499
2500 bool
2501 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2502                                  unsigned int *nelts_per_vq_out)
2503 {
2504   rtx elt;
2505   poly_int64 value;
2506
2507   if (!const_vec_duplicate_p (x, &elt)
2508       || !poly_int_rtx_p (elt, &value))
2509     return false;
2510
2511   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2512   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2513     /* There's no vector INCB.  */
2514     return false;
2515
2516   HOST_WIDE_INT factor = value.coeffs[0];
2517   if (value.coeffs[1] != factor)
2518     return false;
2519
2520   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
2521   if ((factor % nelts_per_vq) != 0
2522       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2523     return false;
2524
2525   if (factor_out)
2526     *factor_out = factor;
2527   if (nelts_per_vq_out)
2528     *nelts_per_vq_out = nelts_per_vq;
2529   return true;
2530 }
2531
2532 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2533    instruction.  */
2534
2535 bool
2536 aarch64_sve_inc_dec_immediate_p (rtx x)
2537 {
2538   return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2539 }
2540
2541 /* Return the asm template for an SVE vector INC or DEC instruction.
2542    OPERANDS gives the operands before the vector count and X is the
2543    value of the vector count operand itself.  */
2544
2545 char *
2546 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2547 {
2548   int factor;
2549   unsigned int nelts_per_vq;
2550   if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2551     gcc_unreachable ();
2552   if (factor < 0)
2553     return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2554                                              nelts_per_vq);
2555   else
2556     return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2557                                              nelts_per_vq);
2558 }
2559
2560 static int
2561 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2562                                 scalar_int_mode mode)
2563 {
2564   int i;
2565   unsigned HOST_WIDE_INT val, val2, mask;
2566   int one_match, zero_match;
2567   int num_insns;
2568
2569   val = INTVAL (imm);
2570
2571   if (aarch64_move_imm (val, mode))
2572     {
2573       if (generate)
2574         emit_insn (gen_rtx_SET (dest, imm));
2575       return 1;
2576     }
2577
2578   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2579      (with XXXX non-zero). In that case check to see if the move can be done in
2580      a smaller mode.  */
2581   val2 = val & 0xffffffff;
2582   if (mode == DImode
2583       && aarch64_move_imm (val2, SImode)
2584       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2585     {
2586       if (generate)
2587         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2588
2589       /* Check if we have to emit a second instruction by checking to see
2590          if any of the upper 32 bits of the original DI mode value is set.  */
2591       if (val == val2)
2592         return 1;
2593
2594       i = (val >> 48) ? 48 : 32;
2595
2596       if (generate)
2597          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2598                                     GEN_INT ((val >> i) & 0xffff)));
2599
2600       return 2;
2601     }
2602
2603   if ((val >> 32) == 0 || mode == SImode)
2604     {
2605       if (generate)
2606         {
2607           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2608           if (mode == SImode)
2609             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2610                                        GEN_INT ((val >> 16) & 0xffff)));
2611           else
2612             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2613                                        GEN_INT ((val >> 16) & 0xffff)));
2614         }
2615       return 2;
2616     }
2617
2618   /* Remaining cases are all for DImode.  */
2619
2620   mask = 0xffff;
2621   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2622     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2623   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2624     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2625
2626   if (zero_match != 2 && one_match != 2)
2627     {
2628       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2629          For a 64-bit bitmask try whether changing 16 bits to all ones or
2630          zeroes creates a valid bitmask.  To check any repeated bitmask,
2631          try using 16 bits from the other 32-bit half of val.  */
2632
2633       for (i = 0; i < 64; i += 16, mask <<= 16)
2634         {
2635           val2 = val & ~mask;
2636           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2637             break;
2638           val2 = val | mask;
2639           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2640             break;
2641           val2 = val2 & ~mask;
2642           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2643           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2644             break;
2645         }
2646       if (i != 64)
2647         {
2648           if (generate)
2649             {
2650               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2651               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2652                                          GEN_INT ((val >> i) & 0xffff)));
2653             }
2654           return 2;
2655         }
2656     }
2657
2658   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2659      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
2660      otherwise skip zero bits.  */
2661
2662   num_insns = 1;
2663   mask = 0xffff;
2664   val2 = one_match > zero_match ? ~val : val;
2665   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2666
2667   if (generate)
2668     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2669                                            ? (val | ~(mask << i))
2670                                            : (val & (mask << i)))));
2671   for (i += 16; i < 64; i += 16)
2672     {
2673       if ((val2 & (mask << i)) == 0)
2674         continue;
2675       if (generate)
2676         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2677                                    GEN_INT ((val >> i) & 0xffff)));
2678       num_insns ++;
2679     }
2680
2681   return num_insns;
2682 }
2683
2684 /* Return whether imm is a 128-bit immediate which is simple enough to
2685    expand inline.  */
2686 bool
2687 aarch64_mov128_immediate (rtx imm)
2688 {
2689   if (GET_CODE (imm) == CONST_INT)
2690     return true;
2691
2692   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2693
2694   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2695   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2696
2697   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2698          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2699 }
2700
2701
2702 /* Return the number of temporary registers that aarch64_add_offset_1
2703    would need to add OFFSET to a register.  */
2704
2705 static unsigned int
2706 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2707 {
2708   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2709 }
2710
2711 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
2712    a non-polynomial OFFSET.  MODE is the mode of the addition.
2713    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2714    be set and CFA adjustments added to the generated instructions.
2715
2716    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2717    temporary if register allocation is already complete.  This temporary
2718    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
2719    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2720    the immediate again.
2721
2722    Since this function may be used to adjust the stack pointer, we must
2723    ensure that it cannot cause transient stack deallocation (for example
2724    by first incrementing SP and then decrementing when adjusting by a
2725    large immediate).  */
2726
2727 static void
2728 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2729                       rtx src, HOST_WIDE_INT offset, rtx temp1,
2730                       bool frame_related_p, bool emit_move_imm)
2731 {
2732   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2733   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2734
2735   HOST_WIDE_INT moffset = abs_hwi (offset);
2736   rtx_insn *insn;
2737
2738   if (!moffset)
2739     {
2740       if (!rtx_equal_p (dest, src))
2741         {
2742           insn = emit_insn (gen_rtx_SET (dest, src));
2743           RTX_FRAME_RELATED_P (insn) = frame_related_p;
2744         }
2745       return;
2746     }
2747
2748   /* Single instruction adjustment.  */
2749   if (aarch64_uimm12_shift (moffset))
2750     {
2751       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2752       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2753       return;
2754     }
2755
2756   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2757      and either:
2758
2759      a) the offset cannot be loaded by a 16-bit move or
2760      b) there is no spare register into which we can move it.  */
2761   if (moffset < 0x1000000
2762       && ((!temp1 && !can_create_pseudo_p ())
2763           || !aarch64_move_imm (moffset, mode)))
2764     {
2765       HOST_WIDE_INT low_off = moffset & 0xfff;
2766
2767       low_off = offset < 0 ? -low_off : low_off;
2768       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2769       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2770       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2771       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2772       return;
2773     }
2774
2775   /* Emit a move immediate if required and an addition/subtraction.  */
2776   if (emit_move_imm)
2777     {
2778       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2779       temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2780     }
2781   insn = emit_insn (offset < 0
2782                     ? gen_sub3_insn (dest, src, temp1)
2783                     : gen_add3_insn (dest, src, temp1));
2784   if (frame_related_p)
2785     {
2786       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2787       rtx adj = plus_constant (mode, src, offset);
2788       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2789     }
2790 }
2791
2792 /* Return the number of temporary registers that aarch64_add_offset
2793    would need to move OFFSET into a register or add OFFSET to a register;
2794    ADD_P is true if we want the latter rather than the former.  */
2795
2796 static unsigned int
2797 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2798 {
2799   /* This follows the same structure as aarch64_add_offset.  */
2800   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2801     return 0;
2802
2803   unsigned int count = 0;
2804   HOST_WIDE_INT factor = offset.coeffs[1];
2805   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2806   poly_int64 poly_offset (factor, factor);
2807   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2808     /* Need one register for the ADDVL/ADDPL result.  */
2809     count += 1;
2810   else if (factor != 0)
2811     {
2812       factor = abs (factor);
2813       if (factor > 16 * (factor & -factor))
2814         /* Need one register for the CNT result and one for the multiplication
2815            factor.  If necessary, the second temporary can be reused for the
2816            constant part of the offset.  */
2817         return 2;
2818       /* Need one register for the CNT result (which might then
2819          be shifted).  */
2820       count += 1;
2821     }
2822   return count + aarch64_add_offset_1_temporaries (constant);
2823 }
2824
2825 /* If X can be represented as a poly_int64, return the number
2826    of temporaries that are required to add it to a register.
2827    Return -1 otherwise.  */
2828
2829 int
2830 aarch64_add_offset_temporaries (rtx x)
2831 {
2832   poly_int64 offset;
2833   if (!poly_int_rtx_p (x, &offset))
2834     return -1;
2835   return aarch64_offset_temporaries (true, offset);
2836 }
2837
2838 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
2839    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2840    be set and CFA adjustments added to the generated instructions.
2841
2842    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2843    temporary if register allocation is already complete.  This temporary
2844    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2845    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2846    false to avoid emitting the immediate again.
2847
2848    TEMP2, if nonnull, is a second temporary register that doesn't
2849    overlap either DEST or REG.
2850
2851    Since this function may be used to adjust the stack pointer, we must
2852    ensure that it cannot cause transient stack deallocation (for example
2853    by first incrementing SP and then decrementing when adjusting by a
2854    large immediate).  */
2855
2856 static void
2857 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2858                     poly_int64 offset, rtx temp1, rtx temp2,
2859                     bool frame_related_p, bool emit_move_imm = true)
2860 {
2861   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2862   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2863   gcc_assert (temp1 == NULL_RTX
2864               || !frame_related_p
2865               || !reg_overlap_mentioned_p (temp1, dest));
2866   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2867
2868   /* Try using ADDVL or ADDPL to add the whole value.  */
2869   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2870     {
2871       rtx offset_rtx = gen_int_mode (offset, mode);
2872       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2873       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2874       return;
2875     }
2876
2877   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2878      SVE vector register, over and above the minimum size of 128 bits.
2879      This is equivalent to half the value returned by CNTD with a
2880      vector shape of ALL.  */
2881   HOST_WIDE_INT factor = offset.coeffs[1];
2882   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2883
2884   /* Try using ADDVL or ADDPL to add the VG-based part.  */
2885   poly_int64 poly_offset (factor, factor);
2886   if (src != const0_rtx
2887       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2888     {
2889       rtx offset_rtx = gen_int_mode (poly_offset, mode);
2890       if (frame_related_p)
2891         {
2892           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2893           RTX_FRAME_RELATED_P (insn) = true;
2894           src = dest;
2895         }
2896       else
2897         {
2898           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
2899           src = aarch64_force_temporary (mode, temp1, addr);
2900           temp1 = temp2;
2901           temp2 = NULL_RTX;
2902         }
2903     }
2904   /* Otherwise use a CNT-based sequence.  */
2905   else if (factor != 0)
2906     {
2907       /* Use a subtraction if we have a negative factor.  */
2908       rtx_code code = PLUS;
2909       if (factor < 0)
2910         {
2911           factor = -factor;
2912           code = MINUS;
2913         }
2914
2915       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
2916          into the multiplication.  */
2917       rtx val;
2918       int shift = 0;
2919       if (factor & 1)
2920         /* Use a right shift by 1.  */
2921         shift = -1;
2922       else
2923         factor /= 2;
2924       HOST_WIDE_INT low_bit = factor & -factor;
2925       if (factor <= 16 * low_bit)
2926         {
2927           if (factor > 16 * 8)
2928             {
2929               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
2930                  the value with the minimum multiplier and shift it into
2931                  position.  */
2932               int extra_shift = exact_log2 (low_bit);
2933               shift += extra_shift;
2934               factor >>= extra_shift;
2935             }
2936           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
2937         }
2938       else
2939         {
2940           /* Use CNTD, then multiply it by FACTOR.  */
2941           val = gen_int_mode (poly_int64 (2, 2), mode);
2942           val = aarch64_force_temporary (mode, temp1, val);
2943
2944           /* Go back to using a negative multiplication factor if we have
2945              no register from which to subtract.  */
2946           if (code == MINUS && src == const0_rtx)
2947             {
2948               factor = -factor;
2949               code = PLUS;
2950             }
2951           rtx coeff1 = gen_int_mode (factor, mode);
2952           coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
2953           val = gen_rtx_MULT (mode, val, coeff1);
2954         }
2955
2956       if (shift > 0)
2957         {
2958           /* Multiply by 1 << SHIFT.  */
2959           val = aarch64_force_temporary (mode, temp1, val);
2960           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
2961         }
2962       else if (shift == -1)
2963         {
2964           /* Divide by 2.  */
2965           val = aarch64_force_temporary (mode, temp1, val);
2966           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
2967         }
2968
2969       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
2970       if (src != const0_rtx)
2971         {
2972           val = aarch64_force_temporary (mode, temp1, val);
2973           val = gen_rtx_fmt_ee (code, mode, src, val);
2974         }
2975       else if (code == MINUS)
2976         {
2977           val = aarch64_force_temporary (mode, temp1, val);
2978           val = gen_rtx_NEG (mode, val);
2979         }
2980
2981       if (constant == 0 || frame_related_p)
2982         {
2983           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
2984           if (frame_related_p)
2985             {
2986               RTX_FRAME_RELATED_P (insn) = true;
2987               add_reg_note (insn, REG_CFA_ADJUST_CFA,
2988                             gen_rtx_SET (dest, plus_constant (Pmode, src,
2989                                                               poly_offset)));
2990             }
2991           src = dest;
2992           if (constant == 0)
2993             return;
2994         }
2995       else
2996         {
2997           src = aarch64_force_temporary (mode, temp1, val);
2998           temp1 = temp2;
2999           temp2 = NULL_RTX;
3000         }
3001
3002       emit_move_imm = true;
3003     }
3004
3005   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
3006                         frame_related_p, emit_move_imm);
3007 }
3008
3009 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3010    than a poly_int64.  */
3011
3012 void
3013 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3014                           rtx offset_rtx, rtx temp1, rtx temp2)
3015 {
3016   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
3017                       temp1, temp2, false);
3018 }
3019
3020 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3021    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
3022    if TEMP1 already contains abs (DELTA).  */
3023
3024 static inline void
3025 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
3026 {
3027   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
3028                       temp1, temp2, true, emit_move_imm);
3029 }
3030
3031 /* Subtract DELTA from the stack pointer, marking the instructions
3032    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
3033    if nonnull.  */
3034
3035 static inline void
3036 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
3037                 bool emit_move_imm = true)
3038 {
3039   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
3040                       temp1, temp2, frame_related_p, emit_move_imm);
3041 }
3042
3043 /* Set DEST to (vec_series BASE STEP).  */
3044
3045 static void
3046 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
3047 {
3048   machine_mode mode = GET_MODE (dest);
3049   scalar_mode inner = GET_MODE_INNER (mode);
3050
3051   /* Each operand can be a register or an immediate in the range [-16, 15].  */
3052   if (!aarch64_sve_index_immediate_p (base))
3053     base = force_reg (inner, base);
3054   if (!aarch64_sve_index_immediate_p (step))
3055     step = force_reg (inner, step);
3056
3057   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
3058 }
3059
3060 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
3061    integer of mode INT_MODE.  Return true on success.  */
3062
3063 static bool
3064 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
3065                                       rtx src)
3066 {
3067   /* If the constant is smaller than 128 bits, we can do the move
3068      using a vector of SRC_MODEs.  */
3069   if (src_mode != TImode)
3070     {
3071       poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
3072                                      GET_MODE_SIZE (src_mode));
3073       machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
3074       emit_move_insn (gen_lowpart (dup_mode, dest),
3075                       gen_const_vec_duplicate (dup_mode, src));
3076       return true;
3077     }
3078
3079   /* Use LD1RQ[BHWD] to load the 128 bits from memory.  */
3080   src = force_const_mem (src_mode, src);
3081   if (!src)
3082     return false;
3083
3084   /* Make sure that the address is legitimate.  */
3085   if (!aarch64_sve_ld1r_operand_p (src))
3086     {
3087       rtx addr = force_reg (Pmode, XEXP (src, 0));
3088       src = replace_equiv_address (src, addr);
3089     }
3090
3091   machine_mode mode = GET_MODE (dest);
3092   unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
3093   machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
3094   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3095   src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
3096   emit_insn (gen_rtx_SET (dest, src));
3097   return true;
3098 }
3099
3100 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
3101    isn't a simple duplicate or series.  */
3102
3103 static void
3104 aarch64_expand_sve_const_vector (rtx dest, rtx src)
3105 {
3106   machine_mode mode = GET_MODE (src);
3107   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
3108   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
3109   gcc_assert (npatterns > 1);
3110
3111   if (nelts_per_pattern == 1)
3112     {
3113       /* The constant is a repeating seqeuence of at least two elements,
3114          where the repeating elements occupy no more than 128 bits.
3115          Get an integer representation of the replicated value.  */
3116       scalar_int_mode int_mode;
3117       if (BYTES_BIG_ENDIAN)
3118         /* For now, always use LD1RQ to load the value on big-endian
3119            targets, since the handling of smaller integers includes a
3120            subreg that is semantically an element reverse.  */
3121         int_mode = TImode;
3122       else
3123         {
3124           unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
3125           gcc_assert (int_bits <= 128);
3126           int_mode = int_mode_for_size (int_bits, 0).require ();
3127         }
3128       rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
3129       if (int_value
3130           && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
3131         return;
3132     }
3133
3134   /* Expand each pattern individually.  */
3135   rtx_vector_builder builder;
3136   auto_vec<rtx, 16> vectors (npatterns);
3137   for (unsigned int i = 0; i < npatterns; ++i)
3138     {
3139       builder.new_vector (mode, 1, nelts_per_pattern);
3140       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
3141         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
3142       vectors.quick_push (force_reg (mode, builder.build ()));
3143     }
3144
3145   /* Use permutes to interleave the separate vectors.  */
3146   while (npatterns > 1)
3147     {
3148       npatterns /= 2;
3149       for (unsigned int i = 0; i < npatterns; ++i)
3150         {
3151           rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
3152           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
3153           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
3154           vectors[i] = tmp;
3155         }
3156     }
3157   gcc_assert (vectors[0] == dest);
3158 }
3159
3160 /* Set DEST to immediate IMM.  For SVE vector modes, GEN_VEC_DUPLICATE
3161    is a pattern that can be used to set DEST to a replicated scalar
3162    element.  */
3163
3164 void
3165 aarch64_expand_mov_immediate (rtx dest, rtx imm,
3166                               rtx (*gen_vec_duplicate) (rtx, rtx))
3167 {
3168   machine_mode mode = GET_MODE (dest);
3169
3170   /* Check on what type of symbol it is.  */
3171   scalar_int_mode int_mode;
3172   if ((GET_CODE (imm) == SYMBOL_REF
3173        || GET_CODE (imm) == LABEL_REF
3174        || GET_CODE (imm) == CONST
3175        || GET_CODE (imm) == CONST_POLY_INT)
3176       && is_a <scalar_int_mode> (mode, &int_mode))
3177     {
3178       rtx mem;
3179       poly_int64 offset;
3180       HOST_WIDE_INT const_offset;
3181       enum aarch64_symbol_type sty;
3182
3183       /* If we have (const (plus symbol offset)), separate out the offset
3184          before we start classifying the symbol.  */
3185       rtx base = strip_offset (imm, &offset);
3186
3187       /* We must always add an offset involving VL separately, rather than
3188          folding it into the relocation.  */
3189       if (!offset.is_constant (&const_offset))
3190         {
3191           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
3192             emit_insn (gen_rtx_SET (dest, imm));
3193           else
3194             {
3195               /* Do arithmetic on 32-bit values if the result is smaller
3196                  than that.  */
3197               if (partial_subreg_p (int_mode, SImode))
3198                 {
3199                   /* It is invalid to do symbol calculations in modes
3200                      narrower than SImode.  */
3201                   gcc_assert (base == const0_rtx);
3202                   dest = gen_lowpart (SImode, dest);
3203                   int_mode = SImode;
3204                 }
3205               if (base != const0_rtx)
3206                 {
3207                   base = aarch64_force_temporary (int_mode, dest, base);
3208                   aarch64_add_offset (int_mode, dest, base, offset,
3209                                       NULL_RTX, NULL_RTX, false);
3210                 }
3211               else
3212                 aarch64_add_offset (int_mode, dest, base, offset,
3213                                     dest, NULL_RTX, false);
3214             }
3215           return;
3216         }
3217
3218       sty = aarch64_classify_symbol (base, const_offset);
3219       switch (sty)
3220         {
3221         case SYMBOL_FORCE_TO_MEM:
3222           if (const_offset != 0
3223               && targetm.cannot_force_const_mem (int_mode, imm))
3224             {
3225               gcc_assert (can_create_pseudo_p ());
3226               base = aarch64_force_temporary (int_mode, dest, base);
3227               aarch64_add_offset (int_mode, dest, base, const_offset,
3228                                   NULL_RTX, NULL_RTX, false);
3229               return;
3230             }
3231
3232           mem = force_const_mem (ptr_mode, imm);
3233           gcc_assert (mem);
3234
3235           /* If we aren't generating PC relative literals, then
3236              we need to expand the literal pool access carefully.
3237              This is something that needs to be done in a number
3238              of places, so could well live as a separate function.  */
3239           if (!aarch64_pcrelative_literal_loads)
3240             {
3241               gcc_assert (can_create_pseudo_p ());
3242               base = gen_reg_rtx (ptr_mode);
3243               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
3244               if (ptr_mode != Pmode)
3245                 base = convert_memory_address (Pmode, base);
3246               mem = gen_rtx_MEM (ptr_mode, base);
3247             }
3248
3249           if (int_mode != ptr_mode)
3250             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
3251
3252           emit_insn (gen_rtx_SET (dest, mem));
3253
3254           return;
3255
3256         case SYMBOL_SMALL_TLSGD:
3257         case SYMBOL_SMALL_TLSDESC:
3258         case SYMBOL_SMALL_TLSIE:
3259         case SYMBOL_SMALL_GOT_28K:
3260         case SYMBOL_SMALL_GOT_4G:
3261         case SYMBOL_TINY_GOT:
3262         case SYMBOL_TINY_TLSIE:
3263           if (const_offset != 0)
3264             {
3265               gcc_assert(can_create_pseudo_p ());
3266               base = aarch64_force_temporary (int_mode, dest, base);
3267               aarch64_add_offset (int_mode, dest, base, const_offset,
3268                                   NULL_RTX, NULL_RTX, false);
3269               return;
3270             }
3271           /* FALLTHRU */
3272
3273         case SYMBOL_SMALL_ABSOLUTE:
3274         case SYMBOL_TINY_ABSOLUTE:
3275         case SYMBOL_TLSLE12:
3276         case SYMBOL_TLSLE24:
3277         case SYMBOL_TLSLE32:
3278         case SYMBOL_TLSLE48:
3279           aarch64_load_symref_appropriately (dest, imm, sty);
3280           return;
3281
3282         default:
3283           gcc_unreachable ();
3284         }
3285     }
3286
3287   if (!CONST_INT_P (imm))
3288     {
3289       rtx base, step, value;
3290       if (GET_CODE (imm) == HIGH
3291           || aarch64_simd_valid_immediate (imm, NULL))
3292         emit_insn (gen_rtx_SET (dest, imm));
3293       else if (const_vec_series_p (imm, &base, &step))
3294         aarch64_expand_vec_series (dest, base, step);
3295       else if (const_vec_duplicate_p (imm, &value))
3296         {
3297           /* If the constant is out of range of an SVE vector move,
3298              load it from memory if we can, otherwise move it into
3299              a register and use a DUP.  */
3300           scalar_mode inner_mode = GET_MODE_INNER (mode);
3301           rtx op = force_const_mem (inner_mode, value);
3302           if (!op)
3303             op = force_reg (inner_mode, value);
3304           else if (!aarch64_sve_ld1r_operand_p (op))
3305             {
3306               rtx addr = force_reg (Pmode, XEXP (op, 0));
3307               op = replace_equiv_address (op, addr);
3308             }
3309           emit_insn (gen_vec_duplicate (dest, op));
3310         }
3311       else if (GET_CODE (imm) == CONST_VECTOR
3312                && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3313         aarch64_expand_sve_const_vector (dest, imm);
3314       else
3315         {
3316           rtx mem = force_const_mem (mode, imm);
3317           gcc_assert (mem);
3318           emit_move_insn (dest, mem);
3319         }
3320
3321       return;
3322     }
3323
3324   aarch64_internal_mov_immediate (dest, imm, true,
3325                                   as_a <scalar_int_mode> (mode));
3326 }
3327
3328 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
3329    that is known to contain PTRUE.  */
3330
3331 void
3332 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3333 {
3334   emit_insn (gen_rtx_SET (dest, gen_rtx_UNSPEC (GET_MODE (dest),
3335                                                 gen_rtvec (2, pred, src),
3336                                                 UNSPEC_MERGE_PTRUE)));
3337 }
3338
3339 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3340    operand is in memory.  In this case we need to use the predicated LD1
3341    and ST1 instead of LDR and STR, both for correctness on big-endian
3342    targets and because LD1 and ST1 support a wider range of addressing modes.
3343    PRED_MODE is the mode of the predicate.
3344
3345    See the comment at the head of aarch64-sve.md for details about the
3346    big-endian handling.  */
3347
3348 void
3349 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3350 {
3351   machine_mode mode = GET_MODE (dest);
3352   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3353   if (!register_operand (src, mode)
3354       && !register_operand (dest, mode))
3355     {
3356       rtx tmp = gen_reg_rtx (mode);
3357       if (MEM_P (src))
3358         aarch64_emit_sve_pred_move (tmp, ptrue, src);
3359       else
3360         emit_move_insn (tmp, src);
3361       src = tmp;
3362     }
3363   aarch64_emit_sve_pred_move (dest, ptrue, src);
3364 }
3365
3366 /* Called only on big-endian targets.  See whether an SVE vector move
3367    from SRC to DEST is effectively a REV[BHW] instruction, because at
3368    least one operand is a subreg of an SVE vector that has wider or
3369    narrower elements.  Return true and emit the instruction if so.
3370
3371    For example:
3372
3373      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3374
3375    represents a VIEW_CONVERT between the following vectors, viewed
3376    in memory order:
3377
3378      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
3379      R1: { [0],      [1],      [2],      [3],     ... }
3380
3381    The high part of lane X in R2 should therefore correspond to lane X*2
3382    of R1, but the register representations are:
3383
3384          msb                                      lsb
3385      R2: ...... [1].high  [1].low   [0].high  [0].low
3386      R1: ...... [3]       [2]       [1]       [0]
3387
3388    where the low part of lane X in R2 corresponds to lane X*2 in R1.
3389    We therefore need a reverse operation to swap the high and low values
3390    around.
3391
3392    This is purely an optimization.  Without it we would spill the
3393    subreg operand to the stack in one mode and reload it in the
3394    other mode, which has the same effect as the REV.  */
3395
3396 bool
3397 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3398 {
3399   gcc_assert (BYTES_BIG_ENDIAN);
3400   if (GET_CODE (dest) == SUBREG)
3401     dest = SUBREG_REG (dest);
3402   if (GET_CODE (src) == SUBREG)
3403     src = SUBREG_REG (src);
3404
3405   /* The optimization handles two single SVE REGs with different element
3406      sizes.  */
3407   if (!REG_P (dest)
3408       || !REG_P (src)
3409       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3410       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3411       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3412           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3413     return false;
3414
3415   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
3416   rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3417   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3418                                UNSPEC_REV_SUBREG);
3419   emit_insn (gen_rtx_SET (dest, unspec));
3420   return true;
3421 }
3422
3423 /* Return a copy of X with mode MODE, without changing its other
3424    attributes.  Unlike gen_lowpart, this doesn't care whether the
3425    mode change is valid.  */
3426
3427 static rtx
3428 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3429 {
3430   if (GET_MODE (x) == mode)
3431     return x;
3432
3433   x = shallow_copy_rtx (x);
3434   set_mode_and_regno (x, mode, REGNO (x));
3435   return x;
3436 }
3437
3438 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3439    operands.  */
3440
3441 void
3442 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3443 {
3444   /* Decide which REV operation we need.  The mode with narrower elements
3445      determines the mode of the operands and the mode with the wider
3446      elements determines the reverse width.  */
3447   machine_mode mode_with_wider_elts = GET_MODE (dest);
3448   machine_mode mode_with_narrower_elts = GET_MODE (src);
3449   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3450       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3451     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3452
3453   unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3454   unsigned int unspec;
3455   if (wider_bytes == 8)
3456     unspec = UNSPEC_REV64;
3457   else if (wider_bytes == 4)
3458     unspec = UNSPEC_REV32;
3459   else if (wider_bytes == 2)
3460     unspec = UNSPEC_REV16;
3461   else
3462     gcc_unreachable ();
3463   machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3464
3465   /* Emit:
3466
3467        (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3468                          UNSPEC_MERGE_PTRUE))
3469
3470      with the appropriate modes.  */
3471   ptrue = gen_lowpart (pred_mode, ptrue);
3472   dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3473   src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3474   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3475   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3476                         UNSPEC_MERGE_PTRUE);
3477   emit_insn (gen_rtx_SET (dest, src));
3478 }
3479
3480 static bool
3481 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3482                                  tree exp ATTRIBUTE_UNUSED)
3483 {
3484   if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
3485     return false;
3486
3487   return true;
3488 }
3489
3490 /* Implement TARGET_PASS_BY_REFERENCE.  */
3491
3492 static bool
3493 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3494                            machine_mode mode,
3495                            const_tree type,
3496                            bool named ATTRIBUTE_UNUSED)
3497 {
3498   HOST_WIDE_INT size;
3499   machine_mode dummymode;
3500   int nregs;
3501
3502   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
3503   if (mode == BLKmode && type)
3504     size = int_size_in_bytes (type);
3505   else
3506     /* No frontends can create types with variable-sized modes, so we
3507        shouldn't be asked to pass or return them.  */
3508     size = GET_MODE_SIZE (mode).to_constant ();
3509
3510   /* Aggregates are passed by reference based on their size.  */
3511   if (type && AGGREGATE_TYPE_P (type))
3512     {
3513       size = int_size_in_bytes (type);
3514     }
3515
3516   /* Variable sized arguments are always returned by reference.  */
3517   if (size < 0)
3518     return true;
3519
3520   /* Can this be a candidate to be passed in fp/simd register(s)?  */
3521   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3522                                                &dummymode, &nregs,
3523                                                NULL))
3524     return false;
3525
3526   /* Arguments which are variable sized or larger than 2 registers are
3527      passed by reference unless they are a homogenous floating point
3528      aggregate.  */
3529   return size > 2 * UNITS_PER_WORD;
3530 }
3531
3532 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
3533 static bool
3534 aarch64_return_in_msb (const_tree valtype)
3535 {
3536   machine_mode dummy_mode;
3537   int dummy_int;
3538
3539   /* Never happens in little-endian mode.  */
3540   if (!BYTES_BIG_ENDIAN)
3541     return false;
3542
3543   /* Only composite types smaller than or equal to 16 bytes can
3544      be potentially returned in registers.  */
3545   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3546       || int_size_in_bytes (valtype) <= 0
3547       || int_size_in_bytes (valtype) > 16)
3548     return false;
3549
3550   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3551      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3552      is always passed/returned in the least significant bits of fp/simd
3553      register(s).  */
3554   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3555                                                &dummy_mode, &dummy_int, NULL))
3556     return false;
3557
3558   return true;
3559 }
3560
3561 /* Implement TARGET_FUNCTION_VALUE.
3562    Define how to find the value returned by a function.  */
3563
3564 static rtx
3565 aarch64_function_value (const_tree type, const_tree func,
3566                         bool outgoing ATTRIBUTE_UNUSED)
3567 {
3568   machine_mode mode;
3569   int unsignedp;
3570   int count;
3571   machine_mode ag_mode;
3572
3573   mode = TYPE_MODE (type);
3574   if (INTEGRAL_TYPE_P (type))
3575     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3576
3577   if (aarch64_return_in_msb (type))
3578     {
3579       HOST_WIDE_INT size = int_size_in_bytes (type);
3580
3581       if (size % UNITS_PER_WORD != 0)
3582         {
3583           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3584           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3585         }
3586     }
3587
3588   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3589                                                &ag_mode, &count, NULL))
3590     {
3591       if (!aarch64_composite_type_p (type, mode))
3592         {
3593           gcc_assert (count == 1 && mode == ag_mode);
3594           return gen_rtx_REG (mode, V0_REGNUM);
3595         }
3596       else
3597         {
3598           int i;
3599           rtx par;
3600
3601           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3602           for (i = 0; i < count; i++)
3603             {
3604               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3605               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3606               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3607               XVECEXP (par, 0, i) = tmp;
3608             }
3609           return par;
3610         }
3611     }
3612   else
3613     return gen_rtx_REG (mode, R0_REGNUM);
3614 }
3615
3616 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3617    Return true if REGNO is the number of a hard register in which the values
3618    of called function may come back.  */
3619
3620 static bool
3621 aarch64_function_value_regno_p (const unsigned int regno)
3622 {
3623   /* Maximum of 16 bytes can be returned in the general registers.  Examples
3624      of 16-byte return values are: 128-bit integers and 16-byte small
3625      structures (excluding homogeneous floating-point aggregates).  */
3626   if (regno == R0_REGNUM || regno == R1_REGNUM)
3627     return true;
3628
3629   /* Up to four fp/simd registers can return a function value, e.g. a
3630      homogeneous floating-point aggregate having four members.  */
3631   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3632     return TARGET_FLOAT;
3633
3634   return false;
3635 }
3636
3637 /* Implement TARGET_RETURN_IN_MEMORY.
3638
3639    If the type T of the result of a function is such that
3640      void func (T arg)
3641    would require that arg be passed as a value in a register (or set of
3642    registers) according to the parameter passing rules, then the result
3643    is returned in the same registers as would be used for such an
3644    argument.  */
3645
3646 static bool
3647 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3648 {
3649   HOST_WIDE_INT size;
3650   machine_mode ag_mode;
3651   int count;
3652
3653   if (!AGGREGATE_TYPE_P (type)
3654       && TREE_CODE (type) != COMPLEX_TYPE
3655       && TREE_CODE (type) != VECTOR_TYPE)
3656     /* Simple scalar types always returned in registers.  */
3657     return false;
3658
3659   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3660                                                type,
3661                                                &ag_mode,
3662                                                &count,
3663                                                NULL))
3664     return false;
3665
3666   /* Types larger than 2 registers returned in memory.  */
3667   size = int_size_in_bytes (type);
3668   return (size < 0 || size > 2 * UNITS_PER_WORD);
3669 }
3670
3671 static bool
3672 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3673                                const_tree type, int *nregs)
3674 {
3675   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3676   return aarch64_vfp_is_call_or_return_candidate (mode,
3677                                                   type,
3678                                                   &pcum->aapcs_vfp_rmode,
3679                                                   nregs,
3680                                                   NULL);
3681 }
3682
3683 /* Given MODE and TYPE of a function argument, return the alignment in
3684    bits.  The idea is to suppress any stronger alignment requested by
3685    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3686    This is a helper function for local use only.  */
3687
3688 static unsigned int
3689 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
3690 {
3691   if (!type)
3692     return GET_MODE_ALIGNMENT (mode);
3693
3694   if (integer_zerop (TYPE_SIZE (type)))
3695     return 0;
3696
3697   gcc_assert (TYPE_MODE (type) == mode);
3698
3699   if (!AGGREGATE_TYPE_P (type))
3700     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3701
3702   if (TREE_CODE (type) == ARRAY_TYPE)
3703     return TYPE_ALIGN (TREE_TYPE (type));
3704
3705   unsigned int alignment = 0;
3706   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3707     if (TREE_CODE (field) == FIELD_DECL)
3708       alignment = std::max (alignment, DECL_ALIGN (field));
3709
3710   return alignment;
3711 }
3712
3713 /* Layout a function argument according to the AAPCS64 rules.  The rule
3714    numbers refer to the rule numbers in the AAPCS64.  */
3715
3716 static void
3717 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3718                     const_tree type,
3719                     bool named ATTRIBUTE_UNUSED)
3720 {
3721   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3722   int ncrn, nvrn, nregs;
3723   bool allocate_ncrn, allocate_nvrn;
3724   HOST_WIDE_INT size;
3725
3726   /* We need to do this once per argument.  */
3727   if (pcum->aapcs_arg_processed)
3728     return;
3729
3730   pcum->aapcs_arg_processed = true;
3731
3732   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
3733   if (type)
3734     size = int_size_in_bytes (type);
3735   else
3736     /* No frontends can create types with variable-sized modes, so we
3737        shouldn't be asked to pass or return them.  */
3738     size = GET_MODE_SIZE (mode).to_constant ();
3739   size = ROUND_UP (size, UNITS_PER_WORD);
3740
3741   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3742   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3743                                                  mode,
3744                                                  type,
3745                                                  &nregs);
3746
3747   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3748      The following code thus handles passing by SIMD/FP registers first.  */
3749
3750   nvrn = pcum->aapcs_nvrn;
3751
3752   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3753      and homogenous short-vector aggregates (HVA).  */
3754   if (allocate_nvrn)
3755     {
3756       if (!TARGET_FLOAT)
3757         aarch64_err_no_fpadvsimd (mode);
3758
3759       if (nvrn + nregs <= NUM_FP_ARG_REGS)
3760         {
3761           pcum->aapcs_nextnvrn = nvrn + nregs;
3762           if (!aarch64_composite_type_p (type, mode))
3763             {
3764               gcc_assert (nregs == 1);
3765               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3766             }
3767           else
3768             {
3769               rtx par;
3770               int i;
3771               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3772               for (i = 0; i < nregs; i++)
3773                 {
3774                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3775                                          V0_REGNUM + nvrn + i);
3776                   rtx offset = gen_int_mode
3777                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3778                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3779                   XVECEXP (par, 0, i) = tmp;
3780                 }
3781               pcum->aapcs_reg = par;
3782             }
3783           return;
3784         }
3785       else
3786         {
3787           /* C.3 NSRN is set to 8.  */
3788           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3789           goto on_stack;
3790         }
3791     }
3792
3793   ncrn = pcum->aapcs_ncrn;
3794   nregs = size / UNITS_PER_WORD;
3795
3796   /* C6 - C9.  though the sign and zero extension semantics are
3797      handled elsewhere.  This is the case where the argument fits
3798      entirely general registers.  */
3799   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3800     {
3801
3802       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3803
3804       /* C.8 if the argument has an alignment of 16 then the NGRN is
3805          rounded up to the next even number.  */
3806       if (nregs == 2
3807           && ncrn % 2
3808           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3809              comparison is there because for > 16 * BITS_PER_UNIT
3810              alignment nregs should be > 2 and therefore it should be
3811              passed by reference rather than value.  */
3812           && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3813         {
3814           ++ncrn;
3815           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3816         }
3817
3818       /* NREGS can be 0 when e.g. an empty structure is to be passed.
3819          A reg is still generated for it, but the caller should be smart
3820          enough not to use it.  */
3821       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3822         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3823       else
3824         {
3825           rtx par;
3826           int i;
3827
3828           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3829           for (i = 0; i < nregs; i++)
3830             {
3831               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3832               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3833                                        GEN_INT (i * UNITS_PER_WORD));
3834               XVECEXP (par, 0, i) = tmp;
3835             }
3836           pcum->aapcs_reg = par;
3837         }
3838
3839       pcum->aapcs_nextncrn = ncrn + nregs;
3840       return;
3841     }
3842
3843   /* C.11  */
3844   pcum->aapcs_nextncrn = NUM_ARG_REGS;
3845
3846   /* The argument is passed on stack; record the needed number of words for
3847      this argument and align the total size if necessary.  */
3848 on_stack:
3849   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3850
3851   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3852     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
3853                                        16 / UNITS_PER_WORD);
3854   return;
3855 }
3856
3857 /* Implement TARGET_FUNCTION_ARG.  */
3858
3859 static rtx
3860 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
3861                       const_tree type, bool named)
3862 {
3863   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3864   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
3865
3866   if (mode == VOIDmode)
3867     return NULL_RTX;
3868
3869   aarch64_layout_arg (pcum_v, mode, type, named);
3870   return pcum->aapcs_reg;
3871 }
3872
3873 void
3874 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
3875                            const_tree fntype ATTRIBUTE_UNUSED,
3876                            rtx libname ATTRIBUTE_UNUSED,
3877                            const_tree fndecl ATTRIBUTE_UNUSED,
3878                            unsigned n_named ATTRIBUTE_UNUSED)
3879 {
3880   pcum->aapcs_ncrn = 0;
3881   pcum->aapcs_nvrn = 0;
3882   pcum->aapcs_nextncrn = 0;
3883   pcum->aapcs_nextnvrn = 0;
3884   pcum->pcs_variant = ARM_PCS_AAPCS64;
3885   pcum->aapcs_reg = NULL_RTX;
3886   pcum->aapcs_arg_processed = false;
3887   pcum->aapcs_stack_words = 0;
3888   pcum->aapcs_stack_size = 0;
3889
3890   if (!TARGET_FLOAT
3891       && fndecl && TREE_PUBLIC (fndecl)
3892       && fntype && fntype != error_mark_node)
3893     {
3894       const_tree type = TREE_TYPE (fntype);
3895       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
3896       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
3897       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
3898                                                    &mode, &nregs, NULL))
3899         aarch64_err_no_fpadvsimd (TYPE_MODE (type));
3900     }
3901   return;
3902 }
3903
3904 static void
3905 aarch64_function_arg_advance (cumulative_args_t pcum_v,
3906                               machine_mode mode,
3907                               const_tree type,
3908                               bool named)
3909 {
3910   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3911   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
3912     {
3913       aarch64_layout_arg (pcum_v, mode, type, named);
3914       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
3915                   != (pcum->aapcs_stack_words != 0));
3916       pcum->aapcs_arg_processed = false;
3917       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
3918       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
3919       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
3920       pcum->aapcs_stack_words = 0;
3921       pcum->aapcs_reg = NULL_RTX;
3922     }
3923 }
3924
3925 bool
3926 aarch64_function_arg_regno_p (unsigned regno)
3927 {
3928   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
3929           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
3930 }
3931
3932 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
3933    PARM_BOUNDARY bits of alignment, but will be given anything up
3934    to STACK_BOUNDARY bits if the type requires it.  This makes sure
3935    that both before and after the layout of each argument, the Next
3936    Stacked Argument Address (NSAA) will have a minimum alignment of
3937    8 bytes.  */
3938
3939 static unsigned int
3940 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
3941 {
3942   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
3943   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
3944 }
3945
3946 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
3947
3948 static fixed_size_mode
3949 aarch64_get_reg_raw_mode (int regno)
3950 {
3951   if (TARGET_SVE && FP_REGNUM_P (regno))
3952     /* Don't use the SVE part of the register for __builtin_apply and
3953        __builtin_return.  The SVE registers aren't used by the normal PCS,
3954        so using them there would be a waste of time.  The PCS extensions
3955        for SVE types are fundamentally incompatible with the
3956        __builtin_return/__builtin_apply interface.  */
3957     return as_a <fixed_size_mode> (V16QImode);
3958   return default_get_reg_raw_mode (regno);
3959 }
3960
3961 /* Implement TARGET_FUNCTION_ARG_PADDING.
3962
3963    Small aggregate types are placed in the lowest memory address.
3964
3965    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
3966
3967 static pad_direction
3968 aarch64_function_arg_padding (machine_mode mode, const_tree type)
3969 {
3970   /* On little-endian targets, the least significant byte of every stack
3971      argument is passed at the lowest byte address of the stack slot.  */
3972   if (!BYTES_BIG_ENDIAN)
3973     return PAD_UPWARD;
3974
3975   /* Otherwise, integral, floating-point and pointer types are padded downward:
3976      the least significant byte of a stack argument is passed at the highest
3977      byte address of the stack slot.  */
3978   if (type
3979       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
3980          || POINTER_TYPE_P (type))
3981       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
3982     return PAD_DOWNWARD;
3983
3984   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
3985   return PAD_UPWARD;
3986 }
3987
3988 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
3989
3990    It specifies padding for the last (may also be the only)
3991    element of a block move between registers and memory.  If
3992    assuming the block is in the memory, padding upward means that
3993    the last element is padded after its highest significant byte,
3994    while in downward padding, the last element is padded at the
3995    its least significant byte side.
3996
3997    Small aggregates and small complex types are always padded
3998    upwards.
3999
4000    We don't need to worry about homogeneous floating-point or
4001    short-vector aggregates; their move is not affected by the
4002    padding direction determined here.  Regardless of endianness,
4003    each element of such an aggregate is put in the least
4004    significant bits of a fp/simd register.
4005
4006    Return !BYTES_BIG_ENDIAN if the least significant byte of the
4007    register has useful data, and return the opposite if the most
4008    significant byte does.  */
4009
4010 bool
4011 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
4012                      bool first ATTRIBUTE_UNUSED)
4013 {
4014
4015   /* Small composite types are always padded upward.  */
4016   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
4017     {
4018       HOST_WIDE_INT size;
4019       if (type)
4020         size = int_size_in_bytes (type);
4021       else
4022         /* No frontends can create types with variable-sized modes, so we
4023            shouldn't be asked to pass or return them.  */
4024         size = GET_MODE_SIZE (mode).to_constant ();
4025       if (size < 2 * UNITS_PER_WORD)
4026         return true;
4027     }
4028
4029   /* Otherwise, use the default padding.  */
4030   return !BYTES_BIG_ENDIAN;
4031 }
4032
4033 static scalar_int_mode
4034 aarch64_libgcc_cmp_return_mode (void)
4035 {
4036   return SImode;
4037 }
4038
4039 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
4040
4041 /* We use the 12-bit shifted immediate arithmetic instructions so values
4042    must be multiple of (1 << 12), i.e. 4096.  */
4043 #define ARITH_FACTOR 4096
4044
4045 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
4046 #error Cannot use simple address calculation for stack probing
4047 #endif
4048
4049 /* The pair of scratch registers used for stack probing.  */
4050 #define PROBE_STACK_FIRST_REG  R9_REGNUM
4051 #define PROBE_STACK_SECOND_REG R10_REGNUM
4052
4053 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
4054    inclusive.  These are offsets from the current stack pointer.  */
4055
4056 static void
4057 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
4058 {
4059   HOST_WIDE_INT size;
4060   if (!poly_size.is_constant (&size))
4061     {
4062       sorry ("stack probes for SVE frames");
4063       return;
4064     }
4065
4066   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
4067
4068   /* See the same assertion on PROBE_INTERVAL above.  */
4069   gcc_assert ((first % ARITH_FACTOR) == 0);
4070
4071   /* See if we have a constant small number of probes to generate.  If so,
4072      that's the easy case.  */
4073   if (size <= PROBE_INTERVAL)
4074     {
4075       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
4076
4077       emit_set_insn (reg1,
4078                      plus_constant (Pmode,
4079                                     stack_pointer_rtx, -(first + base)));
4080       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
4081     }
4082
4083   /* The run-time loop is made up of 8 insns in the generic case while the
4084      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
4085   else if (size <= 4 * PROBE_INTERVAL)
4086     {
4087       HOST_WIDE_INT i, rem;
4088
4089       emit_set_insn (reg1,
4090                      plus_constant (Pmode,
4091                                     stack_pointer_rtx,
4092                                     -(first + PROBE_INTERVAL)));
4093       emit_stack_probe (reg1);
4094
4095       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
4096          it exceeds SIZE.  If only two probes are needed, this will not
4097          generate any code.  Then probe at FIRST + SIZE.  */
4098       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
4099         {
4100           emit_set_insn (reg1,
4101                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
4102           emit_stack_probe (reg1);
4103         }
4104
4105       rem = size - (i - PROBE_INTERVAL);
4106       if (rem > 256)
4107         {
4108           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4109
4110           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
4111           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
4112         }
4113       else
4114         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
4115     }
4116
4117   /* Otherwise, do the same as above, but in a loop.  Note that we must be
4118      extra careful with variables wrapping around because we might be at
4119      the very top (or the very bottom) of the address space and we have
4120      to be able to handle this case properly; in particular, we use an
4121      equality test for the loop condition.  */
4122   else
4123     {
4124       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
4125
4126       /* Step 1: round SIZE to the previous multiple of the interval.  */
4127
4128       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
4129
4130
4131       /* Step 2: compute initial and final value of the loop counter.  */
4132
4133       /* TEST_ADDR = SP + FIRST.  */
4134       emit_set_insn (reg1,
4135                      plus_constant (Pmode, stack_pointer_rtx, -first));
4136
4137       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
4138       HOST_WIDE_INT adjustment = - (first + rounded_size);
4139       if (! aarch64_uimm12_shift (adjustment))
4140         {
4141           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
4142                                           true, Pmode);
4143           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
4144         }
4145       else
4146         emit_set_insn (reg2,
4147                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
4148
4149       /* Step 3: the loop
4150
4151          do
4152            {
4153              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
4154              probe at TEST_ADDR
4155            }
4156          while (TEST_ADDR != LAST_ADDR)
4157
4158          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
4159          until it is equal to ROUNDED_SIZE.  */
4160
4161       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
4162
4163
4164       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
4165          that SIZE is equal to ROUNDED_SIZE.  */
4166
4167       if (size != rounded_size)
4168         {
4169           HOST_WIDE_INT rem = size - rounded_size;
4170
4171           if (rem > 256)
4172             {
4173               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4174
4175               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
4176               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
4177             }
4178           else
4179             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
4180         }
4181     }
4182
4183   /* Make sure nothing is scheduled before we are done.  */
4184   emit_insn (gen_blockage ());
4185 }
4186
4187 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
4188    absolute addresses.  */
4189
4190 const char *
4191 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
4192 {
4193   static int labelno = 0;
4194   char loop_lab[32];
4195   rtx xops[2];
4196
4197   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
4198
4199   /* Loop.  */
4200   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
4201
4202   HOST_WIDE_INT stack_clash_probe_interval
4203     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
4204
4205   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
4206   xops[0] = reg1;
4207   HOST_WIDE_INT interval;
4208   if (flag_stack_clash_protection)
4209     interval = stack_clash_probe_interval;
4210   else
4211     interval = PROBE_INTERVAL;
4212
4213   gcc_assert (aarch64_uimm12_shift (interval));
4214   xops[1] = GEN_INT (interval);
4215
4216   output_asm_insn ("sub\t%0, %0, %1", xops);
4217
4218   /* If doing stack clash protection then we probe up by the ABI specified
4219      amount.  We do this because we're dropping full pages at a time in the
4220      loop.  But if we're doing non-stack clash probing, probe at SP 0.  */
4221   if (flag_stack_clash_protection)
4222     xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
4223   else
4224     xops[1] = CONST0_RTX (GET_MODE (xops[1]));
4225
4226   /* Probe at TEST_ADDR.  If we're inside the loop it is always safe to probe
4227      by this amount for each iteration.  */
4228   output_asm_insn ("str\txzr, [%0, %1]", xops);
4229
4230   /* Test if TEST_ADDR == LAST_ADDR.  */
4231   xops[1] = reg2;
4232   output_asm_insn ("cmp\t%0, %1", xops);
4233
4234   /* Branch.  */
4235   fputs ("\tb.ne\t", asm_out_file);
4236   assemble_name_raw (asm_out_file, loop_lab);
4237   fputc ('\n', asm_out_file);
4238
4239   return "";
4240 }
4241
4242 /* Emit the probe loop for doing stack clash probes and stack adjustments for
4243    SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
4244    of GUARD_SIZE.  When a probe is emitted it is done at most
4245    MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
4246    at most MIN_PROBE_THRESHOLD.  By the end of this function
4247    BASE = BASE - ADJUSTMENT.  */
4248
4249 const char *
4250 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
4251                                       rtx min_probe_threshold, rtx guard_size)
4252 {
4253   /* This function is not allowed to use any instruction generation function
4254      like gen_ and friends.  If you do you'll likely ICE during CFG validation,
4255      so instead emit the code you want using output_asm_insn.  */
4256   gcc_assert (flag_stack_clash_protection);
4257   gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
4258   gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
4259
4260   /* The minimum required allocation before the residual requires probing.  */
4261   HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
4262
4263   /* Clamp the value down to the nearest value that can be used with a cmp.  */
4264   residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
4265   rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
4266
4267   gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
4268   gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
4269
4270   static int labelno = 0;
4271   char loop_start_lab[32];
4272   char loop_end_lab[32];
4273   rtx xops[2];
4274
4275   ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
4276   ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
4277
4278   /* Emit loop start label.  */
4279   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
4280
4281   /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
4282   xops[0] = adjustment;
4283   xops[1] = probe_offset_value_rtx;
4284   output_asm_insn ("cmp\t%0, %1", xops);
4285
4286   /* Branch to end if not enough adjustment to probe.  */
4287   fputs ("\tb.lt\t", asm_out_file);
4288   assemble_name_raw (asm_out_file, loop_end_lab);
4289   fputc ('\n', asm_out_file);
4290
4291   /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
4292   xops[0] = base;
4293   xops[1] = probe_offset_value_rtx;
4294   output_asm_insn ("sub\t%0, %0, %1", xops);
4295
4296   /* Probe at BASE.  */
4297   xops[1] = const0_rtx;
4298   output_asm_insn ("str\txzr, [%0, %1]", xops);
4299
4300   /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
4301   xops[0] = adjustment;
4302   xops[1] = probe_offset_value_rtx;
4303   output_asm_insn ("sub\t%0, %0, %1", xops);
4304
4305   /* Branch to start if still more bytes to allocate.  */
4306   fputs ("\tb\t", asm_out_file);
4307   assemble_name_raw (asm_out_file, loop_start_lab);
4308   fputc ('\n', asm_out_file);
4309
4310   /* No probe leave.  */
4311   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
4312
4313   /* BASE = BASE - ADJUSTMENT.  */
4314   xops[0] = base;
4315   xops[1] = adjustment;
4316   output_asm_insn ("sub\t%0, %0, %1", xops);
4317   return "";
4318 }
4319
4320 /* Determine whether a frame chain needs to be generated.  */
4321 static bool
4322 aarch64_needs_frame_chain (void)
4323 {
4324   /* Force a frame chain for EH returns so the return address is at FP+8.  */
4325   if (frame_pointer_needed || crtl->calls_eh_return)
4326     return true;
4327
4328   /* A leaf function cannot have calls or write LR.  */
4329   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
4330
4331   /* Don't use a frame chain in leaf functions if leaf frame pointers
4332      are disabled.  */
4333   if (flag_omit_leaf_frame_pointer && is_leaf)
4334     return false;
4335
4336   return aarch64_use_frame_pointer;
4337 }
4338
4339 /* Mark the registers that need to be saved by the callee and calculate
4340    the size of the callee-saved registers area and frame record (both FP
4341    and LR may be omitted).  */
4342 static void
4343 aarch64_layout_frame (void)
4344 {
4345   HOST_WIDE_INT offset = 0;
4346   int regno, last_fp_reg = INVALID_REGNUM;
4347   bool simd_function = aarch64_simd_decl_p (cfun->decl);
4348
4349   cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
4350
4351   /* Adjust the outgoing arguments size if required.  Keep it in sync with what
4352      the mid-end is doing.  */
4353   crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
4354
4355 #define SLOT_NOT_REQUIRED (-2)
4356 #define SLOT_REQUIRED     (-1)
4357
4358   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
4359   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
4360
4361   /* If this is a non-leaf simd function with calls we assume that
4362      at least one of those calls is to a non-simd function and thus
4363      we must save V8 to V23 in the prologue.  */
4364
4365   if (simd_function && !crtl->is_leaf)
4366     {
4367       for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4368         if (FP_SIMD_SAVED_REGNUM_P (regno))
4369           df_set_regs_ever_live (regno, true);
4370     }
4371
4372   /* First mark all the registers that really need to be saved...  */
4373   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4374     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4375
4376   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4377     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4378
4379   /* ... that includes the eh data registers (if needed)...  */
4380   if (crtl->calls_eh_return)
4381     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
4382       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
4383         = SLOT_REQUIRED;
4384
4385   /* ... and any callee saved register that dataflow says is live.  */
4386   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4387     if (df_regs_ever_live_p (regno)
4388         && (regno == R30_REGNUM
4389             || !call_used_regs[regno]))
4390       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4391
4392   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4393     if (df_regs_ever_live_p (regno)
4394         && (!call_used_regs[regno]
4395             || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
4396       {
4397         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4398         last_fp_reg = regno;
4399       }
4400
4401   if (cfun->machine->frame.emit_frame_chain)
4402     {
4403       /* FP and LR are placed in the linkage record.  */
4404       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
4405       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
4406       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
4407       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
4408       offset = 2 * UNITS_PER_WORD;
4409     }
4410
4411   /* With stack-clash, LR must be saved in non-leaf functions.  */
4412   gcc_assert (crtl->is_leaf
4413               || (cfun->machine->frame.reg_offset[R30_REGNUM]
4414                   != SLOT_NOT_REQUIRED));
4415
4416   /* Now assign stack slots for them.  */
4417   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4418     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4419       {
4420         cfun->machine->frame.reg_offset[regno] = offset;
4421         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4422           cfun->machine->frame.wb_candidate1 = regno;
4423         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4424           cfun->machine->frame.wb_candidate2 = regno;
4425         offset += UNITS_PER_WORD;
4426       }
4427
4428   HOST_WIDE_INT max_int_offset = offset;
4429   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4430   bool has_align_gap = offset != max_int_offset;
4431
4432   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4433     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4434       {
4435         /* If there is an alignment gap between integer and fp callee-saves,
4436            allocate the last fp register to it if possible.  */
4437         if (regno == last_fp_reg
4438             && has_align_gap
4439             && !simd_function
4440             && (offset & 8) == 0)
4441           {
4442             cfun->machine->frame.reg_offset[regno] = max_int_offset;
4443             break;
4444           }
4445
4446         cfun->machine->frame.reg_offset[regno] = offset;
4447         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4448           cfun->machine->frame.wb_candidate1 = regno;
4449         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4450                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4451           cfun->machine->frame.wb_candidate2 = regno;
4452         offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
4453       }
4454
4455   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4456
4457   cfun->machine->frame.saved_regs_size = offset;
4458
4459   HOST_WIDE_INT varargs_and_saved_regs_size
4460     = offset + cfun->machine->frame.saved_varargs_size;
4461
4462   cfun->machine->frame.hard_fp_offset
4463     = aligned_upper_bound (varargs_and_saved_regs_size
4464                            + get_frame_size (),
4465                            STACK_BOUNDARY / BITS_PER_UNIT);
4466
4467   /* Both these values are already aligned.  */
4468   gcc_assert (multiple_p (crtl->outgoing_args_size,
4469                           STACK_BOUNDARY / BITS_PER_UNIT));
4470   cfun->machine->frame.frame_size
4471     = (cfun->machine->frame.hard_fp_offset
4472        + crtl->outgoing_args_size);
4473
4474   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4475
4476   cfun->machine->frame.initial_adjust = 0;
4477   cfun->machine->frame.final_adjust = 0;
4478   cfun->machine->frame.callee_adjust = 0;
4479   cfun->machine->frame.callee_offset = 0;
4480
4481   HOST_WIDE_INT max_push_offset = 0;
4482   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4483     max_push_offset = 512;
4484   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4485     max_push_offset = 256;
4486
4487   HOST_WIDE_INT const_size, const_fp_offset;
4488   if (cfun->machine->frame.frame_size.is_constant (&const_size)
4489       && const_size < max_push_offset
4490       && known_eq (crtl->outgoing_args_size, 0))
4491     {
4492       /* Simple, small frame with no outgoing arguments:
4493          stp reg1, reg2, [sp, -frame_size]!
4494          stp reg3, reg4, [sp, 16]  */
4495       cfun->machine->frame.callee_adjust = const_size;
4496     }
4497   else if (known_lt (crtl->outgoing_args_size
4498                      + cfun->machine->frame.saved_regs_size, 512)
4499            && !(cfun->calls_alloca
4500                 && known_lt (cfun->machine->frame.hard_fp_offset,
4501                              max_push_offset)))
4502     {
4503       /* Frame with small outgoing arguments:
4504          sub sp, sp, frame_size
4505          stp reg1, reg2, [sp, outgoing_args_size]
4506          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
4507       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4508       cfun->machine->frame.callee_offset
4509         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4510     }
4511   else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4512            && const_fp_offset < max_push_offset)
4513     {
4514       /* Frame with large outgoing arguments but a small local area:
4515          stp reg1, reg2, [sp, -hard_fp_offset]!
4516          stp reg3, reg4, [sp, 16]
4517          sub sp, sp, outgoing_args_size  */
4518       cfun->machine->frame.callee_adjust = const_fp_offset;
4519       cfun->machine->frame.final_adjust
4520         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4521     }
4522   else
4523     {
4524       /* Frame with large local area and outgoing arguments using frame pointer:
4525          sub sp, sp, hard_fp_offset
4526          stp x29, x30, [sp, 0]
4527          add x29, sp, 0
4528          stp reg3, reg4, [sp, 16]
4529          sub sp, sp, outgoing_args_size  */
4530       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4531       cfun->machine->frame.final_adjust
4532         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4533     }
4534
4535   cfun->machine->frame.laid_out = true;
4536 }
4537
4538 /* Return true if the register REGNO is saved on entry to
4539    the current function.  */
4540
4541 static bool
4542 aarch64_register_saved_on_entry (int regno)
4543 {
4544   return cfun->machine->frame.reg_offset[regno] >= 0;
4545 }
4546
4547 /* Return the next register up from REGNO up to LIMIT for the callee
4548    to save.  */
4549
4550 static unsigned
4551 aarch64_next_callee_save (unsigned regno, unsigned limit)
4552 {
4553   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4554     regno ++;
4555   return regno;
4556 }
4557
4558 /* Push the register number REGNO of mode MODE to the stack with write-back
4559    adjusting the stack by ADJUSTMENT.  */
4560
4561 static void
4562 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4563                            HOST_WIDE_INT adjustment)
4564  {
4565   rtx base_rtx = stack_pointer_rtx;
4566   rtx insn, reg, mem;
4567
4568   reg = gen_rtx_REG (mode, regno);
4569   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4570                             plus_constant (Pmode, base_rtx, -adjustment));
4571   mem = gen_frame_mem (mode, mem);
4572
4573   insn = emit_move_insn (mem, reg);
4574   RTX_FRAME_RELATED_P (insn) = 1;
4575 }
4576
4577 /* Generate and return an instruction to store the pair of registers
4578    REG and REG2 of mode MODE to location BASE with write-back adjusting
4579    the stack location BASE by ADJUSTMENT.  */
4580
4581 static rtx
4582 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4583                           HOST_WIDE_INT adjustment)
4584 {
4585   switch (mode)
4586     {
4587     case E_DImode:
4588       return gen_storewb_pairdi_di (base, base, reg, reg2,
4589                                     GEN_INT (-adjustment),
4590                                     GEN_INT (UNITS_PER_WORD - adjustment));
4591     case E_DFmode:
4592       return gen_storewb_pairdf_di (base, base, reg, reg2,
4593                                     GEN_INT (-adjustment),
4594                                     GEN_INT (UNITS_PER_WORD - adjustment));
4595     case E_TFmode:
4596       return gen_storewb_pairtf_di (base, base, reg, reg2,
4597                                     GEN_INT (-adjustment),
4598                                     GEN_INT (UNITS_PER_VREG - adjustment));
4599     default:
4600       gcc_unreachable ();
4601     }
4602 }
4603
4604 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4605    stack pointer by ADJUSTMENT.  */
4606
4607 static void
4608 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4609 {
4610   rtx_insn *insn;
4611   machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
4612
4613   if (regno2 == INVALID_REGNUM)
4614     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4615
4616   rtx reg1 = gen_rtx_REG (mode, regno1);
4617   rtx reg2 = gen_rtx_REG (mode, regno2);
4618
4619   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4620                                               reg2, adjustment));
4621   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4622   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4623   RTX_FRAME_RELATED_P (insn) = 1;
4624 }
4625
4626 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4627    adjusting it by ADJUSTMENT afterwards.  */
4628
4629 static rtx
4630 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4631                          HOST_WIDE_INT adjustment)
4632 {
4633   switch (mode)
4634     {
4635     case E_DImode:
4636       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4637                                    GEN_INT (UNITS_PER_WORD));
4638     case E_DFmode:
4639       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4640                                    GEN_INT (UNITS_PER_WORD));
4641     case E_TFmode:
4642       return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
4643                                    GEN_INT (UNITS_PER_VREG));
4644     default:
4645       gcc_unreachable ();
4646     }
4647 }
4648
4649 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4650    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4651    into CFI_OPS.  */
4652
4653 static void
4654 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4655                   rtx *cfi_ops)
4656 {
4657   machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
4658   rtx reg1 = gen_rtx_REG (mode, regno1);
4659
4660   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4661
4662   if (regno2 == INVALID_REGNUM)
4663     {
4664       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4665       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4666       emit_move_insn (reg1, gen_frame_mem (mode, mem));
4667     }
4668   else
4669     {
4670       rtx reg2 = gen_rtx_REG (mode, regno2);
4671       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4672       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4673                                           reg2, adjustment));
4674     }
4675 }
4676
4677 /* Generate and return a store pair instruction of mode MODE to store
4678    register REG1 to MEM1 and register REG2 to MEM2.  */
4679
4680 static rtx
4681 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4682                         rtx reg2)
4683 {
4684   switch (mode)
4685     {
4686     case E_DImode:
4687       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
4688
4689     case E_DFmode:
4690       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
4691
4692     case E_TFmode:
4693       return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
4694
4695     default:
4696       gcc_unreachable ();
4697     }
4698 }
4699
4700 /* Generate and regurn a load pair isntruction of mode MODE to load register
4701    REG1 from MEM1 and register REG2 from MEM2.  */
4702
4703 static rtx
4704 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4705                        rtx mem2)
4706 {
4707   switch (mode)
4708     {
4709     case E_DImode:
4710       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
4711
4712     case E_DFmode:
4713       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
4714
4715     case E_TFmode:
4716       return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
4717
4718     default:
4719       gcc_unreachable ();
4720     }
4721 }
4722
4723 /* Return TRUE if return address signing should be enabled for the current
4724    function, otherwise return FALSE.  */
4725
4726 bool
4727 aarch64_return_address_signing_enabled (void)
4728 {
4729   /* This function should only be called after frame laid out.   */
4730   gcc_assert (cfun->machine->frame.laid_out);
4731
4732   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4733      if it's LR is pushed onto stack.  */
4734   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4735           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4736               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4737 }
4738
4739 /* Return TRUE if Branch Target Identification Mechanism is enabled.  */
4740 bool
4741 aarch64_bti_enabled (void)
4742 {
4743   return (aarch64_enable_bti == 1);
4744 }
4745
4746 /* Emit code to save the callee-saved registers from register number START
4747    to LIMIT to the stack at the location starting at offset START_OFFSET,
4748    skipping any write-back candidates if SKIP_WB is true.  */
4749
4750 static void
4751 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4752                            unsigned start, unsigned limit, bool skip_wb)
4753 {
4754   rtx_insn *insn;
4755   unsigned regno;
4756   unsigned regno2;
4757
4758   for (regno = aarch64_next_callee_save (start, limit);
4759        regno <= limit;
4760        regno = aarch64_next_callee_save (regno + 1, limit))
4761     {
4762       rtx reg, mem;
4763       poly_int64 offset;
4764       int offset_diff;
4765
4766       if (skip_wb
4767           && (regno == cfun->machine->frame.wb_candidate1
4768               || regno == cfun->machine->frame.wb_candidate2))
4769         continue;
4770
4771       if (cfun->machine->reg_is_wrapped_separately[regno])
4772        continue;
4773
4774       reg = gen_rtx_REG (mode, regno);
4775       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4776       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4777                                                 offset));
4778
4779       regno2 = aarch64_next_callee_save (regno + 1, limit);
4780       offset_diff = cfun->machine->frame.reg_offset[regno2]
4781                     - cfun->machine->frame.reg_offset[regno];
4782
4783       if (regno2 <= limit
4784           && !cfun->machine->reg_is_wrapped_separately[regno2]
4785           && known_eq (GET_MODE_SIZE (mode), offset_diff))
4786         {
4787           rtx reg2 = gen_rtx_REG (mode, regno2);
4788           rtx mem2;
4789
4790           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4791           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4792                                                      offset));
4793           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4794                                                     reg2));
4795
4796           /* The first part of a frame-related parallel insn is
4797              always assumed to be relevant to the frame
4798              calculations; subsequent parts, are only
4799              frame-related if explicitly marked.  */
4800           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4801           regno = regno2;
4802         }
4803       else
4804         insn = emit_move_insn (mem, reg);
4805
4806       RTX_FRAME_RELATED_P (insn) = 1;
4807     }
4808 }
4809
4810 /* Emit code to restore the callee registers of mode MODE from register
4811    number START up to and including LIMIT.  Restore from the stack offset
4812    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4813    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
4814
4815 static void
4816 aarch64_restore_callee_saves (machine_mode mode,
4817                               poly_int64 start_offset, unsigned start,
4818                               unsigned limit, bool skip_wb, rtx *cfi_ops)
4819 {
4820   rtx base_rtx = stack_pointer_rtx;
4821   unsigned regno;
4822   unsigned regno2;
4823   poly_int64 offset;
4824
4825   for (regno = aarch64_next_callee_save (start, limit);
4826        regno <= limit;
4827        regno = aarch64_next_callee_save (regno + 1, limit))
4828     {
4829       if (cfun->machine->reg_is_wrapped_separately[regno])
4830        continue;
4831
4832       rtx reg, mem;
4833       int offset_diff;
4834
4835       if (skip_wb
4836           && (regno == cfun->machine->frame.wb_candidate1
4837               || regno == cfun->machine->frame.wb_candidate2))
4838         continue;
4839
4840       reg = gen_rtx_REG (mode, regno);
4841       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4842       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4843
4844       regno2 = aarch64_next_callee_save (regno + 1, limit);
4845       offset_diff = cfun->machine->frame.reg_offset[regno2]
4846                     - cfun->machine->frame.reg_offset[regno];
4847
4848       if (regno2 <= limit
4849           && !cfun->machine->reg_is_wrapped_separately[regno2]
4850           && known_eq (GET_MODE_SIZE (mode), offset_diff))
4851         {
4852           rtx reg2 = gen_rtx_REG (mode, regno2);
4853           rtx mem2;
4854
4855           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4856           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4857           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4858
4859           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4860           regno = regno2;
4861         }
4862       else
4863         emit_move_insn (reg, mem);
4864       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
4865     }
4866 }
4867
4868 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4869    of MODE.  */
4870
4871 static inline bool
4872 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4873 {
4874   HOST_WIDE_INT multiple;
4875   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4876           && IN_RANGE (multiple, -8, 7));
4877 }
4878
4879 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4880    of MODE.  */
4881
4882 static inline bool
4883 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4884 {
4885   HOST_WIDE_INT multiple;
4886   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4887           && IN_RANGE (multiple, 0, 63));
4888 }
4889
4890 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4891    of MODE.  */
4892
4893 bool
4894 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4895 {
4896   HOST_WIDE_INT multiple;
4897   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4898           && IN_RANGE (multiple, -64, 63));
4899 }
4900
4901 /* Return true if OFFSET is a signed 9-bit value.  */
4902
4903 bool
4904 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
4905                                        poly_int64 offset)
4906 {
4907   HOST_WIDE_INT const_offset;
4908   return (offset.is_constant (&const_offset)
4909           && IN_RANGE (const_offset, -256, 255));
4910 }
4911
4912 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4913    of MODE.  */
4914
4915 static inline bool
4916 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4917 {
4918   HOST_WIDE_INT multiple;
4919   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4920           && IN_RANGE (multiple, -256, 255));
4921 }
4922
4923 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
4924    of MODE.  */
4925
4926 static inline bool
4927 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4928 {
4929   HOST_WIDE_INT multiple;
4930   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4931           && IN_RANGE (multiple, 0, 4095));
4932 }
4933
4934 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
4935
4936 static sbitmap
4937 aarch64_get_separate_components (void)
4938 {
4939   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4940   bitmap_clear (components);
4941
4942   /* The registers we need saved to the frame.  */
4943   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4944     if (aarch64_register_saved_on_entry (regno))
4945       {
4946         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4947         if (!frame_pointer_needed)
4948           offset += cfun->machine->frame.frame_size
4949                     - cfun->machine->frame.hard_fp_offset;
4950         /* Check that we can access the stack slot of the register with one
4951            direct load with no adjustments needed.  */
4952         if (offset_12bit_unsigned_scaled_p (DImode, offset))
4953           bitmap_set_bit (components, regno);
4954       }
4955
4956   /* Don't mess with the hard frame pointer.  */
4957   if (frame_pointer_needed)
4958     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
4959
4960   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4961   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4962   /* If registers have been chosen to be stored/restored with
4963      writeback don't interfere with them to avoid having to output explicit
4964      stack adjustment instructions.  */
4965   if (reg2 != INVALID_REGNUM)
4966     bitmap_clear_bit (components, reg2);
4967   if (reg1 != INVALID_REGNUM)
4968     bitmap_clear_bit (components, reg1);
4969
4970   bitmap_clear_bit (components, LR_REGNUM);
4971   bitmap_clear_bit (components, SP_REGNUM);
4972
4973   return components;
4974 }
4975
4976 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
4977
4978 static sbitmap
4979 aarch64_components_for_bb (basic_block bb)
4980 {
4981   bitmap in = DF_LIVE_IN (bb);
4982   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
4983   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
4984   bool simd_function = aarch64_simd_decl_p (cfun->decl);
4985
4986   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4987   bitmap_clear (components);
4988
4989   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
4990   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4991     if ((!call_used_regs[regno]
4992         || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
4993        && (bitmap_bit_p (in, regno)
4994            || bitmap_bit_p (gen, regno)
4995            || bitmap_bit_p (kill, regno)))
4996       {
4997         unsigned regno2, offset, offset2;
4998         bitmap_set_bit (components, regno);
4999
5000         /* If there is a callee-save at an adjacent offset, add it too
5001            to increase the use of LDP/STP.  */
5002         offset = cfun->machine->frame.reg_offset[regno];
5003         regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
5004
5005         if (regno2 <= LAST_SAVED_REGNUM)
5006           {
5007             offset2 = cfun->machine->frame.reg_offset[regno2];
5008             if ((offset & ~8) == (offset2 & ~8))
5009               bitmap_set_bit (components, regno2);
5010           }
5011       }
5012
5013   return components;
5014 }
5015
5016 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5017    Nothing to do for aarch64.  */
5018
5019 static void
5020 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
5021 {
5022 }
5023
5024 /* Return the next set bit in BMP from START onwards.  Return the total number
5025    of bits in BMP if no set bit is found at or after START.  */
5026
5027 static unsigned int
5028 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
5029 {
5030   unsigned int nbits = SBITMAP_SIZE (bmp);
5031   if (start == nbits)
5032     return start;
5033
5034   gcc_assert (start < nbits);
5035   for (unsigned int i = start; i < nbits; i++)
5036     if (bitmap_bit_p (bmp, i))
5037       return i;
5038
5039   return nbits;
5040 }
5041
5042 /* Do the work for aarch64_emit_prologue_components and
5043    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
5044    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
5045    for these components or the epilogue sequence.  That is, it determines
5046    whether we should emit stores or loads and what kind of CFA notes to attach
5047    to the insns.  Otherwise the logic for the two sequences is very
5048    similar.  */
5049
5050 static void
5051 aarch64_process_components (sbitmap components, bool prologue_p)
5052 {
5053   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
5054                              ? HARD_FRAME_POINTER_REGNUM
5055                              : STACK_POINTER_REGNUM);
5056
5057   unsigned last_regno = SBITMAP_SIZE (components);
5058   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
5059   rtx_insn *insn = NULL;
5060
5061   while (regno != last_regno)
5062     {
5063       /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
5064          so DFmode for the vector registers is enough.  For simd functions
5065          we want to save the low 128 bits.  */
5066       machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
5067
5068       rtx reg = gen_rtx_REG (mode, regno);
5069       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5070       if (!frame_pointer_needed)
5071         offset += cfun->machine->frame.frame_size
5072                   - cfun->machine->frame.hard_fp_offset;
5073       rtx addr = plus_constant (Pmode, ptr_reg, offset);
5074       rtx mem = gen_frame_mem (mode, addr);
5075
5076       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
5077       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
5078       /* No more registers to handle after REGNO.
5079          Emit a single save/restore and exit.  */
5080       if (regno2 == last_regno)
5081         {
5082           insn = emit_insn (set);
5083           RTX_FRAME_RELATED_P (insn) = 1;
5084           if (prologue_p)
5085             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5086           else
5087             add_reg_note (insn, REG_CFA_RESTORE, reg);
5088           break;
5089         }
5090
5091       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
5092       /* The next register is not of the same class or its offset is not
5093          mergeable with the current one into a pair.  */
5094       if (!satisfies_constraint_Ump (mem)
5095           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
5096           || (aarch64_simd_decl_p (cfun->decl) && FP_REGNUM_P (regno))
5097           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
5098                        GET_MODE_SIZE (mode)))
5099         {
5100           insn = emit_insn (set);
5101           RTX_FRAME_RELATED_P (insn) = 1;
5102           if (prologue_p)
5103             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5104           else
5105             add_reg_note (insn, REG_CFA_RESTORE, reg);
5106
5107           regno = regno2;
5108           continue;
5109         }
5110
5111       /* REGNO2 can be saved/restored in a pair with REGNO.  */
5112       rtx reg2 = gen_rtx_REG (mode, regno2);
5113       if (!frame_pointer_needed)
5114         offset2 += cfun->machine->frame.frame_size
5115                   - cfun->machine->frame.hard_fp_offset;
5116       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
5117       rtx mem2 = gen_frame_mem (mode, addr2);
5118       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
5119                              : gen_rtx_SET (reg2, mem2);
5120
5121       if (prologue_p)
5122         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
5123       else
5124         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5125
5126       RTX_FRAME_RELATED_P (insn) = 1;
5127       if (prologue_p)
5128         {
5129           add_reg_note (insn, REG_CFA_OFFSET, set);
5130           add_reg_note (insn, REG_CFA_OFFSET, set2);
5131         }
5132       else
5133         {
5134           add_reg_note (insn, REG_CFA_RESTORE, reg);
5135           add_reg_note (insn, REG_CFA_RESTORE, reg2);
5136         }
5137
5138       regno = aarch64_get_next_set_bit (components, regno2 + 1);
5139     }
5140 }
5141
5142 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
5143
5144 static void
5145 aarch64_emit_prologue_components (sbitmap components)
5146 {
5147   aarch64_process_components (components, true);
5148 }
5149
5150 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
5151
5152 static void
5153 aarch64_emit_epilogue_components (sbitmap components)
5154 {
5155   aarch64_process_components (components, false);
5156 }
5157
5158 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
5159
5160 static void
5161 aarch64_set_handled_components (sbitmap components)
5162 {
5163   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5164     if (bitmap_bit_p (components, regno))
5165       cfun->machine->reg_is_wrapped_separately[regno] = true;
5166 }
5167
5168 /* On AArch64 we have an ABI defined safe buffer.  This constant is used to
5169    determining the probe offset for alloca.  */
5170
5171 static HOST_WIDE_INT
5172 aarch64_stack_clash_protection_alloca_probe_range (void)
5173 {
5174   return STACK_CLASH_CALLER_GUARD;
5175 }
5176
5177
5178 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
5179    registers.  If POLY_SIZE is not large enough to require a probe this function
5180    will only adjust the stack.  When allocating the stack space
5181    FRAME_RELATED_P is then used to indicate if the allocation is frame related.
5182    FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
5183    arguments.  If we are then we ensure that any allocation larger than the ABI
5184    defined buffer needs a probe so that the invariant of having a 1KB buffer is
5185    maintained.
5186
5187    We emit barriers after each stack adjustment to prevent optimizations from
5188    breaking the invariant that we never drop the stack more than a page.  This
5189    invariant is needed to make it easier to correctly handle asynchronous
5190    events, e.g. if we were to allow the stack to be dropped by more than a page
5191    and then have multiple probes up and we take a signal somewhere in between
5192    then the signal handler doesn't know the state of the stack and can make no
5193    assumptions about which pages have been probed.  */
5194
5195 static void
5196 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
5197                                         poly_int64 poly_size,
5198                                         bool frame_related_p,
5199                                         bool final_adjustment_p)
5200 {
5201   HOST_WIDE_INT guard_size
5202     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5203   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
5204   /* When doing the final adjustment for the outgoing argument size we can't
5205      assume that LR was saved at position 0.  So subtract it's offset from the
5206      ABI safe buffer so that we don't accidentally allow an adjustment that
5207      would result in an allocation larger than the ABI buffer without
5208      probing.  */
5209   HOST_WIDE_INT min_probe_threshold
5210     = final_adjustment_p
5211       ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
5212       : guard_size - guard_used_by_caller;
5213
5214   poly_int64 frame_size = cfun->machine->frame.frame_size;
5215
5216   /* We should always have a positive probe threshold.  */
5217   gcc_assert (min_probe_threshold > 0);
5218
5219   if (flag_stack_clash_protection && !final_adjustment_p)
5220     {
5221       poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5222       poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5223
5224       if (known_eq (frame_size, 0))
5225         {
5226           dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
5227         }
5228       else if (known_lt (initial_adjust, guard_size - guard_used_by_caller)
5229                && known_lt (final_adjust, guard_used_by_caller))
5230         {
5231           dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
5232         }
5233     }
5234
5235   /* If SIZE is not large enough to require probing, just adjust the stack and
5236      exit.  */
5237   if (known_lt (poly_size, min_probe_threshold)
5238       || !flag_stack_clash_protection)
5239     {
5240       aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
5241       return;
5242     }
5243
5244   HOST_WIDE_INT size;
5245   /* Handle the SVE non-constant case first.  */
5246   if (!poly_size.is_constant (&size))
5247     {
5248      if (dump_file)
5249       {
5250         fprintf (dump_file, "Stack clash SVE prologue: ");
5251         print_dec (poly_size, dump_file);
5252         fprintf (dump_file, " bytes, dynamic probing will be required.\n");
5253       }
5254
5255       /* First calculate the amount of bytes we're actually spilling.  */
5256       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
5257                           poly_size, temp1, temp2, false, true);
5258
5259       rtx_insn *insn = get_last_insn ();
5260
5261       if (frame_related_p)
5262         {
5263           /* This is done to provide unwinding information for the stack
5264              adjustments we're about to do, however to prevent the optimizers
5265              from removing the R15 move and leaving the CFA note (which would be
5266              very wrong) we tie the old and new stack pointer together.
5267              The tie will expand to nothing but the optimizers will not touch
5268              the instruction.  */
5269           rtx stack_ptr_copy = gen_rtx_REG (Pmode, R15_REGNUM);
5270           emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
5271           emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
5272
5273           /* We want the CFA independent of the stack pointer for the
5274              duration of the loop.  */
5275           add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
5276           RTX_FRAME_RELATED_P (insn) = 1;
5277         }
5278
5279       rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
5280       rtx guard_const = gen_int_mode (guard_size, Pmode);
5281
5282       insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
5283                                                    stack_pointer_rtx, temp1,
5284                                                    probe_const, guard_const));
5285
5286       /* Now reset the CFA register if needed.  */
5287       if (frame_related_p)
5288         {
5289           add_reg_note (insn, REG_CFA_DEF_CFA,
5290                         gen_rtx_PLUS (Pmode, stack_pointer_rtx,
5291                                       gen_int_mode (poly_size, Pmode)));
5292           RTX_FRAME_RELATED_P (insn) = 1;
5293         }
5294
5295       return;
5296     }
5297
5298   if (dump_file)
5299     fprintf (dump_file,
5300              "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
5301              " bytes, probing will be required.\n", size);
5302
5303   /* Round size to the nearest multiple of guard_size, and calculate the
5304      residual as the difference between the original size and the rounded
5305      size.  */
5306   HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
5307   HOST_WIDE_INT residual = size - rounded_size;
5308
5309   /* We can handle a small number of allocations/probes inline.  Otherwise
5310      punt to a loop.  */
5311   if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
5312     {
5313       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
5314         {
5315           aarch64_sub_sp (NULL, temp2, guard_size, true);
5316           emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
5317                                            guard_used_by_caller));
5318           emit_insn (gen_blockage ());
5319         }
5320       dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
5321     }
5322   else
5323     {
5324       /* Compute the ending address.  */
5325       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
5326                           temp1, NULL, false, true);
5327       rtx_insn *insn = get_last_insn ();
5328
5329       /* For the initial allocation, we don't have a frame pointer
5330          set up, so we always need CFI notes.  If we're doing the
5331          final allocation, then we may have a frame pointer, in which
5332          case it is the CFA, otherwise we need CFI notes.
5333
5334          We can determine which allocation we are doing by looking at
5335          the value of FRAME_RELATED_P since the final allocations are not
5336          frame related.  */
5337       if (frame_related_p)
5338         {
5339           /* We want the CFA independent of the stack pointer for the
5340              duration of the loop.  */
5341           add_reg_note (insn, REG_CFA_DEF_CFA,
5342                         plus_constant (Pmode, temp1, rounded_size));
5343           RTX_FRAME_RELATED_P (insn) = 1;
5344         }
5345
5346       /* This allocates and probes the stack.  Note that this re-uses some of
5347          the existing Ada stack protection code.  However we are guaranteed not
5348          to enter the non loop or residual branches of that code.
5349
5350          The non-loop part won't be entered because if our allocation amount
5351          doesn't require a loop, the case above would handle it.
5352
5353          The residual amount won't be entered because TEMP1 is a mutliple of
5354          the allocation size.  The residual will always be 0.  As such, the only
5355          part we are actually using from that code is the loop setup.  The
5356          actual probing is done in aarch64_output_probe_stack_range.  */
5357       insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
5358                                                stack_pointer_rtx, temp1));
5359
5360       /* Now reset the CFA register if needed.  */
5361       if (frame_related_p)
5362         {
5363           add_reg_note (insn, REG_CFA_DEF_CFA,
5364                         plus_constant (Pmode, stack_pointer_rtx, rounded_size));
5365           RTX_FRAME_RELATED_P (insn) = 1;
5366         }
5367
5368       emit_insn (gen_blockage ());
5369       dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
5370     }
5371
5372   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
5373      be probed.  This maintains the requirement that each page is probed at
5374      least once.  For initial probing we probe only if the allocation is
5375      more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
5376      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
5377      GUARD_SIZE.  This works that for any allocation that is large enough to
5378      trigger a probe here, we'll have at least one, and if they're not large
5379      enough for this code to emit anything for them, The page would have been
5380      probed by the saving of FP/LR either by this function or any callees.  If
5381      we don't have any callees then we won't have more stack adjustments and so
5382      are still safe.  */
5383   if (residual)
5384     {
5385       HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
5386       /* If we're doing final adjustments, and we've done any full page
5387          allocations then any residual needs to be probed.  */
5388       if (final_adjustment_p && rounded_size != 0)
5389         min_probe_threshold = 0;
5390       /* If doing a small final adjustment, we always probe at offset 0.
5391          This is done to avoid issues when LR is not at position 0 or when
5392          the final adjustment is smaller than the probing offset.  */
5393       else if (final_adjustment_p && rounded_size == 0)
5394         residual_probe_offset = 0;
5395
5396       aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
5397       if (residual >= min_probe_threshold)
5398         {
5399           if (dump_file)
5400             fprintf (dump_file,
5401                      "Stack clash AArch64 prologue residuals: "
5402                      HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
5403                      "\n", residual);
5404
5405             emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
5406                                              residual_probe_offset));
5407           emit_insn (gen_blockage ());
5408         }
5409     }
5410 }
5411
5412 /* Return 1 if the register is used by the epilogue.  We need to say the
5413    return register is used, but only after epilogue generation is complete.
5414    Note that in the case of sibcalls, the values "used by the epilogue" are
5415    considered live at the start of the called function.
5416
5417    For SIMD functions we need to return 1 for FP registers that are saved and
5418    restored by a function but are not zero in call_used_regs.  If we do not do
5419    this optimizations may remove the restore of the register.  */
5420
5421 int
5422 aarch64_epilogue_uses (int regno)
5423 {
5424   if (epilogue_completed)
5425     {
5426       if (regno == LR_REGNUM)
5427         return 1;
5428       if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno))
5429         return 1;
5430     }
5431   return 0;
5432 }
5433
5434 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
5435    is saved at BASE + OFFSET.  */
5436
5437 static void
5438 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
5439                             rtx base, poly_int64 offset)
5440 {
5441   rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
5442   add_reg_note (insn, REG_CFA_EXPRESSION,
5443                 gen_rtx_SET (mem, regno_reg_rtx[reg]));
5444 }
5445
5446 /* AArch64 stack frames generated by this compiler look like:
5447
5448         +-------------------------------+
5449         |                               |
5450         |  incoming stack arguments     |
5451         |                               |
5452         +-------------------------------+
5453         |                               | <-- incoming stack pointer (aligned)
5454         |  callee-allocated save area   |
5455         |  for register varargs         |
5456         |                               |
5457         +-------------------------------+
5458         |  local variables              | <-- frame_pointer_rtx
5459         |                               |
5460         +-------------------------------+
5461         |  padding                      | \
5462         +-------------------------------+  |
5463         |  callee-saved registers       |  | frame.saved_regs_size
5464         +-------------------------------+  |
5465         |  LR'                          |  |
5466         +-------------------------------+  |
5467         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
5468         +-------------------------------+
5469         |  dynamic allocation           |
5470         +-------------------------------+
5471         |  padding                      |
5472         +-------------------------------+
5473         |  outgoing stack arguments     | <-- arg_pointer
5474         |                               |
5475         +-------------------------------+
5476         |                               | <-- stack_pointer_rtx (aligned)
5477
5478    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
5479    but leave frame_pointer_rtx and hard_frame_pointer_rtx
5480    unchanged.
5481
5482    By default for stack-clash we assume the guard is at least 64KB, but this
5483    value is configurable to either 4KB or 64KB.  We also force the guard size to
5484    be the same as the probing interval and both values are kept in sync.
5485
5486    With those assumptions the callee can allocate up to 63KB (or 3KB depending
5487    on the guard size) of stack space without probing.
5488
5489    When probing is needed, we emit a probe at the start of the prologue
5490    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
5491
5492    We have to track how much space has been allocated and the only stores
5493    to the stack we track as implicit probes are the FP/LR stores.
5494
5495    For outgoing arguments we probe if the size is larger than 1KB, such that
5496    the ABI specified buffer is maintained for the next callee.  */
5497
5498 /* Generate the prologue instructions for entry into a function.
5499    Establish the stack frame by decreasing the stack pointer with a
5500    properly calculated size and, if necessary, create a frame record
5501    filled with the values of LR and previous frame pointer.  The
5502    current FP is also set up if it is in use.  */
5503
5504 void
5505 aarch64_expand_prologue (void)
5506 {
5507   poly_int64 frame_size = cfun->machine->frame.frame_size;
5508   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5509   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
5510   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5511   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
5512   unsigned reg1 = cfun->machine->frame.wb_candidate1;
5513   unsigned reg2 = cfun->machine->frame.wb_candidate2;
5514   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
5515   rtx_insn *insn;
5516
5517   /* Sign return address for functions.  */
5518   if (aarch64_return_address_signing_enabled ())
5519     {
5520       insn = emit_insn (gen_pacisp ());
5521       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5522       RTX_FRAME_RELATED_P (insn) = 1;
5523     }
5524
5525   if (flag_stack_usage_info)
5526     current_function_static_stack_size = constant_lower_bound (frame_size);
5527
5528   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
5529     {
5530       if (crtl->is_leaf && !cfun->calls_alloca)
5531         {
5532           if (maybe_gt (frame_size, PROBE_INTERVAL)
5533               && maybe_gt (frame_size, get_stack_check_protect ()))
5534             aarch64_emit_probe_stack_range (get_stack_check_protect (),
5535                                             (frame_size
5536                                              - get_stack_check_protect ()));
5537         }
5538       else if (maybe_gt (frame_size, 0))
5539         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
5540     }
5541
5542   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
5543   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
5544
5545   /* In theory we should never have both an initial adjustment
5546      and a callee save adjustment.  Verify that is the case since the
5547      code below does not handle it for -fstack-clash-protection.  */
5548   gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
5549
5550   /* Will only probe if the initial adjustment is larger than the guard
5551      less the amount of the guard reserved for use by the caller's
5552      outgoing args.  */
5553   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
5554                                           true, false);
5555
5556   if (callee_adjust != 0)
5557     aarch64_push_regs (reg1, reg2, callee_adjust);
5558
5559   if (emit_frame_chain)
5560     {
5561       poly_int64 reg_offset = callee_adjust;
5562       if (callee_adjust == 0)
5563         {
5564           reg1 = R29_REGNUM;
5565           reg2 = R30_REGNUM;
5566           reg_offset = callee_offset;
5567           aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
5568         }
5569       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
5570                           stack_pointer_rtx, callee_offset,
5571                           tmp1_rtx, tmp0_rtx, frame_pointer_needed);
5572       if (frame_pointer_needed && !frame_size.is_constant ())
5573         {
5574           /* Variable-sized frames need to describe the save slot
5575              address using DW_CFA_expression rather than DW_CFA_offset.
5576              This means that, without taking further action, the
5577              locations of the registers that we've already saved would
5578              remain based on the stack pointer even after we redefine
5579              the CFA based on the frame pointer.  We therefore need new
5580              DW_CFA_expressions to re-express the save slots with addresses
5581              based on the frame pointer.  */
5582           rtx_insn *insn = get_last_insn ();
5583           gcc_assert (RTX_FRAME_RELATED_P (insn));
5584
5585           /* Add an explicit CFA definition if this was previously
5586              implicit.  */
5587           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
5588             {
5589               rtx src = plus_constant (Pmode, stack_pointer_rtx,
5590                                        callee_offset);
5591               add_reg_note (insn, REG_CFA_ADJUST_CFA,
5592                             gen_rtx_SET (hard_frame_pointer_rtx, src));
5593             }
5594
5595           /* Change the save slot expressions for the registers that
5596              we've already saved.  */
5597           reg_offset -= callee_offset;
5598           aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
5599                                       reg_offset + UNITS_PER_WORD);
5600           aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
5601                                       reg_offset);
5602         }
5603       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
5604     }
5605
5606   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5607                              callee_adjust != 0 || emit_frame_chain);
5608   if (aarch64_simd_decl_p (cfun->decl))
5609     aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5610                                callee_adjust != 0 || emit_frame_chain);
5611   else
5612     aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5613                                callee_adjust != 0 || emit_frame_chain);
5614
5615   /* We may need to probe the final adjustment if it is larger than the guard
5616      that is assumed by the called.  */
5617   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
5618                                           !frame_pointer_needed, true);
5619 }
5620
5621 /* Return TRUE if we can use a simple_return insn.
5622
5623    This function checks whether the callee saved stack is empty, which
5624    means no restore actions are need. The pro_and_epilogue will use
5625    this to check whether shrink-wrapping opt is feasible.  */
5626
5627 bool
5628 aarch64_use_return_insn_p (void)
5629 {
5630   if (!reload_completed)
5631     return false;
5632
5633   if (crtl->profile)
5634     return false;
5635
5636   return known_eq (cfun->machine->frame.frame_size, 0);
5637 }
5638
5639 /* Return false for non-leaf SIMD functions in order to avoid
5640    shrink-wrapping them.  Doing this will lose the necessary
5641    save/restore of FP registers.  */
5642
5643 bool
5644 aarch64_use_simple_return_insn_p (void)
5645 {
5646   if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf)
5647     return false;
5648
5649   return true;
5650 }
5651
5652 /* Generate the epilogue instructions for returning from a function.
5653    This is almost exactly the reverse of the prolog sequence, except
5654    that we need to insert barriers to avoid scheduling loads that read
5655    from a deallocated stack, and we optimize the unwind records by
5656    emitting them all together if possible.  */
5657 void
5658 aarch64_expand_epilogue (bool for_sibcall)
5659 {
5660   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5661   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
5662   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5663   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
5664   unsigned reg1 = cfun->machine->frame.wb_candidate1;
5665   unsigned reg2 = cfun->machine->frame.wb_candidate2;
5666   rtx cfi_ops = NULL;
5667   rtx_insn *insn;
5668   /* A stack clash protection prologue may not have left EP0_REGNUM or
5669      EP1_REGNUM in a usable state.  The same is true for allocations
5670      with an SVE component, since we then need both temporary registers
5671      for each allocation.  For stack clash we are in a usable state if
5672      the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
5673   HOST_WIDE_INT guard_size
5674     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5675   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
5676
5677   /* We can re-use the registers when the allocation amount is smaller than
5678      guard_size - guard_used_by_caller because we won't be doing any probes
5679      then.  In such situations the register should remain live with the correct
5680      value.  */
5681   bool can_inherit_p = (initial_adjust.is_constant ()
5682                         && final_adjust.is_constant ())
5683                         && (!flag_stack_clash_protection
5684                             || known_lt (initial_adjust,
5685                                          guard_size - guard_used_by_caller));
5686
5687   /* We need to add memory barrier to prevent read from deallocated stack.  */
5688   bool need_barrier_p
5689     = maybe_ne (get_frame_size ()
5690                 + cfun->machine->frame.saved_varargs_size, 0);
5691
5692   /* Emit a barrier to prevent loads from a deallocated stack.  */
5693   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
5694       || cfun->calls_alloca
5695       || crtl->calls_eh_return)
5696     {
5697       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5698       need_barrier_p = false;
5699     }
5700
5701   /* Restore the stack pointer from the frame pointer if it may not
5702      be the same as the stack pointer.  */
5703   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
5704   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
5705   if (frame_pointer_needed
5706       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
5707     /* If writeback is used when restoring callee-saves, the CFA
5708        is restored on the instruction doing the writeback.  */
5709     aarch64_add_offset (Pmode, stack_pointer_rtx,
5710                         hard_frame_pointer_rtx, -callee_offset,
5711                         tmp1_rtx, tmp0_rtx, callee_adjust == 0);
5712   else
5713      /* The case where we need to re-use the register here is very rare, so
5714         avoid the complicated condition and just always emit a move if the
5715         immediate doesn't fit.  */
5716      aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
5717
5718   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5719                                 callee_adjust != 0, &cfi_ops);
5720   if (aarch64_simd_decl_p (cfun->decl))
5721     aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5722                                   callee_adjust != 0, &cfi_ops);
5723   else
5724     aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5725                                   callee_adjust != 0, &cfi_ops);
5726
5727   if (need_barrier_p)
5728     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5729
5730   if (callee_adjust != 0)
5731     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
5732
5733   if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
5734     {
5735       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
5736       insn = get_last_insn ();
5737       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
5738       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
5739       RTX_FRAME_RELATED_P (insn) = 1;
5740       cfi_ops = NULL;
5741     }
5742
5743   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
5744      add restriction on emit_move optimization to leaf functions.  */
5745   aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
5746                   (!can_inherit_p || !crtl->is_leaf
5747                    || df_regs_ever_live_p (EP0_REGNUM)));
5748
5749   if (cfi_ops)
5750     {
5751       /* Emit delayed restores and reset the CFA to be SP.  */
5752       insn = get_last_insn ();
5753       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
5754       REG_NOTES (insn) = cfi_ops;
5755       RTX_FRAME_RELATED_P (insn) = 1;
5756     }
5757
5758   /* We prefer to emit the combined return/authenticate instruction RETAA,
5759      however there are three cases in which we must instead emit an explicit
5760      authentication instruction.
5761
5762         1) Sibcalls don't return in a normal way, so if we're about to call one
5763            we must authenticate.
5764
5765         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5766            generating code for !TARGET_ARMV8_3 we can't use it and must
5767            explicitly authenticate.
5768
5769         3) On an eh_return path we make extra stack adjustments to update the
5770            canonical frame address to be the exception handler's CFA.  We want
5771            to authenticate using the CFA of the function which calls eh_return.
5772     */
5773   if (aarch64_return_address_signing_enabled ()
5774       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
5775     {
5776       insn = emit_insn (gen_autisp ());
5777       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5778       RTX_FRAME_RELATED_P (insn) = 1;
5779     }
5780
5781   /* Stack adjustment for exception handler.  */
5782   if (crtl->calls_eh_return)
5783     {
5784       /* We need to unwind the stack by the offset computed by
5785          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
5786          to be SP; letting the CFA move during this adjustment
5787          is just as correct as retaining the CFA from the body
5788          of the function.  Therefore, do nothing special.  */
5789       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
5790     }
5791
5792   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
5793   if (!for_sibcall)
5794     emit_jump_insn (ret_rtx);
5795 }
5796
5797 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
5798    normally or return to a previous frame after unwinding.
5799
5800    An EH return uses a single shared return sequence.  The epilogue is
5801    exactly like a normal epilogue except that it has an extra input
5802    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5803    that must be applied after the frame has been destroyed.  An extra label
5804    is inserted before the epilogue which initializes this register to zero,
5805    and this is the entry point for a normal return.
5806
5807    An actual EH return updates the return address, initializes the stack
5808    adjustment and jumps directly into the epilogue (bypassing the zeroing
5809    of the adjustment).  Since the return address is typically saved on the
5810    stack when a function makes a call, the saved LR must be updated outside
5811    the epilogue.
5812
5813    This poses problems as the store is generated well before the epilogue,
5814    so the offset of LR is not known yet.  Also optimizations will remove the
5815    store as it appears dead, even after the epilogue is generated (as the
5816    base or offset for loading LR is different in many cases).
5817
5818    To avoid these problems this implementation forces the frame pointer
5819    in eh_return functions so that the location of LR is fixed and known early.
5820    It also marks the store volatile, so no optimization is permitted to
5821    remove the store.  */
5822 rtx
5823 aarch64_eh_return_handler_rtx (void)
5824 {
5825   rtx tmp = gen_frame_mem (Pmode,
5826     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
5827
5828   /* Mark the store volatile, so no optimization is permitted to remove it.  */
5829   MEM_VOLATILE_P (tmp) = true;
5830   return tmp;
5831 }
5832
5833 /* Output code to add DELTA to the first argument, and then jump
5834    to FUNCTION.  Used for C++ multiple inheritance.  */
5835 static void
5836 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
5837                          HOST_WIDE_INT delta,
5838                          HOST_WIDE_INT vcall_offset,
5839                          tree function)
5840 {
5841   /* The this pointer is always in x0.  Note that this differs from
5842      Arm where the this pointer maybe bumped to r1 if r0 is required
5843      to return a pointer to an aggregate.  On AArch64 a result value
5844      pointer will be in x8.  */
5845   int this_regno = R0_REGNUM;
5846   rtx this_rtx, temp0, temp1, addr, funexp;
5847   rtx_insn *insn;
5848
5849   reload_completed = 1;
5850   emit_note (NOTE_INSN_PROLOGUE_END);
5851
5852   this_rtx = gen_rtx_REG (Pmode, this_regno);
5853   temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
5854   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
5855
5856   if (vcall_offset == 0)
5857     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
5858   else
5859     {
5860       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
5861
5862       addr = this_rtx;
5863       if (delta != 0)
5864         {
5865           if (delta >= -256 && delta < 256)
5866             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
5867                                        plus_constant (Pmode, this_rtx, delta));
5868           else
5869             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
5870                                 temp1, temp0, false);
5871         }
5872
5873       if (Pmode == ptr_mode)
5874         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
5875       else
5876         aarch64_emit_move (temp0,
5877                            gen_rtx_ZERO_EXTEND (Pmode,
5878                                                 gen_rtx_MEM (ptr_mode, addr)));
5879
5880       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
5881           addr = plus_constant (Pmode, temp0, vcall_offset);
5882       else
5883         {
5884           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
5885                                           Pmode);
5886           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
5887         }
5888
5889       if (Pmode == ptr_mode)
5890         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
5891       else
5892         aarch64_emit_move (temp1,
5893                            gen_rtx_SIGN_EXTEND (Pmode,
5894                                                 gen_rtx_MEM (ptr_mode, addr)));
5895
5896       emit_insn (gen_add2_insn (this_rtx, temp1));
5897     }
5898
5899   /* Generate a tail call to the target function.  */
5900   if (!TREE_USED (function))
5901     {
5902       assemble_external (function);
5903       TREE_USED (function) = 1;
5904     }
5905   funexp = XEXP (DECL_RTL (function), 0);
5906   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
5907   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
5908   SIBLING_CALL_P (insn) = 1;
5909
5910   insn = get_insns ();
5911   shorten_branches (insn);
5912   final_start_function (insn, file, 1);
5913   final (insn, file, 1);
5914   final_end_function ();
5915
5916   /* Stop pretending to be a post-reload pass.  */
5917   reload_completed = 0;
5918 }
5919
5920 static bool
5921 aarch64_tls_referenced_p (rtx x)
5922 {
5923   if (!TARGET_HAVE_TLS)
5924     return false;
5925   subrtx_iterator::array_type array;
5926   FOR_EACH_SUBRTX (iter, array, x, ALL)
5927     {
5928       const_rtx x = *iter;
5929       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
5930         return true;
5931       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
5932          TLS offsets, not real symbol references.  */
5933       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5934         iter.skip_subrtxes ();
5935     }
5936   return false;
5937 }
5938
5939
5940 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5941    a left shift of 0 or 12 bits.  */
5942 bool
5943 aarch64_uimm12_shift (HOST_WIDE_INT val)
5944 {
5945   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
5946           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
5947           );
5948 }
5949
5950 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
5951    that can be created with a left shift of 0 or 12.  */
5952 static HOST_WIDE_INT
5953 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
5954 {
5955   /* Check to see if the value fits in 24 bits, as that is the maximum we can
5956      handle correctly.  */
5957   gcc_assert ((val & 0xffffff) == val);
5958
5959   if (((val & 0xfff) << 0) == val)
5960     return val;
5961
5962   return val & (0xfff << 12);
5963 }
5964
5965 /* Return true if val is an immediate that can be loaded into a
5966    register by a MOVZ instruction.  */
5967 static bool
5968 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
5969 {
5970   if (GET_MODE_SIZE (mode) > 4)
5971     {
5972       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
5973           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
5974         return 1;
5975     }
5976   else
5977     {
5978       /* Ignore sign extension.  */
5979       val &= (HOST_WIDE_INT) 0xffffffff;
5980     }
5981   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
5982           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
5983 }
5984
5985 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
5986    64-bit (DImode) integer.  */
5987
5988 static unsigned HOST_WIDE_INT
5989 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5990 {
5991   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
5992   while (size < 64)
5993     {
5994       val &= (HOST_WIDE_INT_1U << size) - 1;
5995       val |= val << size;
5996       size *= 2;
5997     }
5998   return val;
5999 }
6000
6001 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
6002
6003 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
6004   {
6005     0x0000000100000001ull,
6006     0x0001000100010001ull,
6007     0x0101010101010101ull,
6008     0x1111111111111111ull,
6009     0x5555555555555555ull,
6010   };
6011
6012
6013 /* Return true if val is a valid bitmask immediate.  */
6014
6015 bool
6016 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
6017 {
6018   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
6019   int bits;
6020
6021   /* Check for a single sequence of one bits and return quickly if so.
6022      The special cases of all ones and all zeroes returns false.  */
6023   val = aarch64_replicate_bitmask_imm (val_in, mode);
6024   tmp = val + (val & -val);
6025
6026   if (tmp == (tmp & -tmp))
6027     return (val + 1) > 1;
6028
6029   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
6030   if (mode == SImode)
6031     val = (val << 32) | (val & 0xffffffff);
6032
6033   /* Invert if the immediate doesn't start with a zero bit - this means we
6034      only need to search for sequences of one bits.  */
6035   if (val & 1)
6036     val = ~val;
6037
6038   /* Find the first set bit and set tmp to val with the first sequence of one
6039      bits removed.  Return success if there is a single sequence of ones.  */
6040   first_one = val & -val;
6041   tmp = val & (val + first_one);
6042
6043   if (tmp == 0)
6044     return true;
6045
6046   /* Find the next set bit and compute the difference in bit position.  */
6047   next_one = tmp & -tmp;
6048   bits = clz_hwi (first_one) - clz_hwi (next_one);
6049   mask = val ^ tmp;
6050
6051   /* Check the bit position difference is a power of 2, and that the first
6052      sequence of one bits fits within 'bits' bits.  */
6053   if ((mask >> bits) != 0 || bits != (bits & -bits))
6054     return false;
6055
6056   /* Check the sequence of one bits is repeated 64/bits times.  */
6057   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
6058 }
6059
6060 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
6061    Assumed precondition: VAL_IN Is not zero.  */
6062
6063 unsigned HOST_WIDE_INT
6064 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
6065 {
6066   int lowest_bit_set = ctz_hwi (val_in);
6067   int highest_bit_set = floor_log2 (val_in);
6068   gcc_assert (val_in != 0);
6069
6070   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
6071           (HOST_WIDE_INT_1U << lowest_bit_set));
6072 }
6073
6074 /* Create constant where bits outside of lowest bit set to highest bit set
6075    are set to 1.  */
6076
6077 unsigned HOST_WIDE_INT
6078 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
6079 {
6080   return val_in | ~aarch64_and_split_imm1 (val_in);
6081 }
6082
6083 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
6084
6085 bool
6086 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
6087 {
6088   scalar_int_mode int_mode;
6089   if (!is_a <scalar_int_mode> (mode, &int_mode))
6090     return false;
6091
6092   if (aarch64_bitmask_imm (val_in, int_mode))
6093     return false;
6094
6095   if (aarch64_move_imm (val_in, int_mode))
6096     return false;
6097
6098   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
6099
6100   return aarch64_bitmask_imm (imm2, int_mode);
6101 }
6102
6103 /* Return true if val is an immediate that can be loaded into a
6104    register in a single instruction.  */
6105 bool
6106 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
6107 {
6108   scalar_int_mode int_mode;
6109   if (!is_a <scalar_int_mode> (mode, &int_mode))
6110     return false;
6111
6112   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
6113     return 1;
6114   return aarch64_bitmask_imm (val, int_mode);
6115 }
6116
6117 static bool
6118 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
6119 {
6120   rtx base, offset;
6121
6122   if (GET_CODE (x) == HIGH)
6123     return true;
6124
6125   /* There's no way to calculate VL-based values using relocations.  */
6126   subrtx_iterator::array_type array;
6127   FOR_EACH_SUBRTX (iter, array, x, ALL)
6128     if (GET_CODE (*iter) == CONST_POLY_INT)
6129       return true;
6130
6131   split_const (x, &base, &offset);
6132   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
6133     {
6134       if (aarch64_classify_symbol (base, INTVAL (offset))
6135           != SYMBOL_FORCE_TO_MEM)
6136         return true;
6137       else
6138         /* Avoid generating a 64-bit relocation in ILP32; leave
6139            to aarch64_expand_mov_immediate to handle it properly.  */
6140         return mode != ptr_mode;
6141     }
6142
6143   return aarch64_tls_referenced_p (x);
6144 }
6145
6146 /* Implement TARGET_CASE_VALUES_THRESHOLD.
6147    The expansion for a table switch is quite expensive due to the number
6148    of instructions, the table lookup and hard to predict indirect jump.
6149    When optimizing for speed, and -O3 enabled, use the per-core tuning if
6150    set, otherwise use tables for > 16 cases as a tradeoff between size and
6151    performance.  When optimizing for size, use the default setting.  */
6152
6153 static unsigned int
6154 aarch64_case_values_threshold (void)
6155 {
6156   /* Use the specified limit for the number of cases before using jump
6157      tables at higher optimization levels.  */
6158   if (optimize > 2
6159       && selected_cpu->tune->max_case_values != 0)
6160     return selected_cpu->tune->max_case_values;
6161   else
6162     return optimize_size ? default_case_values_threshold () : 17;
6163 }
6164
6165 /* Return true if register REGNO is a valid index register.
6166    STRICT_P is true if REG_OK_STRICT is in effect.  */
6167
6168 bool
6169 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
6170 {
6171   if (!HARD_REGISTER_NUM_P (regno))
6172     {
6173       if (!strict_p)
6174         return true;
6175
6176       if (!reg_renumber)
6177         return false;
6178
6179       regno = reg_renumber[regno];
6180     }
6181   return GP_REGNUM_P (regno);
6182 }
6183
6184 /* Return true if register REGNO is a valid base register for mode MODE.
6185    STRICT_P is true if REG_OK_STRICT is in effect.  */
6186
6187 bool
6188 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
6189 {
6190   if (!HARD_REGISTER_NUM_P (regno))
6191     {
6192       if (!strict_p)
6193         return true;
6194
6195       if (!reg_renumber)
6196         return false;
6197
6198       regno = reg_renumber[regno];
6199     }
6200
6201   /* The fake registers will be eliminated to either the stack or
6202      hard frame pointer, both of which are usually valid base registers.
6203      Reload deals with the cases where the eliminated form isn't valid.  */
6204   return (GP_REGNUM_P (regno)
6205           || regno == SP_REGNUM
6206           || regno == FRAME_POINTER_REGNUM
6207           || regno == ARG_POINTER_REGNUM);
6208 }
6209
6210 /* Return true if X is a valid base register for mode MODE.
6211    STRICT_P is true if REG_OK_STRICT is in effect.  */
6212
6213 static bool
6214 aarch64_base_register_rtx_p (rtx x, bool strict_p)
6215 {
6216   if (!strict_p
6217       && GET_CODE (x) == SUBREG
6218       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
6219     x = SUBREG_REG (x);
6220
6221   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
6222 }
6223
6224 /* Return true if address offset is a valid index.  If it is, fill in INFO
6225    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
6226
6227 static bool
6228 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
6229                         machine_mode mode, bool strict_p)
6230 {
6231   enum aarch64_address_type type;
6232   rtx index;
6233   int shift;
6234
6235   /* (reg:P) */
6236   if ((REG_P (x) || GET_CODE (x) == SUBREG)
6237       && GET_MODE (x) == Pmode)
6238     {
6239       type = ADDRESS_REG_REG;
6240       index = x;
6241       shift = 0;
6242     }
6243   /* (sign_extend:DI (reg:SI)) */
6244   else if ((GET_CODE (x) == SIGN_EXTEND
6245             || GET_CODE (x) == ZERO_EXTEND)
6246            && GET_MODE (x) == DImode
6247            && GET_MODE (XEXP (x, 0)) == SImode)
6248     {
6249       type = (GET_CODE (x) == SIGN_EXTEND)
6250         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6251       index = XEXP (x, 0);
6252       shift = 0;
6253     }
6254   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
6255   else if (GET_CODE (x) == MULT
6256            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6257                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6258            && GET_MODE (XEXP (x, 0)) == DImode
6259            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6260            && CONST_INT_P (XEXP (x, 1)))
6261     {
6262       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6263         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6264       index = XEXP (XEXP (x, 0), 0);
6265       shift = exact_log2 (INTVAL (XEXP (x, 1)));
6266     }
6267   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
6268   else if (GET_CODE (x) == ASHIFT
6269            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6270                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6271            && GET_MODE (XEXP (x, 0)) == DImode
6272            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6273            && CONST_INT_P (XEXP (x, 1)))
6274     {
6275       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6276         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6277       index = XEXP (XEXP (x, 0), 0);
6278       shift = INTVAL (XEXP (x, 1));
6279     }
6280   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
6281   else if ((GET_CODE (x) == SIGN_EXTRACT
6282             || GET_CODE (x) == ZERO_EXTRACT)
6283            && GET_MODE (x) == DImode
6284            && GET_CODE (XEXP (x, 0)) == MULT
6285            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6286            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
6287     {
6288       type = (GET_CODE (x) == SIGN_EXTRACT)
6289         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6290       index = XEXP (XEXP (x, 0), 0);
6291       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
6292       if (INTVAL (XEXP (x, 1)) != 32 + shift
6293           || INTVAL (XEXP (x, 2)) != 0)
6294         shift = -1;
6295     }
6296   /* (and:DI (mult:DI (reg:DI) (const_int scale))
6297      (const_int 0xffffffff<<shift)) */
6298   else if (GET_CODE (x) == AND
6299            && GET_MODE (x) == DImode
6300            && GET_CODE (XEXP (x, 0)) == MULT
6301            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6302            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6303            && CONST_INT_P (XEXP (x, 1)))
6304     {
6305       type = ADDRESS_REG_UXTW;
6306       index = XEXP (XEXP (x, 0), 0);
6307       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
6308       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
6309         shift = -1;
6310     }
6311   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
6312   else if ((GET_CODE (x) == SIGN_EXTRACT
6313             || GET_CODE (x) == ZERO_EXTRACT)
6314            && GET_MODE (x) == DImode
6315            && GET_CODE (XEXP (x, 0)) == ASHIFT
6316            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6317            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
6318     {
6319       type = (GET_CODE (x) == SIGN_EXTRACT)
6320         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6321       index = XEXP (XEXP (x, 0), 0);
6322       shift = INTVAL (XEXP (XEXP (x, 0), 1));
6323       if (INTVAL (XEXP (x, 1)) != 32 + shift
6324           || INTVAL (XEXP (x, 2)) != 0)
6325         shift = -1;
6326     }
6327   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
6328      (const_int 0xffffffff<<shift)) */
6329   else if (GET_CODE (x) == AND
6330            && GET_MODE (x) == DImode
6331            && GET_CODE (XEXP (x, 0)) == ASHIFT
6332            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6333            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6334            && CONST_INT_P (XEXP (x, 1)))
6335     {
6336       type = ADDRESS_REG_UXTW;
6337       index = XEXP (XEXP (x, 0), 0);
6338       shift = INTVAL (XEXP (XEXP (x, 0), 1));
6339       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
6340         shift = -1;
6341     }
6342   /* (mult:P (reg:P) (const_int scale)) */
6343   else if (GET_CODE (x) == MULT
6344            && GET_MODE (x) == Pmode
6345            && GET_MODE (XEXP (x, 0)) == Pmode
6346            && CONST_INT_P (XEXP (x, 1)))
6347     {
6348       type = ADDRESS_REG_REG;
6349       index = XEXP (x, 0);
6350       shift = exact_log2 (INTVAL (XEXP (x, 1)));
6351     }
6352   /* (ashift:P (reg:P) (const_int shift)) */
6353   else if (GET_CODE (x) == ASHIFT
6354            && GET_MODE (x) == Pmode
6355            && GET_MODE (XEXP (x, 0)) == Pmode
6356            && CONST_INT_P (XEXP (x, 1)))
6357     {
6358       type = ADDRESS_REG_REG;
6359       index = XEXP (x, 0);
6360       shift = INTVAL (XEXP (x, 1));
6361     }
6362   else
6363     return false;
6364
6365   if (!strict_p
6366       && GET_CODE (index) == SUBREG
6367       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
6368     index = SUBREG_REG (index);
6369
6370   if (aarch64_sve_data_mode_p (mode))
6371     {
6372       if (type != ADDRESS_REG_REG
6373           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
6374         return false;
6375     }
6376   else
6377     {
6378       if (shift != 0
6379           && !(IN_RANGE (shift, 1, 3)
6380                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
6381         return false;
6382     }
6383
6384   if (REG_P (index)
6385       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
6386     {
6387       info->type = type;
6388       info->offset = index;
6389       info->shift = shift;
6390       return true;
6391     }
6392
6393   return false;
6394 }
6395
6396 /* Return true if MODE is one of the modes for which we
6397    support LDP/STP operations.  */
6398
6399 static bool
6400 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
6401 {
6402   return mode == SImode || mode == DImode
6403          || mode == SFmode || mode == DFmode
6404          || (aarch64_vector_mode_supported_p (mode)
6405              && (known_eq (GET_MODE_SIZE (mode), 8)
6406                  || (known_eq (GET_MODE_SIZE (mode), 16)
6407                     && (aarch64_tune_params.extra_tuning_flags
6408                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
6409 }
6410
6411 /* Return true if REGNO is a virtual pointer register, or an eliminable
6412    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
6413    include stack_pointer or hard_frame_pointer.  */
6414 static bool
6415 virt_or_elim_regno_p (unsigned regno)
6416 {
6417   return ((regno >= FIRST_VIRTUAL_REGISTER
6418            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
6419           || regno == FRAME_POINTER_REGNUM
6420           || regno == ARG_POINTER_REGNUM);
6421 }
6422
6423 /* Return true if X is a valid address of type TYPE for machine mode MODE.
6424    If it is, fill in INFO appropriately.  STRICT_P is true if
6425    REG_OK_STRICT is in effect.  */
6426
6427 bool
6428 aarch64_classify_address (struct aarch64_address_info *info,
6429                           rtx x, machine_mode mode, bool strict_p,
6430                           aarch64_addr_query_type type)
6431 {
6432   enum rtx_code code = GET_CODE (x);
6433   rtx op0, op1;
6434   poly_int64 offset;
6435
6436   HOST_WIDE_INT const_size;
6437
6438   /* On BE, we use load/store pair for all large int mode load/stores.
6439      TI/TFmode may also use a load/store pair.  */
6440   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6441   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
6442   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
6443                             || type == ADDR_QUERY_LDP_STP_N
6444                             || mode == TImode
6445                             || mode == TFmode
6446                             || (BYTES_BIG_ENDIAN && advsimd_struct_p));
6447
6448   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
6449      corresponds to the actual size of the memory being loaded/stored and the
6450      mode of the corresponding addressing mode is half of that.  */
6451   if (type == ADDR_QUERY_LDP_STP_N
6452       && known_eq (GET_MODE_SIZE (mode), 16))
6453     mode = DFmode;
6454
6455   bool allow_reg_index_p = (!load_store_pair_p
6456                             && (known_lt (GET_MODE_SIZE (mode), 16)
6457                                 || vec_flags == VEC_ADVSIMD
6458                                 || vec_flags == VEC_SVE_DATA));
6459
6460   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
6461      [Rn, #offset, MUL VL].  */
6462   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
6463       && (code != REG && code != PLUS))
6464     return false;
6465
6466   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
6467      REG addressing.  */
6468   if (advsimd_struct_p
6469       && !BYTES_BIG_ENDIAN
6470       && (code != POST_INC && code != REG))
6471     return false;
6472
6473   gcc_checking_assert (GET_MODE (x) == VOIDmode
6474                        || SCALAR_INT_MODE_P (GET_MODE (x)));
6475
6476   switch (code)
6477     {
6478     case REG:
6479     case SUBREG:
6480       info->type = ADDRESS_REG_IMM;
6481       info->base = x;
6482       info->offset = const0_rtx;
6483       info->const_offset = 0;
6484       return aarch64_base_register_rtx_p (x, strict_p);
6485
6486     case PLUS:
6487       op0 = XEXP (x, 0);
6488       op1 = XEXP (x, 1);
6489
6490       if (! strict_p
6491           && REG_P (op0)
6492           && virt_or_elim_regno_p (REGNO (op0))
6493           && poly_int_rtx_p (op1, &offset))
6494         {
6495           info->type = ADDRESS_REG_IMM;
6496           info->base = op0;
6497           info->offset = op1;
6498           info->const_offset = offset;
6499
6500           return true;
6501         }
6502
6503       if (maybe_ne (GET_MODE_SIZE (mode), 0)
6504           && aarch64_base_register_rtx_p (op0, strict_p)
6505           && poly_int_rtx_p (op1, &offset))
6506         {
6507           info->type = ADDRESS_REG_IMM;
6508           info->base = op0;
6509           info->offset = op1;
6510           info->const_offset = offset;
6511
6512           /* TImode and TFmode values are allowed in both pairs of X
6513              registers and individual Q registers.  The available
6514              address modes are:
6515              X,X: 7-bit signed scaled offset
6516              Q:   9-bit signed offset
6517              We conservatively require an offset representable in either mode.
6518              When performing the check for pairs of X registers i.e.  LDP/STP
6519              pass down DImode since that is the natural size of the LDP/STP
6520              instruction memory accesses.  */
6521           if (mode == TImode || mode == TFmode)
6522             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
6523                     && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
6524                         || offset_12bit_unsigned_scaled_p (mode, offset)));
6525
6526           /* A 7bit offset check because OImode will emit a ldp/stp
6527              instruction (only big endian will get here).
6528              For ldp/stp instructions, the offset is scaled for the size of a
6529              single element of the pair.  */
6530           if (mode == OImode)
6531             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
6532
6533           /* Three 9/12 bit offsets checks because CImode will emit three
6534              ldr/str instructions (only big endian will get here).  */
6535           if (mode == CImode)
6536             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
6537                     && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
6538                                                                offset + 32)
6539                         || offset_12bit_unsigned_scaled_p (V16QImode,
6540                                                            offset + 32)));
6541
6542           /* Two 7bit offsets checks because XImode will emit two ldp/stp
6543              instructions (only big endian will get here).  */
6544           if (mode == XImode)
6545             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
6546                     && aarch64_offset_7bit_signed_scaled_p (TImode,
6547                                                             offset + 32));
6548
6549           /* Make "m" use the LD1 offset range for SVE data modes, so
6550              that pre-RTL optimizers like ivopts will work to that
6551              instead of the wider LDR/STR range.  */
6552           if (vec_flags == VEC_SVE_DATA)
6553             return (type == ADDR_QUERY_M
6554                     ? offset_4bit_signed_scaled_p (mode, offset)
6555                     : offset_9bit_signed_scaled_p (mode, offset));
6556
6557           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
6558             {
6559               poly_int64 end_offset = (offset
6560                                        + GET_MODE_SIZE (mode)
6561                                        - BYTES_PER_SVE_VECTOR);
6562               return (type == ADDR_QUERY_M
6563                       ? offset_4bit_signed_scaled_p (mode, offset)
6564                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
6565                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
6566                                                          end_offset)));
6567             }
6568
6569           if (vec_flags == VEC_SVE_PRED)
6570             return offset_9bit_signed_scaled_p (mode, offset);
6571
6572           if (load_store_pair_p)
6573             return ((known_eq (GET_MODE_SIZE (mode), 4)
6574                      || known_eq (GET_MODE_SIZE (mode), 8)
6575                      || known_eq (GET_MODE_SIZE (mode), 16))
6576                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
6577           else
6578             return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
6579                     || offset_12bit_unsigned_scaled_p (mode, offset));
6580         }
6581
6582       if (allow_reg_index_p)
6583         {
6584           /* Look for base + (scaled/extended) index register.  */
6585           if (aarch64_base_register_rtx_p (op0, strict_p)
6586               && aarch64_classify_index (info, op1, mode, strict_p))
6587             {
6588               info->base = op0;
6589               return true;
6590             }
6591           if (aarch64_base_register_rtx_p (op1, strict_p)
6592               && aarch64_classify_index (info, op0, mode, strict_p))
6593             {
6594               info->base = op1;
6595               return true;
6596             }
6597         }
6598
6599       return false;
6600
6601     case POST_INC:
6602     case POST_DEC:
6603     case PRE_INC:
6604     case PRE_DEC:
6605       info->type = ADDRESS_REG_WB;
6606       info->base = XEXP (x, 0);
6607       info->offset = NULL_RTX;
6608       return aarch64_base_register_rtx_p (info->base, strict_p);
6609
6610     case POST_MODIFY:
6611     case PRE_MODIFY:
6612       info->type = ADDRESS_REG_WB;
6613       info->base = XEXP (x, 0);
6614       if (GET_CODE (XEXP (x, 1)) == PLUS
6615           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
6616           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
6617           && aarch64_base_register_rtx_p (info->base, strict_p))
6618         {
6619           info->offset = XEXP (XEXP (x, 1), 1);
6620           info->const_offset = offset;
6621
6622           /* TImode and TFmode values are allowed in both pairs of X
6623              registers and individual Q registers.  The available
6624              address modes are:
6625              X,X: 7-bit signed scaled offset
6626              Q:   9-bit signed offset
6627              We conservatively require an offset representable in either mode.
6628            */
6629           if (mode == TImode || mode == TFmode)
6630             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
6631                     && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
6632
6633           if (load_store_pair_p)
6634             return ((known_eq (GET_MODE_SIZE (mode), 4)
6635                      || known_eq (GET_MODE_SIZE (mode), 8)
6636                      || known_eq (GET_MODE_SIZE (mode), 16))
6637                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
6638           else
6639             return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
6640         }
6641       return false;
6642
6643     case CONST:
6644     case SYMBOL_REF:
6645     case LABEL_REF:
6646       /* load literal: pc-relative constant pool entry.  Only supported
6647          for SI mode or larger.  */
6648       info->type = ADDRESS_SYMBOLIC;
6649
6650       if (!load_store_pair_p
6651           && GET_MODE_SIZE (mode).is_constant (&const_size)
6652           && const_size >= 4)
6653         {
6654           rtx sym, addend;
6655
6656           split_const (x, &sym, &addend);
6657           return ((GET_CODE (sym) == LABEL_REF
6658                    || (GET_CODE (sym) == SYMBOL_REF
6659                        && CONSTANT_POOL_ADDRESS_P (sym)
6660                        && aarch64_pcrelative_literal_loads)));
6661         }
6662       return false;
6663
6664     case LO_SUM:
6665       info->type = ADDRESS_LO_SUM;
6666       info->base = XEXP (x, 0);
6667       info->offset = XEXP (x, 1);
6668       if (allow_reg_index_p
6669           && aarch64_base_register_rtx_p (info->base, strict_p))
6670         {
6671           rtx sym, offs;
6672           split_const (info->offset, &sym, &offs);
6673           if (GET_CODE (sym) == SYMBOL_REF
6674               && (aarch64_classify_symbol (sym, INTVAL (offs))
6675                   == SYMBOL_SMALL_ABSOLUTE))
6676             {
6677               /* The symbol and offset must be aligned to the access size.  */
6678               unsigned int align;
6679
6680               if (CONSTANT_POOL_ADDRESS_P (sym))
6681                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
6682               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
6683                 {
6684                   tree exp = SYMBOL_REF_DECL (sym);
6685                   align = TYPE_ALIGN (TREE_TYPE (exp));
6686                   align = aarch64_constant_alignment (exp, align);
6687                 }
6688               else if (SYMBOL_REF_DECL (sym))
6689                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
6690               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
6691                        && SYMBOL_REF_BLOCK (sym) != NULL)
6692                 align = SYMBOL_REF_BLOCK (sym)->alignment;
6693               else
6694                 align = BITS_PER_UNIT;
6695
6696               poly_int64 ref_size = GET_MODE_SIZE (mode);
6697               if (known_eq (ref_size, 0))
6698                 ref_size = GET_MODE_SIZE (DImode);
6699
6700               return (multiple_p (INTVAL (offs), ref_size)
6701                       && multiple_p (align / BITS_PER_UNIT, ref_size));
6702             }
6703         }
6704       return false;
6705
6706     default:
6707       return false;
6708     }
6709 }
6710
6711 /* Return true if the address X is valid for a PRFM instruction.
6712    STRICT_P is true if we should do strict checking with
6713    aarch64_classify_address.  */
6714
6715 bool
6716 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
6717 {
6718   struct aarch64_address_info addr;
6719
6720   /* PRFM accepts the same addresses as DImode...  */
6721   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
6722   if (!res)
6723     return false;
6724
6725   /* ... except writeback forms.  */
6726   return addr.type != ADDRESS_REG_WB;
6727 }
6728
6729 bool
6730 aarch64_symbolic_address_p (rtx x)
6731 {
6732   rtx offset;
6733
6734   split_const (x, &x, &offset);
6735   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
6736 }
6737
6738 /* Classify the base of symbolic expression X.  */
6739
6740 enum aarch64_symbol_type
6741 aarch64_classify_symbolic_expression (rtx x)
6742 {
6743   rtx offset;
6744
6745   split_const (x, &x, &offset);
6746   return aarch64_classify_symbol (x, INTVAL (offset));
6747 }
6748
6749
6750 /* Return TRUE if X is a legitimate address for accessing memory in
6751    mode MODE.  */
6752 static bool
6753 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
6754 {
6755   struct aarch64_address_info addr;
6756
6757   return aarch64_classify_address (&addr, x, mode, strict_p);
6758 }
6759
6760 /* Return TRUE if X is a legitimate address of type TYPE for accessing
6761    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
6762 bool
6763 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
6764                               aarch64_addr_query_type type)
6765 {
6766   struct aarch64_address_info addr;
6767
6768   return aarch64_classify_address (&addr, x, mode, strict_p, type);
6769 }
6770
6771 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
6772
6773 static bool
6774 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
6775                                          poly_int64 orig_offset,
6776                                          machine_mode mode)
6777 {
6778   HOST_WIDE_INT size;
6779   if (GET_MODE_SIZE (mode).is_constant (&size))
6780     {
6781       HOST_WIDE_INT const_offset, second_offset;
6782
6783       /* A general SVE offset is A * VQ + B.  Remove the A component from
6784          coefficient 0 in order to get the constant B.  */
6785       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
6786
6787       /* Split an out-of-range address displacement into a base and
6788          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
6789          range otherwise to increase opportunities for sharing the base
6790          address of different sizes.  Unaligned accesses use the signed
6791          9-bit range, TImode/TFmode use the intersection of signed
6792          scaled 7-bit and signed 9-bit offset.  */
6793       if (mode == TImode || mode == TFmode)
6794         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
6795       else if ((const_offset & (size - 1)) != 0)
6796         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
6797       else
6798         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
6799
6800       if (second_offset == 0 || known_eq (orig_offset, second_offset))
6801         return false;
6802
6803       /* Split the offset into second_offset and the rest.  */
6804       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6805       *offset2 = gen_int_mode (second_offset, Pmode);
6806       return true;
6807     }
6808   else
6809     {
6810       /* Get the mode we should use as the basis of the range.  For structure
6811          modes this is the mode of one vector.  */
6812       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6813       machine_mode step_mode
6814         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
6815
6816       /* Get the "mul vl" multiplier we'd like to use.  */
6817       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
6818       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
6819       if (vec_flags & VEC_SVE_DATA)
6820         /* LDR supports a 9-bit range, but the move patterns for
6821            structure modes require all vectors to be in range of the
6822            same base.  The simplest way of accomodating that while still
6823            promoting reuse of anchor points between different modes is
6824            to use an 8-bit range unconditionally.  */
6825         vnum = ((vnum + 128) & 255) - 128;
6826       else
6827         /* Predicates are only handled singly, so we might as well use
6828            the full range.  */
6829         vnum = ((vnum + 256) & 511) - 256;
6830       if (vnum == 0)
6831         return false;
6832
6833       /* Convert the "mul vl" multiplier into a byte offset.  */
6834       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
6835       if (known_eq (second_offset, orig_offset))
6836         return false;
6837
6838       /* Split the offset into second_offset and the rest.  */
6839       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6840       *offset2 = gen_int_mode (second_offset, Pmode);
6841       return true;
6842     }
6843 }
6844
6845 /* Return the binary representation of floating point constant VALUE in INTVAL.
6846    If the value cannot be converted, return false without setting INTVAL.
6847    The conversion is done in the given MODE.  */
6848 bool
6849 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
6850 {
6851
6852   /* We make a general exception for 0.  */
6853   if (aarch64_float_const_zero_rtx_p (value))
6854     {
6855       *intval = 0;
6856       return true;
6857     }
6858
6859   scalar_float_mode mode;
6860   if (GET_CODE (value) != CONST_DOUBLE
6861       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
6862       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
6863       /* Only support up to DF mode.  */
6864       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
6865     return false;
6866
6867   unsigned HOST_WIDE_INT ival = 0;
6868
6869   long res[2];
6870   real_to_target (res,
6871                   CONST_DOUBLE_REAL_VALUE (value),
6872                   REAL_MODE_FORMAT (mode));
6873
6874   if (mode == DFmode)
6875     {
6876       int order = BYTES_BIG_ENDIAN ? 1 : 0;
6877       ival = zext_hwi (res[order], 32);
6878       ival |= (zext_hwi (res[1 - order], 32) << 32);
6879     }
6880   else
6881       ival = zext_hwi (res[0], 32);
6882
6883   *intval = ival;
6884   return true;
6885 }
6886
6887 /* Return TRUE if rtx X is an immediate constant that can be moved using a
6888    single MOV(+MOVK) followed by an FMOV.  */
6889 bool
6890 aarch64_float_const_rtx_p (rtx x)
6891 {
6892   machine_mode mode = GET_MODE (x);
6893   if (mode == VOIDmode)
6894     return false;
6895
6896   /* Determine whether it's cheaper to write float constants as
6897      mov/movk pairs over ldr/adrp pairs.  */
6898   unsigned HOST_WIDE_INT ival;
6899
6900   if (GET_CODE (x) == CONST_DOUBLE
6901       && SCALAR_FLOAT_MODE_P (mode)
6902       && aarch64_reinterpret_float_as_int (x, &ival))
6903     {
6904       scalar_int_mode imode = (mode == HFmode
6905                                ? SImode
6906                                : int_mode_for_mode (mode).require ());
6907       int num_instr = aarch64_internal_mov_immediate
6908                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
6909       return num_instr < 3;
6910     }
6911
6912   return false;
6913 }
6914
6915 /* Return TRUE if rtx X is immediate constant 0.0 */
6916 bool
6917 aarch64_float_const_zero_rtx_p (rtx x)
6918 {
6919   if (GET_MODE (x) == VOIDmode)
6920     return false;
6921
6922   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
6923     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
6924   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
6925 }
6926
6927 /* Return TRUE if rtx X is immediate constant that fits in a single
6928    MOVI immediate operation.  */
6929 bool
6930 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
6931 {
6932   if (!TARGET_SIMD)
6933      return false;
6934
6935   machine_mode vmode;
6936   scalar_int_mode imode;
6937   unsigned HOST_WIDE_INT ival;
6938
6939   if (GET_CODE (x) == CONST_DOUBLE
6940       && SCALAR_FLOAT_MODE_P (mode))
6941     {
6942       if (!aarch64_reinterpret_float_as_int (x, &ival))
6943         return false;
6944
6945       /* We make a general exception for 0.  */
6946       if (aarch64_float_const_zero_rtx_p (x))
6947         return true;
6948
6949       imode = int_mode_for_mode (mode).require ();
6950     }
6951   else if (GET_CODE (x) == CONST_INT
6952            && is_a <scalar_int_mode> (mode, &imode))
6953     ival = INTVAL (x);
6954   else
6955     return false;
6956
6957    /* use a 64 bit mode for everything except for DI/DF mode, where we use
6958      a 128 bit vector mode.  */
6959   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
6960
6961   vmode = aarch64_simd_container_mode (imode, width);
6962   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
6963
6964   return aarch64_simd_valid_immediate (v_op, NULL);
6965 }
6966
6967
6968 /* Return the fixed registers used for condition codes.  */
6969
6970 static bool
6971 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
6972 {
6973   *p1 = CC_REGNUM;
6974   *p2 = INVALID_REGNUM;
6975   return true;
6976 }
6977
6978 /* This function is used by the call expanders of the machine description.
6979    RESULT is the register in which the result is returned.  It's NULL for
6980    "call" and "sibcall".
6981    MEM is the location of the function call.
6982    SIBCALL indicates whether this function call is normal call or sibling call.
6983    It will generate different pattern accordingly.  */
6984
6985 void
6986 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
6987 {
6988   rtx call, callee, tmp;
6989   rtvec vec;
6990   machine_mode mode;
6991
6992   gcc_assert (MEM_P (mem));
6993   callee = XEXP (mem, 0);
6994   mode = GET_MODE (callee);
6995   gcc_assert (mode == Pmode);
6996
6997   /* Decide if we should generate indirect calls by loading the
6998      address of the callee into a register before performing
6999      the branch-and-link.  */
7000   if (SYMBOL_REF_P (callee)
7001       ? (aarch64_is_long_call_p (callee)
7002          || aarch64_is_noplt_call_p (callee))
7003       : !REG_P (callee))
7004     XEXP (mem, 0) = force_reg (mode, callee);
7005
7006   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
7007
7008   if (result != NULL_RTX)
7009     call = gen_rtx_SET (result, call);
7010
7011   if (sibcall)
7012     tmp = ret_rtx;
7013   else
7014     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
7015
7016   vec = gen_rtvec (2, call, tmp);
7017   call = gen_rtx_PARALLEL (VOIDmode, vec);
7018
7019   aarch64_emit_call_insn (call);
7020 }
7021
7022 /* Emit call insn with PAT and do aarch64-specific handling.  */
7023
7024 void
7025 aarch64_emit_call_insn (rtx pat)
7026 {
7027   rtx insn = emit_call_insn (pat);
7028
7029   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
7030   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
7031   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
7032 }
7033
7034 machine_mode
7035 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
7036 {
7037   /* All floating point compares return CCFP if it is an equality
7038      comparison, and CCFPE otherwise.  */
7039   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
7040     {
7041       switch (code)
7042         {
7043         case EQ:
7044         case NE:
7045         case UNORDERED:
7046         case ORDERED:
7047         case UNLT:
7048         case UNLE:
7049         case UNGT:
7050         case UNGE:
7051         case UNEQ:
7052           return CCFPmode;
7053
7054         case LT:
7055         case LE:
7056         case GT:
7057         case GE:
7058         case LTGT:
7059           return CCFPEmode;
7060
7061         default:
7062           gcc_unreachable ();
7063         }
7064     }
7065
7066   /* Equality comparisons of short modes against zero can be performed
7067      using the TST instruction with the appropriate bitmask.  */
7068   if (y == const0_rtx && REG_P (x)
7069       && (code == EQ || code == NE)
7070       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
7071     return CC_NZmode;
7072
7073   /* Similarly, comparisons of zero_extends from shorter modes can
7074      be performed using an ANDS with an immediate mask.  */
7075   if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
7076       && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
7077       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
7078       && (code == EQ || code == NE))
7079     return CC_NZmode;
7080
7081   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
7082       && y == const0_rtx
7083       && (code == EQ || code == NE || code == LT || code == GE)
7084       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
7085           || GET_CODE (x) == NEG
7086           || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
7087               && CONST_INT_P (XEXP (x, 2)))))
7088     return CC_NZmode;
7089
7090   /* A compare with a shifted operand.  Because of canonicalization,
7091      the comparison will have to be swapped when we emit the assembly
7092      code.  */
7093   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
7094       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
7095       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
7096           || GET_CODE (x) == LSHIFTRT
7097           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
7098     return CC_SWPmode;
7099
7100   /* Similarly for a negated operand, but we can only do this for
7101      equalities.  */
7102   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
7103       && (REG_P (y) || GET_CODE (y) == SUBREG)
7104       && (code == EQ || code == NE)
7105       && GET_CODE (x) == NEG)
7106     return CC_Zmode;
7107
7108   /* A test for unsigned overflow.  */
7109   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
7110       && code == NE
7111       && GET_CODE (x) == PLUS
7112       && GET_CODE (y) == ZERO_EXTEND)
7113     return CC_Cmode;
7114
7115   /* A test for signed overflow.  */
7116   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
7117       && code == NE
7118       && GET_CODE (x) == PLUS
7119       && GET_CODE (y) == SIGN_EXTEND)
7120     return CC_Vmode;
7121
7122   /* For everything else, return CCmode.  */
7123   return CCmode;
7124 }
7125
7126 static int
7127 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
7128
7129 int
7130 aarch64_get_condition_code (rtx x)
7131 {
7132   machine_mode mode = GET_MODE (XEXP (x, 0));
7133   enum rtx_code comp_code = GET_CODE (x);
7134
7135   if (GET_MODE_CLASS (mode) != MODE_CC)
7136     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
7137   return aarch64_get_condition_code_1 (mode, comp_code);
7138 }
7139
7140 static int
7141 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
7142 {
7143   switch (mode)
7144     {
7145     case E_CCFPmode:
7146     case E_CCFPEmode:
7147       switch (comp_code)
7148         {
7149         case GE: return AARCH64_GE;
7150         case GT: return AARCH64_GT;
7151         case LE: return AARCH64_LS;
7152         case LT: return AARCH64_MI;
7153         case NE: return AARCH64_NE;
7154         case EQ: return AARCH64_EQ;
7155         case ORDERED: return AARCH64_VC;
7156         case UNORDERED: return AARCH64_VS;
7157         case UNLT: return AARCH64_LT;
7158         case UNLE: return AARCH64_LE;
7159         case UNGT: return AARCH64_HI;
7160         case UNGE: return AARCH64_PL;
7161         default: return -1;
7162         }
7163       break;
7164
7165     case E_CCmode:
7166       switch (comp_code)
7167         {
7168         case NE: return AARCH64_NE;
7169         case EQ: return AARCH64_EQ;
7170         case GE: return AARCH64_GE;
7171         case GT: return AARCH64_GT;
7172         case LE: return AARCH64_LE;
7173         case LT: return AARCH64_LT;
7174         case GEU: return AARCH64_CS;
7175         case GTU: return AARCH64_HI;
7176         case LEU: return AARCH64_LS;
7177         case LTU: return AARCH64_CC;
7178         default: return -1;
7179         }
7180       break;
7181
7182     case E_CC_SWPmode:
7183       switch (comp_code)
7184         {
7185         case NE: return AARCH64_NE;
7186         case EQ: return AARCH64_EQ;
7187         case GE: return AARCH64_LE;
7188         case GT: return AARCH64_LT;
7189         case LE: return AARCH64_GE;
7190         case LT: return AARCH64_GT;
7191         case GEU: return AARCH64_LS;
7192         case GTU: return AARCH64_CC;
7193         case LEU: return AARCH64_CS;
7194         case LTU: return AARCH64_HI;
7195         default: return -1;
7196         }
7197       break;
7198
7199     case E_CC_NZmode:
7200       switch (comp_code)
7201         {
7202         case NE: return AARCH64_NE;
7203         case EQ: return AARCH64_EQ;
7204         case GE: return AARCH64_PL;
7205         case LT: return AARCH64_MI;
7206         default: return -1;
7207         }
7208       break;
7209
7210     case E_CC_Zmode:
7211       switch (comp_code)
7212         {
7213         case NE: return AARCH64_NE;
7214         case EQ: return AARCH64_EQ;
7215         default: return -1;
7216         }
7217       break;
7218
7219     case E_CC_Cmode:
7220       switch (comp_code)
7221         {
7222         case NE: return AARCH64_CS;
7223         case EQ: return AARCH64_CC;
7224         default: return -1;
7225         }
7226       break;
7227
7228     case E_CC_Vmode:
7229       switch (comp_code)
7230         {
7231         case NE: return AARCH64_VS;
7232         case EQ: return AARCH64_VC;
7233         default: return -1;
7234         }
7235       break;
7236
7237     default:
7238       return -1;
7239     }
7240
7241   return -1;
7242 }
7243
7244 bool
7245 aarch64_const_vec_all_same_in_range_p (rtx x,
7246                                        HOST_WIDE_INT minval,
7247                                        HOST_WIDE_INT maxval)
7248 {
7249   rtx elt;
7250   return (const_vec_duplicate_p (x, &elt)
7251           && CONST_INT_P (elt)
7252           && IN_RANGE (INTVAL (elt), minval, maxval));
7253 }
7254
7255 bool
7256 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
7257 {
7258   return aarch64_const_vec_all_same_in_range_p (x, val, val);
7259 }
7260
7261 /* Return true if VEC is a constant in which every element is in the range
7262    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
7263
7264 static bool
7265 aarch64_const_vec_all_in_range_p (rtx vec,
7266                                   HOST_WIDE_INT minval,
7267                                   HOST_WIDE_INT maxval)
7268 {
7269   if (GET_CODE (vec) != CONST_VECTOR
7270       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
7271     return false;
7272
7273   int nunits;
7274   if (!CONST_VECTOR_STEPPED_P (vec))
7275     nunits = const_vector_encoded_nelts (vec);
7276   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
7277     return false;
7278
7279   for (int i = 0; i < nunits; i++)
7280     {
7281       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
7282       if (!CONST_INT_P (vec_elem)
7283           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
7284         return false;
7285     }
7286   return true;
7287 }
7288
7289 /* N Z C V.  */
7290 #define AARCH64_CC_V 1
7291 #define AARCH64_CC_C (1 << 1)
7292 #define AARCH64_CC_Z (1 << 2)
7293 #define AARCH64_CC_N (1 << 3)
7294
7295 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
7296 static const int aarch64_nzcv_codes[] =
7297 {
7298   0,            /* EQ, Z == 1.  */
7299   AARCH64_CC_Z, /* NE, Z == 0.  */
7300   0,            /* CS, C == 1.  */
7301   AARCH64_CC_C, /* CC, C == 0.  */
7302   0,            /* MI, N == 1.  */
7303   AARCH64_CC_N, /* PL, N == 0.  */
7304   0,            /* VS, V == 1.  */
7305   AARCH64_CC_V, /* VC, V == 0.  */
7306   0,            /* HI, C ==1 && Z == 0.  */
7307   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
7308   AARCH64_CC_V, /* GE, N == V.  */
7309   0,            /* LT, N != V.  */
7310   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
7311   0,            /* LE, !(Z == 0 && N == V).  */
7312   0,            /* AL, Any.  */
7313   0             /* NV, Any.  */
7314 };
7315
7316 /* Print floating-point vector immediate operand X to F, negating it
7317    first if NEGATE is true.  Return true on success, false if it isn't
7318    a constant we can handle.  */
7319
7320 static bool
7321 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
7322 {
7323   rtx elt;
7324
7325   if (!const_vec_duplicate_p (x, &elt))
7326     return false;
7327
7328   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
7329   if (negate)
7330     r = real_value_negate (&r);
7331
7332   /* We only handle the SVE single-bit immediates here.  */
7333   if (real_equal (&r, &dconst0))
7334     asm_fprintf (f, "0.0");
7335   else if (real_equal (&r, &dconst1))
7336     asm_fprintf (f, "1.0");
7337   else if (real_equal (&r, &dconsthalf))
7338     asm_fprintf (f, "0.5");
7339   else
7340     return false;
7341
7342   return true;
7343 }
7344
7345 /* Return the equivalent letter for size.  */
7346 static char
7347 sizetochar (int size)
7348 {
7349   switch (size)
7350     {
7351     case 64: return 'd';
7352     case 32: return 's';
7353     case 16: return 'h';
7354     case 8 : return 'b';
7355     default: gcc_unreachable ();
7356     }
7357 }
7358
7359 /* Print operand X to file F in a target specific manner according to CODE.
7360    The acceptable formatting commands given by CODE are:
7361      'c':               An integer or symbol address without a preceding #
7362                         sign.
7363      'C':               Take the duplicated element in a vector constant
7364                         and print it in hex.
7365      'D':               Take the duplicated element in a vector constant
7366                         and print it as an unsigned integer, in decimal.
7367      'e':               Print the sign/zero-extend size as a character 8->b,
7368                         16->h, 32->w.
7369      'p':               Prints N such that 2^N == X (X must be power of 2 and
7370                         const int).
7371      'P':               Print the number of non-zero bits in X (a const_int).
7372      'H':               Print the higher numbered register of a pair (TImode)
7373                         of regs.
7374      'm':               Print a condition (eq, ne, etc).
7375      'M':               Same as 'm', but invert condition.
7376      'N':               Take the duplicated element in a vector constant
7377                         and print the negative of it in decimal.
7378      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
7379      'S/T/U/V':         Print a FP/SIMD register name for a register list.
7380                         The register printed is the FP/SIMD register name
7381                         of X + 0/1/2/3 for S/T/U/V.
7382      'R':               Print a scalar FP/SIMD register name + 1.
7383      'X':               Print bottom 16 bits of integer constant in hex.
7384      'w/x':             Print a general register name or the zero register
7385                         (32-bit or 64-bit).
7386      '0':               Print a normal operand, if it's a general register,
7387                         then we assume DImode.
7388      'k':               Print NZCV for conditional compare instructions.
7389      'A':               Output address constant representing the first
7390                         argument of X, specifying a relocation offset
7391                         if appropriate.
7392      'L':               Output constant address specified by X
7393                         with a relocation offset if appropriate.
7394      'G':               Prints address of X, specifying a PC relative
7395                         relocation mode if appropriate.
7396      'y':               Output address of LDP or STP - this is used for
7397                         some LDP/STPs which don't use a PARALLEL in their
7398                         pattern (so the mode needs to be adjusted).
7399      'z':               Output address of a typical LDP or STP.  */
7400
7401 static void
7402 aarch64_print_operand (FILE *f, rtx x, int code)
7403 {
7404   rtx elt;
7405   switch (code)
7406     {
7407     case 'c':
7408       switch (GET_CODE (x))
7409         {
7410         case CONST_INT:
7411           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
7412           break;
7413
7414         case SYMBOL_REF:
7415           output_addr_const (f, x);
7416           break;
7417
7418         case CONST:
7419           if (GET_CODE (XEXP (x, 0)) == PLUS
7420               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
7421             {
7422               output_addr_const (f, x);
7423               break;
7424             }
7425           /* Fall through.  */
7426
7427         default:
7428           output_operand_lossage ("unsupported operand for code '%c'", code);
7429         }
7430       break;
7431
7432     case 'e':
7433       {
7434         int n;
7435
7436         if (!CONST_INT_P (x)
7437             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
7438           {
7439             output_operand_lossage ("invalid operand for '%%%c'", code);
7440             return;
7441           }
7442
7443         switch (n)
7444           {
7445           case 3:
7446             fputc ('b', f);
7447             break;
7448           case 4:
7449             fputc ('h', f);
7450             break;
7451           case 5:
7452             fputc ('w', f);
7453             break;
7454           default:
7455             output_operand_lossage ("invalid operand for '%%%c'", code);
7456             return;
7457           }
7458       }
7459       break;
7460
7461     case 'p':
7462       {
7463         int n;
7464
7465         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
7466           {
7467             output_operand_lossage ("invalid operand for '%%%c'", code);
7468             return;
7469           }
7470
7471         asm_fprintf (f, "%d", n);
7472       }
7473       break;
7474
7475     case 'P':
7476       if (!CONST_INT_P (x))
7477         {
7478           output_operand_lossage ("invalid operand for '%%%c'", code);
7479           return;
7480         }
7481
7482       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
7483       break;
7484
7485     case 'H':
7486       if (x == const0_rtx)
7487         {
7488           asm_fprintf (f, "xzr");
7489           break;
7490         }
7491
7492       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
7493         {
7494           output_operand_lossage ("invalid operand for '%%%c'", code);
7495           return;
7496         }
7497
7498       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
7499       break;
7500
7501     case 'M':
7502     case 'm':
7503       {
7504         int cond_code;
7505         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
7506         if (x == const_true_rtx)
7507           {
7508             if (code == 'M')
7509               fputs ("nv", f);
7510             return;
7511           }
7512
7513         if (!COMPARISON_P (x))
7514           {
7515             output_operand_lossage ("invalid operand for '%%%c'", code);
7516             return;
7517           }
7518
7519         cond_code = aarch64_get_condition_code (x);
7520         gcc_assert (cond_code >= 0);
7521         if (code == 'M')
7522           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
7523         fputs (aarch64_condition_codes[cond_code], f);
7524       }
7525       break;
7526
7527     case 'N':
7528       if (!const_vec_duplicate_p (x, &elt))
7529         {
7530           output_operand_lossage ("invalid vector constant");
7531           return;
7532         }
7533
7534       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
7535         asm_fprintf (f, "%wd", -INTVAL (elt));
7536       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
7537                && aarch64_print_vector_float_operand (f, x, true))
7538         ;
7539       else
7540         {
7541           output_operand_lossage ("invalid vector constant");
7542           return;
7543         }
7544       break;
7545
7546     case 'b':
7547     case 'h':
7548     case 's':
7549     case 'd':
7550     case 'q':
7551       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7552         {
7553           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7554           return;
7555         }
7556       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
7557       break;
7558
7559     case 'S':
7560     case 'T':
7561     case 'U':
7562     case 'V':
7563       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7564         {
7565           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7566           return;
7567         }
7568       asm_fprintf (f, "%c%d",
7569                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
7570                    REGNO (x) - V0_REGNUM + (code - 'S'));
7571       break;
7572
7573     case 'R':
7574       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7575         {
7576           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7577           return;
7578         }
7579       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
7580       break;
7581
7582     case 'X':
7583       if (!CONST_INT_P (x))
7584         {
7585           output_operand_lossage ("invalid operand for '%%%c'", code);
7586           return;
7587         }
7588       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
7589       break;
7590
7591     case 'C':
7592       {
7593         /* Print a replicated constant in hex.  */
7594         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
7595           {
7596             output_operand_lossage ("invalid operand for '%%%c'", code);
7597             return;
7598           }
7599         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
7600         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
7601       }
7602       break;
7603
7604     case 'D':
7605       {
7606         /* Print a replicated constant in decimal, treating it as
7607            unsigned.  */
7608         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
7609           {
7610             output_operand_lossage ("invalid operand for '%%%c'", code);
7611             return;
7612           }
7613         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
7614         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
7615       }
7616       break;
7617
7618     case 'w':
7619     case 'x':
7620       if (x == const0_rtx
7621           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
7622         {
7623           asm_fprintf (f, "%czr", code);
7624           break;
7625         }
7626
7627       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
7628         {
7629           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
7630           break;
7631         }
7632
7633       if (REG_P (x) && REGNO (x) == SP_REGNUM)
7634         {
7635           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
7636           break;
7637         }
7638
7639       /* Fall through */
7640
7641     case 0:
7642       if (x == NULL)
7643         {
7644           output_operand_lossage ("missing operand");
7645           return;
7646         }
7647
7648       switch (GET_CODE (x))
7649         {
7650         case REG:
7651           if (aarch64_sve_data_mode_p (GET_MODE (x)))
7652             {
7653               if (REG_NREGS (x) == 1)
7654                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
7655               else
7656                 {
7657                   char suffix
7658                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
7659                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
7660                                REGNO (x) - V0_REGNUM, suffix,
7661                                END_REGNO (x) - V0_REGNUM - 1, suffix);
7662                 }
7663             }
7664           else
7665             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
7666           break;
7667
7668         case MEM:
7669           output_address (GET_MODE (x), XEXP (x, 0));
7670           break;
7671
7672         case LABEL_REF:
7673         case SYMBOL_REF:
7674           output_addr_const (asm_out_file, x);
7675           break;
7676
7677         case CONST_INT:
7678           asm_fprintf (f, "%wd", INTVAL (x));
7679           break;
7680
7681         case CONST:
7682           if (!VECTOR_MODE_P (GET_MODE (x)))
7683             {
7684               output_addr_const (asm_out_file, x);
7685               break;
7686             }
7687           /* fall through */
7688
7689         case CONST_VECTOR:
7690           if (!const_vec_duplicate_p (x, &elt))
7691             {
7692               output_operand_lossage ("invalid vector constant");
7693               return;
7694             }
7695
7696           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
7697             asm_fprintf (f, "%wd", INTVAL (elt));
7698           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
7699                    && aarch64_print_vector_float_operand (f, x, false))
7700             ;
7701           else
7702             {
7703               output_operand_lossage ("invalid vector constant");
7704               return;
7705             }
7706           break;
7707
7708         case CONST_DOUBLE:
7709           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
7710              be getting CONST_DOUBLEs holding integers.  */
7711           gcc_assert (GET_MODE (x) != VOIDmode);
7712           if (aarch64_float_const_zero_rtx_p (x))
7713             {
7714               fputc ('0', f);
7715               break;
7716             }
7717           else if (aarch64_float_const_representable_p (x))
7718             {
7719 #define buf_size 20
7720               char float_buf[buf_size] = {'\0'};
7721               real_to_decimal_for_mode (float_buf,
7722                                         CONST_DOUBLE_REAL_VALUE (x),
7723                                         buf_size, buf_size,
7724                                         1, GET_MODE (x));
7725               asm_fprintf (asm_out_file, "%s", float_buf);
7726               break;
7727 #undef buf_size
7728             }
7729           output_operand_lossage ("invalid constant");
7730           return;
7731         default:
7732           output_operand_lossage ("invalid operand");
7733           return;
7734         }
7735       break;
7736
7737     case 'A':
7738       if (GET_CODE (x) == HIGH)
7739         x = XEXP (x, 0);
7740
7741       switch (aarch64_classify_symbolic_expression (x))
7742         {
7743         case SYMBOL_SMALL_GOT_4G:
7744           asm_fprintf (asm_out_file, ":got:");
7745           break;
7746
7747         case SYMBOL_SMALL_TLSGD:
7748           asm_fprintf (asm_out_file, ":tlsgd:");
7749           break;
7750
7751         case SYMBOL_SMALL_TLSDESC:
7752           asm_fprintf (asm_out_file, ":tlsdesc:");
7753           break;
7754
7755         case SYMBOL_SMALL_TLSIE:
7756           asm_fprintf (asm_out_file, ":gottprel:");
7757           break;
7758
7759         case SYMBOL_TLSLE24:
7760           asm_fprintf (asm_out_file, ":tprel:");
7761           break;
7762
7763         case SYMBOL_TINY_GOT:
7764           gcc_unreachable ();
7765           break;
7766
7767         default:
7768           break;
7769         }
7770       output_addr_const (asm_out_file, x);
7771       break;
7772
7773     case 'L':
7774       switch (aarch64_classify_symbolic_expression (x))
7775         {
7776         case SYMBOL_SMALL_GOT_4G:
7777           asm_fprintf (asm_out_file, ":lo12:");
7778           break;
7779
7780         case SYMBOL_SMALL_TLSGD:
7781           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
7782           break;
7783
7784         case SYMBOL_SMALL_TLSDESC:
7785           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
7786           break;
7787
7788         case SYMBOL_SMALL_TLSIE:
7789           asm_fprintf (asm_out_file, ":gottprel_lo12:");
7790           break;
7791
7792         case SYMBOL_TLSLE12:
7793           asm_fprintf (asm_out_file, ":tprel_lo12:");
7794           break;
7795
7796         case SYMBOL_TLSLE24:
7797           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
7798           break;
7799
7800         case SYMBOL_TINY_GOT:
7801           asm_fprintf (asm_out_file, ":got:");
7802           break;
7803
7804         case SYMBOL_TINY_TLSIE:
7805           asm_fprintf (asm_out_file, ":gottprel:");
7806           break;
7807
7808         default:
7809           break;
7810         }
7811       output_addr_const (asm_out_file, x);
7812       break;
7813
7814     case 'G':
7815       switch (aarch64_classify_symbolic_expression (x))
7816         {
7817         case SYMBOL_TLSLE24:
7818           asm_fprintf (asm_out_file, ":tprel_hi12:");
7819           break;
7820         default:
7821           break;
7822         }
7823       output_addr_const (asm_out_file, x);
7824       break;
7825
7826     case 'k':
7827       {
7828         HOST_WIDE_INT cond_code;
7829
7830         if (!CONST_INT_P (x))
7831           {
7832             output_operand_lossage ("invalid operand for '%%%c'", code);
7833             return;
7834           }
7835
7836         cond_code = INTVAL (x);
7837         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
7838         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
7839       }
7840       break;
7841
7842     case 'y':
7843     case 'z':
7844       {
7845         machine_mode mode = GET_MODE (x);
7846
7847         if (GET_CODE (x) != MEM
7848             || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
7849           {
7850             output_operand_lossage ("invalid operand for '%%%c'", code);
7851             return;
7852           }
7853
7854         if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
7855                                             code == 'y'
7856                                             ? ADDR_QUERY_LDP_STP_N
7857                                             : ADDR_QUERY_LDP_STP))
7858           output_operand_lossage ("invalid operand prefix '%%%c'", code);
7859       }
7860       break;
7861
7862     default:
7863       output_operand_lossage ("invalid operand prefix '%%%c'", code);
7864       return;
7865     }
7866 }
7867
7868 /* Print address 'x' of a memory access with mode 'mode'.
7869    'op' is the context required by aarch64_classify_address.  It can either be
7870    MEM for a normal memory access or PARALLEL for LDP/STP.  */
7871 static bool
7872 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
7873                                 aarch64_addr_query_type type)
7874 {
7875   struct aarch64_address_info addr;
7876   unsigned int size;
7877
7878   /* Check all addresses are Pmode - including ILP32.  */
7879   if (GET_MODE (x) != Pmode
7880       && (!CONST_INT_P (x)
7881           || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
7882     {
7883       output_operand_lossage ("invalid address mode");
7884       return false;
7885     }
7886
7887   if (aarch64_classify_address (&addr, x, mode, true, type))
7888     switch (addr.type)
7889       {
7890       case ADDRESS_REG_IMM:
7891         if (known_eq (addr.const_offset, 0))
7892           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
7893         else if (aarch64_sve_data_mode_p (mode))
7894           {
7895             HOST_WIDE_INT vnum
7896               = exact_div (addr.const_offset,
7897                            BYTES_PER_SVE_VECTOR).to_constant ();
7898             asm_fprintf (f, "[%s, #%wd, mul vl]",
7899                          reg_names[REGNO (addr.base)], vnum);
7900           }
7901         else if (aarch64_sve_pred_mode_p (mode))
7902           {
7903             HOST_WIDE_INT vnum
7904               = exact_div (addr.const_offset,
7905                            BYTES_PER_SVE_PRED).to_constant ();
7906             asm_fprintf (f, "[%s, #%wd, mul vl]",
7907                          reg_names[REGNO (addr.base)], vnum);
7908           }
7909         else
7910           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
7911                        INTVAL (addr.offset));
7912         return true;
7913
7914       case ADDRESS_REG_REG:
7915         if (addr.shift == 0)
7916           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
7917                        reg_names [REGNO (addr.offset)]);
7918         else
7919           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
7920                        reg_names [REGNO (addr.offset)], addr.shift);
7921         return true;
7922
7923       case ADDRESS_REG_UXTW:
7924         if (addr.shift == 0)
7925           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
7926                        REGNO (addr.offset) - R0_REGNUM);
7927         else
7928           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
7929                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
7930         return true;
7931
7932       case ADDRESS_REG_SXTW:
7933         if (addr.shift == 0)
7934           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
7935                        REGNO (addr.offset) - R0_REGNUM);
7936         else
7937           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
7938                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
7939         return true;
7940
7941       case ADDRESS_REG_WB:
7942         /* Writeback is only supported for fixed-width modes.  */
7943         size = GET_MODE_SIZE (mode).to_constant ();
7944         switch (GET_CODE (x))
7945           {
7946           case PRE_INC:
7947             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
7948             return true;
7949           case POST_INC:
7950             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
7951             return true;
7952           case PRE_DEC:
7953             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
7954             return true;
7955           case POST_DEC:
7956             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
7957             return true;
7958           case PRE_MODIFY:
7959             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
7960                          INTVAL (addr.offset));
7961             return true;
7962           case POST_MODIFY:
7963             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
7964                          INTVAL (addr.offset));
7965             return true;
7966           default:
7967             break;
7968           }
7969         break;
7970
7971       case ADDRESS_LO_SUM:
7972         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
7973         output_addr_const (f, addr.offset);
7974         asm_fprintf (f, "]");
7975         return true;
7976
7977       case ADDRESS_SYMBOLIC:
7978         output_addr_const (f, x);
7979         return true;
7980       }
7981
7982   return false;
7983 }
7984
7985 /* Print address 'x' of a memory access with mode 'mode'.  */
7986 static void
7987 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
7988 {
7989   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
7990     output_addr_const (f, x);
7991 }
7992
7993 bool
7994 aarch64_label_mentioned_p (rtx x)
7995 {
7996   const char *fmt;
7997   int i;
7998
7999   if (GET_CODE (x) == LABEL_REF)
8000     return true;
8001
8002   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
8003      referencing instruction, but they are constant offsets, not
8004      symbols.  */
8005   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
8006     return false;
8007
8008   fmt = GET_RTX_FORMAT (GET_CODE (x));
8009   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
8010     {
8011       if (fmt[i] == 'E')
8012         {
8013           int j;
8014
8015           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
8016             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
8017               return 1;
8018         }
8019       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
8020         return 1;
8021     }
8022
8023   return 0;
8024 }
8025
8026 /* Implement REGNO_REG_CLASS.  */
8027
8028 enum reg_class
8029 aarch64_regno_regclass (unsigned regno)
8030 {
8031   if (GP_REGNUM_P (regno))
8032     return GENERAL_REGS;
8033
8034   if (regno == SP_REGNUM)
8035     return STACK_REG;
8036
8037   if (regno == FRAME_POINTER_REGNUM
8038       || regno == ARG_POINTER_REGNUM)
8039     return POINTER_REGS;
8040
8041   if (FP_REGNUM_P (regno))
8042     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
8043
8044   if (PR_REGNUM_P (regno))
8045     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
8046
8047   return NO_REGS;
8048 }
8049
8050 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
8051    If OFFSET is out of range, return an offset of an anchor point
8052    that is in range.  Return 0 otherwise.  */
8053
8054 static HOST_WIDE_INT
8055 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
8056                        machine_mode mode)
8057 {
8058   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
8059   if (size > 16)
8060     return (offset + 0x400) & ~0x7f0;
8061
8062   /* For offsets that aren't a multiple of the access size, the limit is
8063      -256...255.  */
8064   if (offset & (size - 1))
8065     {
8066       /* BLKmode typically uses LDP of X-registers.  */
8067       if (mode == BLKmode)
8068         return (offset + 512) & ~0x3ff;
8069       return (offset + 0x100) & ~0x1ff;
8070     }
8071
8072   /* Small negative offsets are supported.  */
8073   if (IN_RANGE (offset, -256, 0))
8074     return 0;
8075
8076   if (mode == TImode || mode == TFmode)
8077     return (offset + 0x100) & ~0x1ff;
8078
8079   /* Use 12-bit offset by access size.  */
8080   return offset & (~0xfff * size);
8081 }
8082
8083 static rtx
8084 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
8085 {
8086   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
8087      where mask is selected by alignment and size of the offset.
8088      We try to pick as large a range for the offset as possible to
8089      maximize the chance of a CSE.  However, for aligned addresses
8090      we limit the range to 4k so that structures with different sized
8091      elements are likely to use the same base.  We need to be careful
8092      not to split a CONST for some forms of address expression, otherwise
8093      it will generate sub-optimal code.  */
8094
8095   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
8096     {
8097       rtx base = XEXP (x, 0);
8098       rtx offset_rtx = XEXP (x, 1);
8099       HOST_WIDE_INT offset = INTVAL (offset_rtx);
8100
8101       if (GET_CODE (base) == PLUS)
8102         {
8103           rtx op0 = XEXP (base, 0);
8104           rtx op1 = XEXP (base, 1);
8105
8106           /* Force any scaling into a temp for CSE.  */
8107           op0 = force_reg (Pmode, op0);
8108           op1 = force_reg (Pmode, op1);
8109
8110           /* Let the pointer register be in op0.  */
8111           if (REG_POINTER (op1))
8112             std::swap (op0, op1);
8113
8114           /* If the pointer is virtual or frame related, then we know that
8115              virtual register instantiation or register elimination is going
8116              to apply a second constant.  We want the two constants folded
8117              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
8118           if (virt_or_elim_regno_p (REGNO (op0)))
8119             {
8120               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
8121                                    NULL_RTX, true, OPTAB_DIRECT);
8122               return gen_rtx_PLUS (Pmode, base, op1);
8123             }
8124
8125           /* Otherwise, in order to encourage CSE (and thence loop strength
8126              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
8127           base = expand_binop (Pmode, add_optab, op0, op1,
8128                                NULL_RTX, true, OPTAB_DIRECT);
8129           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
8130         }
8131
8132       HOST_WIDE_INT size;
8133       if (GET_MODE_SIZE (mode).is_constant (&size))
8134         {
8135           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
8136                                                              mode);
8137           if (base_offset != 0)
8138             {
8139               base = plus_constant (Pmode, base, base_offset);
8140               base = force_operand (base, NULL_RTX);
8141               return plus_constant (Pmode, base, offset - base_offset);
8142             }
8143         }
8144     }
8145
8146   return x;
8147 }
8148
8149 static reg_class_t
8150 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
8151                           reg_class_t rclass,
8152                           machine_mode mode,
8153                           secondary_reload_info *sri)
8154 {
8155   /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
8156      directly by the *aarch64_sve_mov<mode>_be move pattern.  See the
8157      comment at the head of aarch64-sve.md for more details about the
8158      big-endian handling.  */
8159   if (BYTES_BIG_ENDIAN
8160       && reg_class_subset_p (rclass, FP_REGS)
8161       && !((REG_P (x) && HARD_REGISTER_P (x))
8162            || aarch64_simd_valid_immediate (x, NULL))
8163       && aarch64_sve_data_mode_p (mode))
8164     {
8165       sri->icode = CODE_FOR_aarch64_sve_reload_be;
8166       return NO_REGS;
8167     }
8168
8169   /* If we have to disable direct literal pool loads and stores because the
8170      function is too big, then we need a scratch register.  */
8171   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
8172       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
8173           || targetm.vector_mode_supported_p (GET_MODE (x)))
8174       && !aarch64_pcrelative_literal_loads)
8175     {
8176       sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
8177       return NO_REGS;
8178     }
8179
8180   /* Without the TARGET_SIMD instructions we cannot move a Q register
8181      to a Q register directly.  We need a scratch.  */
8182   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
8183       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
8184       && reg_class_subset_p (rclass, FP_REGS))
8185     {
8186       sri->icode = code_for_aarch64_reload_mov (mode);
8187       return NO_REGS;
8188     }
8189
8190   /* A TFmode or TImode memory access should be handled via an FP_REGS
8191      because AArch64 has richer addressing modes for LDR/STR instructions
8192      than LDP/STP instructions.  */
8193   if (TARGET_FLOAT && rclass == GENERAL_REGS
8194       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
8195     return FP_REGS;
8196
8197   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
8198       return GENERAL_REGS;
8199
8200   return NO_REGS;
8201 }
8202
8203 static bool
8204 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
8205 {
8206   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
8207
8208   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
8209      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
8210   if (frame_pointer_needed)
8211     return to == HARD_FRAME_POINTER_REGNUM;
8212   return true;
8213 }
8214
8215 poly_int64
8216 aarch64_initial_elimination_offset (unsigned from, unsigned to)
8217 {
8218   if (to == HARD_FRAME_POINTER_REGNUM)
8219     {
8220       if (from == ARG_POINTER_REGNUM)
8221         return cfun->machine->frame.hard_fp_offset;
8222
8223       if (from == FRAME_POINTER_REGNUM)
8224         return cfun->machine->frame.hard_fp_offset
8225                - cfun->machine->frame.locals_offset;
8226     }
8227
8228   if (to == STACK_POINTER_REGNUM)
8229     {
8230       if (from == FRAME_POINTER_REGNUM)
8231           return cfun->machine->frame.frame_size
8232                  - cfun->machine->frame.locals_offset;
8233     }
8234
8235   return cfun->machine->frame.frame_size;
8236 }
8237
8238 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
8239    previous frame.  */
8240
8241 rtx
8242 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
8243 {
8244   if (count != 0)
8245     return const0_rtx;
8246   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
8247 }
8248
8249
8250 static void
8251 aarch64_asm_trampoline_template (FILE *f)
8252 {
8253   int offset1 = 16;
8254   int offset2 = 20;
8255
8256   if (aarch64_bti_enabled ())
8257     {
8258       asm_fprintf (f, "\thint\t34 // bti c\n");
8259       offset1 -= 4;
8260       offset2 -= 4;
8261     }
8262
8263   if (TARGET_ILP32)
8264     {
8265       asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
8266       asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
8267                    offset1);
8268     }
8269   else
8270     {
8271       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
8272       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
8273                    offset2);
8274     }
8275   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
8276
8277   /* The trampoline needs an extra padding instruction.  In case if BTI is
8278      enabled the padding instruction is replaced by the BTI instruction at
8279      the beginning.  */
8280   if (!aarch64_bti_enabled ())
8281     assemble_aligned_integer (4, const0_rtx);
8282
8283   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
8284   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
8285 }
8286
8287 static void
8288 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
8289 {
8290   rtx fnaddr, mem, a_tramp;
8291   const int tramp_code_sz = 16;
8292
8293   /* Don't need to copy the trailing D-words, we fill those in below.  */
8294   emit_block_move (m_tramp, assemble_trampoline_template (),
8295                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
8296   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
8297   fnaddr = XEXP (DECL_RTL (fndecl), 0);
8298   if (GET_MODE (fnaddr) != ptr_mode)
8299     fnaddr = convert_memory_address (ptr_mode, fnaddr);
8300   emit_move_insn (mem, fnaddr);
8301
8302   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
8303   emit_move_insn (mem, chain_value);
8304
8305   /* XXX We should really define a "clear_cache" pattern and use
8306      gen_clear_cache().  */
8307   a_tramp = XEXP (m_tramp, 0);
8308   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
8309                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
8310                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
8311                      ptr_mode);
8312 }
8313
8314 static unsigned char
8315 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
8316 {
8317   /* ??? Logically we should only need to provide a value when
8318      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
8319      can hold MODE, but at the moment we need to handle all modes.
8320      Just ignore any runtime parts for registers that can't store them.  */
8321   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
8322   unsigned int nregs;
8323   switch (regclass)
8324     {
8325     case TAILCALL_ADDR_REGS:
8326     case POINTER_REGS:
8327     case GENERAL_REGS:
8328     case ALL_REGS:
8329     case POINTER_AND_FP_REGS:
8330     case FP_REGS:
8331     case FP_LO_REGS:
8332       if (aarch64_sve_data_mode_p (mode)
8333           && constant_multiple_p (GET_MODE_SIZE (mode),
8334                                   BYTES_PER_SVE_VECTOR, &nregs))
8335         return nregs;
8336       return (aarch64_vector_data_mode_p (mode)
8337               ? CEIL (lowest_size, UNITS_PER_VREG)
8338               : CEIL (lowest_size, UNITS_PER_WORD));
8339     case STACK_REG:
8340     case PR_REGS:
8341     case PR_LO_REGS:
8342     case PR_HI_REGS:
8343       return 1;
8344
8345     case NO_REGS:
8346       return 0;
8347
8348     default:
8349       break;
8350     }
8351   gcc_unreachable ();
8352 }
8353
8354 static reg_class_t
8355 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
8356 {
8357   if (regclass == POINTER_REGS)
8358     return GENERAL_REGS;
8359
8360   if (regclass == STACK_REG)
8361     {
8362       if (REG_P(x)
8363           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
8364           return regclass;
8365
8366       return NO_REGS;
8367     }
8368
8369   /* Register eliminiation can result in a request for
8370      SP+constant->FP_REGS.  We cannot support such operations which
8371      use SP as source and an FP_REG as destination, so reject out
8372      right now.  */
8373   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
8374     {
8375       rtx lhs = XEXP (x, 0);
8376
8377       /* Look through a possible SUBREG introduced by ILP32.  */
8378       if (GET_CODE (lhs) == SUBREG)
8379         lhs = SUBREG_REG (lhs);
8380
8381       gcc_assert (REG_P (lhs));
8382       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
8383                                       POINTER_REGS));
8384       return NO_REGS;
8385     }
8386
8387   return regclass;
8388 }
8389
8390 void
8391 aarch64_asm_output_labelref (FILE* f, const char *name)
8392 {
8393   asm_fprintf (f, "%U%s", name);
8394 }
8395
8396 static void
8397 aarch64_elf_asm_constructor (rtx symbol, int priority)
8398 {
8399   if (priority == DEFAULT_INIT_PRIORITY)
8400     default_ctor_section_asm_out_constructor (symbol, priority);
8401   else
8402     {
8403       section *s;
8404       /* While priority is known to be in range [0, 65535], so 18 bytes
8405          would be enough, the compiler might not know that.  To avoid
8406          -Wformat-truncation false positive, use a larger size.  */
8407       char buf[23];
8408       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
8409       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
8410       switch_to_section (s);
8411       assemble_align (POINTER_SIZE);
8412       assemble_aligned_integer (POINTER_BYTES, symbol);
8413     }
8414 }
8415
8416 static void
8417 aarch64_elf_asm_destructor (rtx symbol, int priority)
8418 {
8419   if (priority == DEFAULT_INIT_PRIORITY)
8420     default_dtor_section_asm_out_destructor (symbol, priority);
8421   else
8422     {
8423       section *s;
8424       /* While priority is known to be in range [0, 65535], so 18 bytes
8425          would be enough, the compiler might not know that.  To avoid
8426          -Wformat-truncation false positive, use a larger size.  */
8427       char buf[23];
8428       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
8429       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
8430       switch_to_section (s);
8431       assemble_align (POINTER_SIZE);
8432       assemble_aligned_integer (POINTER_BYTES, symbol);
8433     }
8434 }
8435
8436 const char*
8437 aarch64_output_casesi (rtx *operands)
8438 {
8439   char buf[100];
8440   char label[100];
8441   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
8442   int index;
8443   static const char *const patterns[4][2] =
8444   {
8445     {
8446       "ldrb\t%w3, [%0,%w1,uxtw]",
8447       "add\t%3, %4, %w3, sxtb #2"
8448     },
8449     {
8450       "ldrh\t%w3, [%0,%w1,uxtw #1]",
8451       "add\t%3, %4, %w3, sxth #2"
8452     },
8453     {
8454       "ldr\t%w3, [%0,%w1,uxtw #2]",
8455       "add\t%3, %4, %w3, sxtw #2"
8456     },
8457     /* We assume that DImode is only generated when not optimizing and
8458        that we don't really need 64-bit address offsets.  That would
8459        imply an object file with 8GB of code in a single function!  */
8460     {
8461       "ldr\t%w3, [%0,%w1,uxtw #2]",
8462       "add\t%3, %4, %w3, sxtw #2"
8463     }
8464   };
8465
8466   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
8467
8468   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
8469   index = exact_log2 (GET_MODE_SIZE (mode));
8470
8471   gcc_assert (index >= 0 && index <= 3);
8472
8473   /* Need to implement table size reduction, by chaning the code below.  */
8474   output_asm_insn (patterns[index][0], operands);
8475   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
8476   snprintf (buf, sizeof (buf),
8477             "adr\t%%4, %s", targetm.strip_name_encoding (label));
8478   output_asm_insn (buf, operands);
8479   output_asm_insn (patterns[index][1], operands);
8480   output_asm_insn ("br\t%3", operands);
8481   assemble_label (asm_out_file, label);
8482   return "";
8483 }
8484
8485
8486 /* Return size in bits of an arithmetic operand which is shifted/scaled and
8487    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
8488    operator.  */
8489
8490 int
8491 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
8492 {
8493   if (shift >= 0 && shift <= 3)
8494     {
8495       int size;
8496       for (size = 8; size <= 32; size *= 2)
8497         {
8498           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
8499           if (mask == bits << shift)
8500             return size;
8501         }
8502     }
8503   return 0;
8504 }
8505
8506 /* Constant pools are per function only when PC relative
8507    literal loads are true or we are in the large memory
8508    model.  */
8509
8510 static inline bool
8511 aarch64_can_use_per_function_literal_pools_p (void)
8512 {
8513   return (aarch64_pcrelative_literal_loads
8514           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
8515 }
8516
8517 static bool
8518 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
8519 {
8520   /* We can't use blocks for constants when we're using a per-function
8521      constant pool.  */
8522   return !aarch64_can_use_per_function_literal_pools_p ();
8523 }
8524
8525 /* Select appropriate section for constants depending
8526    on where we place literal pools.  */
8527
8528 static section *
8529 aarch64_select_rtx_section (machine_mode mode,
8530                             rtx x,
8531                             unsigned HOST_WIDE_INT align)
8532 {
8533   if (aarch64_can_use_per_function_literal_pools_p ())
8534     return function_section (current_function_decl);
8535
8536   return default_elf_select_rtx_section (mode, x, align);
8537 }
8538
8539 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
8540 void
8541 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
8542                                   HOST_WIDE_INT offset)
8543 {
8544   /* When using per-function literal pools, we must ensure that any code
8545      section is aligned to the minimal instruction length, lest we get
8546      errors from the assembler re "unaligned instructions".  */
8547   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
8548     ASM_OUTPUT_ALIGN (f, 2);
8549 }
8550
8551 /* Costs.  */
8552
8553 /* Helper function for rtx cost calculation.  Strip a shift expression
8554    from X.  Returns the inner operand if successful, or the original
8555    expression on failure.  */
8556 static rtx
8557 aarch64_strip_shift (rtx x)
8558 {
8559   rtx op = x;
8560
8561   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
8562      we can convert both to ROR during final output.  */
8563   if ((GET_CODE (op) == ASHIFT
8564        || GET_CODE (op) == ASHIFTRT
8565        || GET_CODE (op) == LSHIFTRT
8566        || GET_CODE (op) == ROTATERT
8567        || GET_CODE (op) == ROTATE)
8568       && CONST_INT_P (XEXP (op, 1)))
8569     return XEXP (op, 0);
8570
8571   if (GET_CODE (op) == MULT
8572       && CONST_INT_P (XEXP (op, 1))
8573       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
8574     return XEXP (op, 0);
8575
8576   return x;
8577 }
8578
8579 /* Helper function for rtx cost calculation.  Strip an extend
8580    expression from X.  Returns the inner operand if successful, or the
8581    original expression on failure.  We deal with a number of possible
8582    canonicalization variations here. If STRIP_SHIFT is true, then
8583    we can strip off a shift also.  */
8584 static rtx
8585 aarch64_strip_extend (rtx x, bool strip_shift)
8586 {
8587   scalar_int_mode mode;
8588   rtx op = x;
8589
8590   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
8591     return op;
8592
8593   /* Zero and sign extraction of a widened value.  */
8594   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
8595       && XEXP (op, 2) == const0_rtx
8596       && GET_CODE (XEXP (op, 0)) == MULT
8597       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
8598                                          XEXP (op, 1)))
8599     return XEXP (XEXP (op, 0), 0);
8600
8601   /* It can also be represented (for zero-extend) as an AND with an
8602      immediate.  */
8603   if (GET_CODE (op) == AND
8604       && GET_CODE (XEXP (op, 0)) == MULT
8605       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
8606       && CONST_INT_P (XEXP (op, 1))
8607       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
8608                            INTVAL (XEXP (op, 1))) != 0)
8609     return XEXP (XEXP (op, 0), 0);
8610
8611   /* Now handle extended register, as this may also have an optional
8612      left shift by 1..4.  */
8613   if (strip_shift
8614       && GET_CODE (op) == ASHIFT
8615       && CONST_INT_P (XEXP (op, 1))
8616       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
8617     op = XEXP (op, 0);
8618
8619   if (GET_CODE (op) == ZERO_EXTEND
8620       || GET_CODE (op) == SIGN_EXTEND)
8621     op = XEXP (op, 0);
8622
8623   if (op != x)
8624     return op;
8625
8626   return x;
8627 }
8628
8629 /* Return true iff CODE is a shift supported in combination
8630    with arithmetic instructions.  */
8631
8632 static bool
8633 aarch64_shift_p (enum rtx_code code)
8634 {
8635   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
8636 }
8637
8638
8639 /* Return true iff X is a cheap shift without a sign extend. */
8640
8641 static bool
8642 aarch64_cheap_mult_shift_p (rtx x)
8643 {
8644   rtx op0, op1;
8645
8646   op0 = XEXP (x, 0);
8647   op1 = XEXP (x, 1);
8648
8649   if (!(aarch64_tune_params.extra_tuning_flags
8650                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
8651     return false;
8652
8653   if (GET_CODE (op0) == SIGN_EXTEND)
8654     return false;
8655
8656   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
8657       && UINTVAL (op1) <= 4)
8658     return true;
8659
8660   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
8661     return false;
8662
8663   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
8664
8665   if (l2 > 0 && l2 <= 4)
8666     return true;
8667
8668   return false;
8669 }
8670
8671 /* Helper function for rtx cost calculation.  Calculate the cost of
8672    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
8673    Return the calculated cost of the expression, recursing manually in to
8674    operands where needed.  */
8675
8676 static int
8677 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
8678 {
8679   rtx op0, op1;
8680   const struct cpu_cost_table *extra_cost
8681     = aarch64_tune_params.insn_extra_cost;
8682   int cost = 0;
8683   bool compound_p = (outer == PLUS || outer == MINUS);
8684   machine_mode mode = GET_MODE (x);
8685
8686   gcc_checking_assert (code == MULT);
8687
8688   op0 = XEXP (x, 0);
8689   op1 = XEXP (x, 1);
8690
8691   if (VECTOR_MODE_P (mode))
8692     mode = GET_MODE_INNER (mode);
8693
8694   /* Integer multiply/fma.  */
8695   if (GET_MODE_CLASS (mode) == MODE_INT)
8696     {
8697       /* The multiply will be canonicalized as a shift, cost it as such.  */
8698       if (aarch64_shift_p (GET_CODE (x))
8699           || (CONST_INT_P (op1)
8700               && exact_log2 (INTVAL (op1)) > 0))
8701         {
8702           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
8703                            || GET_CODE (op0) == SIGN_EXTEND;
8704           if (speed)
8705             {
8706               if (compound_p)
8707                 {
8708                   /* If the shift is considered cheap,
8709                      then don't add any cost. */
8710                   if (aarch64_cheap_mult_shift_p (x))
8711                     ;
8712                   else if (REG_P (op1))
8713                     /* ARITH + shift-by-register.  */
8714                     cost += extra_cost->alu.arith_shift_reg;
8715                   else if (is_extend)
8716                     /* ARITH + extended register.  We don't have a cost field
8717                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
8718                     cost += extra_cost->alu.extend_arith;
8719                   else
8720                     /* ARITH + shift-by-immediate.  */
8721                     cost += extra_cost->alu.arith_shift;
8722                 }
8723               else
8724                 /* LSL (immediate).  */
8725                 cost += extra_cost->alu.shift;
8726
8727             }
8728           /* Strip extends as we will have costed them in the case above.  */
8729           if (is_extend)
8730             op0 = aarch64_strip_extend (op0, true);
8731
8732           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
8733
8734           return cost;
8735         }
8736
8737       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
8738          compound and let the below cases handle it.  After all, MNEG is a
8739          special-case alias of MSUB.  */
8740       if (GET_CODE (op0) == NEG)
8741         {
8742           op0 = XEXP (op0, 0);
8743           compound_p = true;
8744         }
8745
8746       /* Integer multiplies or FMAs have zero/sign extending variants.  */
8747       if ((GET_CODE (op0) == ZERO_EXTEND
8748            && GET_CODE (op1) == ZERO_EXTEND)
8749           || (GET_CODE (op0) == SIGN_EXTEND
8750               && GET_CODE (op1) == SIGN_EXTEND))
8751         {
8752           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
8753           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
8754
8755           if (speed)
8756             {
8757               if (compound_p)
8758                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
8759                 cost += extra_cost->mult[0].extend_add;
8760               else
8761                 /* MUL/SMULL/UMULL.  */
8762                 cost += extra_cost->mult[0].extend;
8763             }
8764
8765           return cost;
8766         }
8767
8768       /* This is either an integer multiply or a MADD.  In both cases
8769          we want to recurse and cost the operands.  */
8770       cost += rtx_cost (op0, mode, MULT, 0, speed);
8771       cost += rtx_cost (op1, mode, MULT, 1, speed);
8772
8773       if (speed)
8774         {
8775           if (compound_p)
8776             /* MADD/MSUB.  */
8777             cost += extra_cost->mult[mode == DImode].add;
8778           else
8779             /* MUL.  */
8780             cost += extra_cost->mult[mode == DImode].simple;
8781         }
8782
8783       return cost;
8784     }
8785   else
8786     {
8787       if (speed)
8788         {
8789           /* Floating-point FMA/FMUL can also support negations of the
8790              operands, unless the rounding mode is upward or downward in
8791              which case FNMUL is different than FMUL with operand negation.  */
8792           bool neg0 = GET_CODE (op0) == NEG;
8793           bool neg1 = GET_CODE (op1) == NEG;
8794           if (compound_p || !flag_rounding_math || (neg0 && neg1))
8795             {
8796               if (neg0)
8797                 op0 = XEXP (op0, 0);
8798               if (neg1)
8799                 op1 = XEXP (op1, 0);
8800             }
8801
8802           if (compound_p)
8803             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
8804             cost += extra_cost->fp[mode == DFmode].fma;
8805           else
8806             /* FMUL/FNMUL.  */
8807             cost += extra_cost->fp[mode == DFmode].mult;
8808         }
8809
8810       cost += rtx_cost (op0, mode, MULT, 0, speed);
8811       cost += rtx_cost (op1, mode, MULT, 1, speed);
8812       return cost;
8813     }
8814 }
8815
8816 static int
8817 aarch64_address_cost (rtx x,
8818                       machine_mode mode,
8819                       addr_space_t as ATTRIBUTE_UNUSED,
8820                       bool speed)
8821 {
8822   enum rtx_code c = GET_CODE (x);
8823   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
8824   struct aarch64_address_info info;
8825   int cost = 0;
8826   info.shift = 0;
8827
8828   if (!aarch64_classify_address (&info, x, mode, false))
8829     {
8830       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
8831         {
8832           /* This is a CONST or SYMBOL ref which will be split
8833              in a different way depending on the code model in use.
8834              Cost it through the generic infrastructure.  */
8835           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
8836           /* Divide through by the cost of one instruction to
8837              bring it to the same units as the address costs.  */
8838           cost_symbol_ref /= COSTS_N_INSNS (1);
8839           /* The cost is then the cost of preparing the address,
8840              followed by an immediate (possibly 0) offset.  */
8841           return cost_symbol_ref + addr_cost->imm_offset;
8842         }
8843       else
8844         {
8845           /* This is most likely a jump table from a case
8846              statement.  */
8847           return addr_cost->register_offset;
8848         }
8849     }
8850
8851   switch (info.type)
8852     {
8853       case ADDRESS_LO_SUM:
8854       case ADDRESS_SYMBOLIC:
8855       case ADDRESS_REG_IMM:
8856         cost += addr_cost->imm_offset;
8857         break;
8858
8859       case ADDRESS_REG_WB:
8860         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
8861           cost += addr_cost->pre_modify;
8862         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
8863           cost += addr_cost->post_modify;
8864         else
8865           gcc_unreachable ();
8866
8867         break;
8868
8869       case ADDRESS_REG_REG:
8870         cost += addr_cost->register_offset;
8871         break;
8872
8873       case ADDRESS_REG_SXTW:
8874         cost += addr_cost->register_sextend;
8875         break;
8876
8877       case ADDRESS_REG_UXTW:
8878         cost += addr_cost->register_zextend;
8879         break;
8880
8881       default:
8882         gcc_unreachable ();
8883     }
8884
8885
8886   if (info.shift > 0)
8887     {
8888       /* For the sake of calculating the cost of the shifted register
8889          component, we can treat same sized modes in the same way.  */
8890       if (known_eq (GET_MODE_BITSIZE (mode), 16))
8891         cost += addr_cost->addr_scale_costs.hi;
8892       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
8893         cost += addr_cost->addr_scale_costs.si;
8894       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
8895         cost += addr_cost->addr_scale_costs.di;
8896       else
8897         /* We can't tell, or this is a 128-bit vector.  */
8898         cost += addr_cost->addr_scale_costs.ti;
8899     }
8900
8901   return cost;
8902 }
8903
8904 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
8905    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
8906    to be taken.  */
8907
8908 int
8909 aarch64_branch_cost (bool speed_p, bool predictable_p)
8910 {
8911   /* When optimizing for speed, use the cost of unpredictable branches.  */
8912   const struct cpu_branch_cost *branch_costs =
8913     aarch64_tune_params.branch_costs;
8914
8915   if (!speed_p || predictable_p)
8916     return branch_costs->predictable;
8917   else
8918     return branch_costs->unpredictable;
8919 }
8920
8921 /* Return true if the RTX X in mode MODE is a zero or sign extract
8922    usable in an ADD or SUB (extended register) instruction.  */
8923 static bool
8924 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
8925 {
8926   /* Catch add with a sign extract.
8927      This is add_<optab><mode>_multp2.  */
8928   if (GET_CODE (x) == SIGN_EXTRACT
8929       || GET_CODE (x) == ZERO_EXTRACT)
8930     {
8931       rtx op0 = XEXP (x, 0);
8932       rtx op1 = XEXP (x, 1);
8933       rtx op2 = XEXP (x, 2);
8934
8935       if (GET_CODE (op0) == MULT
8936           && CONST_INT_P (op1)
8937           && op2 == const0_rtx
8938           && CONST_INT_P (XEXP (op0, 1))
8939           && aarch64_is_extend_from_extract (mode,
8940                                              XEXP (op0, 1),
8941                                              op1))
8942         {
8943           return true;
8944         }
8945     }
8946   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
8947      No shift.  */
8948   else if (GET_CODE (x) == SIGN_EXTEND
8949            || GET_CODE (x) == ZERO_EXTEND)
8950     return REG_P (XEXP (x, 0));
8951
8952   return false;
8953 }
8954
8955 static bool
8956 aarch64_frint_unspec_p (unsigned int u)
8957 {
8958   switch (u)
8959     {
8960       case UNSPEC_FRINTZ:
8961       case UNSPEC_FRINTP:
8962       case UNSPEC_FRINTM:
8963       case UNSPEC_FRINTA:
8964       case UNSPEC_FRINTN:
8965       case UNSPEC_FRINTX:
8966       case UNSPEC_FRINTI:
8967         return true;
8968
8969       default:
8970         return false;
8971     }
8972 }
8973
8974 /* Return true iff X is an rtx that will match an extr instruction
8975    i.e. as described in the *extr<mode>5_insn family of patterns.
8976    OP0 and OP1 will be set to the operands of the shifts involved
8977    on success and will be NULL_RTX otherwise.  */
8978
8979 static bool
8980 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
8981 {
8982   rtx op0, op1;
8983   scalar_int_mode mode;
8984   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
8985     return false;
8986
8987   *res_op0 = NULL_RTX;
8988   *res_op1 = NULL_RTX;
8989
8990   if (GET_CODE (x) != IOR)
8991     return false;
8992
8993   op0 = XEXP (x, 0);
8994   op1 = XEXP (x, 1);
8995
8996   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
8997       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
8998     {
8999      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
9000       if (GET_CODE (op1) == ASHIFT)
9001         std::swap (op0, op1);
9002
9003       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
9004         return false;
9005
9006       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
9007       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
9008
9009       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
9010           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
9011         {
9012           *res_op0 = XEXP (op0, 0);
9013           *res_op1 = XEXP (op1, 0);
9014           return true;
9015         }
9016     }
9017
9018   return false;
9019 }
9020
9021 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
9022    storing it in *COST.  Result is true if the total cost of the operation
9023    has now been calculated.  */
9024 static bool
9025 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
9026 {
9027   rtx inner;
9028   rtx comparator;
9029   enum rtx_code cmpcode;
9030
9031   if (COMPARISON_P (op0))
9032     {
9033       inner = XEXP (op0, 0);
9034       comparator = XEXP (op0, 1);
9035       cmpcode = GET_CODE (op0);
9036     }
9037   else
9038     {
9039       inner = op0;
9040       comparator = const0_rtx;
9041       cmpcode = NE;
9042     }
9043
9044   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
9045     {
9046       /* Conditional branch.  */
9047       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9048         return true;
9049       else
9050         {
9051           if (cmpcode == NE || cmpcode == EQ)
9052             {
9053               if (comparator == const0_rtx)
9054                 {
9055                   /* TBZ/TBNZ/CBZ/CBNZ.  */
9056                   if (GET_CODE (inner) == ZERO_EXTRACT)
9057                     /* TBZ/TBNZ.  */
9058                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
9059                                        ZERO_EXTRACT, 0, speed);
9060                   else
9061                     /* CBZ/CBNZ.  */
9062                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
9063
9064                 return true;
9065               }
9066             }
9067           else if (cmpcode == LT || cmpcode == GE)
9068             {
9069               /* TBZ/TBNZ.  */
9070               if (comparator == const0_rtx)
9071                 return true;
9072             }
9073         }
9074     }
9075   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9076     {
9077       /* CCMP.  */
9078       if (GET_CODE (op1) == COMPARE)
9079         {
9080           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
9081           if (XEXP (op1, 1) == const0_rtx)
9082             *cost += 1;
9083           if (speed)
9084             {
9085               machine_mode mode = GET_MODE (XEXP (op1, 0));
9086               const struct cpu_cost_table *extra_cost
9087                 = aarch64_tune_params.insn_extra_cost;
9088
9089               if (GET_MODE_CLASS (mode) == MODE_INT)
9090                 *cost += extra_cost->alu.arith;
9091               else
9092                 *cost += extra_cost->fp[mode == DFmode].compare;
9093             }
9094           return true;
9095         }
9096
9097       /* It's a conditional operation based on the status flags,
9098          so it must be some flavor of CSEL.  */
9099
9100       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
9101       if (GET_CODE (op1) == NEG
9102           || GET_CODE (op1) == NOT
9103           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
9104         op1 = XEXP (op1, 0);
9105       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
9106         {
9107           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
9108           op1 = XEXP (op1, 0);
9109           op2 = XEXP (op2, 0);
9110         }
9111
9112       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
9113       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
9114       return true;
9115     }
9116
9117   /* We don't know what this is, cost all operands.  */
9118   return false;
9119 }
9120
9121 /* Check whether X is a bitfield operation of the form shift + extend that
9122    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
9123    operand to which the bitfield operation is applied.  Otherwise return
9124    NULL_RTX.  */
9125
9126 static rtx
9127 aarch64_extend_bitfield_pattern_p (rtx x)
9128 {
9129   rtx_code outer_code = GET_CODE (x);
9130   machine_mode outer_mode = GET_MODE (x);
9131
9132   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
9133       && outer_mode != SImode && outer_mode != DImode)
9134     return NULL_RTX;
9135
9136   rtx inner = XEXP (x, 0);
9137   rtx_code inner_code = GET_CODE (inner);
9138   machine_mode inner_mode = GET_MODE (inner);
9139   rtx op = NULL_RTX;
9140
9141   switch (inner_code)
9142     {
9143       case ASHIFT:
9144         if (CONST_INT_P (XEXP (inner, 1))
9145             && (inner_mode == QImode || inner_mode == HImode))
9146           op = XEXP (inner, 0);
9147         break;
9148       case LSHIFTRT:
9149         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
9150             && (inner_mode == QImode || inner_mode == HImode))
9151           op = XEXP (inner, 0);
9152         break;
9153       case ASHIFTRT:
9154         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
9155             && (inner_mode == QImode || inner_mode == HImode))
9156           op = XEXP (inner, 0);
9157         break;
9158       default:
9159         break;
9160     }
9161
9162   return op;
9163 }
9164
9165 /* Return true if the mask and a shift amount from an RTX of the form
9166    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
9167    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
9168
9169 bool
9170 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
9171                                     rtx shft_amnt)
9172 {
9173   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
9174          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
9175          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
9176          && (INTVAL (mask)
9177              & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
9178 }
9179
9180 /* Calculate the cost of calculating X, storing it in *COST.  Result
9181    is true if the total cost of the operation has now been calculated.  */
9182 static bool
9183 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
9184                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
9185 {
9186   rtx op0, op1, op2;
9187   const struct cpu_cost_table *extra_cost
9188     = aarch64_tune_params.insn_extra_cost;
9189   int code = GET_CODE (x);
9190   scalar_int_mode int_mode;
9191
9192   /* By default, assume that everything has equivalent cost to the
9193      cheapest instruction.  Any additional costs are applied as a delta
9194      above this default.  */
9195   *cost = COSTS_N_INSNS (1);
9196
9197   switch (code)
9198     {
9199     case SET:
9200       /* The cost depends entirely on the operands to SET.  */
9201       *cost = 0;
9202       op0 = SET_DEST (x);
9203       op1 = SET_SRC (x);
9204
9205       switch (GET_CODE (op0))
9206         {
9207         case MEM:
9208           if (speed)
9209             {
9210               rtx address = XEXP (op0, 0);
9211               if (VECTOR_MODE_P (mode))
9212                 *cost += extra_cost->ldst.storev;
9213               else if (GET_MODE_CLASS (mode) == MODE_INT)
9214                 *cost += extra_cost->ldst.store;
9215               else if (mode == SFmode)
9216                 *cost += extra_cost->ldst.storef;
9217               else if (mode == DFmode)
9218                 *cost += extra_cost->ldst.stored;
9219
9220               *cost +=
9221                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9222                                                      0, speed));
9223             }
9224
9225           *cost += rtx_cost (op1, mode, SET, 1, speed);
9226           return true;
9227
9228         case SUBREG:
9229           if (! REG_P (SUBREG_REG (op0)))
9230             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
9231
9232           /* Fall through.  */
9233         case REG:
9234           /* The cost is one per vector-register copied.  */
9235           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
9236             {
9237               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
9238               *cost = COSTS_N_INSNS (nregs);
9239             }
9240           /* const0_rtx is in general free, but we will use an
9241              instruction to set a register to 0.  */
9242           else if (REG_P (op1) || op1 == const0_rtx)
9243             {
9244               /* The cost is 1 per register copied.  */
9245               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
9246               *cost = COSTS_N_INSNS (nregs);
9247             }
9248           else
9249             /* Cost is just the cost of the RHS of the set.  */
9250             *cost += rtx_cost (op1, mode, SET, 1, speed);
9251           return true;
9252
9253         case ZERO_EXTRACT:
9254         case SIGN_EXTRACT:
9255           /* Bit-field insertion.  Strip any redundant widening of
9256              the RHS to meet the width of the target.  */
9257           if (GET_CODE (op1) == SUBREG)
9258             op1 = SUBREG_REG (op1);
9259           if ((GET_CODE (op1) == ZERO_EXTEND
9260                || GET_CODE (op1) == SIGN_EXTEND)
9261               && CONST_INT_P (XEXP (op0, 1))
9262               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
9263               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
9264             op1 = XEXP (op1, 0);
9265
9266           if (CONST_INT_P (op1))
9267             {
9268               /* MOV immediate is assumed to always be cheap.  */
9269               *cost = COSTS_N_INSNS (1);
9270             }
9271           else
9272             {
9273               /* BFM.  */
9274               if (speed)
9275                 *cost += extra_cost->alu.bfi;
9276               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
9277             }
9278
9279           return true;
9280
9281         default:
9282           /* We can't make sense of this, assume default cost.  */
9283           *cost = COSTS_N_INSNS (1);
9284           return false;
9285         }
9286       return false;
9287
9288     case CONST_INT:
9289       /* If an instruction can incorporate a constant within the
9290          instruction, the instruction's expression avoids calling
9291          rtx_cost() on the constant.  If rtx_cost() is called on a
9292          constant, then it is usually because the constant must be
9293          moved into a register by one or more instructions.
9294
9295          The exception is constant 0, which can be expressed
9296          as XZR/WZR and is therefore free.  The exception to this is
9297          if we have (set (reg) (const0_rtx)) in which case we must cost
9298          the move.  However, we can catch that when we cost the SET, so
9299          we don't need to consider that here.  */
9300       if (x == const0_rtx)
9301         *cost = 0;
9302       else
9303         {
9304           /* To an approximation, building any other constant is
9305              proportionally expensive to the number of instructions
9306              required to build that constant.  This is true whether we
9307              are compiling for SPEED or otherwise.  */
9308           if (!is_a <scalar_int_mode> (mode, &int_mode))
9309             int_mode = word_mode;
9310           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
9311                                  (NULL_RTX, x, false, int_mode));
9312         }
9313       return true;
9314
9315     case CONST_DOUBLE:
9316
9317       /* First determine number of instructions to do the move
9318           as an integer constant.  */
9319       if (!aarch64_float_const_representable_p (x)
9320            && !aarch64_can_const_movi_rtx_p (x, mode)
9321            && aarch64_float_const_rtx_p (x))
9322         {
9323           unsigned HOST_WIDE_INT ival;
9324           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
9325           gcc_assert (succeed);
9326
9327           scalar_int_mode imode = (mode == HFmode
9328                                    ? SImode
9329                                    : int_mode_for_mode (mode).require ());
9330           int ncost = aarch64_internal_mov_immediate
9331                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
9332           *cost += COSTS_N_INSNS (ncost);
9333           return true;
9334         }
9335
9336       if (speed)
9337         {
9338           /* mov[df,sf]_aarch64.  */
9339           if (aarch64_float_const_representable_p (x))
9340             /* FMOV (scalar immediate).  */
9341             *cost += extra_cost->fp[mode == DFmode].fpconst;
9342           else if (!aarch64_float_const_zero_rtx_p (x))
9343             {
9344               /* This will be a load from memory.  */
9345               if (mode == DFmode)
9346                 *cost += extra_cost->ldst.loadd;
9347               else
9348                 *cost += extra_cost->ldst.loadf;
9349             }
9350           else
9351             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
9352                or MOV v0.s[0], wzr - neither of which are modeled by the
9353                cost tables.  Just use the default cost.  */
9354             {
9355             }
9356         }
9357
9358       return true;
9359
9360     case MEM:
9361       if (speed)
9362         {
9363           /* For loads we want the base cost of a load, plus an
9364              approximation for the additional cost of the addressing
9365              mode.  */
9366           rtx address = XEXP (x, 0);
9367           if (VECTOR_MODE_P (mode))
9368             *cost += extra_cost->ldst.loadv;
9369           else if (GET_MODE_CLASS (mode) == MODE_INT)
9370             *cost += extra_cost->ldst.load;
9371           else if (mode == SFmode)
9372             *cost += extra_cost->ldst.loadf;
9373           else if (mode == DFmode)
9374             *cost += extra_cost->ldst.loadd;
9375
9376           *cost +=
9377                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9378                                                      0, speed));
9379         }
9380
9381       return true;
9382
9383     case NEG:
9384       op0 = XEXP (x, 0);
9385
9386       if (VECTOR_MODE_P (mode))
9387         {
9388           if (speed)
9389             {
9390               /* FNEG.  */
9391               *cost += extra_cost->vect.alu;
9392             }
9393           return false;
9394         }
9395
9396       if (GET_MODE_CLASS (mode) == MODE_INT)
9397         {
9398           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
9399               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
9400             {
9401               /* CSETM.  */
9402               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
9403               return true;
9404             }
9405
9406           /* Cost this as SUB wzr, X.  */
9407           op0 = CONST0_RTX (mode);
9408           op1 = XEXP (x, 0);
9409           goto cost_minus;
9410         }
9411
9412       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9413         {
9414           /* Support (neg(fma...)) as a single instruction only if
9415              sign of zeros is unimportant.  This matches the decision
9416              making in aarch64.md.  */
9417           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
9418             {
9419               /* FNMADD.  */
9420               *cost = rtx_cost (op0, mode, NEG, 0, speed);
9421               return true;
9422             }
9423           if (GET_CODE (op0) == MULT)
9424             {
9425               /* FNMUL.  */
9426               *cost = rtx_cost (op0, mode, NEG, 0, speed);
9427               return true;
9428             }
9429           if (speed)
9430             /* FNEG.  */
9431             *cost += extra_cost->fp[mode == DFmode].neg;
9432           return false;
9433         }
9434
9435       return false;
9436
9437     case CLRSB:
9438     case CLZ:
9439       if (speed)
9440         {
9441           if (VECTOR_MODE_P (mode))
9442             *cost += extra_cost->vect.alu;
9443           else
9444             *cost += extra_cost->alu.clz;
9445         }
9446
9447       return false;
9448
9449     case COMPARE:
9450       op0 = XEXP (x, 0);
9451       op1 = XEXP (x, 1);
9452
9453       if (op1 == const0_rtx
9454           && GET_CODE (op0) == AND)
9455         {
9456           x = op0;
9457           mode = GET_MODE (op0);
9458           goto cost_logic;
9459         }
9460
9461       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
9462         {
9463           /* TODO: A write to the CC flags possibly costs extra, this
9464              needs encoding in the cost tables.  */
9465
9466           mode = GET_MODE (op0);
9467           /* ANDS.  */
9468           if (GET_CODE (op0) == AND)
9469             {
9470               x = op0;
9471               goto cost_logic;
9472             }
9473
9474           if (GET_CODE (op0) == PLUS)
9475             {
9476               /* ADDS (and CMN alias).  */
9477               x = op0;
9478               goto cost_plus;
9479             }
9480
9481           if (GET_CODE (op0) == MINUS)
9482             {
9483               /* SUBS.  */
9484               x = op0;
9485               goto cost_minus;
9486             }
9487
9488           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
9489               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
9490               && CONST_INT_P (XEXP (op0, 2)))
9491             {
9492               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
9493                  Handle it here directly rather than going to cost_logic
9494                  since we know the immediate generated for the TST is valid
9495                  so we can avoid creating an intermediate rtx for it only
9496                  for costing purposes.  */
9497               if (speed)
9498                 *cost += extra_cost->alu.logical;
9499
9500               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
9501                                  ZERO_EXTRACT, 0, speed);
9502               return true;
9503             }
9504
9505           if (GET_CODE (op1) == NEG)
9506             {
9507               /* CMN.  */
9508               if (speed)
9509                 *cost += extra_cost->alu.arith;
9510
9511               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
9512               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
9513               return true;
9514             }
9515
9516           /* CMP.
9517
9518              Compare can freely swap the order of operands, and
9519              canonicalization puts the more complex operation first.
9520              But the integer MINUS logic expects the shift/extend
9521              operation in op1.  */
9522           if (! (REG_P (op0)
9523                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
9524           {
9525             op0 = XEXP (x, 1);
9526             op1 = XEXP (x, 0);
9527           }
9528           goto cost_minus;
9529         }
9530
9531       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
9532         {
9533           /* FCMP.  */
9534           if (speed)
9535             *cost += extra_cost->fp[mode == DFmode].compare;
9536
9537           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
9538             {
9539               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
9540               /* FCMP supports constant 0.0 for no extra cost. */
9541               return true;
9542             }
9543           return false;
9544         }
9545
9546       if (VECTOR_MODE_P (mode))
9547         {
9548           /* Vector compare.  */
9549           if (speed)
9550             *cost += extra_cost->vect.alu;
9551
9552           if (aarch64_float_const_zero_rtx_p (op1))
9553             {
9554               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
9555                  cost.  */
9556               return true;
9557             }
9558           return false;
9559         }
9560       return false;
9561
9562     case MINUS:
9563       {
9564         op0 = XEXP (x, 0);
9565         op1 = XEXP (x, 1);
9566
9567 cost_minus:
9568         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
9569
9570         /* Detect valid immediates.  */
9571         if ((GET_MODE_CLASS (mode) == MODE_INT
9572              || (GET_MODE_CLASS (mode) == MODE_CC
9573                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
9574             && CONST_INT_P (op1)
9575             && aarch64_uimm12_shift (INTVAL (op1)))
9576           {
9577             if (speed)
9578               /* SUB(S) (immediate).  */
9579               *cost += extra_cost->alu.arith;
9580             return true;
9581           }
9582
9583         /* Look for SUB (extended register).  */
9584         if (is_a <scalar_int_mode> (mode, &int_mode)
9585             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
9586           {
9587             if (speed)
9588               *cost += extra_cost->alu.extend_arith;
9589
9590             op1 = aarch64_strip_extend (op1, true);
9591             *cost += rtx_cost (op1, VOIDmode,
9592                                (enum rtx_code) GET_CODE (op1), 0, speed);
9593             return true;
9594           }
9595
9596         rtx new_op1 = aarch64_strip_extend (op1, false);
9597
9598         /* Cost this as an FMA-alike operation.  */
9599         if ((GET_CODE (new_op1) == MULT
9600              || aarch64_shift_p (GET_CODE (new_op1)))
9601             && code != COMPARE)
9602           {
9603             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
9604                                             (enum rtx_code) code,
9605                                             speed);
9606             return true;
9607           }
9608
9609         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
9610
9611         if (speed)
9612           {
9613             if (VECTOR_MODE_P (mode))
9614               {
9615                 /* Vector SUB.  */
9616                 *cost += extra_cost->vect.alu;
9617               }
9618             else if (GET_MODE_CLASS (mode) == MODE_INT)
9619               {
9620                 /* SUB(S).  */
9621                 *cost += extra_cost->alu.arith;
9622               }
9623             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9624               {
9625                 /* FSUB.  */
9626                 *cost += extra_cost->fp[mode == DFmode].addsub;
9627               }
9628           }
9629         return true;
9630       }
9631
9632     case PLUS:
9633       {
9634         rtx new_op0;
9635
9636         op0 = XEXP (x, 0);
9637         op1 = XEXP (x, 1);
9638
9639 cost_plus:
9640         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
9641             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
9642           {
9643             /* CSINC.  */
9644             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
9645             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
9646             return true;
9647           }
9648
9649         if (GET_MODE_CLASS (mode) == MODE_INT
9650             && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
9651                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
9652           {
9653             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
9654
9655             if (speed)
9656               /* ADD (immediate).  */
9657               *cost += extra_cost->alu.arith;
9658             return true;
9659           }
9660
9661         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
9662
9663         /* Look for ADD (extended register).  */
9664         if (is_a <scalar_int_mode> (mode, &int_mode)
9665             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
9666           {
9667             if (speed)
9668               *cost += extra_cost->alu.extend_arith;
9669
9670             op0 = aarch64_strip_extend (op0, true);
9671             *cost += rtx_cost (op0, VOIDmode,
9672                                (enum rtx_code) GET_CODE (op0), 0, speed);
9673             return true;
9674           }
9675
9676         /* Strip any extend, leave shifts behind as we will
9677            cost them through mult_cost.  */
9678         new_op0 = aarch64_strip_extend (op0, false);
9679
9680         if (GET_CODE (new_op0) == MULT
9681             || aarch64_shift_p (GET_CODE (new_op0)))
9682           {
9683             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
9684                                             speed);
9685             return true;
9686           }
9687
9688         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
9689
9690         if (speed)
9691           {
9692             if (VECTOR_MODE_P (mode))
9693               {
9694                 /* Vector ADD.  */
9695                 *cost += extra_cost->vect.alu;
9696               }
9697             else if (GET_MODE_CLASS (mode) == MODE_INT)
9698               {
9699                 /* ADD.  */
9700                 *cost += extra_cost->alu.arith;
9701               }
9702             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9703               {
9704                 /* FADD.  */
9705                 *cost += extra_cost->fp[mode == DFmode].addsub;
9706               }
9707           }
9708         return true;
9709       }
9710
9711     case BSWAP:
9712       *cost = COSTS_N_INSNS (1);
9713
9714       if (speed)
9715         {
9716           if (VECTOR_MODE_P (mode))
9717             *cost += extra_cost->vect.alu;
9718           else
9719             *cost += extra_cost->alu.rev;
9720         }
9721       return false;
9722
9723     case IOR:
9724       if (aarch_rev16_p (x))
9725         {
9726           *cost = COSTS_N_INSNS (1);
9727
9728           if (speed)
9729             {
9730               if (VECTOR_MODE_P (mode))
9731                 *cost += extra_cost->vect.alu;
9732               else
9733                 *cost += extra_cost->alu.rev;
9734             }
9735           return true;
9736         }
9737
9738       if (aarch64_extr_rtx_p (x, &op0, &op1))
9739         {
9740           *cost += rtx_cost (op0, mode, IOR, 0, speed);
9741           *cost += rtx_cost (op1, mode, IOR, 1, speed);
9742           if (speed)
9743             *cost += extra_cost->alu.shift;
9744
9745           return true;
9746         }
9747     /* Fall through.  */
9748     case XOR:
9749     case AND:
9750     cost_logic:
9751       op0 = XEXP (x, 0);
9752       op1 = XEXP (x, 1);
9753
9754       if (VECTOR_MODE_P (mode))
9755         {
9756           if (speed)
9757             *cost += extra_cost->vect.alu;
9758           return true;
9759         }
9760
9761       if (code == AND
9762           && GET_CODE (op0) == MULT
9763           && CONST_INT_P (XEXP (op0, 1))
9764           && CONST_INT_P (op1)
9765           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
9766                                INTVAL (op1)) != 0)
9767         {
9768           /* This is a UBFM/SBFM.  */
9769           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
9770           if (speed)
9771             *cost += extra_cost->alu.bfx;
9772           return true;
9773         }
9774
9775       if (is_int_mode (mode, &int_mode))
9776         {
9777           if (CONST_INT_P (op1))
9778             {
9779               /* We have a mask + shift version of a UBFIZ
9780                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
9781               if (GET_CODE (op0) == ASHIFT
9782                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
9783                                                          XEXP (op0, 1)))
9784                 {
9785                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
9786                                      (enum rtx_code) code, 0, speed);
9787                   if (speed)
9788                     *cost += extra_cost->alu.bfx;
9789
9790                   return true;
9791                 }
9792               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
9793                 {
9794                 /* We possibly get the immediate for free, this is not
9795                    modelled.  */
9796                   *cost += rtx_cost (op0, int_mode,
9797                                      (enum rtx_code) code, 0, speed);
9798                   if (speed)
9799                     *cost += extra_cost->alu.logical;
9800
9801                   return true;
9802                 }
9803             }
9804           else
9805             {
9806               rtx new_op0 = op0;
9807
9808               /* Handle ORN, EON, or BIC.  */
9809               if (GET_CODE (op0) == NOT)
9810                 op0 = XEXP (op0, 0);
9811
9812               new_op0 = aarch64_strip_shift (op0);
9813
9814               /* If we had a shift on op0 then this is a logical-shift-
9815                  by-register/immediate operation.  Otherwise, this is just
9816                  a logical operation.  */
9817               if (speed)
9818                 {
9819                   if (new_op0 != op0)
9820                     {
9821                       /* Shift by immediate.  */
9822                       if (CONST_INT_P (XEXP (op0, 1)))
9823                         *cost += extra_cost->alu.log_shift;
9824                       else
9825                         *cost += extra_cost->alu.log_shift_reg;
9826                     }
9827                   else
9828                     *cost += extra_cost->alu.logical;
9829                 }
9830
9831               /* In both cases we want to cost both operands.  */
9832               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
9833                                  0, speed);
9834               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
9835                                  1, speed);
9836
9837               return true;
9838             }
9839         }
9840       return false;
9841
9842     case NOT:
9843       x = XEXP (x, 0);
9844       op0 = aarch64_strip_shift (x);
9845
9846       if (VECTOR_MODE_P (mode))
9847         {
9848           /* Vector NOT.  */
9849           *cost += extra_cost->vect.alu;
9850           return false;
9851         }
9852
9853       /* MVN-shifted-reg.  */
9854       if (op0 != x)
9855         {
9856           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9857
9858           if (speed)
9859             *cost += extra_cost->alu.log_shift;
9860
9861           return true;
9862         }
9863       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
9864          Handle the second form here taking care that 'a' in the above can
9865          be a shift.  */
9866       else if (GET_CODE (op0) == XOR)
9867         {
9868           rtx newop0 = XEXP (op0, 0);
9869           rtx newop1 = XEXP (op0, 1);
9870           rtx op0_stripped = aarch64_strip_shift (newop0);
9871
9872           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
9873           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
9874
9875           if (speed)
9876             {
9877               if (op0_stripped != newop0)
9878                 *cost += extra_cost->alu.log_shift;
9879               else
9880                 *cost += extra_cost->alu.logical;
9881             }
9882
9883           return true;
9884         }
9885       /* MVN.  */
9886       if (speed)
9887         *cost += extra_cost->alu.logical;
9888
9889       return false;
9890
9891     case ZERO_EXTEND:
9892
9893       op0 = XEXP (x, 0);
9894       /* If a value is written in SI mode, then zero extended to DI
9895          mode, the operation will in general be free as a write to
9896          a 'w' register implicitly zeroes the upper bits of an 'x'
9897          register.  However, if this is
9898
9899            (set (reg) (zero_extend (reg)))
9900
9901          we must cost the explicit register move.  */
9902       if (mode == DImode
9903           && GET_MODE (op0) == SImode
9904           && outer == SET)
9905         {
9906           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
9907
9908         /* If OP_COST is non-zero, then the cost of the zero extend
9909            is effectively the cost of the inner operation.  Otherwise
9910            we have a MOV instruction and we take the cost from the MOV
9911            itself.  This is true independently of whether we are
9912            optimizing for space or time.  */
9913           if (op_cost)
9914             *cost = op_cost;
9915
9916           return true;
9917         }
9918       else if (MEM_P (op0))
9919         {
9920           /* All loads can zero extend to any size for free.  */
9921           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
9922           return true;
9923         }
9924
9925       op0 = aarch64_extend_bitfield_pattern_p (x);
9926       if (op0)
9927         {
9928           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
9929           if (speed)
9930             *cost += extra_cost->alu.bfx;
9931           return true;
9932         }
9933
9934       if (speed)
9935         {
9936           if (VECTOR_MODE_P (mode))
9937             {
9938               /* UMOV.  */
9939               *cost += extra_cost->vect.alu;
9940             }
9941           else
9942             {
9943               /* We generate an AND instead of UXTB/UXTH.  */
9944               *cost += extra_cost->alu.logical;
9945             }
9946         }
9947       return false;
9948
9949     case SIGN_EXTEND:
9950       if (MEM_P (XEXP (x, 0)))
9951         {
9952           /* LDRSH.  */
9953           if (speed)
9954             {
9955               rtx address = XEXP (XEXP (x, 0), 0);
9956               *cost += extra_cost->ldst.load_sign_extend;
9957
9958               *cost +=
9959                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9960                                                      0, speed));
9961             }
9962           return true;
9963         }
9964
9965       op0 = aarch64_extend_bitfield_pattern_p (x);
9966       if (op0)
9967         {
9968           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
9969           if (speed)
9970             *cost += extra_cost->alu.bfx;
9971           return true;
9972         }
9973
9974       if (speed)
9975         {
9976           if (VECTOR_MODE_P (mode))
9977             *cost += extra_cost->vect.alu;
9978           else
9979             *cost += extra_cost->alu.extend;
9980         }
9981       return false;
9982
9983     case ASHIFT:
9984       op0 = XEXP (x, 0);
9985       op1 = XEXP (x, 1);
9986
9987       if (CONST_INT_P (op1))
9988         {
9989           if (speed)
9990             {
9991               if (VECTOR_MODE_P (mode))
9992                 {
9993                   /* Vector shift (immediate).  */
9994                   *cost += extra_cost->vect.alu;
9995                 }
9996               else
9997                 {
9998                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
9999                      aliases.  */
10000                   *cost += extra_cost->alu.shift;
10001                 }
10002             }
10003
10004           /* We can incorporate zero/sign extend for free.  */
10005           if (GET_CODE (op0) == ZERO_EXTEND
10006               || GET_CODE (op0) == SIGN_EXTEND)
10007             op0 = XEXP (op0, 0);
10008
10009           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
10010           return true;
10011         }
10012       else
10013         {
10014           if (VECTOR_MODE_P (mode))
10015             {
10016               if (speed)
10017                 /* Vector shift (register).  */
10018                 *cost += extra_cost->vect.alu;
10019             }
10020           else
10021             {
10022               if (speed)
10023                 /* LSLV.  */
10024                 *cost += extra_cost->alu.shift_reg;
10025
10026               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10027                   && CONST_INT_P (XEXP (op1, 1))
10028                   && known_eq (INTVAL (XEXP (op1, 1)),
10029                                GET_MODE_BITSIZE (mode) - 1))
10030                 {
10031                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10032                   /* We already demanded XEXP (op1, 0) to be REG_P, so
10033                      don't recurse into it.  */
10034                   return true;
10035                 }
10036             }
10037           return false;  /* All arguments need to be in registers.  */
10038         }
10039
10040     case ROTATE:
10041     case ROTATERT:
10042     case LSHIFTRT:
10043     case ASHIFTRT:
10044       op0 = XEXP (x, 0);
10045       op1 = XEXP (x, 1);
10046
10047       if (CONST_INT_P (op1))
10048         {
10049           /* ASR (immediate) and friends.  */
10050           if (speed)
10051             {
10052               if (VECTOR_MODE_P (mode))
10053                 *cost += extra_cost->vect.alu;
10054               else
10055                 *cost += extra_cost->alu.shift;
10056             }
10057
10058           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10059           return true;
10060         }
10061       else
10062         {
10063           if (VECTOR_MODE_P (mode))
10064             {
10065               if (speed)
10066                 /* Vector shift (register).  */
10067                 *cost += extra_cost->vect.alu;
10068             }
10069           else
10070             {
10071               if (speed)
10072                 /* ASR (register) and friends.  */
10073                 *cost += extra_cost->alu.shift_reg;
10074
10075               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10076                   && CONST_INT_P (XEXP (op1, 1))
10077                   && known_eq (INTVAL (XEXP (op1, 1)),
10078                                GET_MODE_BITSIZE (mode) - 1))
10079                 {
10080                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10081                   /* We already demanded XEXP (op1, 0) to be REG_P, so
10082                      don't recurse into it.  */
10083                   return true;
10084                 }
10085             }
10086           return false;  /* All arguments need to be in registers.  */
10087         }
10088
10089     case SYMBOL_REF:
10090
10091       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
10092           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
10093         {
10094           /* LDR.  */
10095           if (speed)
10096             *cost += extra_cost->ldst.load;
10097         }
10098       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
10099                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
10100         {
10101           /* ADRP, followed by ADD.  */
10102           *cost += COSTS_N_INSNS (1);
10103           if (speed)
10104             *cost += 2 * extra_cost->alu.arith;
10105         }
10106       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
10107                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10108         {
10109           /* ADR.  */
10110           if (speed)
10111             *cost += extra_cost->alu.arith;
10112         }
10113
10114       if (flag_pic)
10115         {
10116           /* One extra load instruction, after accessing the GOT.  */
10117           *cost += COSTS_N_INSNS (1);
10118           if (speed)
10119             *cost += extra_cost->ldst.load;
10120         }
10121       return true;
10122
10123     case HIGH:
10124     case LO_SUM:
10125       /* ADRP/ADD (immediate).  */
10126       if (speed)
10127         *cost += extra_cost->alu.arith;
10128       return true;
10129
10130     case ZERO_EXTRACT:
10131     case SIGN_EXTRACT:
10132       /* UBFX/SBFX.  */
10133       if (speed)
10134         {
10135           if (VECTOR_MODE_P (mode))
10136             *cost += extra_cost->vect.alu;
10137           else
10138             *cost += extra_cost->alu.bfx;
10139         }
10140
10141       /* We can trust that the immediates used will be correct (there
10142          are no by-register forms), so we need only cost op0.  */
10143       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
10144       return true;
10145
10146     case MULT:
10147       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
10148       /* aarch64_rtx_mult_cost always handles recursion to its
10149          operands.  */
10150       return true;
10151
10152     case MOD:
10153     /* We can expand signed mod by power of 2 using a NEGS, two parallel
10154        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
10155        an unconditional negate.  This case should only ever be reached through
10156        the set_smod_pow2_cheap check in expmed.c.  */
10157       if (CONST_INT_P (XEXP (x, 1))
10158           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
10159           && (mode == SImode || mode == DImode))
10160         {
10161           /* We expand to 4 instructions.  Reset the baseline.  */
10162           *cost = COSTS_N_INSNS (4);
10163
10164           if (speed)
10165             *cost += 2 * extra_cost->alu.logical
10166                      + 2 * extra_cost->alu.arith;
10167
10168           return true;
10169         }
10170
10171     /* Fall-through.  */
10172     case UMOD:
10173       if (speed)
10174         {
10175           /* Slighly prefer UMOD over SMOD.  */
10176           if (VECTOR_MODE_P (mode))
10177             *cost += extra_cost->vect.alu;
10178           else if (GET_MODE_CLASS (mode) == MODE_INT)
10179             *cost += (extra_cost->mult[mode == DImode].add
10180                       + extra_cost->mult[mode == DImode].idiv
10181                       + (code == MOD ? 1 : 0));
10182         }
10183       return false;  /* All arguments need to be in registers.  */
10184
10185     case DIV:
10186     case UDIV:
10187     case SQRT:
10188       if (speed)
10189         {
10190           if (VECTOR_MODE_P (mode))
10191             *cost += extra_cost->vect.alu;
10192           else if (GET_MODE_CLASS (mode) == MODE_INT)
10193             /* There is no integer SQRT, so only DIV and UDIV can get
10194                here.  */
10195             *cost += (extra_cost->mult[mode == DImode].idiv
10196                      /* Slighly prefer UDIV over SDIV.  */
10197                      + (code == DIV ? 1 : 0));
10198           else
10199             *cost += extra_cost->fp[mode == DFmode].div;
10200         }
10201       return false;  /* All arguments need to be in registers.  */
10202
10203     case IF_THEN_ELSE:
10204       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
10205                                          XEXP (x, 2), cost, speed);
10206
10207     case EQ:
10208     case NE:
10209     case GT:
10210     case GTU:
10211     case LT:
10212     case LTU:
10213     case GE:
10214     case GEU:
10215     case LE:
10216     case LEU:
10217
10218       return false; /* All arguments must be in registers.  */
10219
10220     case FMA:
10221       op0 = XEXP (x, 0);
10222       op1 = XEXP (x, 1);
10223       op2 = XEXP (x, 2);
10224
10225       if (speed)
10226         {
10227           if (VECTOR_MODE_P (mode))
10228             *cost += extra_cost->vect.alu;
10229           else
10230             *cost += extra_cost->fp[mode == DFmode].fma;
10231         }
10232
10233       /* FMSUB, FNMADD, and FNMSUB are free.  */
10234       if (GET_CODE (op0) == NEG)
10235         op0 = XEXP (op0, 0);
10236
10237       if (GET_CODE (op2) == NEG)
10238         op2 = XEXP (op2, 0);
10239
10240       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
10241          and the by-element operand as operand 0.  */
10242       if (GET_CODE (op1) == NEG)
10243         op1 = XEXP (op1, 0);
10244
10245       /* Catch vector-by-element operations.  The by-element operand can
10246          either be (vec_duplicate (vec_select (x))) or just
10247          (vec_select (x)), depending on whether we are multiplying by
10248          a vector or a scalar.
10249
10250          Canonicalization is not very good in these cases, FMA4 will put the
10251          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
10252       if (GET_CODE (op0) == VEC_DUPLICATE)
10253         op0 = XEXP (op0, 0);
10254       else if (GET_CODE (op1) == VEC_DUPLICATE)
10255         op1 = XEXP (op1, 0);
10256
10257       if (GET_CODE (op0) == VEC_SELECT)
10258         op0 = XEXP (op0, 0);
10259       else if (GET_CODE (op1) == VEC_SELECT)
10260         op1 = XEXP (op1, 0);
10261
10262       /* If the remaining parameters are not registers,
10263          get the cost to put them into registers.  */
10264       *cost += rtx_cost (op0, mode, FMA, 0, speed);
10265       *cost += rtx_cost (op1, mode, FMA, 1, speed);
10266       *cost += rtx_cost (op2, mode, FMA, 2, speed);
10267       return true;
10268
10269     case FLOAT:
10270     case UNSIGNED_FLOAT:
10271       if (speed)
10272         *cost += extra_cost->fp[mode == DFmode].fromint;
10273       return false;
10274
10275     case FLOAT_EXTEND:
10276       if (speed)
10277         {
10278           if (VECTOR_MODE_P (mode))
10279             {
10280               /*Vector truncate.  */
10281               *cost += extra_cost->vect.alu;
10282             }
10283           else
10284             *cost += extra_cost->fp[mode == DFmode].widen;
10285         }
10286       return false;
10287
10288     case FLOAT_TRUNCATE:
10289       if (speed)
10290         {
10291           if (VECTOR_MODE_P (mode))
10292             {
10293               /*Vector conversion.  */
10294               *cost += extra_cost->vect.alu;
10295             }
10296           else
10297             *cost += extra_cost->fp[mode == DFmode].narrow;
10298         }
10299       return false;
10300
10301     case FIX:
10302     case UNSIGNED_FIX:
10303       x = XEXP (x, 0);
10304       /* Strip the rounding part.  They will all be implemented
10305          by the fcvt* family of instructions anyway.  */
10306       if (GET_CODE (x) == UNSPEC)
10307         {
10308           unsigned int uns_code = XINT (x, 1);
10309
10310           if (uns_code == UNSPEC_FRINTA
10311               || uns_code == UNSPEC_FRINTM
10312               || uns_code == UNSPEC_FRINTN
10313               || uns_code == UNSPEC_FRINTP
10314               || uns_code == UNSPEC_FRINTZ)
10315             x = XVECEXP (x, 0, 0);
10316         }
10317
10318       if (speed)
10319         {
10320           if (VECTOR_MODE_P (mode))
10321             *cost += extra_cost->vect.alu;
10322           else
10323             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
10324         }
10325
10326       /* We can combine fmul by a power of 2 followed by a fcvt into a single
10327          fixed-point fcvt.  */
10328       if (GET_CODE (x) == MULT
10329           && ((VECTOR_MODE_P (mode)
10330                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
10331               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
10332         {
10333           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
10334                              0, speed);
10335           return true;
10336         }
10337
10338       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
10339       return true;
10340
10341     case ABS:
10342       if (VECTOR_MODE_P (mode))
10343         {
10344           /* ABS (vector).  */
10345           if (speed)
10346             *cost += extra_cost->vect.alu;
10347         }
10348       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10349         {
10350           op0 = XEXP (x, 0);
10351
10352           /* FABD, which is analogous to FADD.  */
10353           if (GET_CODE (op0) == MINUS)
10354             {
10355               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
10356               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
10357               if (speed)
10358                 *cost += extra_cost->fp[mode == DFmode].addsub;
10359
10360               return true;
10361             }
10362           /* Simple FABS is analogous to FNEG.  */
10363           if (speed)
10364             *cost += extra_cost->fp[mode == DFmode].neg;
10365         }
10366       else
10367         {
10368           /* Integer ABS will either be split to
10369              two arithmetic instructions, or will be an ABS
10370              (scalar), which we don't model.  */
10371           *cost = COSTS_N_INSNS (2);
10372           if (speed)
10373             *cost += 2 * extra_cost->alu.arith;
10374         }
10375       return false;
10376
10377     case SMAX:
10378     case SMIN:
10379       if (speed)
10380         {
10381           if (VECTOR_MODE_P (mode))
10382             *cost += extra_cost->vect.alu;
10383           else
10384             {
10385               /* FMAXNM/FMINNM/FMAX/FMIN.
10386                  TODO: This may not be accurate for all implementations, but
10387                  we do not model this in the cost tables.  */
10388               *cost += extra_cost->fp[mode == DFmode].addsub;
10389             }
10390         }
10391       return false;
10392
10393     case UNSPEC:
10394       /* The floating point round to integer frint* instructions.  */
10395       if (aarch64_frint_unspec_p (XINT (x, 1)))
10396         {
10397           if (speed)
10398             *cost += extra_cost->fp[mode == DFmode].roundint;
10399
10400           return false;
10401         }
10402
10403       if (XINT (x, 1) == UNSPEC_RBIT)
10404         {
10405           if (speed)
10406             *cost += extra_cost->alu.rev;
10407
10408           return false;
10409         }
10410       break;
10411
10412     case TRUNCATE:
10413
10414       /* Decompose <su>muldi3_highpart.  */
10415       if (/* (truncate:DI  */
10416           mode == DImode
10417           /*   (lshiftrt:TI  */
10418           && GET_MODE (XEXP (x, 0)) == TImode
10419           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
10420           /*      (mult:TI  */
10421           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
10422           /*        (ANY_EXTEND:TI (reg:DI))
10423                     (ANY_EXTEND:TI (reg:DI)))  */
10424           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
10425                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
10426               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
10427                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
10428           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
10429           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
10430           /*     (const_int 64)  */
10431           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10432           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
10433         {
10434           /* UMULH/SMULH.  */
10435           if (speed)
10436             *cost += extra_cost->mult[mode == DImode].extend;
10437           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
10438                              mode, MULT, 0, speed);
10439           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
10440                              mode, MULT, 1, speed);
10441           return true;
10442         }
10443
10444       /* Fall through.  */
10445     default:
10446       break;
10447     }
10448
10449   if (dump_file
10450       && flag_aarch64_verbose_cost)
10451     fprintf (dump_file,
10452       "\nFailed to cost RTX.  Assuming default cost.\n");
10453
10454   return true;
10455 }
10456
10457 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
10458    calculated for X.  This cost is stored in *COST.  Returns true
10459    if the total cost of X was calculated.  */
10460 static bool
10461 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
10462                    int param, int *cost, bool speed)
10463 {
10464   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
10465
10466   if (dump_file
10467       && flag_aarch64_verbose_cost)
10468     {
10469       print_rtl_single (dump_file, x);
10470       fprintf (dump_file, "\n%s cost: %d (%s)\n",
10471                speed ? "Hot" : "Cold",
10472                *cost, result ? "final" : "partial");
10473     }
10474
10475   return result;
10476 }
10477
10478 static int
10479 aarch64_register_move_cost (machine_mode mode,
10480                             reg_class_t from_i, reg_class_t to_i)
10481 {
10482   enum reg_class from = (enum reg_class) from_i;
10483   enum reg_class to = (enum reg_class) to_i;
10484   const struct cpu_regmove_cost *regmove_cost
10485     = aarch64_tune_params.regmove_cost;
10486
10487   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
10488   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
10489     to = GENERAL_REGS;
10490
10491   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
10492     from = GENERAL_REGS;
10493
10494   /* Moving between GPR and stack cost is the same as GP2GP.  */
10495   if ((from == GENERAL_REGS && to == STACK_REG)
10496       || (to == GENERAL_REGS && from == STACK_REG))
10497     return regmove_cost->GP2GP;
10498
10499   /* To/From the stack register, we move via the gprs.  */
10500   if (to == STACK_REG || from == STACK_REG)
10501     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
10502             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
10503
10504   if (known_eq (GET_MODE_SIZE (mode), 16))
10505     {
10506       /* 128-bit operations on general registers require 2 instructions.  */
10507       if (from == GENERAL_REGS && to == GENERAL_REGS)
10508         return regmove_cost->GP2GP * 2;
10509       else if (from == GENERAL_REGS)
10510         return regmove_cost->GP2FP * 2;
10511       else if (to == GENERAL_REGS)
10512         return regmove_cost->FP2GP * 2;
10513
10514       /* When AdvSIMD instructions are disabled it is not possible to move
10515          a 128-bit value directly between Q registers.  This is handled in
10516          secondary reload.  A general register is used as a scratch to move
10517          the upper DI value and the lower DI value is moved directly,
10518          hence the cost is the sum of three moves. */
10519       if (! TARGET_SIMD)
10520         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
10521
10522       return regmove_cost->FP2FP;
10523     }
10524
10525   if (from == GENERAL_REGS && to == GENERAL_REGS)
10526     return regmove_cost->GP2GP;
10527   else if (from == GENERAL_REGS)
10528     return regmove_cost->GP2FP;
10529   else if (to == GENERAL_REGS)
10530     return regmove_cost->FP2GP;
10531
10532   return regmove_cost->FP2FP;
10533 }
10534
10535 static int
10536 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
10537                           reg_class_t rclass ATTRIBUTE_UNUSED,
10538                           bool in ATTRIBUTE_UNUSED)
10539 {
10540   return aarch64_tune_params.memmov_cost;
10541 }
10542
10543 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
10544    to optimize 1.0/sqrt.  */
10545
10546 static bool
10547 use_rsqrt_p (machine_mode mode)
10548 {
10549   return (!flag_trapping_math
10550           && flag_unsafe_math_optimizations
10551           && ((aarch64_tune_params.approx_modes->recip_sqrt
10552                & AARCH64_APPROX_MODE (mode))
10553               || flag_mrecip_low_precision_sqrt));
10554 }
10555
10556 /* Function to decide when to use the approximate reciprocal square root
10557    builtin.  */
10558
10559 static tree
10560 aarch64_builtin_reciprocal (tree fndecl)
10561 {
10562   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
10563
10564   if (!use_rsqrt_p (mode))
10565     return NULL_TREE;
10566   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
10567 }
10568
10569 /* Emit instruction sequence to compute either the approximate square root
10570    or its approximate reciprocal, depending on the flag RECP, and return
10571    whether the sequence was emitted or not.  */
10572
10573 bool
10574 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
10575 {
10576   machine_mode mode = GET_MODE (dst);
10577
10578   if (GET_MODE_INNER (mode) == HFmode)
10579     {
10580       gcc_assert (!recp);
10581       return false;
10582     }
10583
10584   if (!recp)
10585     {
10586       if (!(flag_mlow_precision_sqrt
10587             || (aarch64_tune_params.approx_modes->sqrt
10588                 & AARCH64_APPROX_MODE (mode))))
10589         return false;
10590
10591       if (flag_finite_math_only
10592           || flag_trapping_math
10593           || !flag_unsafe_math_optimizations
10594           || optimize_function_for_size_p (cfun))
10595         return false;
10596     }
10597   else
10598     /* Caller assumes we cannot fail.  */
10599     gcc_assert (use_rsqrt_p (mode));
10600
10601   machine_mode mmsk = mode_for_int_vector (mode).require ();
10602   rtx xmsk = gen_reg_rtx (mmsk);
10603   if (!recp)
10604     /* When calculating the approximate square root, compare the
10605        argument with 0.0 and create a mask.  */
10606     emit_insn (gen_rtx_SET (xmsk,
10607                             gen_rtx_NEG (mmsk,
10608                                          gen_rtx_EQ (mmsk, src,
10609                                                      CONST0_RTX (mode)))));
10610
10611   /* Estimate the approximate reciprocal square root.  */
10612   rtx xdst = gen_reg_rtx (mode);
10613   emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
10614
10615   /* Iterate over the series twice for SF and thrice for DF.  */
10616   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10617
10618   /* Optionally iterate over the series once less for faster performance
10619      while sacrificing the accuracy.  */
10620   if ((recp && flag_mrecip_low_precision_sqrt)
10621       || (!recp && flag_mlow_precision_sqrt))
10622     iterations--;
10623
10624   /* Iterate over the series to calculate the approximate reciprocal square
10625      root.  */
10626   rtx x1 = gen_reg_rtx (mode);
10627   while (iterations--)
10628     {
10629       rtx x2 = gen_reg_rtx (mode);
10630       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
10631
10632       emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
10633
10634       if (iterations > 0)
10635         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
10636     }
10637
10638   if (!recp)
10639     {
10640       /* Qualify the approximate reciprocal square root when the argument is
10641          0.0 by squashing the intermediary result to 0.0.  */
10642       rtx xtmp = gen_reg_rtx (mmsk);
10643       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
10644                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
10645       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
10646
10647       /* Calculate the approximate square root.  */
10648       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
10649     }
10650
10651   /* Finalize the approximation.  */
10652   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
10653
10654   return true;
10655 }
10656
10657 /* Emit the instruction sequence to compute the approximation for the division
10658    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
10659
10660 bool
10661 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
10662 {
10663   machine_mode mode = GET_MODE (quo);
10664
10665   if (GET_MODE_INNER (mode) == HFmode)
10666     return false;
10667
10668   bool use_approx_division_p = (flag_mlow_precision_div
10669                                 || (aarch64_tune_params.approx_modes->division
10670                                     & AARCH64_APPROX_MODE (mode)));
10671
10672   if (!flag_finite_math_only
10673       || flag_trapping_math
10674       || !flag_unsafe_math_optimizations
10675       || optimize_function_for_size_p (cfun)
10676       || !use_approx_division_p)
10677     return false;
10678
10679   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
10680     return false;
10681
10682   /* Estimate the approximate reciprocal.  */
10683   rtx xrcp = gen_reg_rtx (mode);
10684   emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
10685
10686   /* Iterate over the series twice for SF and thrice for DF.  */
10687   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10688
10689   /* Optionally iterate over the series once less for faster performance,
10690      while sacrificing the accuracy.  */
10691   if (flag_mlow_precision_div)
10692     iterations--;
10693
10694   /* Iterate over the series to calculate the approximate reciprocal.  */
10695   rtx xtmp = gen_reg_rtx (mode);
10696   while (iterations--)
10697     {
10698       emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
10699
10700       if (iterations > 0)
10701         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
10702     }
10703
10704   if (num != CONST1_RTX (mode))
10705     {
10706       /* As the approximate reciprocal of DEN is already calculated, only
10707          calculate the approximate division when NUM is not 1.0.  */
10708       rtx xnum = force_reg (mode, num);
10709       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
10710     }
10711
10712   /* Finalize the approximation.  */
10713   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
10714   return true;
10715 }
10716
10717 /* Return the number of instructions that can be issued per cycle.  */
10718 static int
10719 aarch64_sched_issue_rate (void)
10720 {
10721   return aarch64_tune_params.issue_rate;
10722 }
10723
10724 static int
10725 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
10726 {
10727   int issue_rate = aarch64_sched_issue_rate ();
10728
10729   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
10730 }
10731
10732
10733 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
10734    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
10735    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
10736
10737 static int
10738 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
10739                                                     int ready_index)
10740 {
10741   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
10742 }
10743
10744
10745 /* Vectorizer cost model target hooks.  */
10746
10747 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
10748 static int
10749 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
10750                                     tree vectype,
10751                                     int misalign ATTRIBUTE_UNUSED)
10752 {
10753   unsigned elements;
10754   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
10755   bool fp = false;
10756
10757   if (vectype != NULL)
10758     fp = FLOAT_TYPE_P (vectype);
10759
10760   switch (type_of_cost)
10761     {
10762       case scalar_stmt:
10763         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
10764
10765       case scalar_load:
10766         return costs->scalar_load_cost;
10767
10768       case scalar_store:
10769         return costs->scalar_store_cost;
10770
10771       case vector_stmt:
10772         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10773
10774       case vector_load:
10775         return costs->vec_align_load_cost;
10776
10777       case vector_store:
10778         return costs->vec_store_cost;
10779
10780       case vec_to_scalar:
10781         return costs->vec_to_scalar_cost;
10782
10783       case scalar_to_vec:
10784         return costs->scalar_to_vec_cost;
10785
10786       case unaligned_load:
10787       case vector_gather_load:
10788         return costs->vec_unalign_load_cost;
10789
10790       case unaligned_store:
10791       case vector_scatter_store:
10792         return costs->vec_unalign_store_cost;
10793
10794       case cond_branch_taken:
10795         return costs->cond_taken_branch_cost;
10796
10797       case cond_branch_not_taken:
10798         return costs->cond_not_taken_branch_cost;
10799
10800       case vec_perm:
10801         return costs->vec_permute_cost;
10802
10803       case vec_promote_demote:
10804         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10805
10806       case vec_construct:
10807         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
10808         return elements / 2 + 1;
10809
10810       default:
10811         gcc_unreachable ();
10812     }
10813 }
10814
10815 /* Implement targetm.vectorize.add_stmt_cost.  */
10816 static unsigned
10817 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
10818                        struct _stmt_vec_info *stmt_info, int misalign,
10819                        enum vect_cost_model_location where)
10820 {
10821   unsigned *cost = (unsigned *) data;
10822   unsigned retval = 0;
10823
10824   if (flag_vect_cost_model)
10825     {
10826       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
10827       int stmt_cost =
10828             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
10829
10830       /* Statements in an inner loop relative to the loop being
10831          vectorized are weighted more heavily.  The value here is
10832          arbitrary and could potentially be improved with analysis.  */
10833       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
10834         count *= 50; /*  FIXME  */
10835
10836       retval = (unsigned) (count * stmt_cost);
10837       cost[where] += retval;
10838     }
10839
10840   return retval;
10841 }
10842
10843 static void initialize_aarch64_code_model (struct gcc_options *);
10844
10845 /* Parse the TO_PARSE string and put the architecture struct that it
10846    selects into RES and the architectural features into ISA_FLAGS.
10847    Return an aarch64_parse_opt_result describing the parse result.
10848    If there is an error parsing, RES and ISA_FLAGS are left unchanged.
10849    When the TO_PARSE string contains an invalid extension,
10850    a copy of the string is created and stored to INVALID_EXTENSION.  */
10851
10852 static enum aarch64_parse_opt_result
10853 aarch64_parse_arch (const char *to_parse, const struct processor **res,
10854                     unsigned long *isa_flags, std::string *invalid_extension)
10855 {
10856   const char *ext;
10857   const struct processor *arch;
10858   size_t len;
10859
10860   ext = strchr (to_parse, '+');
10861
10862   if (ext != NULL)
10863     len = ext - to_parse;
10864   else
10865     len = strlen (to_parse);
10866
10867   if (len == 0)
10868     return AARCH64_PARSE_MISSING_ARG;
10869
10870
10871   /* Loop through the list of supported ARCHes to find a match.  */
10872   for (arch = all_architectures; arch->name != NULL; arch++)
10873     {
10874       if (strlen (arch->name) == len
10875           && strncmp (arch->name, to_parse, len) == 0)
10876         {
10877           unsigned long isa_temp = arch->flags;
10878
10879           if (ext != NULL)
10880             {
10881               /* TO_PARSE string contains at least one extension.  */
10882               enum aarch64_parse_opt_result ext_res
10883                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
10884
10885               if (ext_res != AARCH64_PARSE_OK)
10886                 return ext_res;
10887             }
10888           /* Extension parsing was successful.  Confirm the result
10889              arch and ISA flags.  */
10890           *res = arch;
10891           *isa_flags = isa_temp;
10892           return AARCH64_PARSE_OK;
10893         }
10894     }
10895
10896   /* ARCH name not found in list.  */
10897   return AARCH64_PARSE_INVALID_ARG;
10898 }
10899
10900 /* Parse the TO_PARSE string and put the result tuning in RES and the
10901    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
10902    describing the parse result.  If there is an error parsing, RES and
10903    ISA_FLAGS are left unchanged.
10904    When the TO_PARSE string contains an invalid extension,
10905    a copy of the string is created and stored to INVALID_EXTENSION.  */
10906
10907 static enum aarch64_parse_opt_result
10908 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
10909                    unsigned long *isa_flags, std::string *invalid_extension)
10910 {
10911   const char *ext;
10912   const struct processor *cpu;
10913   size_t len;
10914
10915   ext = strchr (to_parse, '+');
10916
10917   if (ext != NULL)
10918     len = ext - to_parse;
10919   else
10920     len = strlen (to_parse);
10921
10922   if (len == 0)
10923     return AARCH64_PARSE_MISSING_ARG;
10924
10925
10926   /* Loop through the list of supported CPUs to find a match.  */
10927   for (cpu = all_cores; cpu->name != NULL; cpu++)
10928     {
10929       if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
10930         {
10931           unsigned long isa_temp = cpu->flags;
10932
10933
10934           if (ext != NULL)
10935             {
10936               /* TO_PARSE string contains at least one extension.  */
10937               enum aarch64_parse_opt_result ext_res
10938                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
10939
10940               if (ext_res != AARCH64_PARSE_OK)
10941                 return ext_res;
10942             }
10943           /* Extension parsing was successfull.  Confirm the result
10944              cpu and ISA flags.  */
10945           *res = cpu;
10946           *isa_flags = isa_temp;
10947           return AARCH64_PARSE_OK;
10948         }
10949     }
10950
10951   /* CPU name not found in list.  */
10952   return AARCH64_PARSE_INVALID_ARG;
10953 }
10954
10955 /* Parse the TO_PARSE string and put the cpu it selects into RES.
10956    Return an aarch64_parse_opt_result describing the parse result.
10957    If the parsing fails the RES does not change.  */
10958
10959 static enum aarch64_parse_opt_result
10960 aarch64_parse_tune (const char *to_parse, const struct processor **res)
10961 {
10962   const struct processor *cpu;
10963
10964   /* Loop through the list of supported CPUs to find a match.  */
10965   for (cpu = all_cores; cpu->name != NULL; cpu++)
10966     {
10967       if (strcmp (cpu->name, to_parse) == 0)
10968         {
10969           *res = cpu;
10970           return AARCH64_PARSE_OK;
10971         }
10972     }
10973
10974   /* CPU name not found in list.  */
10975   return AARCH64_PARSE_INVALID_ARG;
10976 }
10977
10978 /* Parse TOKEN, which has length LENGTH to see if it is an option
10979    described in FLAG.  If it is, return the index bit for that fusion type.
10980    If not, error (printing OPTION_NAME) and return zero.  */
10981
10982 static unsigned int
10983 aarch64_parse_one_option_token (const char *token,
10984                                 size_t length,
10985                                 const struct aarch64_flag_desc *flag,
10986                                 const char *option_name)
10987 {
10988   for (; flag->name != NULL; flag++)
10989     {
10990       if (length == strlen (flag->name)
10991           && !strncmp (flag->name, token, length))
10992         return flag->flag;
10993     }
10994
10995   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
10996   return 0;
10997 }
10998
10999 /* Parse OPTION which is a comma-separated list of flags to enable.
11000    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
11001    default state we inherit from the CPU tuning structures.  OPTION_NAME
11002    gives the top-level option we are parsing in the -moverride string,
11003    for use in error messages.  */
11004
11005 static unsigned int
11006 aarch64_parse_boolean_options (const char *option,
11007                                const struct aarch64_flag_desc *flags,
11008                                unsigned int initial_state,
11009                                const char *option_name)
11010 {
11011   const char separator = '.';
11012   const char* specs = option;
11013   const char* ntoken = option;
11014   unsigned int found_flags = initial_state;
11015
11016   while ((ntoken = strchr (specs, separator)))
11017     {
11018       size_t token_length = ntoken - specs;
11019       unsigned token_ops = aarch64_parse_one_option_token (specs,
11020                                                            token_length,
11021                                                            flags,
11022                                                            option_name);
11023       /* If we find "none" (or, for simplicity's sake, an error) anywhere
11024          in the token stream, reset the supported operations.  So:
11025
11026            adrp+add.cmp+branch.none.adrp+add
11027
11028            would have the result of turning on only adrp+add fusion.  */
11029       if (!token_ops)
11030         found_flags = 0;
11031
11032       found_flags |= token_ops;
11033       specs = ++ntoken;
11034     }
11035
11036   /* We ended with a comma, print something.  */
11037   if (!(*specs))
11038     {
11039       error ("%s string ill-formed\n", option_name);
11040       return 0;
11041     }
11042
11043   /* We still have one more token to parse.  */
11044   size_t token_length = strlen (specs);
11045   unsigned token_ops = aarch64_parse_one_option_token (specs,
11046                                                        token_length,
11047                                                        flags,
11048                                                        option_name);
11049    if (!token_ops)
11050      found_flags = 0;
11051
11052   found_flags |= token_ops;
11053   return found_flags;
11054 }
11055
11056 /* Support for overriding instruction fusion.  */
11057
11058 static void
11059 aarch64_parse_fuse_string (const char *fuse_string,
11060                             struct tune_params *tune)
11061 {
11062   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
11063                                                      aarch64_fusible_pairs,
11064                                                      tune->fusible_ops,
11065                                                      "fuse=");
11066 }
11067
11068 /* Support for overriding other tuning flags.  */
11069
11070 static void
11071 aarch64_parse_tune_string (const char *tune_string,
11072                             struct tune_params *tune)
11073 {
11074   tune->extra_tuning_flags
11075     = aarch64_parse_boolean_options (tune_string,
11076                                      aarch64_tuning_flags,
11077                                      tune->extra_tuning_flags,
11078                                      "tune=");
11079 }
11080
11081 /* Parse the sve_width tuning moverride string in TUNE_STRING.
11082    Accept the valid SVE vector widths allowed by
11083    aarch64_sve_vector_bits_enum and use it to override sve_width
11084    in TUNE.  */
11085
11086 static void
11087 aarch64_parse_sve_width_string (const char *tune_string,
11088                                 struct tune_params *tune)
11089 {
11090   int width = -1;
11091
11092   int n = sscanf (tune_string, "%d", &width);
11093   if (n == EOF)
11094     {
11095       error ("invalid format for sve_width");
11096       return;
11097     }
11098   switch (width)
11099     {
11100     case SVE_128:
11101     case SVE_256:
11102     case SVE_512:
11103     case SVE_1024:
11104     case SVE_2048:
11105       break;
11106     default:
11107       error ("invalid sve_width value: %d", width);
11108     }
11109   tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
11110 }
11111
11112 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
11113    we understand.  If it is, extract the option string and handoff to
11114    the appropriate function.  */
11115
11116 void
11117 aarch64_parse_one_override_token (const char* token,
11118                                   size_t length,
11119                                   struct tune_params *tune)
11120 {
11121   const struct aarch64_tuning_override_function *fn
11122     = aarch64_tuning_override_functions;
11123
11124   const char *option_part = strchr (token, '=');
11125   if (!option_part)
11126     {
11127       error ("tuning string missing in option (%s)", token);
11128       return;
11129     }
11130
11131   /* Get the length of the option name.  */
11132   length = option_part - token;
11133   /* Skip the '=' to get to the option string.  */
11134   option_part++;
11135
11136   for (; fn->name != NULL; fn++)
11137     {
11138       if (!strncmp (fn->name, token, length))
11139         {
11140           fn->parse_override (option_part, tune);
11141           return;
11142         }
11143     }
11144
11145   error ("unknown tuning option (%s)",token);
11146   return;
11147 }
11148
11149 /* A checking mechanism for the implementation of the tls size.  */
11150
11151 static void
11152 initialize_aarch64_tls_size (struct gcc_options *opts)
11153 {
11154   if (aarch64_tls_size == 0)
11155     aarch64_tls_size = 24;
11156
11157   switch (opts->x_aarch64_cmodel_var)
11158     {
11159     case AARCH64_CMODEL_TINY:
11160       /* Both the default and maximum TLS size allowed under tiny is 1M which
11161          needs two instructions to address, so we clamp the size to 24.  */
11162       if (aarch64_tls_size > 24)
11163         aarch64_tls_size = 24;
11164       break;
11165     case AARCH64_CMODEL_SMALL:
11166       /* The maximum TLS size allowed under small is 4G.  */
11167       if (aarch64_tls_size > 32)
11168         aarch64_tls_size = 32;
11169       break;
11170     case AARCH64_CMODEL_LARGE:
11171       /* The maximum TLS size allowed under large is 16E.
11172          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
11173       if (aarch64_tls_size > 48)
11174         aarch64_tls_size = 48;
11175       break;
11176     default:
11177       gcc_unreachable ();
11178     }
11179
11180   return;
11181 }
11182
11183 /* Parse STRING looking for options in the format:
11184      string     :: option:string
11185      option     :: name=substring
11186      name       :: {a-z}
11187      substring  :: defined by option.  */
11188
11189 static void
11190 aarch64_parse_override_string (const char* input_string,
11191                                struct tune_params* tune)
11192 {
11193   const char separator = ':';
11194   size_t string_length = strlen (input_string) + 1;
11195   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
11196   char *string = string_root;
11197   strncpy (string, input_string, string_length);
11198   string[string_length - 1] = '\0';
11199
11200   char* ntoken = string;
11201
11202   while ((ntoken = strchr (string, separator)))
11203     {
11204       size_t token_length = ntoken - string;
11205       /* Make this substring look like a string.  */
11206       *ntoken = '\0';
11207       aarch64_parse_one_override_token (string, token_length, tune);
11208       string = ++ntoken;
11209     }
11210
11211   /* One last option to parse.  */
11212   aarch64_parse_one_override_token (string, strlen (string), tune);
11213   free (string_root);
11214 }
11215
11216
11217 static void
11218 aarch64_override_options_after_change_1 (struct gcc_options *opts)
11219 {
11220   if (accepted_branch_protection_string)
11221     {
11222       opts->x_aarch64_branch_protection_string
11223         = xstrdup (accepted_branch_protection_string);
11224     }
11225
11226   /* PR 70044: We have to be careful about being called multiple times for the
11227      same function.  This means all changes should be repeatable.  */
11228
11229   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
11230      Disable the frame pointer flag so the mid-end will not use a frame
11231      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
11232      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
11233      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
11234   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
11235   if (opts->x_flag_omit_frame_pointer == 0)
11236     opts->x_flag_omit_frame_pointer = 2;
11237
11238   /* If not optimizing for size, set the default
11239      alignment to what the target wants.  */
11240   if (!opts->x_optimize_size)
11241     {
11242       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
11243         opts->x_str_align_loops = aarch64_tune_params.loop_align;
11244       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
11245         opts->x_str_align_jumps = aarch64_tune_params.jump_align;
11246       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
11247         opts->x_str_align_functions = aarch64_tune_params.function_align;
11248     }
11249
11250   /* We default to no pc-relative literal loads.  */
11251
11252   aarch64_pcrelative_literal_loads = false;
11253
11254   /* If -mpc-relative-literal-loads is set on the command line, this
11255      implies that the user asked for PC relative literal loads.  */
11256   if (opts->x_pcrelative_literal_loads == 1)
11257     aarch64_pcrelative_literal_loads = true;
11258
11259   /* In the tiny memory model it makes no sense to disallow PC relative
11260      literal pool loads.  */
11261   if (aarch64_cmodel == AARCH64_CMODEL_TINY
11262       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
11263     aarch64_pcrelative_literal_loads = true;
11264
11265   /* When enabling the lower precision Newton series for the square root, also
11266      enable it for the reciprocal square root, since the latter is an
11267      intermediary step for the former.  */
11268   if (flag_mlow_precision_sqrt)
11269     flag_mrecip_low_precision_sqrt = true;
11270 }
11271
11272 /* 'Unpack' up the internal tuning structs and update the options
11273     in OPTS.  The caller must have set up selected_tune and selected_arch
11274     as all the other target-specific codegen decisions are
11275     derived from them.  */
11276
11277 void
11278 aarch64_override_options_internal (struct gcc_options *opts)
11279 {
11280   aarch64_tune_flags = selected_tune->flags;
11281   aarch64_tune = selected_tune->sched_core;
11282   /* Make a copy of the tuning parameters attached to the core, which
11283      we may later overwrite.  */
11284   aarch64_tune_params = *(selected_tune->tune);
11285   aarch64_architecture_version = selected_arch->architecture_version;
11286
11287   if (opts->x_aarch64_override_tune_string)
11288     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
11289                                   &aarch64_tune_params);
11290
11291   /* This target defaults to strict volatile bitfields.  */
11292   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
11293     opts->x_flag_strict_volatile_bitfields = 1;
11294
11295   initialize_aarch64_code_model (opts);
11296   initialize_aarch64_tls_size (opts);
11297
11298   int queue_depth = 0;
11299   switch (aarch64_tune_params.autoprefetcher_model)
11300     {
11301       case tune_params::AUTOPREFETCHER_OFF:
11302         queue_depth = -1;
11303         break;
11304       case tune_params::AUTOPREFETCHER_WEAK:
11305         queue_depth = 0;
11306         break;
11307       case tune_params::AUTOPREFETCHER_STRONG:
11308         queue_depth = max_insn_queue_index + 1;
11309         break;
11310       default:
11311         gcc_unreachable ();
11312     }
11313
11314   /* We don't mind passing in global_options_set here as we don't use
11315      the *options_set structs anyway.  */
11316   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
11317                          queue_depth,
11318                          opts->x_param_values,
11319                          global_options_set.x_param_values);
11320
11321   /* Set up parameters to be used in prefetching algorithm.  Do not
11322      override the defaults unless we are tuning for a core we have
11323      researched values for.  */
11324   if (aarch64_tune_params.prefetch->num_slots > 0)
11325     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
11326                            aarch64_tune_params.prefetch->num_slots,
11327                            opts->x_param_values,
11328                            global_options_set.x_param_values);
11329   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
11330     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
11331                            aarch64_tune_params.prefetch->l1_cache_size,
11332                            opts->x_param_values,
11333                            global_options_set.x_param_values);
11334   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
11335     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
11336                            aarch64_tune_params.prefetch->l1_cache_line_size,
11337                            opts->x_param_values,
11338                            global_options_set.x_param_values);
11339   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
11340     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
11341                            aarch64_tune_params.prefetch->l2_cache_size,
11342                            opts->x_param_values,
11343                            global_options_set.x_param_values);
11344   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
11345     maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
11346                            0,
11347                            opts->x_param_values,
11348                            global_options_set.x_param_values);
11349   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
11350     maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
11351                            aarch64_tune_params.prefetch->minimum_stride,
11352                            opts->x_param_values,
11353                            global_options_set.x_param_values);
11354
11355   /* Use the alternative scheduling-pressure algorithm by default.  */
11356   maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
11357                          opts->x_param_values,
11358                          global_options_set.x_param_values);
11359
11360   /* If the user hasn't changed it via configure then set the default to 64 KB
11361      for the backend.  */
11362   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
11363                          DEFAULT_STK_CLASH_GUARD_SIZE == 0
11364                            ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE,
11365                          opts->x_param_values,
11366                          global_options_set.x_param_values);
11367
11368   /* Validate the guard size.  */
11369   int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
11370
11371   /* Enforce that interval is the same size as size so the mid-end does the
11372      right thing.  */
11373   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
11374                          guard_size,
11375                          opts->x_param_values,
11376                          global_options_set.x_param_values);
11377
11378   /* The maybe_set calls won't update the value if the user has explicitly set
11379      one.  Which means we need to validate that probing interval and guard size
11380      are equal.  */
11381   int probe_interval
11382     = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
11383   if (guard_size != probe_interval)
11384     error ("stack clash guard size '%d' must be equal to probing interval "
11385            "'%d'", guard_size, probe_interval);
11386
11387   /* Enable sw prefetching at specified optimization level for
11388      CPUS that have prefetch.  Lower optimization level threshold by 1
11389      when profiling is enabled.  */
11390   if (opts->x_flag_prefetch_loop_arrays < 0
11391       && !opts->x_optimize_size
11392       && aarch64_tune_params.prefetch->default_opt_level >= 0
11393       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
11394     opts->x_flag_prefetch_loop_arrays = 1;
11395
11396   if (opts->x_aarch64_arch_string == NULL)
11397     opts->x_aarch64_arch_string = selected_arch->name;
11398   if (opts->x_aarch64_cpu_string == NULL)
11399     opts->x_aarch64_cpu_string = selected_cpu->name;
11400   if (opts->x_aarch64_tune_string == NULL)
11401     opts->x_aarch64_tune_string = selected_tune->name;
11402
11403   aarch64_override_options_after_change_1 (opts);
11404 }
11405
11406 /* Print a hint with a suggestion for a core or architecture name that
11407    most closely resembles what the user passed in STR.  ARCH is true if
11408    the user is asking for an architecture name.  ARCH is false if the user
11409    is asking for a core name.  */
11410
11411 static void
11412 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
11413 {
11414   auto_vec<const char *> candidates;
11415   const struct processor *entry = arch ? all_architectures : all_cores;
11416   for (; entry->name != NULL; entry++)
11417     candidates.safe_push (entry->name);
11418
11419 #ifdef HAVE_LOCAL_CPU_DETECT
11420   /* Add also "native" as possible value.  */
11421   if (arch)
11422     candidates.safe_push ("native");
11423 #endif
11424
11425   char *s;
11426   const char *hint = candidates_list_and_hint (str, s, candidates);
11427   if (hint)
11428     inform (input_location, "valid arguments are: %s;"
11429                              " did you mean %qs?", s, hint);
11430   else
11431     inform (input_location, "valid arguments are: %s", s);
11432
11433   XDELETEVEC (s);
11434 }
11435
11436 /* Print a hint with a suggestion for a core name that most closely resembles
11437    what the user passed in STR.  */
11438
11439 inline static void
11440 aarch64_print_hint_for_core (const char *str)
11441 {
11442   aarch64_print_hint_for_core_or_arch (str, false);
11443 }
11444
11445 /* Print a hint with a suggestion for an architecture name that most closely
11446    resembles what the user passed in STR.  */
11447
11448 inline static void
11449 aarch64_print_hint_for_arch (const char *str)
11450 {
11451   aarch64_print_hint_for_core_or_arch (str, true);
11452 }
11453
11454
11455 /* Print a hint with a suggestion for an extension name
11456    that most closely resembles what the user passed in STR.  */
11457
11458 void
11459 aarch64_print_hint_for_extensions (const std::string &str)
11460 {
11461   auto_vec<const char *> candidates;
11462   aarch64_get_all_extension_candidates (&candidates);
11463   char *s;
11464   const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
11465   if (hint)
11466     inform (input_location, "valid arguments are: %s;"
11467                              " did you mean %qs?", s, hint);
11468   else
11469     inform (input_location, "valid arguments are: %s;", s);
11470
11471   XDELETEVEC (s);
11472 }
11473
11474 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
11475    specified in STR and throw errors if appropriate.  Put the results if
11476    they are valid in RES and ISA_FLAGS.  Return whether the option is
11477    valid.  */
11478
11479 static bool
11480 aarch64_validate_mcpu (const char *str, const struct processor **res,
11481                        unsigned long *isa_flags)
11482 {
11483   std::string invalid_extension;
11484   enum aarch64_parse_opt_result parse_res
11485     = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
11486
11487   if (parse_res == AARCH64_PARSE_OK)
11488     return true;
11489
11490   switch (parse_res)
11491     {
11492       case AARCH64_PARSE_MISSING_ARG:
11493         error ("missing cpu name in %<-mcpu=%s%>", str);
11494         break;
11495       case AARCH64_PARSE_INVALID_ARG:
11496         error ("unknown value %qs for -mcpu", str);
11497         aarch64_print_hint_for_core (str);
11498         break;
11499       case AARCH64_PARSE_INVALID_FEATURE:
11500         error ("invalid feature modifier %qs in %<-mcpu=%s%>",
11501                invalid_extension.c_str (), str);
11502         aarch64_print_hint_for_extensions (invalid_extension);
11503         break;
11504       default:
11505         gcc_unreachable ();
11506     }
11507
11508   return false;
11509 }
11510
11511 /* Parses CONST_STR for branch protection features specified in
11512    aarch64_branch_protect_types, and set any global variables required.  Returns
11513    the parsing result and assigns LAST_STR to the last processed token from
11514    CONST_STR so that it can be used for error reporting.  */
11515
11516 static enum
11517 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
11518                                                           char** last_str)
11519 {
11520   char *str_root = xstrdup (const_str);
11521   char* token_save = NULL;
11522   char *str = strtok_r (str_root, "+", &token_save);
11523   enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
11524   if (!str)
11525     res = AARCH64_PARSE_MISSING_ARG;
11526   else
11527     {
11528       char *next_str = strtok_r (NULL, "+", &token_save);
11529       /* Reset the branch protection features to their defaults.  */
11530       aarch64_handle_no_branch_protection (NULL, NULL);
11531
11532       while (str && res == AARCH64_PARSE_OK)
11533         {
11534           const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
11535           bool found = false;
11536           /* Search for this type.  */
11537           while (type && type->name && !found && res == AARCH64_PARSE_OK)
11538             {
11539               if (strcmp (str, type->name) == 0)
11540                 {
11541                   found = true;
11542                   res = type->handler (str, next_str);
11543                   str = next_str;
11544                   next_str = strtok_r (NULL, "+", &token_save);
11545                 }
11546               else
11547                 type++;
11548             }
11549           if (found && res == AARCH64_PARSE_OK)
11550             {
11551               bool found_subtype = true;
11552               /* Loop through each token until we find one that isn't a
11553                  subtype.  */
11554               while (found_subtype)
11555                 {
11556                   found_subtype = false;
11557                   const aarch64_branch_protect_type *subtype = type->subtypes;
11558                   /* Search for the subtype.  */
11559                   while (str && subtype && subtype->name && !found_subtype
11560                           && res == AARCH64_PARSE_OK)
11561                     {
11562                       if (strcmp (str, subtype->name) == 0)
11563                         {
11564                           found_subtype = true;
11565                           res = subtype->handler (str, next_str);
11566                           str = next_str;
11567                           next_str = strtok_r (NULL, "+", &token_save);
11568                         }
11569                       else
11570                         subtype++;
11571                     }
11572                 }
11573             }
11574           else if (!found)
11575             res = AARCH64_PARSE_INVALID_ARG;
11576         }
11577     }
11578   /* Copy the last processed token into the argument to pass it back.
11579     Used by option and attribute validation to print the offending token.  */
11580   if (last_str)
11581     {
11582       if (str) strcpy (*last_str, str);
11583       else *last_str = NULL;
11584     }
11585   if (res == AARCH64_PARSE_OK)
11586     {
11587       /* If needed, alloc the accepted string then copy in const_str.
11588         Used by override_option_after_change_1.  */
11589       if (!accepted_branch_protection_string)
11590         accepted_branch_protection_string = (char *) xmalloc (
11591                                                       BRANCH_PROTECT_STR_MAX
11592                                                         + 1);
11593       strncpy (accepted_branch_protection_string, const_str,
11594                 BRANCH_PROTECT_STR_MAX + 1);
11595       /* Forcibly null-terminate.  */
11596       accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
11597     }
11598   return res;
11599 }
11600
11601 static bool
11602 aarch64_validate_mbranch_protection (const char *const_str)
11603 {
11604   char *str = (char *) xmalloc (strlen (const_str));
11605   enum aarch64_parse_opt_result res =
11606     aarch64_parse_branch_protection (const_str, &str);
11607   if (res == AARCH64_PARSE_INVALID_ARG)
11608     error ("invalid arg %<%s%> for %<-mbranch-protection=%>", str);
11609   else if (res == AARCH64_PARSE_MISSING_ARG)
11610     error ("missing arg for %<-mbranch-protection=%>");
11611   free (str);
11612   return res == AARCH64_PARSE_OK;
11613 }
11614
11615 /* Validate a command-line -march option.  Parse the arch and extensions
11616    (if any) specified in STR and throw errors if appropriate.  Put the
11617    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
11618    option is valid.  */
11619
11620 static bool
11621 aarch64_validate_march (const char *str, const struct processor **res,
11622                          unsigned long *isa_flags)
11623 {
11624   std::string invalid_extension;
11625   enum aarch64_parse_opt_result parse_res
11626     = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
11627
11628   if (parse_res == AARCH64_PARSE_OK)
11629     return true;
11630
11631   switch (parse_res)
11632     {
11633       case AARCH64_PARSE_MISSING_ARG:
11634         error ("missing arch name in %<-march=%s%>", str);
11635         break;
11636       case AARCH64_PARSE_INVALID_ARG:
11637         error ("unknown value %qs for -march", str);
11638         aarch64_print_hint_for_arch (str);
11639         break;
11640       case AARCH64_PARSE_INVALID_FEATURE:
11641         error ("invalid feature modifier %qs in %<-march=%s%>",
11642                invalid_extension.c_str (), str);
11643         aarch64_print_hint_for_extensions (invalid_extension);
11644         break;
11645       default:
11646         gcc_unreachable ();
11647     }
11648
11649   return false;
11650 }
11651
11652 /* Validate a command-line -mtune option.  Parse the cpu
11653    specified in STR and throw errors if appropriate.  Put the
11654    result, if it is valid, in RES.  Return whether the option is
11655    valid.  */
11656
11657 static bool
11658 aarch64_validate_mtune (const char *str, const struct processor **res)
11659 {
11660   enum aarch64_parse_opt_result parse_res
11661     = aarch64_parse_tune (str, res);
11662
11663   if (parse_res == AARCH64_PARSE_OK)
11664     return true;
11665
11666   switch (parse_res)
11667     {
11668       case AARCH64_PARSE_MISSING_ARG:
11669         error ("missing cpu name in %<-mtune=%s%>", str);
11670         break;
11671       case AARCH64_PARSE_INVALID_ARG:
11672         error ("unknown value %qs for -mtune", str);
11673         aarch64_print_hint_for_core (str);
11674         break;
11675       default:
11676         gcc_unreachable ();
11677     }
11678   return false;
11679 }
11680
11681 /* Return the CPU corresponding to the enum CPU.
11682    If it doesn't specify a cpu, return the default.  */
11683
11684 static const struct processor *
11685 aarch64_get_tune_cpu (enum aarch64_processor cpu)
11686 {
11687   if (cpu != aarch64_none)
11688     return &all_cores[cpu];
11689
11690   /* The & 0x3f is to extract the bottom 6 bits that encode the
11691      default cpu as selected by the --with-cpu GCC configure option
11692      in config.gcc.
11693      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
11694      flags mechanism should be reworked to make it more sane.  */
11695   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
11696 }
11697
11698 /* Return the architecture corresponding to the enum ARCH.
11699    If it doesn't specify a valid architecture, return the default.  */
11700
11701 static const struct processor *
11702 aarch64_get_arch (enum aarch64_arch arch)
11703 {
11704   if (arch != aarch64_no_arch)
11705     return &all_architectures[arch];
11706
11707   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
11708
11709   return &all_architectures[cpu->arch];
11710 }
11711
11712 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
11713
11714 static poly_uint16
11715 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
11716 {
11717   /* For now generate vector-length agnostic code for -msve-vector-bits=128.
11718      This ensures we can clearly distinguish SVE and Advanced SIMD modes when
11719      deciding which .md file patterns to use and when deciding whether
11720      something is a legitimate address or constant.  */
11721   if (value == SVE_SCALABLE || value == SVE_128)
11722     return poly_uint16 (2, 2);
11723   else
11724     return (int) value / 64;
11725 }
11726
11727 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
11728    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
11729    tuning structs.  In particular it must set selected_tune and
11730    aarch64_isa_flags that define the available ISA features and tuning
11731    decisions.  It must also set selected_arch as this will be used to
11732    output the .arch asm tags for each function.  */
11733
11734 static void
11735 aarch64_override_options (void)
11736 {
11737   unsigned long cpu_isa = 0;
11738   unsigned long arch_isa = 0;
11739   aarch64_isa_flags = 0;
11740
11741   bool valid_cpu = true;
11742   bool valid_tune = true;
11743   bool valid_arch = true;
11744
11745   selected_cpu = NULL;
11746   selected_arch = NULL;
11747   selected_tune = NULL;
11748
11749   if (aarch64_branch_protection_string)
11750     aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
11751
11752   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
11753      If either of -march or -mtune is given, they override their
11754      respective component of -mcpu.  */
11755   if (aarch64_cpu_string)
11756     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
11757                                         &cpu_isa);
11758
11759   if (aarch64_arch_string)
11760     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
11761                                           &arch_isa);
11762
11763   if (aarch64_tune_string)
11764     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
11765
11766 #ifdef SUBTARGET_OVERRIDE_OPTIONS
11767   SUBTARGET_OVERRIDE_OPTIONS;
11768 #endif
11769
11770   /* If the user did not specify a processor, choose the default
11771      one for them.  This will be the CPU set during configuration using
11772      --with-cpu, otherwise it is "generic".  */
11773   if (!selected_cpu)
11774     {
11775       if (selected_arch)
11776         {
11777           selected_cpu = &all_cores[selected_arch->ident];
11778           aarch64_isa_flags = arch_isa;
11779           explicit_arch = selected_arch->arch;
11780         }
11781       else
11782         {
11783           /* Get default configure-time CPU.  */
11784           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
11785           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
11786         }
11787
11788       if (selected_tune)
11789         explicit_tune_core = selected_tune->ident;
11790     }
11791   /* If both -mcpu and -march are specified check that they are architecturally
11792      compatible, warn if they're not and prefer the -march ISA flags.  */
11793   else if (selected_arch)
11794     {
11795       if (selected_arch->arch != selected_cpu->arch)
11796         {
11797           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
11798                        all_architectures[selected_cpu->arch].name,
11799                        selected_arch->name);
11800         }
11801       aarch64_isa_flags = arch_isa;
11802       explicit_arch = selected_arch->arch;
11803       explicit_tune_core = selected_tune ? selected_tune->ident
11804                                           : selected_cpu->ident;
11805     }
11806   else
11807     {
11808       /* -mcpu but no -march.  */
11809       aarch64_isa_flags = cpu_isa;
11810       explicit_tune_core = selected_tune ? selected_tune->ident
11811                                           : selected_cpu->ident;
11812       gcc_assert (selected_cpu);
11813       selected_arch = &all_architectures[selected_cpu->arch];
11814       explicit_arch = selected_arch->arch;
11815     }
11816
11817   /* Set the arch as well as we will need it when outputing
11818      the .arch directive in assembly.  */
11819   if (!selected_arch)
11820     {
11821       gcc_assert (selected_cpu);
11822       selected_arch = &all_architectures[selected_cpu->arch];
11823     }
11824
11825   if (!selected_tune)
11826     selected_tune = selected_cpu;
11827
11828   if (aarch64_enable_bti == 2)
11829     {
11830 #ifdef TARGET_ENABLE_BTI
11831       aarch64_enable_bti = 1;
11832 #else
11833       aarch64_enable_bti = 0;
11834 #endif
11835     }
11836
11837   /* Return address signing is currently not supported for ILP32 targets.  For
11838      LP64 targets use the configured option in the absence of a command-line
11839      option for -mbranch-protection.  */
11840   if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
11841     {
11842 #ifdef TARGET_ENABLE_PAC_RET
11843       aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
11844       aarch64_ra_sign_key = AARCH64_KEY_A;
11845 #else
11846       aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
11847 #endif
11848     }
11849
11850 #ifndef HAVE_AS_MABI_OPTION
11851   /* The compiler may have been configured with 2.23.* binutils, which does
11852      not have support for ILP32.  */
11853   if (TARGET_ILP32)
11854     error ("assembler does not support -mabi=ilp32");
11855 #endif
11856
11857   /* Convert -msve-vector-bits to a VG count.  */
11858   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
11859
11860   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
11861     sorry ("return address signing is only supported for -mabi=lp64");
11862
11863   /* Make sure we properly set up the explicit options.  */
11864   if ((aarch64_cpu_string && valid_cpu)
11865        || (aarch64_tune_string && valid_tune))
11866     gcc_assert (explicit_tune_core != aarch64_none);
11867
11868   if ((aarch64_cpu_string && valid_cpu)
11869        || (aarch64_arch_string && valid_arch))
11870     gcc_assert (explicit_arch != aarch64_no_arch);
11871
11872   /* The pass to insert speculation tracking runs before
11873      shrink-wrapping and the latter does not know how to update the
11874      tracking status.  So disable it in this case.  */
11875   if (aarch64_track_speculation)
11876     flag_shrink_wrap = 0;
11877
11878   aarch64_override_options_internal (&global_options);
11879
11880   /* Save these options as the default ones in case we push and pop them later
11881      while processing functions with potential target attributes.  */
11882   target_option_default_node = target_option_current_node
11883       = build_target_option_node (&global_options);
11884 }
11885
11886 /* Implement targetm.override_options_after_change.  */
11887
11888 static void
11889 aarch64_override_options_after_change (void)
11890 {
11891   aarch64_override_options_after_change_1 (&global_options);
11892 }
11893
11894 static struct machine_function *
11895 aarch64_init_machine_status (void)
11896 {
11897   struct machine_function *machine;
11898   machine = ggc_cleared_alloc<machine_function> ();
11899   return machine;
11900 }
11901
11902 void
11903 aarch64_init_expanders (void)
11904 {
11905   init_machine_status = aarch64_init_machine_status;
11906 }
11907
11908 /* A checking mechanism for the implementation of the various code models.  */
11909 static void
11910 initialize_aarch64_code_model (struct gcc_options *opts)
11911 {
11912    if (opts->x_flag_pic)
11913      {
11914        switch (opts->x_aarch64_cmodel_var)
11915          {
11916          case AARCH64_CMODEL_TINY:
11917            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
11918            break;
11919          case AARCH64_CMODEL_SMALL:
11920 #ifdef HAVE_AS_SMALL_PIC_RELOCS
11921            aarch64_cmodel = (flag_pic == 2
11922                              ? AARCH64_CMODEL_SMALL_PIC
11923                              : AARCH64_CMODEL_SMALL_SPIC);
11924 #else
11925            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
11926 #endif
11927            break;
11928          case AARCH64_CMODEL_LARGE:
11929            sorry ("code model %qs with -f%s", "large",
11930                   opts->x_flag_pic > 1 ? "PIC" : "pic");
11931            break;
11932          default:
11933            gcc_unreachable ();
11934          }
11935      }
11936    else
11937      aarch64_cmodel = opts->x_aarch64_cmodel_var;
11938 }
11939
11940 /* Implement TARGET_OPTION_SAVE.  */
11941
11942 static void
11943 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
11944 {
11945   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
11946   ptr->x_aarch64_branch_protection_string
11947     = opts->x_aarch64_branch_protection_string;
11948 }
11949
11950 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
11951    using the information saved in PTR.  */
11952
11953 static void
11954 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
11955 {
11956   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
11957   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11958   opts->x_explicit_arch = ptr->x_explicit_arch;
11959   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
11960   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
11961   opts->x_aarch64_branch_protection_string
11962     = ptr->x_aarch64_branch_protection_string;
11963   if (opts->x_aarch64_branch_protection_string)
11964     {
11965       aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
11966                                         NULL);
11967     }
11968
11969   aarch64_override_options_internal (opts);
11970 }
11971
11972 /* Implement TARGET_OPTION_PRINT.  */
11973
11974 static void
11975 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
11976 {
11977   const struct processor *cpu
11978     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11979   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
11980   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
11981   std::string extension
11982     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
11983
11984   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
11985   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
11986            arch->name, extension.c_str ());
11987 }
11988
11989 static GTY(()) tree aarch64_previous_fndecl;
11990
11991 void
11992 aarch64_reset_previous_fndecl (void)
11993 {
11994   aarch64_previous_fndecl = NULL;
11995 }
11996
11997 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
11998    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
11999    make sure optab availability predicates are recomputed when necessary.  */
12000
12001 void
12002 aarch64_save_restore_target_globals (tree new_tree)
12003 {
12004   if (TREE_TARGET_GLOBALS (new_tree))
12005     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
12006   else if (new_tree == target_option_default_node)
12007     restore_target_globals (&default_target_globals);
12008   else
12009     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
12010 }
12011
12012 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
12013    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
12014    of the function, if such exists.  This function may be called multiple
12015    times on a single function so use aarch64_previous_fndecl to avoid
12016    setting up identical state.  */
12017
12018 static void
12019 aarch64_set_current_function (tree fndecl)
12020 {
12021   if (!fndecl || fndecl == aarch64_previous_fndecl)
12022     return;
12023
12024   tree old_tree = (aarch64_previous_fndecl
12025                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
12026                    : NULL_TREE);
12027
12028   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12029
12030   /* If current function has no attributes but the previous one did,
12031      use the default node.  */
12032   if (!new_tree && old_tree)
12033     new_tree = target_option_default_node;
12034
12035   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
12036      the default have been handled by aarch64_save_restore_target_globals from
12037      aarch64_pragma_target_parse.  */
12038   if (old_tree == new_tree)
12039     return;
12040
12041   aarch64_previous_fndecl = fndecl;
12042
12043   /* First set the target options.  */
12044   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
12045
12046   aarch64_save_restore_target_globals (new_tree);
12047 }
12048
12049 /* Enum describing the various ways we can handle attributes.
12050    In many cases we can reuse the generic option handling machinery.  */
12051
12052 enum aarch64_attr_opt_type
12053 {
12054   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
12055   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
12056   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
12057   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
12058 };
12059
12060 /* All the information needed to handle a target attribute.
12061    NAME is the name of the attribute.
12062    ATTR_TYPE specifies the type of behavior of the attribute as described
12063    in the definition of enum aarch64_attr_opt_type.
12064    ALLOW_NEG is true if the attribute supports a "no-" form.
12065    HANDLER is the function that takes the attribute string as an argument
12066    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
12067    OPT_NUM is the enum specifying the option that the attribute modifies.
12068    This is needed for attributes that mirror the behavior of a command-line
12069    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
12070    aarch64_attr_enum.  */
12071
12072 struct aarch64_attribute_info
12073 {
12074   const char *name;
12075   enum aarch64_attr_opt_type attr_type;
12076   bool allow_neg;
12077   bool (*handler) (const char *);
12078   enum opt_code opt_num;
12079 };
12080
12081 /* Handle the ARCH_STR argument to the arch= target attribute.  */
12082
12083 static bool
12084 aarch64_handle_attr_arch (const char *str)
12085 {
12086   const struct processor *tmp_arch = NULL;
12087   std::string invalid_extension;
12088   enum aarch64_parse_opt_result parse_res
12089     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
12090
12091   if (parse_res == AARCH64_PARSE_OK)
12092     {
12093       gcc_assert (tmp_arch);
12094       selected_arch = tmp_arch;
12095       explicit_arch = selected_arch->arch;
12096       return true;
12097     }
12098
12099   switch (parse_res)
12100     {
12101       case AARCH64_PARSE_MISSING_ARG:
12102         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
12103         break;
12104       case AARCH64_PARSE_INVALID_ARG:
12105         error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
12106         aarch64_print_hint_for_arch (str);
12107         break;
12108       case AARCH64_PARSE_INVALID_FEATURE:
12109         error ("invalid feature modifier %s of value (\"%s\") in "
12110                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12111         aarch64_print_hint_for_extensions (invalid_extension);
12112         break;
12113       default:
12114         gcc_unreachable ();
12115     }
12116
12117   return false;
12118 }
12119
12120 /* Handle the argument CPU_STR to the cpu= target attribute.  */
12121
12122 static bool
12123 aarch64_handle_attr_cpu (const char *str)
12124 {
12125   const struct processor *tmp_cpu = NULL;
12126   std::string invalid_extension;
12127   enum aarch64_parse_opt_result parse_res
12128     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
12129
12130   if (parse_res == AARCH64_PARSE_OK)
12131     {
12132       gcc_assert (tmp_cpu);
12133       selected_tune = tmp_cpu;
12134       explicit_tune_core = selected_tune->ident;
12135
12136       selected_arch = &all_architectures[tmp_cpu->arch];
12137       explicit_arch = selected_arch->arch;
12138       return true;
12139     }
12140
12141   switch (parse_res)
12142     {
12143       case AARCH64_PARSE_MISSING_ARG:
12144         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
12145         break;
12146       case AARCH64_PARSE_INVALID_ARG:
12147         error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
12148         aarch64_print_hint_for_core (str);
12149         break;
12150       case AARCH64_PARSE_INVALID_FEATURE:
12151         error ("invalid feature modifier %s of value (\"%s\") in "
12152                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12153         aarch64_print_hint_for_extensions (invalid_extension);
12154         break;
12155       default:
12156         gcc_unreachable ();
12157     }
12158
12159   return false;
12160 }
12161
12162 /* Handle the argument STR to the branch-protection= attribute.  */
12163
12164  static bool
12165  aarch64_handle_attr_branch_protection (const char* str)
12166  {
12167   char *err_str = (char *) xmalloc (strlen (str));
12168   enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
12169                                                                       &err_str);
12170   bool success = false;
12171   switch (res)
12172     {
12173      case AARCH64_PARSE_MISSING_ARG:
12174        error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
12175               " attribute");
12176        break;
12177      case AARCH64_PARSE_INVALID_ARG:
12178        error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
12179               "=\")%> pragma or attribute", err_str);
12180        break;
12181      case AARCH64_PARSE_OK:
12182        success = true;
12183       /* Fall through.  */
12184      case AARCH64_PARSE_INVALID_FEATURE:
12185        break;
12186      default:
12187        gcc_unreachable ();
12188     }
12189   free (err_str);
12190   return success;
12191  }
12192
12193 /* Handle the argument STR to the tune= target attribute.  */
12194
12195 static bool
12196 aarch64_handle_attr_tune (const char *str)
12197 {
12198   const struct processor *tmp_tune = NULL;
12199   enum aarch64_parse_opt_result parse_res
12200     = aarch64_parse_tune (str, &tmp_tune);
12201
12202   if (parse_res == AARCH64_PARSE_OK)
12203     {
12204       gcc_assert (tmp_tune);
12205       selected_tune = tmp_tune;
12206       explicit_tune_core = selected_tune->ident;
12207       return true;
12208     }
12209
12210   switch (parse_res)
12211     {
12212       case AARCH64_PARSE_INVALID_ARG:
12213         error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
12214         aarch64_print_hint_for_core (str);
12215         break;
12216       default:
12217         gcc_unreachable ();
12218     }
12219
12220   return false;
12221 }
12222
12223 /* Parse an architecture extensions target attribute string specified in STR.
12224    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
12225    if successful.  Update aarch64_isa_flags to reflect the ISA features
12226    modified.  */
12227
12228 static bool
12229 aarch64_handle_attr_isa_flags (char *str)
12230 {
12231   enum aarch64_parse_opt_result parse_res;
12232   unsigned long isa_flags = aarch64_isa_flags;
12233
12234   /* We allow "+nothing" in the beginning to clear out all architectural
12235      features if the user wants to handpick specific features.  */
12236   if (strncmp ("+nothing", str, 8) == 0)
12237     {
12238       isa_flags = 0;
12239       str += 8;
12240     }
12241
12242   std::string invalid_extension;
12243   parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
12244
12245   if (parse_res == AARCH64_PARSE_OK)
12246     {
12247       aarch64_isa_flags = isa_flags;
12248       return true;
12249     }
12250
12251   switch (parse_res)
12252     {
12253       case AARCH64_PARSE_MISSING_ARG:
12254         error ("missing value in %<target()%> pragma or attribute");
12255         break;
12256
12257       case AARCH64_PARSE_INVALID_FEATURE:
12258         error ("invalid feature modifier %s of value (\"%s\") in "
12259                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12260         break;
12261
12262       default:
12263         gcc_unreachable ();
12264     }
12265
12266  return false;
12267 }
12268
12269 /* The target attributes that we support.  On top of these we also support just
12270    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
12271    handled explicitly in aarch64_process_one_target_attr.  */
12272
12273 static const struct aarch64_attribute_info aarch64_attributes[] =
12274 {
12275   { "general-regs-only", aarch64_attr_mask, false, NULL,
12276      OPT_mgeneral_regs_only },
12277   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
12278      OPT_mfix_cortex_a53_835769 },
12279   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
12280      OPT_mfix_cortex_a53_843419 },
12281   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
12282   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
12283   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
12284      OPT_momit_leaf_frame_pointer },
12285   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
12286   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
12287      OPT_march_ },
12288   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
12289   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
12290      OPT_mtune_ },
12291   { "branch-protection", aarch64_attr_custom, false,
12292      aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
12293   { "sign-return-address", aarch64_attr_enum, false, NULL,
12294      OPT_msign_return_address_ },
12295   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
12296 };
12297
12298 /* Parse ARG_STR which contains the definition of one target attribute.
12299    Show appropriate errors if any or return true if the attribute is valid.  */
12300
12301 static bool
12302 aarch64_process_one_target_attr (char *arg_str)
12303 {
12304   bool invert = false;
12305
12306   size_t len = strlen (arg_str);
12307
12308   if (len == 0)
12309     {
12310       error ("malformed %<target()%> pragma or attribute");
12311       return false;
12312     }
12313
12314   char *str_to_check = (char *) alloca (len + 1);
12315   strcpy (str_to_check, arg_str);
12316
12317   /* Skip leading whitespace.  */
12318   while (*str_to_check == ' ' || *str_to_check == '\t')
12319     str_to_check++;
12320
12321   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
12322      It is easier to detect and handle it explicitly here rather than going
12323      through the machinery for the rest of the target attributes in this
12324      function.  */
12325   if (*str_to_check == '+')
12326     return aarch64_handle_attr_isa_flags (str_to_check);
12327
12328   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
12329     {
12330       invert = true;
12331       str_to_check += 3;
12332     }
12333   char *arg = strchr (str_to_check, '=');
12334
12335   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
12336      and point ARG to "foo".  */
12337   if (arg)
12338     {
12339       *arg = '\0';
12340       arg++;
12341     }
12342   const struct aarch64_attribute_info *p_attr;
12343   bool found = false;
12344   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
12345     {
12346       /* If the names don't match up, or the user has given an argument
12347          to an attribute that doesn't accept one, or didn't give an argument
12348          to an attribute that expects one, fail to match.  */
12349       if (strcmp (str_to_check, p_attr->name) != 0)
12350         continue;
12351
12352       found = true;
12353       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
12354                               || p_attr->attr_type == aarch64_attr_enum;
12355
12356       if (attr_need_arg_p ^ (arg != NULL))
12357         {
12358           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
12359           return false;
12360         }
12361
12362       /* If the name matches but the attribute does not allow "no-" versions
12363          then we can't match.  */
12364       if (invert && !p_attr->allow_neg)
12365         {
12366           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
12367           return false;
12368         }
12369
12370       switch (p_attr->attr_type)
12371         {
12372         /* Has a custom handler registered.
12373            For example, cpu=, arch=, tune=.  */
12374           case aarch64_attr_custom:
12375             gcc_assert (p_attr->handler);
12376             if (!p_attr->handler (arg))
12377               return false;
12378             break;
12379
12380           /* Either set or unset a boolean option.  */
12381           case aarch64_attr_bool:
12382             {
12383               struct cl_decoded_option decoded;
12384
12385               generate_option (p_attr->opt_num, NULL, !invert,
12386                                CL_TARGET, &decoded);
12387               aarch64_handle_option (&global_options, &global_options_set,
12388                                       &decoded, input_location);
12389               break;
12390             }
12391           /* Set or unset a bit in the target_flags.  aarch64_handle_option
12392              should know what mask to apply given the option number.  */
12393           case aarch64_attr_mask:
12394             {
12395               struct cl_decoded_option decoded;
12396               /* We only need to specify the option number.
12397                  aarch64_handle_option will know which mask to apply.  */
12398               decoded.opt_index = p_attr->opt_num;
12399               decoded.value = !invert;
12400               aarch64_handle_option (&global_options, &global_options_set,
12401                                       &decoded, input_location);
12402               break;
12403             }
12404           /* Use the option setting machinery to set an option to an enum.  */
12405           case aarch64_attr_enum:
12406             {
12407               gcc_assert (arg);
12408               bool valid;
12409               int value;
12410               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
12411                                               &value, CL_TARGET);
12412               if (valid)
12413                 {
12414                   set_option (&global_options, NULL, p_attr->opt_num, value,
12415                               NULL, DK_UNSPECIFIED, input_location,
12416                               global_dc);
12417                 }
12418               else
12419                 {
12420                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
12421                 }
12422               break;
12423             }
12424           default:
12425             gcc_unreachable ();
12426         }
12427     }
12428
12429   /* If we reached here we either have found an attribute and validated
12430      it or didn't match any.  If we matched an attribute but its arguments
12431      were malformed we will have returned false already.  */
12432   return found;
12433 }
12434
12435 /* Count how many times the character C appears in
12436    NULL-terminated string STR.  */
12437
12438 static unsigned int
12439 num_occurences_in_str (char c, char *str)
12440 {
12441   unsigned int res = 0;
12442   while (*str != '\0')
12443     {
12444       if (*str == c)
12445         res++;
12446
12447       str++;
12448     }
12449
12450   return res;
12451 }
12452
12453 /* Parse the tree in ARGS that contains the target attribute information
12454    and update the global target options space.  */
12455
12456 bool
12457 aarch64_process_target_attr (tree args)
12458 {
12459   if (TREE_CODE (args) == TREE_LIST)
12460     {
12461       do
12462         {
12463           tree head = TREE_VALUE (args);
12464           if (head)
12465             {
12466               if (!aarch64_process_target_attr (head))
12467                 return false;
12468             }
12469           args = TREE_CHAIN (args);
12470         } while (args);
12471
12472       return true;
12473     }
12474
12475   if (TREE_CODE (args) != STRING_CST)
12476     {
12477       error ("attribute %<target%> argument not a string");
12478       return false;
12479     }
12480
12481   size_t len = strlen (TREE_STRING_POINTER (args));
12482   char *str_to_check = (char *) alloca (len + 1);
12483   strcpy (str_to_check, TREE_STRING_POINTER (args));
12484
12485   if (len == 0)
12486     {
12487       error ("malformed %<target()%> pragma or attribute");
12488       return false;
12489     }
12490
12491   /* Used to catch empty spaces between commas i.e.
12492      attribute ((target ("attr1,,attr2"))).  */
12493   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
12494
12495   /* Handle multiple target attributes separated by ','.  */
12496   char *token = strtok_r (str_to_check, ",", &str_to_check);
12497
12498   unsigned int num_attrs = 0;
12499   while (token)
12500     {
12501       num_attrs++;
12502       if (!aarch64_process_one_target_attr (token))
12503         {
12504           error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
12505           return false;
12506         }
12507
12508       token = strtok_r (NULL, ",", &str_to_check);
12509     }
12510
12511   if (num_attrs != num_commas + 1)
12512     {
12513       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
12514       return false;
12515     }
12516
12517   return true;
12518 }
12519
12520 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
12521    process attribute ((target ("..."))).  */
12522
12523 static bool
12524 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
12525 {
12526   struct cl_target_option cur_target;
12527   bool ret;
12528   tree old_optimize;
12529   tree new_target, new_optimize;
12530   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12531
12532   /* If what we're processing is the current pragma string then the
12533      target option node is already stored in target_option_current_node
12534      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
12535      having to re-parse the string.  This is especially useful to keep
12536      arm_neon.h compile times down since that header contains a lot
12537      of intrinsics enclosed in pragmas.  */
12538   if (!existing_target && args == current_target_pragma)
12539     {
12540       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
12541       return true;
12542     }
12543   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
12544
12545   old_optimize = build_optimization_node (&global_options);
12546   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
12547
12548   /* If the function changed the optimization levels as well as setting
12549      target options, start with the optimizations specified.  */
12550   if (func_optimize && func_optimize != old_optimize)
12551     cl_optimization_restore (&global_options,
12552                              TREE_OPTIMIZATION (func_optimize));
12553
12554   /* Save the current target options to restore at the end.  */
12555   cl_target_option_save (&cur_target, &global_options);
12556
12557   /* If fndecl already has some target attributes applied to it, unpack
12558      them so that we add this attribute on top of them, rather than
12559      overwriting them.  */
12560   if (existing_target)
12561     {
12562       struct cl_target_option *existing_options
12563         = TREE_TARGET_OPTION (existing_target);
12564
12565       if (existing_options)
12566         cl_target_option_restore (&global_options, existing_options);
12567     }
12568   else
12569     cl_target_option_restore (&global_options,
12570                         TREE_TARGET_OPTION (target_option_current_node));
12571
12572   ret = aarch64_process_target_attr (args);
12573
12574   /* Set up any additional state.  */
12575   if (ret)
12576     {
12577       aarch64_override_options_internal (&global_options);
12578       /* Initialize SIMD builtins if we haven't already.
12579          Set current_target_pragma to NULL for the duration so that
12580          the builtin initialization code doesn't try to tag the functions
12581          being built with the attributes specified by any current pragma, thus
12582          going into an infinite recursion.  */
12583       if (TARGET_SIMD)
12584         {
12585           tree saved_current_target_pragma = current_target_pragma;
12586           current_target_pragma = NULL;
12587           aarch64_init_simd_builtins ();
12588           current_target_pragma = saved_current_target_pragma;
12589         }
12590       new_target = build_target_option_node (&global_options);
12591     }
12592   else
12593     new_target = NULL;
12594
12595   new_optimize = build_optimization_node (&global_options);
12596
12597   if (fndecl && ret)
12598     {
12599       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
12600
12601       if (old_optimize != new_optimize)
12602         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
12603     }
12604
12605   cl_target_option_restore (&global_options, &cur_target);
12606
12607   if (old_optimize != new_optimize)
12608     cl_optimization_restore (&global_options,
12609                              TREE_OPTIMIZATION (old_optimize));
12610   return ret;
12611 }
12612
12613 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
12614    tri-bool options (yes, no, don't care) and the default value is
12615    DEF, determine whether to reject inlining.  */
12616
12617 static bool
12618 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
12619                                      int dont_care, int def)
12620 {
12621   /* If the callee doesn't care, always allow inlining.  */
12622   if (callee == dont_care)
12623     return true;
12624
12625   /* If the caller doesn't care, always allow inlining.  */
12626   if (caller == dont_care)
12627     return true;
12628
12629   /* Otherwise, allow inlining if either the callee and caller values
12630      agree, or if the callee is using the default value.  */
12631   return (callee == caller || callee == def);
12632 }
12633
12634 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
12635    to inline CALLEE into CALLER based on target-specific info.
12636    Make sure that the caller and callee have compatible architectural
12637    features.  Then go through the other possible target attributes
12638    and see if they can block inlining.  Try not to reject always_inline
12639    callees unless they are incompatible architecturally.  */
12640
12641 static bool
12642 aarch64_can_inline_p (tree caller, tree callee)
12643 {
12644   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
12645   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
12646
12647   struct cl_target_option *caller_opts
12648         = TREE_TARGET_OPTION (caller_tree ? caller_tree
12649                                            : target_option_default_node);
12650
12651   struct cl_target_option *callee_opts
12652         = TREE_TARGET_OPTION (callee_tree ? callee_tree
12653                                            : target_option_default_node);
12654
12655   /* Callee's ISA flags should be a subset of the caller's.  */
12656   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
12657        != callee_opts->x_aarch64_isa_flags)
12658     return false;
12659
12660   /* Allow non-strict aligned functions inlining into strict
12661      aligned ones.  */
12662   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
12663        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
12664       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
12665            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
12666     return false;
12667
12668   bool always_inline = lookup_attribute ("always_inline",
12669                                           DECL_ATTRIBUTES (callee));
12670
12671   /* If the architectural features match up and the callee is always_inline
12672      then the other attributes don't matter.  */
12673   if (always_inline)
12674     return true;
12675
12676   if (caller_opts->x_aarch64_cmodel_var
12677       != callee_opts->x_aarch64_cmodel_var)
12678     return false;
12679
12680   if (caller_opts->x_aarch64_tls_dialect
12681       != callee_opts->x_aarch64_tls_dialect)
12682     return false;
12683
12684   /* Honour explicit requests to workaround errata.  */
12685   if (!aarch64_tribools_ok_for_inlining_p (
12686           caller_opts->x_aarch64_fix_a53_err835769,
12687           callee_opts->x_aarch64_fix_a53_err835769,
12688           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
12689     return false;
12690
12691   if (!aarch64_tribools_ok_for_inlining_p (
12692           caller_opts->x_aarch64_fix_a53_err843419,
12693           callee_opts->x_aarch64_fix_a53_err843419,
12694           2, TARGET_FIX_ERR_A53_843419))
12695     return false;
12696
12697   /* If the user explicitly specified -momit-leaf-frame-pointer for the
12698      caller and calle and they don't match up, reject inlining.  */
12699   if (!aarch64_tribools_ok_for_inlining_p (
12700           caller_opts->x_flag_omit_leaf_frame_pointer,
12701           callee_opts->x_flag_omit_leaf_frame_pointer,
12702           2, 1))
12703     return false;
12704
12705   /* If the callee has specific tuning overrides, respect them.  */
12706   if (callee_opts->x_aarch64_override_tune_string != NULL
12707       && caller_opts->x_aarch64_override_tune_string == NULL)
12708     return false;
12709
12710   /* If the user specified tuning override strings for the
12711      caller and callee and they don't match up, reject inlining.
12712      We just do a string compare here, we don't analyze the meaning
12713      of the string, as it would be too costly for little gain.  */
12714   if (callee_opts->x_aarch64_override_tune_string
12715       && caller_opts->x_aarch64_override_tune_string
12716       && (strcmp (callee_opts->x_aarch64_override_tune_string,
12717                   caller_opts->x_aarch64_override_tune_string) != 0))
12718     return false;
12719
12720   return true;
12721 }
12722
12723 /* Return true if SYMBOL_REF X binds locally.  */
12724
12725 static bool
12726 aarch64_symbol_binds_local_p (const_rtx x)
12727 {
12728   return (SYMBOL_REF_DECL (x)
12729           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
12730           : SYMBOL_REF_LOCAL_P (x));
12731 }
12732
12733 /* Return true if SYMBOL_REF X is thread local */
12734 static bool
12735 aarch64_tls_symbol_p (rtx x)
12736 {
12737   if (! TARGET_HAVE_TLS)
12738     return false;
12739
12740   if (GET_CODE (x) != SYMBOL_REF)
12741     return false;
12742
12743   return SYMBOL_REF_TLS_MODEL (x) != 0;
12744 }
12745
12746 /* Classify a TLS symbol into one of the TLS kinds.  */
12747 enum aarch64_symbol_type
12748 aarch64_classify_tls_symbol (rtx x)
12749 {
12750   enum tls_model tls_kind = tls_symbolic_operand_type (x);
12751
12752   switch (tls_kind)
12753     {
12754     case TLS_MODEL_GLOBAL_DYNAMIC:
12755     case TLS_MODEL_LOCAL_DYNAMIC:
12756       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
12757
12758     case TLS_MODEL_INITIAL_EXEC:
12759       switch (aarch64_cmodel)
12760         {
12761         case AARCH64_CMODEL_TINY:
12762         case AARCH64_CMODEL_TINY_PIC:
12763           return SYMBOL_TINY_TLSIE;
12764         default:
12765           return SYMBOL_SMALL_TLSIE;
12766         }
12767
12768     case TLS_MODEL_LOCAL_EXEC:
12769       if (aarch64_tls_size == 12)
12770         return SYMBOL_TLSLE12;
12771       else if (aarch64_tls_size == 24)
12772         return SYMBOL_TLSLE24;
12773       else if (aarch64_tls_size == 32)
12774         return SYMBOL_TLSLE32;
12775       else if (aarch64_tls_size == 48)
12776         return SYMBOL_TLSLE48;
12777       else
12778         gcc_unreachable ();
12779
12780     case TLS_MODEL_EMULATED:
12781     case TLS_MODEL_NONE:
12782       return SYMBOL_FORCE_TO_MEM;
12783
12784     default:
12785       gcc_unreachable ();
12786     }
12787 }
12788
12789 /* Return the correct method for accessing X + OFFSET, where X is either
12790    a SYMBOL_REF or LABEL_REF.  */
12791
12792 enum aarch64_symbol_type
12793 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
12794 {
12795   if (GET_CODE (x) == LABEL_REF)
12796     {
12797       switch (aarch64_cmodel)
12798         {
12799         case AARCH64_CMODEL_LARGE:
12800           return SYMBOL_FORCE_TO_MEM;
12801
12802         case AARCH64_CMODEL_TINY_PIC:
12803         case AARCH64_CMODEL_TINY:
12804           return SYMBOL_TINY_ABSOLUTE;
12805
12806         case AARCH64_CMODEL_SMALL_SPIC:
12807         case AARCH64_CMODEL_SMALL_PIC:
12808         case AARCH64_CMODEL_SMALL:
12809           return SYMBOL_SMALL_ABSOLUTE;
12810
12811         default:
12812           gcc_unreachable ();
12813         }
12814     }
12815
12816   if (GET_CODE (x) == SYMBOL_REF)
12817     {
12818       if (aarch64_tls_symbol_p (x))
12819         return aarch64_classify_tls_symbol (x);
12820
12821       switch (aarch64_cmodel)
12822         {
12823         case AARCH64_CMODEL_TINY:
12824           /* When we retrieve symbol + offset address, we have to make sure
12825              the offset does not cause overflow of the final address.  But
12826              we have no way of knowing the address of symbol at compile time
12827              so we can't accurately say if the distance between the PC and
12828              symbol + offset is outside the addressible range of +/-1M in the
12829              TINY code model.  So we rely on images not being greater than
12830              1M and cap the offset at 1M and anything beyond 1M will have to
12831              be loaded using an alternative mechanism.  Furthermore if the
12832              symbol is a weak reference to something that isn't known to
12833              resolve to a symbol in this module, then force to memory.  */
12834           if ((SYMBOL_REF_WEAK (x)
12835                && !aarch64_symbol_binds_local_p (x))
12836               || !IN_RANGE (offset, -1048575, 1048575))
12837             return SYMBOL_FORCE_TO_MEM;
12838           return SYMBOL_TINY_ABSOLUTE;
12839
12840         case AARCH64_CMODEL_SMALL:
12841           /* Same reasoning as the tiny code model, but the offset cap here is
12842              4G.  */
12843           if ((SYMBOL_REF_WEAK (x)
12844                && !aarch64_symbol_binds_local_p (x))
12845               || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
12846                             HOST_WIDE_INT_C (4294967264)))
12847             return SYMBOL_FORCE_TO_MEM;
12848           return SYMBOL_SMALL_ABSOLUTE;
12849
12850         case AARCH64_CMODEL_TINY_PIC:
12851           if (!aarch64_symbol_binds_local_p (x))
12852             return SYMBOL_TINY_GOT;
12853           return SYMBOL_TINY_ABSOLUTE;
12854
12855         case AARCH64_CMODEL_SMALL_SPIC:
12856         case AARCH64_CMODEL_SMALL_PIC:
12857           if (!aarch64_symbol_binds_local_p (x))
12858             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
12859                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
12860           return SYMBOL_SMALL_ABSOLUTE;
12861
12862         case AARCH64_CMODEL_LARGE:
12863           /* This is alright even in PIC code as the constant
12864              pool reference is always PC relative and within
12865              the same translation unit.  */
12866           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
12867             return SYMBOL_SMALL_ABSOLUTE;
12868           else
12869             return SYMBOL_FORCE_TO_MEM;
12870
12871         default:
12872           gcc_unreachable ();
12873         }
12874     }
12875
12876   /* By default push everything into the constant pool.  */
12877   return SYMBOL_FORCE_TO_MEM;
12878 }
12879
12880 bool
12881 aarch64_constant_address_p (rtx x)
12882 {
12883   return (CONSTANT_P (x) && memory_address_p (DImode, x));
12884 }
12885
12886 bool
12887 aarch64_legitimate_pic_operand_p (rtx x)
12888 {
12889   if (GET_CODE (x) == SYMBOL_REF
12890       || (GET_CODE (x) == CONST
12891           && GET_CODE (XEXP (x, 0)) == PLUS
12892           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
12893      return false;
12894
12895   return true;
12896 }
12897
12898 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
12899    that should be rematerialized rather than spilled.  */
12900
12901 static bool
12902 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
12903 {
12904   /* Support CSE and rematerialization of common constants.  */
12905   if (CONST_INT_P (x)
12906       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
12907       || GET_CODE (x) == CONST_VECTOR)
12908     return true;
12909
12910   /* Do not allow vector struct mode constants for Advanced SIMD.
12911      We could support 0 and -1 easily, but they need support in
12912      aarch64-simd.md.  */
12913   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12914   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
12915     return false;
12916
12917   /* Only accept variable-length vector constants if they can be
12918      handled directly.
12919
12920      ??? It would be possible to handle rematerialization of other
12921      constants via secondary reloads.  */
12922   if (vec_flags & VEC_ANY_SVE)
12923     return aarch64_simd_valid_immediate (x, NULL);
12924
12925   if (GET_CODE (x) == HIGH)
12926     x = XEXP (x, 0);
12927
12928   /* Accept polynomial constants that can be calculated by using the
12929      destination of a move as the sole temporary.  Constants that
12930      require a second temporary cannot be rematerialized (they can't be
12931      forced to memory and also aren't legitimate constants).  */
12932   poly_int64 offset;
12933   if (poly_int_rtx_p (x, &offset))
12934     return aarch64_offset_temporaries (false, offset) <= 1;
12935
12936   /* If an offset is being added to something else, we need to allow the
12937      base to be moved into the destination register, meaning that there
12938      are no free temporaries for the offset.  */
12939   x = strip_offset (x, &offset);
12940   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
12941     return false;
12942
12943   /* Do not allow const (plus (anchor_symbol, const_int)).  */
12944   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
12945     return false;
12946
12947   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
12948      so spilling them is better than rematerialization.  */
12949   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
12950     return true;
12951
12952   /* Label references are always constant.  */
12953   if (GET_CODE (x) == LABEL_REF)
12954     return true;
12955
12956   return false;
12957 }
12958
12959 rtx
12960 aarch64_load_tp (rtx target)
12961 {
12962   if (!target
12963       || GET_MODE (target) != Pmode
12964       || !register_operand (target, Pmode))
12965     target = gen_reg_rtx (Pmode);
12966
12967   /* Can return in any reg.  */
12968   emit_insn (gen_aarch64_load_tp_hard (target));
12969   return target;
12970 }
12971
12972 /* On AAPCS systems, this is the "struct __va_list".  */
12973 static GTY(()) tree va_list_type;
12974
12975 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
12976    Return the type to use as __builtin_va_list.
12977
12978    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
12979
12980    struct __va_list
12981    {
12982      void *__stack;
12983      void *__gr_top;
12984      void *__vr_top;
12985      int   __gr_offs;
12986      int   __vr_offs;
12987    };  */
12988
12989 static tree
12990 aarch64_build_builtin_va_list (void)
12991 {
12992   tree va_list_name;
12993   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12994
12995   /* Create the type.  */
12996   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
12997   /* Give it the required name.  */
12998   va_list_name = build_decl (BUILTINS_LOCATION,
12999                              TYPE_DECL,
13000                              get_identifier ("__va_list"),
13001                              va_list_type);
13002   DECL_ARTIFICIAL (va_list_name) = 1;
13003   TYPE_NAME (va_list_type) = va_list_name;
13004   TYPE_STUB_DECL (va_list_type) = va_list_name;
13005
13006   /* Create the fields.  */
13007   f_stack = build_decl (BUILTINS_LOCATION,
13008                         FIELD_DECL, get_identifier ("__stack"),
13009                         ptr_type_node);
13010   f_grtop = build_decl (BUILTINS_LOCATION,
13011                         FIELD_DECL, get_identifier ("__gr_top"),
13012                         ptr_type_node);
13013   f_vrtop = build_decl (BUILTINS_LOCATION,
13014                         FIELD_DECL, get_identifier ("__vr_top"),
13015                         ptr_type_node);
13016   f_groff = build_decl (BUILTINS_LOCATION,
13017                         FIELD_DECL, get_identifier ("__gr_offs"),
13018                         integer_type_node);
13019   f_vroff = build_decl (BUILTINS_LOCATION,
13020                         FIELD_DECL, get_identifier ("__vr_offs"),
13021                         integer_type_node);
13022
13023   /* Tell tree-stdarg pass about our internal offset fields.
13024      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
13025      purpose to identify whether the code is updating va_list internal
13026      offset fields through irregular way.  */
13027   va_list_gpr_counter_field = f_groff;
13028   va_list_fpr_counter_field = f_vroff;
13029
13030   DECL_ARTIFICIAL (f_stack) = 1;
13031   DECL_ARTIFICIAL (f_grtop) = 1;
13032   DECL_ARTIFICIAL (f_vrtop) = 1;
13033   DECL_ARTIFICIAL (f_groff) = 1;
13034   DECL_ARTIFICIAL (f_vroff) = 1;
13035
13036   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
13037   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
13038   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
13039   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
13040   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
13041
13042   TYPE_FIELDS (va_list_type) = f_stack;
13043   DECL_CHAIN (f_stack) = f_grtop;
13044   DECL_CHAIN (f_grtop) = f_vrtop;
13045   DECL_CHAIN (f_vrtop) = f_groff;
13046   DECL_CHAIN (f_groff) = f_vroff;
13047
13048   /* Compute its layout.  */
13049   layout_type (va_list_type);
13050
13051   return va_list_type;
13052 }
13053
13054 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
13055 static void
13056 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
13057 {
13058   const CUMULATIVE_ARGS *cum;
13059   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13060   tree stack, grtop, vrtop, groff, vroff;
13061   tree t;
13062   int gr_save_area_size = cfun->va_list_gpr_size;
13063   int vr_save_area_size = cfun->va_list_fpr_size;
13064   int vr_offset;
13065
13066   cum = &crtl->args.info;
13067   if (cfun->va_list_gpr_size)
13068     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
13069                              cfun->va_list_gpr_size);
13070   if (cfun->va_list_fpr_size)
13071     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
13072                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
13073
13074   if (!TARGET_FLOAT)
13075     {
13076       gcc_assert (cum->aapcs_nvrn == 0);
13077       vr_save_area_size = 0;
13078     }
13079
13080   f_stack = TYPE_FIELDS (va_list_type_node);
13081   f_grtop = DECL_CHAIN (f_stack);
13082   f_vrtop = DECL_CHAIN (f_grtop);
13083   f_groff = DECL_CHAIN (f_vrtop);
13084   f_vroff = DECL_CHAIN (f_groff);
13085
13086   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
13087                   NULL_TREE);
13088   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
13089                   NULL_TREE);
13090   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
13091                   NULL_TREE);
13092   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
13093                   NULL_TREE);
13094   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
13095                   NULL_TREE);
13096
13097   /* Emit code to initialize STACK, which points to the next varargs stack
13098      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
13099      by named arguments.  STACK is 8-byte aligned.  */
13100   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
13101   if (cum->aapcs_stack_size > 0)
13102     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
13103   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
13104   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13105
13106   /* Emit code to initialize GRTOP, the top of the GR save area.
13107      virtual_incoming_args_rtx should have been 16 byte aligned.  */
13108   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
13109   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
13110   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13111
13112   /* Emit code to initialize VRTOP, the top of the VR save area.
13113      This address is gr_save_area_bytes below GRTOP, rounded
13114      down to the next 16-byte boundary.  */
13115   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
13116   vr_offset = ROUND_UP (gr_save_area_size,
13117                         STACK_BOUNDARY / BITS_PER_UNIT);
13118
13119   if (vr_offset)
13120     t = fold_build_pointer_plus_hwi (t, -vr_offset);
13121   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
13122   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13123
13124   /* Emit code to initialize GROFF, the offset from GRTOP of the
13125      next GPR argument.  */
13126   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
13127               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
13128   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13129
13130   /* Likewise emit code to initialize VROFF, the offset from FTOP
13131      of the next VR argument.  */
13132   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
13133               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
13134   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13135 }
13136
13137 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
13138
13139 static tree
13140 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
13141                               gimple_seq *post_p ATTRIBUTE_UNUSED)
13142 {
13143   tree addr;
13144   bool indirect_p;
13145   bool is_ha;           /* is HFA or HVA.  */
13146   bool dw_align;        /* double-word align.  */
13147   machine_mode ag_mode = VOIDmode;
13148   int nregs;
13149   machine_mode mode;
13150
13151   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13152   tree stack, f_top, f_off, off, arg, roundup, on_stack;
13153   HOST_WIDE_INT size, rsize, adjust, align;
13154   tree t, u, cond1, cond2;
13155
13156   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
13157   if (indirect_p)
13158     type = build_pointer_type (type);
13159
13160   mode = TYPE_MODE (type);
13161
13162   f_stack = TYPE_FIELDS (va_list_type_node);
13163   f_grtop = DECL_CHAIN (f_stack);
13164   f_vrtop = DECL_CHAIN (f_grtop);
13165   f_groff = DECL_CHAIN (f_vrtop);
13166   f_vroff = DECL_CHAIN (f_groff);
13167
13168   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
13169                   f_stack, NULL_TREE);
13170   size = int_size_in_bytes (type);
13171   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
13172
13173   dw_align = false;
13174   adjust = 0;
13175   if (aarch64_vfp_is_call_or_return_candidate (mode,
13176                                                type,
13177                                                &ag_mode,
13178                                                &nregs,
13179                                                &is_ha))
13180     {
13181       /* No frontends can create types with variable-sized modes, so we
13182          shouldn't be asked to pass or return them.  */
13183       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
13184
13185       /* TYPE passed in fp/simd registers.  */
13186       if (!TARGET_FLOAT)
13187         aarch64_err_no_fpadvsimd (mode);
13188
13189       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
13190                       unshare_expr (valist), f_vrtop, NULL_TREE);
13191       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
13192                       unshare_expr (valist), f_vroff, NULL_TREE);
13193
13194       rsize = nregs * UNITS_PER_VREG;
13195
13196       if (is_ha)
13197         {
13198           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
13199             adjust = UNITS_PER_VREG - ag_size;
13200         }
13201       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13202                && size < UNITS_PER_VREG)
13203         {
13204           adjust = UNITS_PER_VREG - size;
13205         }
13206     }
13207   else
13208     {
13209       /* TYPE passed in general registers.  */
13210       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
13211                       unshare_expr (valist), f_grtop, NULL_TREE);
13212       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
13213                       unshare_expr (valist), f_groff, NULL_TREE);
13214       rsize = ROUND_UP (size, UNITS_PER_WORD);
13215       nregs = rsize / UNITS_PER_WORD;
13216
13217       if (align > 8)
13218         dw_align = true;
13219
13220       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13221           && size < UNITS_PER_WORD)
13222         {
13223           adjust = UNITS_PER_WORD  - size;
13224         }
13225     }
13226
13227   /* Get a local temporary for the field value.  */
13228   off = get_initialized_tmp_var (f_off, pre_p, NULL);
13229
13230   /* Emit code to branch if off >= 0.  */
13231   t = build2 (GE_EXPR, boolean_type_node, off,
13232               build_int_cst (TREE_TYPE (off), 0));
13233   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
13234
13235   if (dw_align)
13236     {
13237       /* Emit: offs = (offs + 15) & -16.  */
13238       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
13239                   build_int_cst (TREE_TYPE (off), 15));
13240       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
13241                   build_int_cst (TREE_TYPE (off), -16));
13242       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
13243     }
13244   else
13245     roundup = NULL;
13246
13247   /* Update ap.__[g|v]r_offs  */
13248   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
13249               build_int_cst (TREE_TYPE (off), rsize));
13250   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
13251
13252   /* String up.  */
13253   if (roundup)
13254     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
13255
13256   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
13257   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
13258               build_int_cst (TREE_TYPE (f_off), 0));
13259   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
13260
13261   /* String up: make sure the assignment happens before the use.  */
13262   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
13263   COND_EXPR_ELSE (cond1) = t;
13264
13265   /* Prepare the trees handling the argument that is passed on the stack;
13266      the top level node will store in ON_STACK.  */
13267   arg = get_initialized_tmp_var (stack, pre_p, NULL);
13268   if (align > 8)
13269     {
13270       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
13271       t = fold_build_pointer_plus_hwi (arg, 15);
13272       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
13273                   build_int_cst (TREE_TYPE (t), -16));
13274       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
13275     }
13276   else
13277     roundup = NULL;
13278   /* Advance ap.__stack  */
13279   t = fold_build_pointer_plus_hwi (arg, size + 7);
13280   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
13281               build_int_cst (TREE_TYPE (t), -8));
13282   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
13283   /* String up roundup and advance.  */
13284   if (roundup)
13285     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
13286   /* String up with arg */
13287   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
13288   /* Big-endianness related address adjustment.  */
13289   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13290       && size < UNITS_PER_WORD)
13291   {
13292     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
13293                 size_int (UNITS_PER_WORD - size));
13294     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
13295   }
13296
13297   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
13298   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
13299
13300   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
13301   t = off;
13302   if (adjust)
13303     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
13304                 build_int_cst (TREE_TYPE (off), adjust));
13305
13306   t = fold_convert (sizetype, t);
13307   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
13308
13309   if (is_ha)
13310     {
13311       /* type ha; // treat as "struct {ftype field[n];}"
13312          ... [computing offs]
13313          for (i = 0; i <nregs; ++i, offs += 16)
13314            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
13315          return ha;  */
13316       int i;
13317       tree tmp_ha, field_t, field_ptr_t;
13318
13319       /* Declare a local variable.  */
13320       tmp_ha = create_tmp_var_raw (type, "ha");
13321       gimple_add_tmp_var (tmp_ha);
13322
13323       /* Establish the base type.  */
13324       switch (ag_mode)
13325         {
13326         case E_SFmode:
13327           field_t = float_type_node;
13328           field_ptr_t = float_ptr_type_node;
13329           break;
13330         case E_DFmode:
13331           field_t = double_type_node;
13332           field_ptr_t = double_ptr_type_node;
13333           break;
13334         case E_TFmode:
13335           field_t = long_double_type_node;
13336           field_ptr_t = long_double_ptr_type_node;
13337           break;
13338         case E_HFmode:
13339           field_t = aarch64_fp16_type_node;
13340           field_ptr_t = aarch64_fp16_ptr_type_node;
13341           break;
13342         case E_V2SImode:
13343         case E_V4SImode:
13344             {
13345               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
13346               field_t = build_vector_type_for_mode (innertype, ag_mode);
13347               field_ptr_t = build_pointer_type (field_t);
13348             }
13349           break;
13350         default:
13351           gcc_assert (0);
13352         }
13353
13354       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
13355       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
13356       addr = t;
13357       t = fold_convert (field_ptr_t, addr);
13358       t = build2 (MODIFY_EXPR, field_t,
13359                   build1 (INDIRECT_REF, field_t, tmp_ha),
13360                   build1 (INDIRECT_REF, field_t, t));
13361
13362       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
13363       for (i = 1; i < nregs; ++i)
13364         {
13365           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
13366           u = fold_convert (field_ptr_t, addr);
13367           u = build2 (MODIFY_EXPR, field_t,
13368                       build2 (MEM_REF, field_t, tmp_ha,
13369                               build_int_cst (field_ptr_t,
13370                                              (i *
13371                                               int_size_in_bytes (field_t)))),
13372                       build1 (INDIRECT_REF, field_t, u));
13373           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
13374         }
13375
13376       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
13377       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
13378     }
13379
13380   COND_EXPR_ELSE (cond2) = t;
13381   addr = fold_convert (build_pointer_type (type), cond1);
13382   addr = build_va_arg_indirect_ref (addr);
13383
13384   if (indirect_p)
13385     addr = build_va_arg_indirect_ref (addr);
13386
13387   return addr;
13388 }
13389
13390 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
13391
13392 static void
13393 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
13394                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
13395                                 int no_rtl)
13396 {
13397   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
13398   CUMULATIVE_ARGS local_cum;
13399   int gr_saved = cfun->va_list_gpr_size;
13400   int vr_saved = cfun->va_list_fpr_size;
13401
13402   /* The caller has advanced CUM up to, but not beyond, the last named
13403      argument.  Advance a local copy of CUM past the last "real" named
13404      argument, to find out how many registers are left over.  */
13405   local_cum = *cum;
13406   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
13407
13408   /* Found out how many registers we need to save.
13409      Honor tree-stdvar analysis results.  */
13410   if (cfun->va_list_gpr_size)
13411     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
13412                     cfun->va_list_gpr_size / UNITS_PER_WORD);
13413   if (cfun->va_list_fpr_size)
13414     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
13415                     cfun->va_list_fpr_size / UNITS_PER_VREG);
13416
13417   if (!TARGET_FLOAT)
13418     {
13419       gcc_assert (local_cum.aapcs_nvrn == 0);
13420       vr_saved = 0;
13421     }
13422
13423   if (!no_rtl)
13424     {
13425       if (gr_saved > 0)
13426         {
13427           rtx ptr, mem;
13428
13429           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
13430           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
13431                                - gr_saved * UNITS_PER_WORD);
13432           mem = gen_frame_mem (BLKmode, ptr);
13433           set_mem_alias_set (mem, get_varargs_alias_set ());
13434
13435           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
13436                                mem, gr_saved);
13437         }
13438       if (vr_saved > 0)
13439         {
13440           /* We can't use move_block_from_reg, because it will use
13441              the wrong mode, storing D regs only.  */
13442           machine_mode mode = TImode;
13443           int off, i, vr_start;
13444
13445           /* Set OFF to the offset from virtual_incoming_args_rtx of
13446              the first vector register.  The VR save area lies below
13447              the GR one, and is aligned to 16 bytes.  */
13448           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
13449                            STACK_BOUNDARY / BITS_PER_UNIT);
13450           off -= vr_saved * UNITS_PER_VREG;
13451
13452           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
13453           for (i = 0; i < vr_saved; ++i)
13454             {
13455               rtx ptr, mem;
13456
13457               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
13458               mem = gen_frame_mem (mode, ptr);
13459               set_mem_alias_set (mem, get_varargs_alias_set ());
13460               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
13461               off += UNITS_PER_VREG;
13462             }
13463         }
13464     }
13465
13466   /* We don't save the size into *PRETEND_SIZE because we want to avoid
13467      any complication of having crtl->args.pretend_args_size changed.  */
13468   cfun->machine->frame.saved_varargs_size
13469     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
13470                  STACK_BOUNDARY / BITS_PER_UNIT)
13471        + vr_saved * UNITS_PER_VREG);
13472 }
13473
13474 static void
13475 aarch64_conditional_register_usage (void)
13476 {
13477   int i;
13478   if (!TARGET_FLOAT)
13479     {
13480       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
13481         {
13482           fixed_regs[i] = 1;
13483           call_used_regs[i] = 1;
13484         }
13485     }
13486   if (!TARGET_SVE)
13487     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
13488       {
13489         fixed_regs[i] = 1;
13490         call_used_regs[i] = 1;
13491       }
13492
13493   /* When tracking speculation, we need a couple of call-clobbered registers
13494      to track the speculation state.  It would be nice to just use
13495      IP0 and IP1, but currently there are numerous places that just
13496      assume these registers are free for other uses (eg pointer
13497      authentication).  */
13498   if (aarch64_track_speculation)
13499     {
13500       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
13501       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
13502       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
13503       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
13504     }
13505 }
13506
13507 /* Walk down the type tree of TYPE counting consecutive base elements.
13508    If *MODEP is VOIDmode, then set it to the first valid floating point
13509    type.  If a non-floating point type is found, or if a floating point
13510    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
13511    otherwise return the count in the sub-tree.  */
13512 static int
13513 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
13514 {
13515   machine_mode mode;
13516   HOST_WIDE_INT size;
13517
13518   switch (TREE_CODE (type))
13519     {
13520     case REAL_TYPE:
13521       mode = TYPE_MODE (type);
13522       if (mode != DFmode && mode != SFmode
13523           && mode != TFmode && mode != HFmode)
13524         return -1;
13525
13526       if (*modep == VOIDmode)
13527         *modep = mode;
13528
13529       if (*modep == mode)
13530         return 1;
13531
13532       break;
13533
13534     case COMPLEX_TYPE:
13535       mode = TYPE_MODE (TREE_TYPE (type));
13536       if (mode != DFmode && mode != SFmode
13537           && mode != TFmode && mode != HFmode)
13538         return -1;
13539
13540       if (*modep == VOIDmode)
13541         *modep = mode;
13542
13543       if (*modep == mode)
13544         return 2;
13545
13546       break;
13547
13548     case VECTOR_TYPE:
13549       /* Use V2SImode and V4SImode as representatives of all 64-bit
13550          and 128-bit vector types.  */
13551       size = int_size_in_bytes (type);
13552       switch (size)
13553         {
13554         case 8:
13555           mode = V2SImode;
13556           break;
13557         case 16:
13558           mode = V4SImode;
13559           break;
13560         default:
13561           return -1;
13562         }
13563
13564       if (*modep == VOIDmode)
13565         *modep = mode;
13566
13567       /* Vector modes are considered to be opaque: two vectors are
13568          equivalent for the purposes of being homogeneous aggregates
13569          if they are the same size.  */
13570       if (*modep == mode)
13571         return 1;
13572
13573       break;
13574
13575     case ARRAY_TYPE:
13576       {
13577         int count;
13578         tree index = TYPE_DOMAIN (type);
13579
13580         /* Can't handle incomplete types nor sizes that are not
13581            fixed.  */
13582         if (!COMPLETE_TYPE_P (type)
13583             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13584           return -1;
13585
13586         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
13587         if (count == -1
13588             || !index
13589             || !TYPE_MAX_VALUE (index)
13590             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
13591             || !TYPE_MIN_VALUE (index)
13592             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
13593             || count < 0)
13594           return -1;
13595
13596         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
13597                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
13598
13599         /* There must be no padding.  */
13600         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13601                       count * GET_MODE_BITSIZE (*modep)))
13602           return -1;
13603
13604         return count;
13605       }
13606
13607     case RECORD_TYPE:
13608       {
13609         int count = 0;
13610         int sub_count;
13611         tree field;
13612
13613         /* Can't handle incomplete types nor sizes that are not
13614            fixed.  */
13615         if (!COMPLETE_TYPE_P (type)
13616             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13617           return -1;
13618
13619         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
13620           {
13621             if (TREE_CODE (field) != FIELD_DECL)
13622               continue;
13623
13624             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
13625             if (sub_count < 0)
13626               return -1;
13627             count += sub_count;
13628           }
13629
13630         /* There must be no padding.  */
13631         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13632                       count * GET_MODE_BITSIZE (*modep)))
13633           return -1;
13634
13635         return count;
13636       }
13637
13638     case UNION_TYPE:
13639     case QUAL_UNION_TYPE:
13640       {
13641         /* These aren't very interesting except in a degenerate case.  */
13642         int count = 0;
13643         int sub_count;
13644         tree field;
13645
13646         /* Can't handle incomplete types nor sizes that are not
13647            fixed.  */
13648         if (!COMPLETE_TYPE_P (type)
13649             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13650           return -1;
13651
13652         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
13653           {
13654             if (TREE_CODE (field) != FIELD_DECL)
13655               continue;
13656
13657             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
13658             if (sub_count < 0)
13659               return -1;
13660             count = count > sub_count ? count : sub_count;
13661           }
13662
13663         /* There must be no padding.  */
13664         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13665                       count * GET_MODE_BITSIZE (*modep)))
13666           return -1;
13667
13668         return count;
13669       }
13670
13671     default:
13672       break;
13673     }
13674
13675   return -1;
13676 }
13677
13678 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
13679    type as described in AAPCS64 \S 4.1.2.
13680
13681    See the comment above aarch64_composite_type_p for the notes on MODE.  */
13682
13683 static bool
13684 aarch64_short_vector_p (const_tree type,
13685                         machine_mode mode)
13686 {
13687   poly_int64 size = -1;
13688
13689   if (type && TREE_CODE (type) == VECTOR_TYPE)
13690     size = int_size_in_bytes (type);
13691   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
13692             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
13693     size = GET_MODE_SIZE (mode);
13694
13695   return known_eq (size, 8) || known_eq (size, 16);
13696 }
13697
13698 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
13699    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
13700    array types.  The C99 floating-point complex types are also considered
13701    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
13702    types, which are GCC extensions and out of the scope of AAPCS64, are
13703    treated as composite types here as well.
13704
13705    Note that MODE itself is not sufficient in determining whether a type
13706    is such a composite type or not.  This is because
13707    stor-layout.c:compute_record_mode may have already changed the MODE
13708    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
13709    structure with only one field may have its MODE set to the mode of the
13710    field.  Also an integer mode whose size matches the size of the
13711    RECORD_TYPE type may be used to substitute the original mode
13712    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
13713    solely relied on.  */
13714
13715 static bool
13716 aarch64_composite_type_p (const_tree type,
13717                           machine_mode mode)
13718 {
13719   if (aarch64_short_vector_p (type, mode))
13720     return false;
13721
13722   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
13723     return true;
13724
13725   if (mode == BLKmode
13726       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
13727       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
13728     return true;
13729
13730   return false;
13731 }
13732
13733 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
13734    shall be passed or returned in simd/fp register(s) (providing these
13735    parameter passing registers are available).
13736
13737    Upon successful return, *COUNT returns the number of needed registers,
13738    *BASE_MODE returns the mode of the individual register and when IS_HAF
13739    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
13740    floating-point aggregate or a homogeneous short-vector aggregate.  */
13741
13742 static bool
13743 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
13744                                          const_tree type,
13745                                          machine_mode *base_mode,
13746                                          int *count,
13747                                          bool *is_ha)
13748 {
13749   machine_mode new_mode = VOIDmode;
13750   bool composite_p = aarch64_composite_type_p (type, mode);
13751
13752   if (is_ha != NULL) *is_ha = false;
13753
13754   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
13755       || aarch64_short_vector_p (type, mode))
13756     {
13757       *count = 1;
13758       new_mode = mode;
13759     }
13760   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
13761     {
13762       if (is_ha != NULL) *is_ha = true;
13763       *count = 2;
13764       new_mode = GET_MODE_INNER (mode);
13765     }
13766   else if (type && composite_p)
13767     {
13768       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
13769
13770       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
13771         {
13772           if (is_ha != NULL) *is_ha = true;
13773           *count = ag_count;
13774         }
13775       else
13776         return false;
13777     }
13778   else
13779     return false;
13780
13781   *base_mode = new_mode;
13782   return true;
13783 }
13784
13785 /* Implement TARGET_STRUCT_VALUE_RTX.  */
13786
13787 static rtx
13788 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
13789                           int incoming ATTRIBUTE_UNUSED)
13790 {
13791   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
13792 }
13793
13794 /* Implements target hook vector_mode_supported_p.  */
13795 static bool
13796 aarch64_vector_mode_supported_p (machine_mode mode)
13797 {
13798   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13799   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
13800 }
13801
13802 /* Return appropriate SIMD container
13803    for MODE within a vector of WIDTH bits.  */
13804 static machine_mode
13805 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
13806 {
13807   if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
13808     switch (mode)
13809       {
13810       case E_DFmode:
13811         return VNx2DFmode;
13812       case E_SFmode:
13813         return VNx4SFmode;
13814       case E_HFmode:
13815         return VNx8HFmode;
13816       case E_DImode:
13817         return VNx2DImode;
13818       case E_SImode:
13819         return VNx4SImode;
13820       case E_HImode:
13821         return VNx8HImode;
13822       case E_QImode:
13823         return VNx16QImode;
13824       default:
13825         return word_mode;
13826       }
13827
13828   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
13829   if (TARGET_SIMD)
13830     {
13831       if (known_eq (width, 128))
13832         switch (mode)
13833           {
13834           case E_DFmode:
13835             return V2DFmode;
13836           case E_SFmode:
13837             return V4SFmode;
13838           case E_HFmode:
13839             return V8HFmode;
13840           case E_SImode:
13841             return V4SImode;
13842           case E_HImode:
13843             return V8HImode;
13844           case E_QImode:
13845             return V16QImode;
13846           case E_DImode:
13847             return V2DImode;
13848           default:
13849             break;
13850           }
13851       else
13852         switch (mode)
13853           {
13854           case E_SFmode:
13855             return V2SFmode;
13856           case E_HFmode:
13857             return V4HFmode;
13858           case E_SImode:
13859             return V2SImode;
13860           case E_HImode:
13861             return V4HImode;
13862           case E_QImode:
13863             return V8QImode;
13864           default:
13865             break;
13866           }
13867     }
13868   return word_mode;
13869 }
13870
13871 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
13872 static machine_mode
13873 aarch64_preferred_simd_mode (scalar_mode mode)
13874 {
13875   poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
13876   return aarch64_simd_container_mode (mode, bits);
13877 }
13878
13879 /* Return a list of possible vector sizes for the vectorizer
13880    to iterate over.  */
13881 static void
13882 aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
13883 {
13884   if (TARGET_SVE)
13885     sizes->safe_push (BYTES_PER_SVE_VECTOR);
13886   sizes->safe_push (16);
13887   sizes->safe_push (8);
13888 }
13889
13890 /* Implement TARGET_MANGLE_TYPE.  */
13891
13892 static const char *
13893 aarch64_mangle_type (const_tree type)
13894 {
13895   /* The AArch64 ABI documents say that "__va_list" has to be
13896      mangled as if it is in the "std" namespace.  */
13897   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
13898     return "St9__va_list";
13899
13900   /* Half-precision float.  */
13901   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
13902     return "Dh";
13903
13904   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
13905      builtin types.  */
13906   if (TYPE_NAME (type) != NULL)
13907     return aarch64_mangle_builtin_type (type);
13908
13909   /* Use the default mangling.  */
13910   return NULL;
13911 }
13912
13913 /* Find the first rtx_insn before insn that will generate an assembly
13914    instruction.  */
13915
13916 static rtx_insn *
13917 aarch64_prev_real_insn (rtx_insn *insn)
13918 {
13919   if (!insn)
13920     return NULL;
13921
13922   do
13923     {
13924       insn = prev_real_insn (insn);
13925     }
13926   while (insn && recog_memoized (insn) < 0);
13927
13928   return insn;
13929 }
13930
13931 static bool
13932 is_madd_op (enum attr_type t1)
13933 {
13934   unsigned int i;
13935   /* A number of these may be AArch32 only.  */
13936   enum attr_type mlatypes[] = {
13937     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
13938     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
13939     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
13940   };
13941
13942   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
13943     {
13944       if (t1 == mlatypes[i])
13945         return true;
13946     }
13947
13948   return false;
13949 }
13950
13951 /* Check if there is a register dependency between a load and the insn
13952    for which we hold recog_data.  */
13953
13954 static bool
13955 dep_between_memop_and_curr (rtx memop)
13956 {
13957   rtx load_reg;
13958   int opno;
13959
13960   gcc_assert (GET_CODE (memop) == SET);
13961
13962   if (!REG_P (SET_DEST (memop)))
13963     return false;
13964
13965   load_reg = SET_DEST (memop);
13966   for (opno = 1; opno < recog_data.n_operands; opno++)
13967     {
13968       rtx operand = recog_data.operand[opno];
13969       if (REG_P (operand)
13970           && reg_overlap_mentioned_p (load_reg, operand))
13971         return true;
13972
13973     }
13974   return false;
13975 }
13976
13977
13978 /* When working around the Cortex-A53 erratum 835769,
13979    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
13980    instruction and has a preceding memory instruction such that a NOP
13981    should be inserted between them.  */
13982
13983 bool
13984 aarch64_madd_needs_nop (rtx_insn* insn)
13985 {
13986   enum attr_type attr_type;
13987   rtx_insn *prev;
13988   rtx body;
13989
13990   if (!TARGET_FIX_ERR_A53_835769)
13991     return false;
13992
13993   if (!INSN_P (insn) || recog_memoized (insn) < 0)
13994     return false;
13995
13996   attr_type = get_attr_type (insn);
13997   if (!is_madd_op (attr_type))
13998     return false;
13999
14000   prev = aarch64_prev_real_insn (insn);
14001   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
14002      Restore recog state to INSN to avoid state corruption.  */
14003   extract_constrain_insn_cached (insn);
14004
14005   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
14006     return false;
14007
14008   body = single_set (prev);
14009
14010   /* If the previous insn is a memory op and there is no dependency between
14011      it and the DImode madd, emit a NOP between them.  If body is NULL then we
14012      have a complex memory operation, probably a load/store pair.
14013      Be conservative for now and emit a NOP.  */
14014   if (GET_MODE (recog_data.operand[0]) == DImode
14015       && (!body || !dep_between_memop_and_curr (body)))
14016     return true;
14017
14018   return false;
14019
14020 }
14021
14022
14023 /* Implement FINAL_PRESCAN_INSN.  */
14024
14025 void
14026 aarch64_final_prescan_insn (rtx_insn *insn)
14027 {
14028   if (aarch64_madd_needs_nop (insn))
14029     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
14030 }
14031
14032
14033 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
14034    instruction.  */
14035
14036 bool
14037 aarch64_sve_index_immediate_p (rtx base_or_step)
14038 {
14039   return (CONST_INT_P (base_or_step)
14040           && IN_RANGE (INTVAL (base_or_step), -16, 15));
14041 }
14042
14043 /* Return true if X is a valid immediate for the SVE ADD and SUB
14044    instructions.  Negate X first if NEGATE_P is true.  */
14045
14046 bool
14047 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
14048 {
14049   rtx elt;
14050
14051   if (!const_vec_duplicate_p (x, &elt)
14052       || !CONST_INT_P (elt))
14053     return false;
14054
14055   HOST_WIDE_INT val = INTVAL (elt);
14056   if (negate_p)
14057     val = -val;
14058   val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
14059
14060   if (val & 0xff)
14061     return IN_RANGE (val, 0, 0xff);
14062   return IN_RANGE (val, 0, 0xff00);
14063 }
14064
14065 /* Return true if X is a valid immediate operand for an SVE logical
14066    instruction such as AND.  */
14067
14068 bool
14069 aarch64_sve_bitmask_immediate_p (rtx x)
14070 {
14071   rtx elt;
14072
14073   return (const_vec_duplicate_p (x, &elt)
14074           && CONST_INT_P (elt)
14075           && aarch64_bitmask_imm (INTVAL (elt),
14076                                   GET_MODE_INNER (GET_MODE (x))));
14077 }
14078
14079 /* Return true if X is a valid immediate for the SVE DUP and CPY
14080    instructions.  */
14081
14082 bool
14083 aarch64_sve_dup_immediate_p (rtx x)
14084 {
14085   rtx elt;
14086
14087   if (!const_vec_duplicate_p (x, &elt)
14088       || !CONST_INT_P (elt))
14089     return false;
14090
14091   HOST_WIDE_INT val = INTVAL (elt);
14092   if (val & 0xff)
14093     return IN_RANGE (val, -0x80, 0x7f);
14094   return IN_RANGE (val, -0x8000, 0x7f00);
14095 }
14096
14097 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
14098    SIGNED_P says whether the operand is signed rather than unsigned.  */
14099
14100 bool
14101 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
14102 {
14103   rtx elt;
14104
14105   return (const_vec_duplicate_p (x, &elt)
14106           && CONST_INT_P (elt)
14107           && (signed_p
14108               ? IN_RANGE (INTVAL (elt), -16, 15)
14109               : IN_RANGE (INTVAL (elt), 0, 127)));
14110 }
14111
14112 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
14113    instruction.  Negate X first if NEGATE_P is true.  */
14114
14115 bool
14116 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
14117 {
14118   rtx elt;
14119   REAL_VALUE_TYPE r;
14120
14121   if (!const_vec_duplicate_p (x, &elt)
14122       || GET_CODE (elt) != CONST_DOUBLE)
14123     return false;
14124
14125   r = *CONST_DOUBLE_REAL_VALUE (elt);
14126
14127   if (negate_p)
14128     r = real_value_negate (&r);
14129
14130   if (real_equal (&r, &dconst1))
14131     return true;
14132   if (real_equal (&r, &dconsthalf))
14133     return true;
14134   return false;
14135 }
14136
14137 /* Return true if X is a valid immediate operand for an SVE FMUL
14138    instruction.  */
14139
14140 bool
14141 aarch64_sve_float_mul_immediate_p (rtx x)
14142 {
14143   rtx elt;
14144
14145   /* GCC will never generate a multiply with an immediate of 2, so there is no
14146      point testing for it (even though it is a valid constant).  */
14147   return (const_vec_duplicate_p (x, &elt)
14148           && GET_CODE (elt) == CONST_DOUBLE
14149           && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
14150 }
14151
14152 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
14153    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
14154    is nonnull, use it to describe valid immediates.  */
14155 static bool
14156 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
14157                                     simd_immediate_info *info,
14158                                     enum simd_immediate_check which,
14159                                     simd_immediate_info::insn_type insn)
14160 {
14161   /* Try a 4-byte immediate with LSL.  */
14162   for (unsigned int shift = 0; shift < 32; shift += 8)
14163     if ((val32 & (0xff << shift)) == val32)
14164       {
14165         if (info)
14166           *info = simd_immediate_info (SImode, val32 >> shift, insn,
14167                                        simd_immediate_info::LSL, shift);
14168         return true;
14169       }
14170
14171   /* Try a 2-byte immediate with LSL.  */
14172   unsigned int imm16 = val32 & 0xffff;
14173   if (imm16 == (val32 >> 16))
14174     for (unsigned int shift = 0; shift < 16; shift += 8)
14175       if ((imm16 & (0xff << shift)) == imm16)
14176         {
14177           if (info)
14178             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
14179                                          simd_immediate_info::LSL, shift);
14180           return true;
14181         }
14182
14183   /* Try a 4-byte immediate with MSL, except for cases that MVN
14184      can handle.  */
14185   if (which == AARCH64_CHECK_MOV)
14186     for (unsigned int shift = 8; shift < 24; shift += 8)
14187       {
14188         unsigned int low = (1 << shift) - 1;
14189         if (((val32 & (0xff << shift)) | low) == val32)
14190           {
14191             if (info)
14192               *info = simd_immediate_info (SImode, val32 >> shift, insn,
14193                                            simd_immediate_info::MSL, shift);
14194             return true;
14195           }
14196       }
14197
14198   return false;
14199 }
14200
14201 /* Return true if replicating VAL64 is a valid immediate for the
14202    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
14203    use it to describe valid immediates.  */
14204 static bool
14205 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
14206                                  simd_immediate_info *info,
14207                                  enum simd_immediate_check which)
14208 {
14209   unsigned int val32 = val64 & 0xffffffff;
14210   unsigned int val16 = val64 & 0xffff;
14211   unsigned int val8 = val64 & 0xff;
14212
14213   if (val32 == (val64 >> 32))
14214     {
14215       if ((which & AARCH64_CHECK_ORR) != 0
14216           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
14217                                                  simd_immediate_info::MOV))
14218         return true;
14219
14220       if ((which & AARCH64_CHECK_BIC) != 0
14221           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
14222                                                  simd_immediate_info::MVN))
14223         return true;
14224
14225       /* Try using a replicated byte.  */
14226       if (which == AARCH64_CHECK_MOV
14227           && val16 == (val32 >> 16)
14228           && val8 == (val16 >> 8))
14229         {
14230           if (info)
14231             *info = simd_immediate_info (QImode, val8);
14232           return true;
14233         }
14234     }
14235
14236   /* Try using a bit-to-bytemask.  */
14237   if (which == AARCH64_CHECK_MOV)
14238     {
14239       unsigned int i;
14240       for (i = 0; i < 64; i += 8)
14241         {
14242           unsigned char byte = (val64 >> i) & 0xff;
14243           if (byte != 0 && byte != 0xff)
14244             break;
14245         }
14246       if (i == 64)
14247         {
14248           if (info)
14249             *info = simd_immediate_info (DImode, val64);
14250           return true;
14251         }
14252     }
14253   return false;
14254 }
14255
14256 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
14257    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
14258
14259 static bool
14260 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
14261                              simd_immediate_info *info)
14262 {
14263   scalar_int_mode mode = DImode;
14264   unsigned int val32 = val64 & 0xffffffff;
14265   if (val32 == (val64 >> 32))
14266     {
14267       mode = SImode;
14268       unsigned int val16 = val32 & 0xffff;
14269       if (val16 == (val32 >> 16))
14270         {
14271           mode = HImode;
14272           unsigned int val8 = val16 & 0xff;
14273           if (val8 == (val16 >> 8))
14274             mode = QImode;
14275         }
14276     }
14277   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
14278   if (IN_RANGE (val, -0x80, 0x7f))
14279     {
14280       /* DUP with no shift.  */
14281       if (info)
14282         *info = simd_immediate_info (mode, val);
14283       return true;
14284     }
14285   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
14286     {
14287       /* DUP with LSL #8.  */
14288       if (info)
14289         *info = simd_immediate_info (mode, val);
14290       return true;
14291     }
14292   if (aarch64_bitmask_imm (val64, mode))
14293     {
14294       /* DUPM.  */
14295       if (info)
14296         *info = simd_immediate_info (mode, val);
14297       return true;
14298     }
14299   return false;
14300 }
14301
14302 /* Return true if OP is a valid SIMD immediate for the operation
14303    described by WHICH.  If INFO is nonnull, use it to describe valid
14304    immediates.  */
14305 bool
14306 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
14307                               enum simd_immediate_check which)
14308 {
14309   machine_mode mode = GET_MODE (op);
14310   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14311   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
14312     return false;
14313
14314   scalar_mode elt_mode = GET_MODE_INNER (mode);
14315   rtx base, step;
14316   unsigned int n_elts;
14317   if (GET_CODE (op) == CONST_VECTOR
14318       && CONST_VECTOR_DUPLICATE_P (op))
14319     n_elts = CONST_VECTOR_NPATTERNS (op);
14320   else if ((vec_flags & VEC_SVE_DATA)
14321            && const_vec_series_p (op, &base, &step))
14322     {
14323       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
14324       if (!aarch64_sve_index_immediate_p (base)
14325           || !aarch64_sve_index_immediate_p (step))
14326         return false;
14327
14328       if (info)
14329         *info = simd_immediate_info (elt_mode, base, step);
14330       return true;
14331     }
14332   else if (GET_CODE (op) == CONST_VECTOR
14333            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
14334     /* N_ELTS set above.  */;
14335   else
14336     return false;
14337
14338   /* Handle PFALSE and PTRUE.  */
14339   if (vec_flags & VEC_SVE_PRED)
14340     return (op == CONST0_RTX (mode)
14341             || op == CONSTM1_RTX (mode));
14342
14343   scalar_float_mode elt_float_mode;
14344   if (n_elts == 1
14345       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
14346     {
14347       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
14348       if (aarch64_float_const_zero_rtx_p (elt)
14349           || aarch64_float_const_representable_p (elt))
14350         {
14351           if (info)
14352             *info = simd_immediate_info (elt_float_mode, elt);
14353           return true;
14354         }
14355     }
14356
14357   unsigned int elt_size = GET_MODE_SIZE (elt_mode);
14358   if (elt_size > 8)
14359     return false;
14360
14361   scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
14362
14363   /* Expand the vector constant out into a byte vector, with the least
14364      significant byte of the register first.  */
14365   auto_vec<unsigned char, 16> bytes;
14366   bytes.reserve (n_elts * elt_size);
14367   for (unsigned int i = 0; i < n_elts; i++)
14368     {
14369       /* The vector is provided in gcc endian-neutral fashion.
14370          For aarch64_be Advanced SIMD, it must be laid out in the vector
14371          register in reverse order.  */
14372       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
14373       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
14374
14375       if (elt_mode != elt_int_mode)
14376         elt = gen_lowpart (elt_int_mode, elt);
14377
14378       if (!CONST_INT_P (elt))
14379         return false;
14380
14381       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
14382       for (unsigned int byte = 0; byte < elt_size; byte++)
14383         {
14384           bytes.quick_push (elt_val & 0xff);
14385           elt_val >>= BITS_PER_UNIT;
14386         }
14387     }
14388
14389   /* The immediate must repeat every eight bytes.  */
14390   unsigned int nbytes = bytes.length ();
14391   for (unsigned i = 8; i < nbytes; ++i)
14392     if (bytes[i] != bytes[i - 8])
14393       return false;
14394
14395   /* Get the repeating 8-byte value as an integer.  No endian correction
14396      is needed here because bytes is already in lsb-first order.  */
14397   unsigned HOST_WIDE_INT val64 = 0;
14398   for (unsigned int i = 0; i < 8; i++)
14399     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
14400               << (i * BITS_PER_UNIT));
14401
14402   if (vec_flags & VEC_SVE_DATA)
14403     return aarch64_sve_valid_immediate (val64, info);
14404   else
14405     return aarch64_advsimd_valid_immediate (val64, info, which);
14406 }
14407
14408 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
14409    has a step in the range of INDEX.  Return the index expression if so,
14410    otherwise return null.  */
14411 rtx
14412 aarch64_check_zero_based_sve_index_immediate (rtx x)
14413 {
14414   rtx base, step;
14415   if (const_vec_series_p (x, &base, &step)
14416       && base == const0_rtx
14417       && aarch64_sve_index_immediate_p (step))
14418     return step;
14419   return NULL_RTX;
14420 }
14421
14422 /* Check of immediate shift constants are within range.  */
14423 bool
14424 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
14425 {
14426   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
14427   if (left)
14428     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
14429   else
14430     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
14431 }
14432
14433 /* Return the bitmask CONST_INT to select the bits required by a zero extract
14434    operation of width WIDTH at bit position POS.  */
14435
14436 rtx
14437 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
14438 {
14439   gcc_assert (CONST_INT_P (width));
14440   gcc_assert (CONST_INT_P (pos));
14441
14442   unsigned HOST_WIDE_INT mask
14443     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
14444   return GEN_INT (mask << UINTVAL (pos));
14445 }
14446
14447 bool
14448 aarch64_mov_operand_p (rtx x, machine_mode mode)
14449 {
14450   if (GET_CODE (x) == HIGH
14451       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
14452     return true;
14453
14454   if (CONST_INT_P (x))
14455     return true;
14456
14457   if (VECTOR_MODE_P (GET_MODE (x)))
14458     return aarch64_simd_valid_immediate (x, NULL);
14459
14460   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
14461     return true;
14462
14463   if (aarch64_sve_cnt_immediate_p (x))
14464     return true;
14465
14466   return aarch64_classify_symbolic_expression (x)
14467     == SYMBOL_TINY_ABSOLUTE;
14468 }
14469
14470 /* Return a const_int vector of VAL.  */
14471 rtx
14472 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
14473 {
14474   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
14475   return gen_const_vec_duplicate (mode, c);
14476 }
14477
14478 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
14479
14480 bool
14481 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
14482 {
14483   machine_mode vmode;
14484
14485   vmode = aarch64_simd_container_mode (mode, 64);
14486   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
14487   return aarch64_simd_valid_immediate (op_v, NULL);
14488 }
14489
14490 /* Construct and return a PARALLEL RTX vector with elements numbering the
14491    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
14492    the vector - from the perspective of the architecture.  This does not
14493    line up with GCC's perspective on lane numbers, so we end up with
14494    different masks depending on our target endian-ness.  The diagram
14495    below may help.  We must draw the distinction when building masks
14496    which select one half of the vector.  An instruction selecting
14497    architectural low-lanes for a big-endian target, must be described using
14498    a mask selecting GCC high-lanes.
14499
14500                  Big-Endian             Little-Endian
14501
14502 GCC             0   1   2   3           3   2   1   0
14503               | x | x | x | x |       | x | x | x | x |
14504 Architecture    3   2   1   0           3   2   1   0
14505
14506 Low Mask:         { 2, 3 }                { 0, 1 }
14507 High Mask:        { 0, 1 }                { 2, 3 }
14508
14509    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
14510
14511 rtx
14512 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
14513 {
14514   rtvec v = rtvec_alloc (nunits / 2);
14515   int high_base = nunits / 2;
14516   int low_base = 0;
14517   int base;
14518   rtx t1;
14519   int i;
14520
14521   if (BYTES_BIG_ENDIAN)
14522     base = high ? low_base : high_base;
14523   else
14524     base = high ? high_base : low_base;
14525
14526   for (i = 0; i < nunits / 2; i++)
14527     RTVEC_ELT (v, i) = GEN_INT (base + i);
14528
14529   t1 = gen_rtx_PARALLEL (mode, v);
14530   return t1;
14531 }
14532
14533 /* Check OP for validity as a PARALLEL RTX vector with elements
14534    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
14535    from the perspective of the architecture.  See the diagram above
14536    aarch64_simd_vect_par_cnst_half for more details.  */
14537
14538 bool
14539 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
14540                                        bool high)
14541 {
14542   int nelts;
14543   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
14544     return false;
14545
14546   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
14547   HOST_WIDE_INT count_op = XVECLEN (op, 0);
14548   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
14549   int i = 0;
14550
14551   if (count_op != count_ideal)
14552     return false;
14553
14554   for (i = 0; i < count_ideal; i++)
14555     {
14556       rtx elt_op = XVECEXP (op, 0, i);
14557       rtx elt_ideal = XVECEXP (ideal, 0, i);
14558
14559       if (!CONST_INT_P (elt_op)
14560           || INTVAL (elt_ideal) != INTVAL (elt_op))
14561         return false;
14562     }
14563   return true;
14564 }
14565
14566 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
14567    HIGH (exclusive).  */
14568 void
14569 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
14570                           const_tree exp)
14571 {
14572   HOST_WIDE_INT lane;
14573   gcc_assert (CONST_INT_P (operand));
14574   lane = INTVAL (operand);
14575
14576   if (lane < low || lane >= high)
14577   {
14578     if (exp)
14579       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
14580     else
14581       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
14582   }
14583 }
14584
14585 /* Peform endian correction on lane number N, which indexes a vector
14586    of mode MODE, and return the result as an SImode rtx.  */
14587
14588 rtx
14589 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
14590 {
14591   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
14592 }
14593
14594 /* Return TRUE if OP is a valid vector addressing mode.  */
14595
14596 bool
14597 aarch64_simd_mem_operand_p (rtx op)
14598 {
14599   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
14600                         || REG_P (XEXP (op, 0)));
14601 }
14602
14603 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
14604
14605 bool
14606 aarch64_sve_ld1r_operand_p (rtx op)
14607 {
14608   struct aarch64_address_info addr;
14609   scalar_mode mode;
14610
14611   return (MEM_P (op)
14612           && is_a <scalar_mode> (GET_MODE (op), &mode)
14613           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
14614           && addr.type == ADDRESS_REG_IMM
14615           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
14616 }
14617
14618 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
14619    The conditions for STR are the same.  */
14620 bool
14621 aarch64_sve_ldr_operand_p (rtx op)
14622 {
14623   struct aarch64_address_info addr;
14624
14625   return (MEM_P (op)
14626           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
14627                                        false, ADDR_QUERY_ANY)
14628           && addr.type == ADDRESS_REG_IMM);
14629 }
14630
14631 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
14632    We need to be able to access the individual pieces, so the range
14633    is different from LD[234] and ST[234].  */
14634 bool
14635 aarch64_sve_struct_memory_operand_p (rtx op)
14636 {
14637   if (!MEM_P (op))
14638     return false;
14639
14640   machine_mode mode = GET_MODE (op);
14641   struct aarch64_address_info addr;
14642   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
14643                                  ADDR_QUERY_ANY)
14644       || addr.type != ADDRESS_REG_IMM)
14645     return false;
14646
14647   poly_int64 first = addr.const_offset;
14648   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
14649   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
14650           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
14651 }
14652
14653 /* Emit a register copy from operand to operand, taking care not to
14654    early-clobber source registers in the process.
14655
14656    COUNT is the number of components into which the copy needs to be
14657    decomposed.  */
14658 void
14659 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
14660                                 unsigned int count)
14661 {
14662   unsigned int i;
14663   int rdest = REGNO (operands[0]);
14664   int rsrc = REGNO (operands[1]);
14665
14666   if (!reg_overlap_mentioned_p (operands[0], operands[1])
14667       || rdest < rsrc)
14668     for (i = 0; i < count; i++)
14669       emit_move_insn (gen_rtx_REG (mode, rdest + i),
14670                       gen_rtx_REG (mode, rsrc + i));
14671   else
14672     for (i = 0; i < count; i++)
14673       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
14674                       gen_rtx_REG (mode, rsrc + count - i - 1));
14675 }
14676
14677 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
14678    one of VSTRUCT modes: OI, CI, or XI.  */
14679 int
14680 aarch64_simd_attr_length_rglist (machine_mode mode)
14681 {
14682   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
14683   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
14684 }
14685
14686 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
14687    alignment of a vector to 128 bits.  SVE predicates have an alignment of
14688    16 bits.  */
14689 static HOST_WIDE_INT
14690 aarch64_simd_vector_alignment (const_tree type)
14691 {
14692   if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14693     /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
14694        be set for non-predicate vectors of booleans.  Modes are the most
14695        direct way we have of identifying real SVE predicate types.  */
14696     return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
14697   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
14698   return MIN (align, 128);
14699 }
14700
14701 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
14702 static poly_uint64
14703 aarch64_vectorize_preferred_vector_alignment (const_tree type)
14704 {
14705   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
14706     {
14707       /* If the length of the vector is fixed, try to align to that length,
14708          otherwise don't try to align at all.  */
14709       HOST_WIDE_INT result;
14710       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
14711         result = TYPE_ALIGN (TREE_TYPE (type));
14712       return result;
14713     }
14714   return TYPE_ALIGN (type);
14715 }
14716
14717 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
14718 static bool
14719 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
14720 {
14721   if (is_packed)
14722     return false;
14723
14724   /* For fixed-length vectors, check that the vectorizer will aim for
14725      full-vector alignment.  This isn't true for generic GCC vectors
14726      that are wider than the ABI maximum of 128 bits.  */
14727   poly_uint64 preferred_alignment =
14728     aarch64_vectorize_preferred_vector_alignment (type);
14729   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
14730       && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
14731                    preferred_alignment))
14732     return false;
14733
14734   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
14735   return true;
14736 }
14737
14738 /* Return true if the vector misalignment factor is supported by the
14739    target.  */
14740 static bool
14741 aarch64_builtin_support_vector_misalignment (machine_mode mode,
14742                                              const_tree type, int misalignment,
14743                                              bool is_packed)
14744 {
14745   if (TARGET_SIMD && STRICT_ALIGNMENT)
14746     {
14747       /* Return if movmisalign pattern is not supported for this mode.  */
14748       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
14749         return false;
14750
14751       /* Misalignment factor is unknown at compile time.  */
14752       if (misalignment == -1)
14753         return false;
14754     }
14755   return default_builtin_support_vector_misalignment (mode, type, misalignment,
14756                                                       is_packed);
14757 }
14758
14759 /* If VALS is a vector constant that can be loaded into a register
14760    using DUP, generate instructions to do so and return an RTX to
14761    assign to the register.  Otherwise return NULL_RTX.  */
14762 static rtx
14763 aarch64_simd_dup_constant (rtx vals)
14764 {
14765   machine_mode mode = GET_MODE (vals);
14766   machine_mode inner_mode = GET_MODE_INNER (mode);
14767   rtx x;
14768
14769   if (!const_vec_duplicate_p (vals, &x))
14770     return NULL_RTX;
14771
14772   /* We can load this constant by using DUP and a constant in a
14773      single ARM register.  This will be cheaper than a vector
14774      load.  */
14775   x = copy_to_mode_reg (inner_mode, x);
14776   return gen_vec_duplicate (mode, x);
14777 }
14778
14779
14780 /* Generate code to load VALS, which is a PARALLEL containing only
14781    constants (for vec_init) or CONST_VECTOR, efficiently into a
14782    register.  Returns an RTX to copy into the register, or NULL_RTX
14783    for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
14784 static rtx
14785 aarch64_simd_make_constant (rtx vals)
14786 {
14787   machine_mode mode = GET_MODE (vals);
14788   rtx const_dup;
14789   rtx const_vec = NULL_RTX;
14790   int n_const = 0;
14791   int i;
14792
14793   if (GET_CODE (vals) == CONST_VECTOR)
14794     const_vec = vals;
14795   else if (GET_CODE (vals) == PARALLEL)
14796     {
14797       /* A CONST_VECTOR must contain only CONST_INTs and
14798          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
14799          Only store valid constants in a CONST_VECTOR.  */
14800       int n_elts = XVECLEN (vals, 0);
14801       for (i = 0; i < n_elts; ++i)
14802         {
14803           rtx x = XVECEXP (vals, 0, i);
14804           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
14805             n_const++;
14806         }
14807       if (n_const == n_elts)
14808         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
14809     }
14810   else
14811     gcc_unreachable ();
14812
14813   if (const_vec != NULL_RTX
14814       && aarch64_simd_valid_immediate (const_vec, NULL))
14815     /* Load using MOVI/MVNI.  */
14816     return const_vec;
14817   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
14818     /* Loaded using DUP.  */
14819     return const_dup;
14820   else if (const_vec != NULL_RTX)
14821     /* Load from constant pool. We cannot take advantage of single-cycle
14822        LD1 because we need a PC-relative addressing mode.  */
14823     return const_vec;
14824   else
14825     /* A PARALLEL containing something not valid inside CONST_VECTOR.
14826        We cannot construct an initializer.  */
14827     return NULL_RTX;
14828 }
14829
14830 /* Expand a vector initialisation sequence, such that TARGET is
14831    initialised to contain VALS.  */
14832
14833 void
14834 aarch64_expand_vector_init (rtx target, rtx vals)
14835 {
14836   machine_mode mode = GET_MODE (target);
14837   scalar_mode inner_mode = GET_MODE_INNER (mode);
14838   /* The number of vector elements.  */
14839   int n_elts = XVECLEN (vals, 0);
14840   /* The number of vector elements which are not constant.  */
14841   int n_var = 0;
14842   rtx any_const = NULL_RTX;
14843   /* The first element of vals.  */
14844   rtx v0 = XVECEXP (vals, 0, 0);
14845   bool all_same = true;
14846
14847   /* Count the number of variable elements to initialise.  */
14848   for (int i = 0; i < n_elts; ++i)
14849     {
14850       rtx x = XVECEXP (vals, 0, i);
14851       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
14852         ++n_var;
14853       else
14854         any_const = x;
14855
14856       all_same &= rtx_equal_p (x, v0);
14857     }
14858
14859   /* No variable elements, hand off to aarch64_simd_make_constant which knows
14860      how best to handle this.  */
14861   if (n_var == 0)
14862     {
14863       rtx constant = aarch64_simd_make_constant (vals);
14864       if (constant != NULL_RTX)
14865         {
14866           emit_move_insn (target, constant);
14867           return;
14868         }
14869     }
14870
14871   /* Splat a single non-constant element if we can.  */
14872   if (all_same)
14873     {
14874       rtx x = copy_to_mode_reg (inner_mode, v0);
14875       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
14876       return;
14877     }
14878
14879   enum insn_code icode = optab_handler (vec_set_optab, mode);
14880   gcc_assert (icode != CODE_FOR_nothing);
14881
14882   /* If there are only variable elements, try to optimize
14883      the insertion using dup for the most common element
14884      followed by insertions.  */
14885
14886   /* The algorithm will fill matches[*][0] with the earliest matching element,
14887      and matches[X][1] with the count of duplicate elements (if X is the
14888      earliest element which has duplicates).  */
14889
14890   if (n_var == n_elts && n_elts <= 16)
14891     {
14892       int matches[16][2] = {0};
14893       for (int i = 0; i < n_elts; i++)
14894         {
14895           for (int j = 0; j <= i; j++)
14896             {
14897               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
14898                 {
14899                   matches[i][0] = j;
14900                   matches[j][1]++;
14901                   break;
14902                 }
14903             }
14904         }
14905       int maxelement = 0;
14906       int maxv = 0;
14907       for (int i = 0; i < n_elts; i++)
14908         if (matches[i][1] > maxv)
14909           {
14910             maxelement = i;
14911             maxv = matches[i][1];
14912           }
14913
14914       /* Create a duplicate of the most common element, unless all elements
14915          are equally useless to us, in which case just immediately set the
14916          vector register using the first element.  */
14917
14918       if (maxv == 1)
14919         {
14920           /* For vectors of two 64-bit elements, we can do even better.  */
14921           if (n_elts == 2
14922               && (inner_mode == E_DImode
14923                   || inner_mode == E_DFmode))
14924
14925             {
14926               rtx x0 = XVECEXP (vals, 0, 0);
14927               rtx x1 = XVECEXP (vals, 0, 1);
14928               /* Combine can pick up this case, but handling it directly
14929                  here leaves clearer RTL.
14930
14931                  This is load_pair_lanes<mode>, and also gives us a clean-up
14932                  for store_pair_lanes<mode>.  */
14933               if (memory_operand (x0, inner_mode)
14934                   && memory_operand (x1, inner_mode)
14935                   && !STRICT_ALIGNMENT
14936                   && rtx_equal_p (XEXP (x1, 0),
14937                                   plus_constant (Pmode,
14938                                                  XEXP (x0, 0),
14939                                                  GET_MODE_SIZE (inner_mode))))
14940                 {
14941                   rtx t;
14942                   if (inner_mode == DFmode)
14943                     t = gen_load_pair_lanesdf (target, x0, x1);
14944                   else
14945                     t = gen_load_pair_lanesdi (target, x0, x1);
14946                   emit_insn (t);
14947                   return;
14948                 }
14949             }
14950           /* The subreg-move sequence below will move into lane zero of the
14951              vector register.  For big-endian we want that position to hold
14952              the last element of VALS.  */
14953           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
14954           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
14955           aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
14956         }
14957       else
14958         {
14959           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
14960           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
14961         }
14962
14963       /* Insert the rest.  */
14964       for (int i = 0; i < n_elts; i++)
14965         {
14966           rtx x = XVECEXP (vals, 0, i);
14967           if (matches[i][0] == maxelement)
14968             continue;
14969           x = copy_to_mode_reg (inner_mode, x);
14970           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
14971         }
14972       return;
14973     }
14974
14975   /* Initialise a vector which is part-variable.  We want to first try
14976      to build those lanes which are constant in the most efficient way we
14977      can.  */
14978   if (n_var != n_elts)
14979     {
14980       rtx copy = copy_rtx (vals);
14981
14982       /* Load constant part of vector.  We really don't care what goes into the
14983          parts we will overwrite, but we're more likely to be able to load the
14984          constant efficiently if it has fewer, larger, repeating parts
14985          (see aarch64_simd_valid_immediate).  */
14986       for (int i = 0; i < n_elts; i++)
14987         {
14988           rtx x = XVECEXP (vals, 0, i);
14989           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
14990             continue;
14991           rtx subst = any_const;
14992           for (int bit = n_elts / 2; bit > 0; bit /= 2)
14993             {
14994               /* Look in the copied vector, as more elements are const.  */
14995               rtx test = XVECEXP (copy, 0, i ^ bit);
14996               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
14997                 {
14998                   subst = test;
14999                   break;
15000                 }
15001             }
15002           XVECEXP (copy, 0, i) = subst;
15003         }
15004       aarch64_expand_vector_init (target, copy);
15005     }
15006
15007   /* Insert the variable lanes directly.  */
15008   for (int i = 0; i < n_elts; i++)
15009     {
15010       rtx x = XVECEXP (vals, 0, i);
15011       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15012         continue;
15013       x = copy_to_mode_reg (inner_mode, x);
15014       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
15015     }
15016 }
15017
15018 static unsigned HOST_WIDE_INT
15019 aarch64_shift_truncation_mask (machine_mode mode)
15020 {
15021   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
15022     return 0;
15023   return GET_MODE_UNIT_BITSIZE (mode) - 1;
15024 }
15025
15026 /* Select a format to encode pointers in exception handling data.  */
15027 int
15028 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
15029 {
15030    int type;
15031    switch (aarch64_cmodel)
15032      {
15033      case AARCH64_CMODEL_TINY:
15034      case AARCH64_CMODEL_TINY_PIC:
15035      case AARCH64_CMODEL_SMALL:
15036      case AARCH64_CMODEL_SMALL_PIC:
15037      case AARCH64_CMODEL_SMALL_SPIC:
15038        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
15039           for everything.  */
15040        type = DW_EH_PE_sdata4;
15041        break;
15042      default:
15043        /* No assumptions here.  8-byte relocs required.  */
15044        type = DW_EH_PE_sdata8;
15045        break;
15046      }
15047    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
15048 }
15049
15050 /* The last .arch and .tune assembly strings that we printed.  */
15051 static std::string aarch64_last_printed_arch_string;
15052 static std::string aarch64_last_printed_tune_string;
15053
15054 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
15055    by the function fndecl.  */
15056
15057 void
15058 aarch64_declare_function_name (FILE *stream, const char* name,
15059                                 tree fndecl)
15060 {
15061   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
15062
15063   struct cl_target_option *targ_options;
15064   if (target_parts)
15065     targ_options = TREE_TARGET_OPTION (target_parts);
15066   else
15067     targ_options = TREE_TARGET_OPTION (target_option_current_node);
15068   gcc_assert (targ_options);
15069
15070   const struct processor *this_arch
15071     = aarch64_get_arch (targ_options->x_explicit_arch);
15072
15073   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
15074   std::string extension
15075     = aarch64_get_extension_string_for_isa_flags (isa_flags,
15076                                                   this_arch->flags);
15077   /* Only update the assembler .arch string if it is distinct from the last
15078      such string we printed.  */
15079   std::string to_print = this_arch->name + extension;
15080   if (to_print != aarch64_last_printed_arch_string)
15081     {
15082       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
15083       aarch64_last_printed_arch_string = to_print;
15084     }
15085
15086   /* Print the cpu name we're tuning for in the comments, might be
15087      useful to readers of the generated asm.  Do it only when it changes
15088      from function to function and verbose assembly is requested.  */
15089   const struct processor *this_tune
15090     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
15091
15092   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
15093     {
15094       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
15095                    this_tune->name);
15096       aarch64_last_printed_tune_string = this_tune->name;
15097     }
15098
15099   /* Don't forget the type directive for ELF.  */
15100   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
15101   ASM_OUTPUT_LABEL (stream, name);
15102 }
15103
15104 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
15105
15106 static void
15107 aarch64_start_file (void)
15108 {
15109   struct cl_target_option *default_options
15110     = TREE_TARGET_OPTION (target_option_default_node);
15111
15112   const struct processor *default_arch
15113     = aarch64_get_arch (default_options->x_explicit_arch);
15114   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
15115   std::string extension
15116     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
15117                                                   default_arch->flags);
15118
15119    aarch64_last_printed_arch_string = default_arch->name + extension;
15120    aarch64_last_printed_tune_string = "";
15121    asm_fprintf (asm_out_file, "\t.arch %s\n",
15122                 aarch64_last_printed_arch_string.c_str ());
15123
15124    default_file_start ();
15125 }
15126
15127 /* Emit load exclusive.  */
15128
15129 static void
15130 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
15131                              rtx mem, rtx model_rtx)
15132 {
15133   emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
15134 }
15135
15136 /* Emit store exclusive.  */
15137
15138 static void
15139 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
15140                               rtx rval, rtx mem, rtx model_rtx)
15141 {
15142   emit_insn (gen_aarch64_store_exclusive (mode, bval, rval, mem, model_rtx));
15143 }
15144
15145 /* Mark the previous jump instruction as unlikely.  */
15146
15147 static void
15148 aarch64_emit_unlikely_jump (rtx insn)
15149 {
15150   rtx_insn *jump = emit_jump_insn (insn);
15151   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
15152 }
15153
15154 /* Expand a compare and swap pattern.  */
15155
15156 void
15157 aarch64_expand_compare_and_swap (rtx operands[])
15158 {
15159   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
15160   machine_mode mode, r_mode;
15161
15162   bval = operands[0];
15163   rval = operands[1];
15164   mem = operands[2];
15165   oldval = operands[3];
15166   newval = operands[4];
15167   is_weak = operands[5];
15168   mod_s = operands[6];
15169   mod_f = operands[7];
15170   mode = GET_MODE (mem);
15171
15172   /* Normally the succ memory model must be stronger than fail, but in the
15173      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
15174      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
15175   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
15176       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
15177     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
15178
15179   r_mode = mode;
15180   if (mode == QImode || mode == HImode)
15181     {
15182       r_mode = SImode;
15183       rval = gen_reg_rtx (r_mode);
15184     }
15185
15186   if (TARGET_LSE)
15187     {
15188       /* The CAS insn requires oldval and rval overlap, but we need to
15189          have a copy of oldval saved across the operation to tell if
15190          the operation is successful.  */
15191       if (reg_overlap_mentioned_p (rval, oldval))
15192         rval = copy_to_mode_reg (r_mode, oldval);
15193       else
15194         emit_move_insn (rval, gen_lowpart (r_mode, oldval));
15195
15196       emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
15197                                                    newval, mod_s));
15198       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
15199     }
15200   else
15201     {
15202       /* The oldval predicate varies by mode.  Test it and force to reg.  */
15203       insn_code code = code_for_aarch64_compare_and_swap (mode);
15204       if (!insn_data[code].operand[2].predicate (oldval, mode))
15205         oldval = force_reg (mode, oldval);
15206
15207       emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
15208                                  is_weak, mod_s, mod_f));
15209       cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
15210     }
15211
15212   if (r_mode != mode)
15213     rval = gen_lowpart (mode, rval);
15214   emit_move_insn (operands[1], rval);
15215
15216   x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
15217   emit_insn (gen_rtx_SET (bval, x));
15218 }
15219
15220 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
15221    sequence implementing an atomic operation.  */
15222
15223 static void
15224 aarch64_emit_post_barrier (enum memmodel model)
15225 {
15226   const enum memmodel base_model = memmodel_base (model);
15227
15228   if (is_mm_sync (model)
15229       && (base_model == MEMMODEL_ACQUIRE
15230           || base_model == MEMMODEL_ACQ_REL
15231           || base_model == MEMMODEL_SEQ_CST))
15232     {
15233       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
15234     }
15235 }
15236
15237 /* Split a compare and swap pattern.  */
15238
15239 void
15240 aarch64_split_compare_and_swap (rtx operands[])
15241 {
15242   rtx rval, mem, oldval, newval, scratch;
15243   machine_mode mode;
15244   bool is_weak;
15245   rtx_code_label *label1, *label2;
15246   rtx x, cond;
15247   enum memmodel model;
15248   rtx model_rtx;
15249
15250   rval = operands[0];
15251   mem = operands[1];
15252   oldval = operands[2];
15253   newval = operands[3];
15254   is_weak = (operands[4] != const0_rtx);
15255   model_rtx = operands[5];
15256   scratch = operands[7];
15257   mode = GET_MODE (mem);
15258   model = memmodel_from_int (INTVAL (model_rtx));
15259
15260   /* When OLDVAL is zero and we want the strong version we can emit a tighter
15261     loop:
15262     .label1:
15263         LD[A]XR rval, [mem]
15264         CBNZ    rval, .label2
15265         ST[L]XR scratch, newval, [mem]
15266         CBNZ    scratch, .label1
15267     .label2:
15268         CMP     rval, 0.  */
15269   bool strong_zero_p = !is_weak && oldval == const0_rtx;
15270
15271   label1 = NULL;
15272   if (!is_weak)
15273     {
15274       label1 = gen_label_rtx ();
15275       emit_label (label1);
15276     }
15277   label2 = gen_label_rtx ();
15278
15279   /* The initial load can be relaxed for a __sync operation since a final
15280      barrier will be emitted to stop code hoisting.  */
15281   if (is_mm_sync (model))
15282     aarch64_emit_load_exclusive (mode, rval, mem,
15283                                  GEN_INT (MEMMODEL_RELAXED));
15284   else
15285     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
15286
15287   if (strong_zero_p)
15288     {
15289       if (aarch64_track_speculation)
15290         {
15291           /* Emit an explicit compare instruction, so that we can correctly
15292              track the condition codes.  */
15293           rtx cc_reg = aarch64_gen_compare_reg (NE, rval, const0_rtx);
15294           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
15295         }
15296       else
15297         x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
15298
15299       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15300                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
15301       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15302     }
15303   else
15304     {
15305       cond = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
15306       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
15307       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15308                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
15309       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15310     }
15311
15312   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
15313
15314   if (!is_weak)
15315     {
15316       if (aarch64_track_speculation)
15317         {
15318           /* Emit an explicit compare instruction, so that we can correctly
15319              track the condition codes.  */
15320           rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
15321           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
15322         }
15323       else
15324         x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
15325
15326       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15327                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
15328       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15329     }
15330   else
15331     {
15332       cond = gen_rtx_REG (CCmode, CC_REGNUM);
15333       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
15334       emit_insn (gen_rtx_SET (cond, x));
15335     }
15336
15337   emit_label (label2);
15338   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
15339      to set the condition flags.  If this is not used it will be removed by
15340      later passes.  */
15341   if (strong_zero_p)
15342     {
15343       cond = gen_rtx_REG (CCmode, CC_REGNUM);
15344       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
15345       emit_insn (gen_rtx_SET (cond, x));
15346     }
15347   /* Emit any final barrier needed for a __sync operation.  */
15348   if (is_mm_sync (model))
15349     aarch64_emit_post_barrier (model);
15350 }
15351
15352 /* Split an atomic operation.  */
15353
15354 void
15355 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
15356                          rtx value, rtx model_rtx, rtx cond)
15357 {
15358   machine_mode mode = GET_MODE (mem);
15359   machine_mode wmode = (mode == DImode ? DImode : SImode);
15360   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
15361   const bool is_sync = is_mm_sync (model);
15362   rtx_code_label *label;
15363   rtx x;
15364
15365   /* Split the atomic operation into a sequence.  */
15366   label = gen_label_rtx ();
15367   emit_label (label);
15368
15369   if (new_out)
15370     new_out = gen_lowpart (wmode, new_out);
15371   if (old_out)
15372     old_out = gen_lowpart (wmode, old_out);
15373   else
15374     old_out = new_out;
15375   value = simplify_gen_subreg (wmode, value, mode, 0);
15376
15377   /* The initial load can be relaxed for a __sync operation since a final
15378      barrier will be emitted to stop code hoisting.  */
15379  if (is_sync)
15380     aarch64_emit_load_exclusive (mode, old_out, mem,
15381                                  GEN_INT (MEMMODEL_RELAXED));
15382   else
15383     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
15384
15385   switch (code)
15386     {
15387     case SET:
15388       new_out = value;
15389       break;
15390
15391     case NOT:
15392       x = gen_rtx_AND (wmode, old_out, value);
15393       emit_insn (gen_rtx_SET (new_out, x));
15394       x = gen_rtx_NOT (wmode, new_out);
15395       emit_insn (gen_rtx_SET (new_out, x));
15396       break;
15397
15398     case MINUS:
15399       if (CONST_INT_P (value))
15400         {
15401           value = GEN_INT (-INTVAL (value));
15402           code = PLUS;
15403         }
15404       /* Fall through.  */
15405
15406     default:
15407       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
15408       emit_insn (gen_rtx_SET (new_out, x));
15409       break;
15410     }
15411
15412   aarch64_emit_store_exclusive (mode, cond, mem,
15413                                 gen_lowpart (mode, new_out), model_rtx);
15414
15415   if (aarch64_track_speculation)
15416     {
15417       /* Emit an explicit compare instruction, so that we can correctly
15418          track the condition codes.  */
15419       rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
15420       x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
15421     }
15422   else
15423     x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
15424
15425   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15426                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
15427   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15428
15429   /* Emit any final barrier needed for a __sync operation.  */
15430   if (is_sync)
15431     aarch64_emit_post_barrier (model);
15432 }
15433
15434 static void
15435 aarch64_init_libfuncs (void)
15436 {
15437    /* Half-precision float operations.  The compiler handles all operations
15438      with NULL libfuncs by converting to SFmode.  */
15439
15440   /* Conversions.  */
15441   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
15442   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
15443
15444   /* Arithmetic.  */
15445   set_optab_libfunc (add_optab, HFmode, NULL);
15446   set_optab_libfunc (sdiv_optab, HFmode, NULL);
15447   set_optab_libfunc (smul_optab, HFmode, NULL);
15448   set_optab_libfunc (neg_optab, HFmode, NULL);
15449   set_optab_libfunc (sub_optab, HFmode, NULL);
15450
15451   /* Comparisons.  */
15452   set_optab_libfunc (eq_optab, HFmode, NULL);
15453   set_optab_libfunc (ne_optab, HFmode, NULL);
15454   set_optab_libfunc (lt_optab, HFmode, NULL);
15455   set_optab_libfunc (le_optab, HFmode, NULL);
15456   set_optab_libfunc (ge_optab, HFmode, NULL);
15457   set_optab_libfunc (gt_optab, HFmode, NULL);
15458   set_optab_libfunc (unord_optab, HFmode, NULL);
15459 }
15460
15461 /* Target hook for c_mode_for_suffix.  */
15462 static machine_mode
15463 aarch64_c_mode_for_suffix (char suffix)
15464 {
15465   if (suffix == 'q')
15466     return TFmode;
15467
15468   return VOIDmode;
15469 }
15470
15471 /* We can only represent floating point constants which will fit in
15472    "quarter-precision" values.  These values are characterised by
15473    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
15474    by:
15475
15476    (-1)^s * (n/16) * 2^r
15477
15478    Where:
15479      's' is the sign bit.
15480      'n' is an integer in the range 16 <= n <= 31.
15481      'r' is an integer in the range -3 <= r <= 4.  */
15482
15483 /* Return true iff X can be represented by a quarter-precision
15484    floating point immediate operand X.  Note, we cannot represent 0.0.  */
15485 bool
15486 aarch64_float_const_representable_p (rtx x)
15487 {
15488   /* This represents our current view of how many bits
15489      make up the mantissa.  */
15490   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
15491   int exponent;
15492   unsigned HOST_WIDE_INT mantissa, mask;
15493   REAL_VALUE_TYPE r, m;
15494   bool fail;
15495
15496   if (!CONST_DOUBLE_P (x))
15497     return false;
15498
15499   if (GET_MODE (x) == VOIDmode
15500       || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
15501     return false;
15502
15503   r = *CONST_DOUBLE_REAL_VALUE (x);
15504
15505   /* We cannot represent infinities, NaNs or +/-zero.  We won't
15506      know if we have +zero until we analyse the mantissa, but we
15507      can reject the other invalid values.  */
15508   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
15509       || REAL_VALUE_MINUS_ZERO (r))
15510     return false;
15511
15512   /* Extract exponent.  */
15513   r = real_value_abs (&r);
15514   exponent = REAL_EXP (&r);
15515
15516   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
15517      highest (sign) bit, with a fixed binary point at bit point_pos.
15518      m1 holds the low part of the mantissa, m2 the high part.
15519      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
15520      bits for the mantissa, this can fail (low bits will be lost).  */
15521   real_ldexp (&m, &r, point_pos - exponent);
15522   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
15523
15524   /* If the low part of the mantissa has bits set we cannot represent
15525      the value.  */
15526   if (w.ulow () != 0)
15527     return false;
15528   /* We have rejected the lower HOST_WIDE_INT, so update our
15529      understanding of how many bits lie in the mantissa and
15530      look only at the high HOST_WIDE_INT.  */
15531   mantissa = w.elt (1);
15532   point_pos -= HOST_BITS_PER_WIDE_INT;
15533
15534   /* We can only represent values with a mantissa of the form 1.xxxx.  */
15535   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
15536   if ((mantissa & mask) != 0)
15537     return false;
15538
15539   /* Having filtered unrepresentable values, we may now remove all
15540      but the highest 5 bits.  */
15541   mantissa >>= point_pos - 5;
15542
15543   /* We cannot represent the value 0.0, so reject it.  This is handled
15544      elsewhere.  */
15545   if (mantissa == 0)
15546     return false;
15547
15548   /* Then, as bit 4 is always set, we can mask it off, leaving
15549      the mantissa in the range [0, 15].  */
15550   mantissa &= ~(1 << 4);
15551   gcc_assert (mantissa <= 15);
15552
15553   /* GCC internally does not use IEEE754-like encoding (where normalized
15554      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
15555      Our mantissa values are shifted 4 places to the left relative to
15556      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
15557      by 5 places to correct for GCC's representation.  */
15558   exponent = 5 - exponent;
15559
15560   return (exponent >= 0 && exponent <= 7);
15561 }
15562
15563 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
15564    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
15565    output MOVI/MVNI, ORR or BIC immediate.  */
15566 char*
15567 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
15568                                    enum simd_immediate_check which)
15569 {
15570   bool is_valid;
15571   static char templ[40];
15572   const char *mnemonic;
15573   const char *shift_op;
15574   unsigned int lane_count = 0;
15575   char element_char;
15576
15577   struct simd_immediate_info info;
15578
15579   /* This will return true to show const_vector is legal for use as either
15580      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
15581      It will also update INFO to show how the immediate should be generated.
15582      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
15583   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
15584   gcc_assert (is_valid);
15585
15586   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
15587   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
15588
15589   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15590     {
15591       gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
15592       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
15593          move immediate path.  */
15594       if (aarch64_float_const_zero_rtx_p (info.value))
15595         info.value = GEN_INT (0);
15596       else
15597         {
15598           const unsigned int buf_size = 20;
15599           char float_buf[buf_size] = {'\0'};
15600           real_to_decimal_for_mode (float_buf,
15601                                     CONST_DOUBLE_REAL_VALUE (info.value),
15602                                     buf_size, buf_size, 1, info.elt_mode);
15603
15604           if (lane_count == 1)
15605             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
15606           else
15607             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
15608                       lane_count, element_char, float_buf);
15609           return templ;
15610         }
15611     }
15612
15613   gcc_assert (CONST_INT_P (info.value));
15614
15615   if (which == AARCH64_CHECK_MOV)
15616     {
15617       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
15618       shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
15619       if (lane_count == 1)
15620         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
15621                   mnemonic, UINTVAL (info.value));
15622       else if (info.shift)
15623         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
15624                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
15625                   element_char, UINTVAL (info.value), shift_op, info.shift);
15626       else
15627         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
15628                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
15629                   element_char, UINTVAL (info.value));
15630     }
15631   else
15632     {
15633       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
15634       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
15635       if (info.shift)
15636         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
15637                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
15638                   element_char, UINTVAL (info.value), "lsl", info.shift);
15639       else
15640         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
15641                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
15642                   element_char, UINTVAL (info.value));
15643     }
15644   return templ;
15645 }
15646
15647 char*
15648 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
15649 {
15650
15651   /* If a floating point number was passed and we desire to use it in an
15652      integer mode do the conversion to integer.  */
15653   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
15654     {
15655       unsigned HOST_WIDE_INT ival;
15656       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
15657           gcc_unreachable ();
15658       immediate = gen_int_mode (ival, mode);
15659     }
15660
15661   machine_mode vmode;
15662   /* use a 64 bit mode for everything except for DI/DF mode, where we use
15663      a 128 bit vector mode.  */
15664   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
15665
15666   vmode = aarch64_simd_container_mode (mode, width);
15667   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
15668   return aarch64_output_simd_mov_immediate (v_op, width);
15669 }
15670
15671 /* Return the output string to use for moving immediate CONST_VECTOR
15672    into an SVE register.  */
15673
15674 char *
15675 aarch64_output_sve_mov_immediate (rtx const_vector)
15676 {
15677   static char templ[40];
15678   struct simd_immediate_info info;
15679   char element_char;
15680
15681   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
15682   gcc_assert (is_valid);
15683
15684   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
15685
15686   if (info.step)
15687     {
15688       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
15689                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
15690                 element_char, INTVAL (info.value), INTVAL (info.step));
15691       return templ;
15692     }
15693
15694   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15695     {
15696       if (aarch64_float_const_zero_rtx_p (info.value))
15697         info.value = GEN_INT (0);
15698       else
15699         {
15700           const int buf_size = 20;
15701           char float_buf[buf_size] = {};
15702           real_to_decimal_for_mode (float_buf,
15703                                     CONST_DOUBLE_REAL_VALUE (info.value),
15704                                     buf_size, buf_size, 1, info.elt_mode);
15705
15706           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
15707                     element_char, float_buf);
15708           return templ;
15709         }
15710     }
15711
15712   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
15713             element_char, INTVAL (info.value));
15714   return templ;
15715 }
15716
15717 /* Return the asm format for a PTRUE instruction whose destination has
15718    mode MODE.  SUFFIX is the element size suffix.  */
15719
15720 char *
15721 aarch64_output_ptrue (machine_mode mode, char suffix)
15722 {
15723   unsigned int nunits;
15724   static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
15725   if (GET_MODE_NUNITS (mode).is_constant (&nunits))
15726     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
15727   else
15728     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
15729   return buf;
15730 }
15731
15732 /* Split operands into moves from op[1] + op[2] into op[0].  */
15733
15734 void
15735 aarch64_split_combinev16qi (rtx operands[3])
15736 {
15737   unsigned int dest = REGNO (operands[0]);
15738   unsigned int src1 = REGNO (operands[1]);
15739   unsigned int src2 = REGNO (operands[2]);
15740   machine_mode halfmode = GET_MODE (operands[1]);
15741   unsigned int halfregs = REG_NREGS (operands[1]);
15742   rtx destlo, desthi;
15743
15744   gcc_assert (halfmode == V16QImode);
15745
15746   if (src1 == dest && src2 == dest + halfregs)
15747     {
15748       /* No-op move.  Can't split to nothing; emit something.  */
15749       emit_note (NOTE_INSN_DELETED);
15750       return;
15751     }
15752
15753   /* Preserve register attributes for variable tracking.  */
15754   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
15755   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
15756                                GET_MODE_SIZE (halfmode));
15757
15758   /* Special case of reversed high/low parts.  */
15759   if (reg_overlap_mentioned_p (operands[2], destlo)
15760       && reg_overlap_mentioned_p (operands[1], desthi))
15761     {
15762       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15763       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
15764       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15765     }
15766   else if (!reg_overlap_mentioned_p (operands[2], destlo))
15767     {
15768       /* Try to avoid unnecessary moves if part of the result
15769          is in the right place already.  */
15770       if (src1 != dest)
15771         emit_move_insn (destlo, operands[1]);
15772       if (src2 != dest + halfregs)
15773         emit_move_insn (desthi, operands[2]);
15774     }
15775   else
15776     {
15777       if (src2 != dest + halfregs)
15778         emit_move_insn (desthi, operands[2]);
15779       if (src1 != dest)
15780         emit_move_insn (destlo, operands[1]);
15781     }
15782 }
15783
15784 /* vec_perm support.  */
15785
15786 struct expand_vec_perm_d
15787 {
15788   rtx target, op0, op1;
15789   vec_perm_indices perm;
15790   machine_mode vmode;
15791   unsigned int vec_flags;
15792   bool one_vector_p;
15793   bool testing_p;
15794 };
15795
15796 /* Generate a variable permutation.  */
15797
15798 static void
15799 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
15800 {
15801   machine_mode vmode = GET_MODE (target);
15802   bool one_vector_p = rtx_equal_p (op0, op1);
15803
15804   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
15805   gcc_checking_assert (GET_MODE (op0) == vmode);
15806   gcc_checking_assert (GET_MODE (op1) == vmode);
15807   gcc_checking_assert (GET_MODE (sel) == vmode);
15808   gcc_checking_assert (TARGET_SIMD);
15809
15810   if (one_vector_p)
15811     {
15812       if (vmode == V8QImode)
15813         {
15814           /* Expand the argument to a V16QI mode by duplicating it.  */
15815           rtx pair = gen_reg_rtx (V16QImode);
15816           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
15817           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15818         }
15819       else
15820         {
15821           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
15822         }
15823     }
15824   else
15825     {
15826       rtx pair;
15827
15828       if (vmode == V8QImode)
15829         {
15830           pair = gen_reg_rtx (V16QImode);
15831           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
15832           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15833         }
15834       else
15835         {
15836           pair = gen_reg_rtx (OImode);
15837           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
15838           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
15839         }
15840     }
15841 }
15842
15843 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
15844    NELT is the number of elements in the vector.  */
15845
15846 void
15847 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
15848                          unsigned int nelt)
15849 {
15850   machine_mode vmode = GET_MODE (target);
15851   bool one_vector_p = rtx_equal_p (op0, op1);
15852   rtx mask;
15853
15854   /* The TBL instruction does not use a modulo index, so we must take care
15855      of that ourselves.  */
15856   mask = aarch64_simd_gen_const_vector_dup (vmode,
15857       one_vector_p ? nelt - 1 : 2 * nelt - 1);
15858   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
15859
15860   /* For big-endian, we also need to reverse the index within the vector
15861      (but not which vector).  */
15862   if (BYTES_BIG_ENDIAN)
15863     {
15864       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
15865       if (!one_vector_p)
15866         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
15867       sel = expand_simple_binop (vmode, XOR, sel, mask,
15868                                  NULL, 0, OPTAB_LIB_WIDEN);
15869     }
15870   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
15871 }
15872
15873 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
15874
15875 static void
15876 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
15877 {
15878   emit_insn (gen_rtx_SET (target,
15879                           gen_rtx_UNSPEC (GET_MODE (target),
15880                                           gen_rtvec (2, op0, op1), code)));
15881 }
15882
15883 /* Expand an SVE vec_perm with the given operands.  */
15884
15885 void
15886 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
15887 {
15888   machine_mode data_mode = GET_MODE (target);
15889   machine_mode sel_mode = GET_MODE (sel);
15890   /* Enforced by the pattern condition.  */
15891   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
15892
15893   /* Note: vec_perm indices are supposed to wrap when they go beyond the
15894      size of the two value vectors, i.e. the upper bits of the indices
15895      are effectively ignored.  SVE TBL instead produces 0 for any
15896      out-of-range indices, so we need to modulo all the vec_perm indices
15897      to ensure they are all in range.  */
15898   rtx sel_reg = force_reg (sel_mode, sel);
15899
15900   /* Check if the sel only references the first values vector.  */
15901   if (GET_CODE (sel) == CONST_VECTOR
15902       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
15903     {
15904       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
15905       return;
15906     }
15907
15908   /* Check if the two values vectors are the same.  */
15909   if (rtx_equal_p (op0, op1))
15910     {
15911       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
15912       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15913                                          NULL, 0, OPTAB_DIRECT);
15914       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
15915       return;
15916     }
15917
15918   /* Run TBL on for each value vector and combine the results.  */
15919
15920   rtx res0 = gen_reg_rtx (data_mode);
15921   rtx res1 = gen_reg_rtx (data_mode);
15922   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
15923   if (GET_CODE (sel) != CONST_VECTOR
15924       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
15925     {
15926       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
15927                                                        2 * nunits - 1);
15928       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15929                                      NULL, 0, OPTAB_DIRECT);
15930     }
15931   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
15932   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
15933                                      NULL, 0, OPTAB_DIRECT);
15934   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
15935   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
15936     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
15937   else
15938     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
15939 }
15940
15941 /* Recognize patterns suitable for the TRN instructions.  */
15942 static bool
15943 aarch64_evpc_trn (struct expand_vec_perm_d *d)
15944 {
15945   HOST_WIDE_INT odd;
15946   poly_uint64 nelt = d->perm.length ();
15947   rtx out, in0, in1, x;
15948   machine_mode vmode = d->vmode;
15949
15950   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15951     return false;
15952
15953   /* Note that these are little-endian tests.
15954      We correct for big-endian later.  */
15955   if (!d->perm[0].is_constant (&odd)
15956       || (odd != 0 && odd != 1)
15957       || !d->perm.series_p (0, 2, odd, 2)
15958       || !d->perm.series_p (1, 2, nelt + odd, 2))
15959     return false;
15960
15961   /* Success!  */
15962   if (d->testing_p)
15963     return true;
15964
15965   in0 = d->op0;
15966   in1 = d->op1;
15967   /* We don't need a big-endian lane correction for SVE; see the comment
15968      at the head of aarch64-sve.md for details.  */
15969   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15970     {
15971       x = in0, in0 = in1, in1 = x;
15972       odd = !odd;
15973     }
15974   out = d->target;
15975
15976   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15977                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
15978   return true;
15979 }
15980
15981 /* Recognize patterns suitable for the UZP instructions.  */
15982 static bool
15983 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
15984 {
15985   HOST_WIDE_INT odd;
15986   rtx out, in0, in1, x;
15987   machine_mode vmode = d->vmode;
15988
15989   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15990     return false;
15991
15992   /* Note that these are little-endian tests.
15993      We correct for big-endian later.  */
15994   if (!d->perm[0].is_constant (&odd)
15995       || (odd != 0 && odd != 1)
15996       || !d->perm.series_p (0, 1, odd, 2))
15997     return false;
15998
15999   /* Success!  */
16000   if (d->testing_p)
16001     return true;
16002
16003   in0 = d->op0;
16004   in1 = d->op1;
16005   /* We don't need a big-endian lane correction for SVE; see the comment
16006      at the head of aarch64-sve.md for details.  */
16007   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16008     {
16009       x = in0, in0 = in1, in1 = x;
16010       odd = !odd;
16011     }
16012   out = d->target;
16013
16014   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16015                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
16016   return true;
16017 }
16018
16019 /* Recognize patterns suitable for the ZIP instructions.  */
16020 static bool
16021 aarch64_evpc_zip (struct expand_vec_perm_d *d)
16022 {
16023   unsigned int high;
16024   poly_uint64 nelt = d->perm.length ();
16025   rtx out, in0, in1, x;
16026   machine_mode vmode = d->vmode;
16027
16028   if (GET_MODE_UNIT_SIZE (vmode) > 8)
16029     return false;
16030
16031   /* Note that these are little-endian tests.
16032      We correct for big-endian later.  */
16033   poly_uint64 first = d->perm[0];
16034   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
16035       || !d->perm.series_p (0, 2, first, 1)
16036       || !d->perm.series_p (1, 2, first + nelt, 1))
16037     return false;
16038   high = maybe_ne (first, 0U);
16039
16040   /* Success!  */
16041   if (d->testing_p)
16042     return true;
16043
16044   in0 = d->op0;
16045   in1 = d->op1;
16046   /* We don't need a big-endian lane correction for SVE; see the comment
16047      at the head of aarch64-sve.md for details.  */
16048   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16049     {
16050       x = in0, in0 = in1, in1 = x;
16051       high = !high;
16052     }
16053   out = d->target;
16054
16055   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16056                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
16057   return true;
16058 }
16059
16060 /* Recognize patterns for the EXT insn.  */
16061
16062 static bool
16063 aarch64_evpc_ext (struct expand_vec_perm_d *d)
16064 {
16065   HOST_WIDE_INT location;
16066   rtx offset;
16067
16068   /* The first element always refers to the first vector.
16069      Check if the extracted indices are increasing by one.  */
16070   if (d->vec_flags == VEC_SVE_PRED
16071       || !d->perm[0].is_constant (&location)
16072       || !d->perm.series_p (0, 1, location, 1))
16073     return false;
16074
16075   /* Success! */
16076   if (d->testing_p)
16077     return true;
16078
16079   /* The case where (location == 0) is a no-op for both big- and little-endian,
16080      and is removed by the mid-end at optimization levels -O1 and higher.
16081
16082      We don't need a big-endian lane correction for SVE; see the comment
16083      at the head of aarch64-sve.md for details.  */
16084   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
16085     {
16086       /* After setup, we want the high elements of the first vector (stored
16087          at the LSB end of the register), and the low elements of the second
16088          vector (stored at the MSB end of the register). So swap.  */
16089       std::swap (d->op0, d->op1);
16090       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
16091          to_constant () is safe since this is restricted to Advanced SIMD
16092          vectors.  */
16093       location = d->perm.length ().to_constant () - location;
16094     }
16095
16096   offset = GEN_INT (location);
16097   emit_set_insn (d->target,
16098                  gen_rtx_UNSPEC (d->vmode,
16099                                  gen_rtvec (3, d->op0, d->op1, offset),
16100                                  UNSPEC_EXT));
16101   return true;
16102 }
16103
16104 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
16105    within each 64-bit, 32-bit or 16-bit granule.  */
16106
16107 static bool
16108 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
16109 {
16110   HOST_WIDE_INT diff;
16111   unsigned int i, size, unspec;
16112   machine_mode pred_mode;
16113
16114   if (d->vec_flags == VEC_SVE_PRED
16115       || !d->one_vector_p
16116       || !d->perm[0].is_constant (&diff))
16117     return false;
16118
16119   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
16120   if (size == 8)
16121     {
16122       unspec = UNSPEC_REV64;
16123       pred_mode = VNx2BImode;
16124     }
16125   else if (size == 4)
16126     {
16127       unspec = UNSPEC_REV32;
16128       pred_mode = VNx4BImode;
16129     }
16130   else if (size == 2)
16131     {
16132       unspec = UNSPEC_REV16;
16133       pred_mode = VNx8BImode;
16134     }
16135   else
16136     return false;
16137
16138   unsigned int step = diff + 1;
16139   for (i = 0; i < step; ++i)
16140     if (!d->perm.series_p (i, step, diff - i, step))
16141       return false;
16142
16143   /* Success! */
16144   if (d->testing_p)
16145     return true;
16146
16147   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
16148   if (d->vec_flags == VEC_SVE_DATA)
16149     {
16150       rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
16151       src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
16152                             UNSPEC_MERGE_PTRUE);
16153     }
16154   emit_set_insn (d->target, src);
16155   return true;
16156 }
16157
16158 /* Recognize patterns for the REV insn, which reverses elements within
16159    a full vector.  */
16160
16161 static bool
16162 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
16163 {
16164   poly_uint64 nelt = d->perm.length ();
16165
16166   if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
16167     return false;
16168
16169   if (!d->perm.series_p (0, 1, nelt - 1, -1))
16170     return false;
16171
16172   /* Success! */
16173   if (d->testing_p)
16174     return true;
16175
16176   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
16177   emit_set_insn (d->target, src);
16178   return true;
16179 }
16180
16181 static bool
16182 aarch64_evpc_dup (struct expand_vec_perm_d *d)
16183 {
16184   rtx out = d->target;
16185   rtx in0;
16186   HOST_WIDE_INT elt;
16187   machine_mode vmode = d->vmode;
16188   rtx lane;
16189
16190   if (d->vec_flags == VEC_SVE_PRED
16191       || d->perm.encoding ().encoded_nelts () != 1
16192       || !d->perm[0].is_constant (&elt))
16193     return false;
16194
16195   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
16196     return false;
16197
16198   /* Success! */
16199   if (d->testing_p)
16200     return true;
16201
16202   /* The generic preparation in aarch64_expand_vec_perm_const_1
16203      swaps the operand order and the permute indices if it finds
16204      d->perm[0] to be in the second operand.  Thus, we can always
16205      use d->op0 and need not do any extra arithmetic to get the
16206      correct lane number.  */
16207   in0 = d->op0;
16208   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
16209
16210   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
16211   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
16212   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
16213   return true;
16214 }
16215
16216 static bool
16217 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
16218 {
16219   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
16220   machine_mode vmode = d->vmode;
16221
16222   /* Make sure that the indices are constant.  */
16223   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
16224   for (unsigned int i = 0; i < encoded_nelts; ++i)
16225     if (!d->perm[i].is_constant ())
16226       return false;
16227
16228   if (d->testing_p)
16229     return true;
16230
16231   /* Generic code will try constant permutation twice.  Once with the
16232      original mode and again with the elements lowered to QImode.
16233      So wait and don't do the selector expansion ourselves.  */
16234   if (vmode != V8QImode && vmode != V16QImode)
16235     return false;
16236
16237   /* to_constant is safe since this routine is specific to Advanced SIMD
16238      vectors.  */
16239   unsigned int nelt = d->perm.length ().to_constant ();
16240   for (unsigned int i = 0; i < nelt; ++i)
16241     /* If big-endian and two vectors we end up with a weird mixed-endian
16242        mode on NEON.  Reverse the index within each word but not the word
16243        itself.  to_constant is safe because we checked is_constant above.  */
16244     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
16245                         ? d->perm[i].to_constant () ^ (nelt - 1)
16246                         : d->perm[i].to_constant ());
16247
16248   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
16249   sel = force_reg (vmode, sel);
16250
16251   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
16252   return true;
16253 }
16254
16255 /* Try to implement D using an SVE TBL instruction.  */
16256
16257 static bool
16258 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
16259 {
16260   unsigned HOST_WIDE_INT nelt;
16261
16262   /* Permuting two variable-length vectors could overflow the
16263      index range.  */
16264   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
16265     return false;
16266
16267   if (d->testing_p)
16268     return true;
16269
16270   machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
16271   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
16272   if (d->one_vector_p)
16273     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
16274   else
16275     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
16276   return true;
16277 }
16278
16279 static bool
16280 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
16281 {
16282   /* The pattern matching functions above are written to look for a small
16283      number to begin the sequence (0, 1, N/2).  If we begin with an index
16284      from the second operand, we can swap the operands.  */
16285   poly_int64 nelt = d->perm.length ();
16286   if (known_ge (d->perm[0], nelt))
16287     {
16288       d->perm.rotate_inputs (1);
16289       std::swap (d->op0, d->op1);
16290     }
16291
16292   if ((d->vec_flags == VEC_ADVSIMD
16293        || d->vec_flags == VEC_SVE_DATA
16294        || d->vec_flags == VEC_SVE_PRED)
16295       && known_gt (nelt, 1))
16296     {
16297       if (aarch64_evpc_rev_local (d))
16298         return true;
16299       else if (aarch64_evpc_rev_global (d))
16300         return true;
16301       else if (aarch64_evpc_ext (d))
16302         return true;
16303       else if (aarch64_evpc_dup (d))
16304         return true;
16305       else if (aarch64_evpc_zip (d))
16306         return true;
16307       else if (aarch64_evpc_uzp (d))
16308         return true;
16309       else if (aarch64_evpc_trn (d))
16310         return true;
16311       if (d->vec_flags == VEC_SVE_DATA)
16312         return aarch64_evpc_sve_tbl (d);
16313       else if (d->vec_flags == VEC_ADVSIMD)
16314         return aarch64_evpc_tbl (d);
16315     }
16316   return false;
16317 }
16318
16319 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
16320
16321 static bool
16322 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
16323                                   rtx op1, const vec_perm_indices &sel)
16324 {
16325   struct expand_vec_perm_d d;
16326
16327   /* Check whether the mask can be applied to a single vector.  */
16328   if (sel.ninputs () == 1
16329       || (op0 && rtx_equal_p (op0, op1)))
16330     d.one_vector_p = true;
16331   else if (sel.all_from_input_p (0))
16332     {
16333       d.one_vector_p = true;
16334       op1 = op0;
16335     }
16336   else if (sel.all_from_input_p (1))
16337     {
16338       d.one_vector_p = true;
16339       op0 = op1;
16340     }
16341   else
16342     d.one_vector_p = false;
16343
16344   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
16345                      sel.nelts_per_input ());
16346   d.vmode = vmode;
16347   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
16348   d.target = target;
16349   d.op0 = op0;
16350   d.op1 = op1;
16351   d.testing_p = !target;
16352
16353   if (!d.testing_p)
16354     return aarch64_expand_vec_perm_const_1 (&d);
16355
16356   rtx_insn *last = get_last_insn ();
16357   bool ret = aarch64_expand_vec_perm_const_1 (&d);
16358   gcc_assert (last == get_last_insn ());
16359
16360   return ret;
16361 }
16362
16363 /* Generate a byte permute mask for a register of mode MODE,
16364    which has NUNITS units.  */
16365
16366 rtx
16367 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
16368 {
16369   /* We have to reverse each vector because we dont have
16370      a permuted load that can reverse-load according to ABI rules.  */
16371   rtx mask;
16372   rtvec v = rtvec_alloc (16);
16373   unsigned int i, j;
16374   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
16375
16376   gcc_assert (BYTES_BIG_ENDIAN);
16377   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
16378
16379   for (i = 0; i < nunits; i++)
16380     for (j = 0; j < usize; j++)
16381       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
16382   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
16383   return force_reg (V16QImode, mask);
16384 }
16385
16386 /* Return true if X is a valid second operand for the SVE instruction
16387    that implements integer comparison OP_CODE.  */
16388
16389 static bool
16390 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
16391 {
16392   if (register_operand (x, VOIDmode))
16393     return true;
16394
16395   switch (op_code)
16396     {
16397     case LTU:
16398     case LEU:
16399     case GEU:
16400     case GTU:
16401       return aarch64_sve_cmp_immediate_p (x, false);
16402     case LT:
16403     case LE:
16404     case GE:
16405     case GT:
16406     case NE:
16407     case EQ:
16408       return aarch64_sve_cmp_immediate_p (x, true);
16409     default:
16410       gcc_unreachable ();
16411     }
16412 }
16413
16414 /* Use predicated SVE instructions to implement the equivalent of:
16415
16416      (set TARGET OP)
16417
16418    given that PTRUE is an all-true predicate of the appropriate mode.  */
16419
16420 static void
16421 aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
16422 {
16423   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
16424                                gen_rtvec (2, ptrue, op),
16425                                UNSPEC_MERGE_PTRUE);
16426   rtx_insn *insn = emit_set_insn (target, unspec);
16427   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
16428 }
16429
16430 /* Likewise, but also clobber the condition codes.  */
16431
16432 static void
16433 aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
16434 {
16435   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
16436                                gen_rtvec (2, ptrue, op),
16437                                UNSPEC_MERGE_PTRUE);
16438   rtx_insn *insn = emit_insn (gen_set_clobber_cc (target, unspec));
16439   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
16440 }
16441
16442 /* Return the UNSPEC_COND_* code for comparison CODE.  */
16443
16444 static unsigned int
16445 aarch64_unspec_cond_code (rtx_code code)
16446 {
16447   switch (code)
16448     {
16449     case NE:
16450       return UNSPEC_COND_NE;
16451     case EQ:
16452       return UNSPEC_COND_EQ;
16453     case LT:
16454       return UNSPEC_COND_LT;
16455     case GT:
16456       return UNSPEC_COND_GT;
16457     case LE:
16458       return UNSPEC_COND_LE;
16459     case GE:
16460       return UNSPEC_COND_GE;
16461     default:
16462       gcc_unreachable ();
16463     }
16464 }
16465
16466 /* Emit:
16467
16468       (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
16469
16470    where <X> is the operation associated with comparison CODE.  This form
16471    of instruction is used when (and (CODE OP0 OP1) PRED) would have different
16472    semantics, such as when PRED might not be all-true and when comparing
16473    inactive lanes could have side effects.  */
16474
16475 static void
16476 aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
16477                                   rtx pred, rtx op0, rtx op1)
16478 {
16479   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
16480                                gen_rtvec (3, pred, op0, op1),
16481                                aarch64_unspec_cond_code (code));
16482   emit_set_insn (target, unspec);
16483 }
16484
16485 /* Expand an SVE integer comparison using the SVE equivalent of:
16486
16487      (set TARGET (CODE OP0 OP1)).  */
16488
16489 void
16490 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
16491 {
16492   machine_mode pred_mode = GET_MODE (target);
16493   machine_mode data_mode = GET_MODE (op0);
16494
16495   if (!aarch64_sve_cmp_operand_p (code, op1))
16496     op1 = force_reg (data_mode, op1);
16497
16498   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
16499   rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
16500   aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
16501 }
16502
16503 /* Emit the SVE equivalent of:
16504
16505       (set TMP1 (CODE1 OP0 OP1))
16506       (set TMP2 (CODE2 OP0 OP1))
16507       (set TARGET (ior:PRED_MODE TMP1 TMP2))
16508
16509    PTRUE is an all-true predicate with the same mode as TARGET.  */
16510
16511 static void
16512 aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
16513                            rtx ptrue, rtx op0, rtx op1)
16514 {
16515   machine_mode pred_mode = GET_MODE (ptrue);
16516   rtx tmp1 = gen_reg_rtx (pred_mode);
16517   aarch64_emit_sve_ptrue_op (tmp1, ptrue,
16518                              gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
16519   rtx tmp2 = gen_reg_rtx (pred_mode);
16520   aarch64_emit_sve_ptrue_op (tmp2, ptrue,
16521                              gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
16522   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
16523 }
16524
16525 /* Emit the SVE equivalent of:
16526
16527       (set TMP (CODE OP0 OP1))
16528       (set TARGET (not TMP))
16529
16530    PTRUE is an all-true predicate with the same mode as TARGET.  */
16531
16532 static void
16533 aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
16534                                 rtx op0, rtx op1)
16535 {
16536   machine_mode pred_mode = GET_MODE (ptrue);
16537   rtx tmp = gen_reg_rtx (pred_mode);
16538   aarch64_emit_sve_ptrue_op (tmp, ptrue,
16539                              gen_rtx_fmt_ee (code, pred_mode, op0, op1));
16540   aarch64_emit_unop (target, one_cmpl_optab, tmp);
16541 }
16542
16543 /* Expand an SVE floating-point comparison using the SVE equivalent of:
16544
16545      (set TARGET (CODE OP0 OP1))
16546
16547    If CAN_INVERT_P is true, the caller can also handle inverted results;
16548    return true if the result is in fact inverted.  */
16549
16550 bool
16551 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
16552                                   rtx op0, rtx op1, bool can_invert_p)
16553 {
16554   machine_mode pred_mode = GET_MODE (target);
16555   machine_mode data_mode = GET_MODE (op0);
16556
16557   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
16558   switch (code)
16559     {
16560     case UNORDERED:
16561       /* UNORDERED has no immediate form.  */
16562       op1 = force_reg (data_mode, op1);
16563       /* fall through */
16564     case LT:
16565     case LE:
16566     case GT:
16567     case GE:
16568     case EQ:
16569     case NE:
16570       {
16571         /* There is native support for the comparison.  */
16572         rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
16573         aarch64_emit_sve_ptrue_op (target, ptrue, cond);
16574         return false;
16575       }
16576
16577     case LTGT:
16578       /* This is a trapping operation (LT or GT).  */
16579       aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
16580       return false;
16581
16582     case UNEQ:
16583       if (!flag_trapping_math)
16584         {
16585           /* This would trap for signaling NaNs.  */
16586           op1 = force_reg (data_mode, op1);
16587           aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
16588           return false;
16589         }
16590       /* fall through */
16591     case UNLT:
16592     case UNLE:
16593     case UNGT:
16594     case UNGE:
16595       if (flag_trapping_math)
16596         {
16597           /* Work out which elements are ordered.  */
16598           rtx ordered = gen_reg_rtx (pred_mode);
16599           op1 = force_reg (data_mode, op1);
16600           aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
16601
16602           /* Test the opposite condition for the ordered elements,
16603              then invert the result.  */
16604           if (code == UNEQ)
16605             code = NE;
16606           else
16607             code = reverse_condition_maybe_unordered (code);
16608           if (can_invert_p)
16609             {
16610               aarch64_emit_sve_predicated_cond (target, code,
16611                                                 ordered, op0, op1);
16612               return true;
16613             }
16614           rtx tmp = gen_reg_rtx (pred_mode);
16615           aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
16616           aarch64_emit_unop (target, one_cmpl_optab, tmp);
16617           return false;
16618         }
16619       break;
16620
16621     case ORDERED:
16622       /* ORDERED has no immediate form.  */
16623       op1 = force_reg (data_mode, op1);
16624       break;
16625
16626     default:
16627       gcc_unreachable ();
16628     }
16629
16630   /* There is native support for the inverse comparison.  */
16631   code = reverse_condition_maybe_unordered (code);
16632   if (can_invert_p)
16633     {
16634       rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
16635       aarch64_emit_sve_ptrue_op (target, ptrue, cond);
16636       return true;
16637     }
16638   aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
16639   return false;
16640 }
16641
16642 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
16643    of the data being selected and CMP_MODE is the mode of the values being
16644    compared.  */
16645
16646 void
16647 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
16648                           rtx *ops)
16649 {
16650   machine_mode pred_mode
16651     = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
16652                              GET_MODE_SIZE (cmp_mode)).require ();
16653   rtx pred = gen_reg_rtx (pred_mode);
16654   if (FLOAT_MODE_P (cmp_mode))
16655     {
16656       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
16657                                             ops[4], ops[5], true))
16658         std::swap (ops[1], ops[2]);
16659     }
16660   else
16661     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
16662
16663   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
16664   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
16665 }
16666
16667 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
16668    true.  However due to issues with register allocation it is preferable
16669    to avoid tieing integer scalar and FP scalar modes.  Executing integer
16670    operations in general registers is better than treating them as scalar
16671    vector operations.  This reduces latency and avoids redundant int<->FP
16672    moves.  So tie modes if they are either the same class, or vector modes
16673    with other vector modes, vector structs or any scalar mode.  */
16674
16675 static bool
16676 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
16677 {
16678   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
16679     return true;
16680
16681   /* We specifically want to allow elements of "structure" modes to
16682      be tieable to the structure.  This more general condition allows
16683      other rarer situations too.  The reason we don't extend this to
16684      predicate modes is that there are no predicate structure modes
16685      nor any specific instructions for extracting part of a predicate
16686      register.  */
16687   if (aarch64_vector_data_mode_p (mode1)
16688       && aarch64_vector_data_mode_p (mode2))
16689     return true;
16690
16691   /* Also allow any scalar modes with vectors.  */
16692   if (aarch64_vector_mode_supported_p (mode1)
16693       || aarch64_vector_mode_supported_p (mode2))
16694     return true;
16695
16696   return false;
16697 }
16698
16699 /* Return a new RTX holding the result of moving POINTER forward by
16700    AMOUNT bytes.  */
16701
16702 static rtx
16703 aarch64_move_pointer (rtx pointer, poly_int64 amount)
16704 {
16705   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
16706
16707   return adjust_automodify_address (pointer, GET_MODE (pointer),
16708                                     next, amount);
16709 }
16710
16711 /* Return a new RTX holding the result of moving POINTER forward by the
16712    size of the mode it points to.  */
16713
16714 static rtx
16715 aarch64_progress_pointer (rtx pointer)
16716 {
16717   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
16718 }
16719
16720 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
16721    MODE bytes.  */
16722
16723 static void
16724 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
16725                                               machine_mode mode)
16726 {
16727   rtx reg = gen_reg_rtx (mode);
16728
16729   /* "Cast" the pointers to the correct mode.  */
16730   *src = adjust_address (*src, mode, 0);
16731   *dst = adjust_address (*dst, mode, 0);
16732   /* Emit the memcpy.  */
16733   emit_move_insn (reg, *src);
16734   emit_move_insn (*dst, reg);
16735   /* Move the pointers forward.  */
16736   *src = aarch64_progress_pointer (*src);
16737   *dst = aarch64_progress_pointer (*dst);
16738 }
16739
16740 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
16741    we succeed, otherwise return false.  */
16742
16743 bool
16744 aarch64_expand_movmem (rtx *operands)
16745 {
16746   int n, mode_bits;
16747   rtx dst = operands[0];
16748   rtx src = operands[1];
16749   rtx base;
16750   machine_mode cur_mode = BLKmode, next_mode;
16751   bool speed_p = !optimize_function_for_size_p (cfun);
16752
16753   /* When optimizing for size, give a better estimate of the length of a
16754      memcpy call, but use the default otherwise.  Moves larger than 8 bytes
16755      will always require an even number of instructions to do now.  And each
16756      operation requires both a load+store, so devide the max number by 2.  */
16757   int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
16758
16759   /* We can't do anything smart if the amount to copy is not constant.  */
16760   if (!CONST_INT_P (operands[2]))
16761     return false;
16762
16763   n = INTVAL (operands[2]);
16764
16765   /* Try to keep the number of instructions low.  For all cases we will do at
16766      most two moves for the residual amount, since we'll always overlap the
16767      remainder.  */
16768   if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
16769     return false;
16770
16771   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
16772   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
16773
16774   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
16775   src = adjust_automodify_address (src, VOIDmode, base, 0);
16776
16777   /* Convert n to bits to make the rest of the code simpler.  */
16778   n = n * BITS_PER_UNIT;
16779
16780   /* Maximum amount to copy in one go.  The AArch64 back-end has integer modes
16781      larger than TImode, but we should not use them for loads/stores here.  */
16782   const int copy_limit = GET_MODE_BITSIZE (TImode);
16783
16784   while (n > 0)
16785     {
16786       /* Find the largest mode in which to do the copy in without over reading
16787          or writing.  */
16788       opt_scalar_int_mode mode_iter;
16789       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
16790         if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
16791           cur_mode = mode_iter.require ();
16792
16793       gcc_assert (cur_mode != BLKmode);
16794
16795       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
16796       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
16797
16798       n -= mode_bits;
16799
16800       /* Do certain trailing copies as overlapping if it's going to be
16801          cheaper.  i.e. less instructions to do so.  For instance doing a 15
16802          byte copy it's more efficient to do two overlapping 8 byte copies than
16803          8 + 6 + 1.  */
16804       if (n > 0 && n <= 8 * BITS_PER_UNIT)
16805         {
16806           next_mode = smallest_mode_for_size (n, MODE_INT);
16807           int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
16808           src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
16809           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
16810           n = n_bits;
16811         }
16812     }
16813
16814   return true;
16815 }
16816
16817 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
16818    SImode stores.  Handle the case when the constant has identical
16819    bottom and top halves.  This is beneficial when the two stores can be
16820    merged into an STP and we avoid synthesising potentially expensive
16821    immediates twice.  Return true if such a split is possible.  */
16822
16823 bool
16824 aarch64_split_dimode_const_store (rtx dst, rtx src)
16825 {
16826   rtx lo = gen_lowpart (SImode, src);
16827   rtx hi = gen_highpart_mode (SImode, DImode, src);
16828
16829   bool size_p = optimize_function_for_size_p (cfun);
16830
16831   if (!rtx_equal_p (lo, hi))
16832     return false;
16833
16834   unsigned int orig_cost
16835     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
16836   unsigned int lo_cost
16837     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
16838
16839   /* We want to transform:
16840      MOV        x1, 49370
16841      MOVK       x1, 0x140, lsl 16
16842      MOVK       x1, 0xc0da, lsl 32
16843      MOVK       x1, 0x140, lsl 48
16844      STR        x1, [x0]
16845    into:
16846      MOV        w1, 49370
16847      MOVK       w1, 0x140, lsl 16
16848      STP        w1, w1, [x0]
16849    So we want to perform this only when we save two instructions
16850    or more.  When optimizing for size, however, accept any code size
16851    savings we can.  */
16852   if (size_p && orig_cost <= lo_cost)
16853     return false;
16854
16855   if (!size_p
16856       && (orig_cost <= lo_cost + 1))
16857     return false;
16858
16859   rtx mem_lo = adjust_address (dst, SImode, 0);
16860   if (!aarch64_mem_pair_operand (mem_lo, SImode))
16861     return false;
16862
16863   rtx tmp_reg = gen_reg_rtx (SImode);
16864   aarch64_expand_mov_immediate (tmp_reg, lo);
16865   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
16866   /* Don't emit an explicit store pair as this may not be always profitable.
16867      Let the sched-fusion logic decide whether to merge them.  */
16868   emit_move_insn (mem_lo, tmp_reg);
16869   emit_move_insn (mem_hi, tmp_reg);
16870
16871   return true;
16872 }
16873
16874 /* Generate RTL for a conditional branch with rtx comparison CODE in
16875    mode CC_MODE.  The destination of the unlikely conditional branch
16876    is LABEL_REF.  */
16877
16878 void
16879 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
16880                               rtx label_ref)
16881 {
16882   rtx x;
16883   x = gen_rtx_fmt_ee (code, VOIDmode,
16884                       gen_rtx_REG (cc_mode, CC_REGNUM),
16885                       const0_rtx);
16886
16887   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16888                             gen_rtx_LABEL_REF (VOIDmode, label_ref),
16889                             pc_rtx);
16890   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16891 }
16892
16893 /* Generate DImode scratch registers for 128-bit (TImode) addition.
16894
16895    OP1 represents the TImode destination operand 1
16896    OP2 represents the TImode destination operand 2
16897    LOW_DEST represents the low half (DImode) of TImode operand 0
16898    LOW_IN1 represents the low half (DImode) of TImode operand 1
16899    LOW_IN2 represents the low half (DImode) of TImode operand 2
16900    HIGH_DEST represents the high half (DImode) of TImode operand 0
16901    HIGH_IN1 represents the high half (DImode) of TImode operand 1
16902    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
16903
16904 void
16905 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
16906                             rtx *low_in1, rtx *low_in2,
16907                             rtx *high_dest, rtx *high_in1,
16908                             rtx *high_in2)
16909 {
16910   *low_dest = gen_reg_rtx (DImode);
16911   *low_in1 = gen_lowpart (DImode, op1);
16912   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
16913                                   subreg_lowpart_offset (DImode, TImode));
16914   *high_dest = gen_reg_rtx (DImode);
16915   *high_in1 = gen_highpart (DImode, op1);
16916   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
16917                                    subreg_highpart_offset (DImode, TImode));
16918 }
16919
16920 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
16921
16922    This function differs from 'arch64_addti_scratch_regs' in that
16923    OP1 can be an immediate constant (zero). We must call
16924    subreg_highpart_offset with DImode and TImode arguments, otherwise
16925    VOIDmode will be used for the const_int which generates an internal
16926    error from subreg_size_highpart_offset which does not expect a size of zero.
16927
16928    OP1 represents the TImode destination operand 1
16929    OP2 represents the TImode destination operand 2
16930    LOW_DEST represents the low half (DImode) of TImode operand 0
16931    LOW_IN1 represents the low half (DImode) of TImode operand 1
16932    LOW_IN2 represents the low half (DImode) of TImode operand 2
16933    HIGH_DEST represents the high half (DImode) of TImode operand 0
16934    HIGH_IN1 represents the high half (DImode) of TImode operand 1
16935    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
16936
16937
16938 void
16939 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
16940                              rtx *low_in1, rtx *low_in2,
16941                              rtx *high_dest, rtx *high_in1,
16942                              rtx *high_in2)
16943 {
16944   *low_dest = gen_reg_rtx (DImode);
16945   *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
16946                                   subreg_lowpart_offset (DImode, TImode));
16947
16948   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
16949                                   subreg_lowpart_offset (DImode, TImode));
16950   *high_dest = gen_reg_rtx (DImode);
16951
16952   *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
16953                                    subreg_highpart_offset (DImode, TImode));
16954   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
16955                                    subreg_highpart_offset (DImode, TImode));
16956 }
16957
16958 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
16959
16960    OP0 represents the TImode destination operand 0
16961    LOW_DEST represents the low half (DImode) of TImode operand 0
16962    LOW_IN1 represents the low half (DImode) of TImode operand 1
16963    LOW_IN2 represents the low half (DImode) of TImode operand 2
16964    HIGH_DEST represents the high half (DImode) of TImode operand 0
16965    HIGH_IN1 represents the high half (DImode) of TImode operand 1
16966    HIGH_IN2 represents the high half (DImode) of TImode operand 2
16967    UNSIGNED_P is true if the operation is being performed on unsigned
16968    values.  */
16969 void
16970 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
16971                        rtx low_in2, rtx high_dest, rtx high_in1,
16972                        rtx high_in2, bool unsigned_p)
16973 {
16974   if (low_in2 == const0_rtx)
16975     {
16976       low_dest = low_in1;
16977       high_in2 = force_reg (DImode, high_in2);
16978       if (unsigned_p)
16979         emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
16980       else
16981         emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
16982     }
16983   else
16984     {
16985       if (CONST_INT_P (low_in2))
16986         {
16987           high_in2 = force_reg (DImode, high_in2);
16988           emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
16989                                               GEN_INT (-INTVAL (low_in2))));
16990         }
16991       else
16992         emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
16993
16994       if (unsigned_p)
16995         emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
16996       else
16997         emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
16998     }
16999
17000   emit_move_insn (gen_lowpart (DImode, op0), low_dest);
17001   emit_move_insn (gen_highpart (DImode, op0), high_dest);
17002
17003 }
17004
17005 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
17006
17007 static unsigned HOST_WIDE_INT
17008 aarch64_asan_shadow_offset (void)
17009 {
17010   return (HOST_WIDE_INT_1 << 36);
17011 }
17012
17013 static rtx
17014 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
17015                         int code, tree treeop0, tree treeop1)
17016 {
17017   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
17018   rtx op0, op1;
17019   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
17020   insn_code icode;
17021   struct expand_operand ops[4];
17022
17023   start_sequence ();
17024   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
17025
17026   op_mode = GET_MODE (op0);
17027   if (op_mode == VOIDmode)
17028     op_mode = GET_MODE (op1);
17029
17030   switch (op_mode)
17031     {
17032     case E_QImode:
17033     case E_HImode:
17034     case E_SImode:
17035       cmp_mode = SImode;
17036       icode = CODE_FOR_cmpsi;
17037       break;
17038
17039     case E_DImode:
17040       cmp_mode = DImode;
17041       icode = CODE_FOR_cmpdi;
17042       break;
17043
17044     case E_SFmode:
17045       cmp_mode = SFmode;
17046       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
17047       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
17048       break;
17049
17050     case E_DFmode:
17051       cmp_mode = DFmode;
17052       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
17053       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
17054       break;
17055
17056     default:
17057       end_sequence ();
17058       return NULL_RTX;
17059     }
17060
17061   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
17062   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
17063   if (!op0 || !op1)
17064     {
17065       end_sequence ();
17066       return NULL_RTX;
17067     }
17068   *prep_seq = get_insns ();
17069   end_sequence ();
17070
17071   create_fixed_operand (&ops[0], op0);
17072   create_fixed_operand (&ops[1], op1);
17073
17074   start_sequence ();
17075   if (!maybe_expand_insn (icode, 2, ops))
17076     {
17077       end_sequence ();
17078       return NULL_RTX;
17079     }
17080   *gen_seq = get_insns ();
17081   end_sequence ();
17082
17083   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
17084                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
17085 }
17086
17087 static rtx
17088 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
17089                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
17090 {
17091   rtx op0, op1, target;
17092   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
17093   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
17094   insn_code icode;
17095   struct expand_operand ops[6];
17096   int aarch64_cond;
17097
17098   push_to_sequence (*prep_seq);
17099   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
17100
17101   op_mode = GET_MODE (op0);
17102   if (op_mode == VOIDmode)
17103     op_mode = GET_MODE (op1);
17104
17105   switch (op_mode)
17106     {
17107     case E_QImode:
17108     case E_HImode:
17109     case E_SImode:
17110       cmp_mode = SImode;
17111       icode = CODE_FOR_ccmpsi;
17112       break;
17113
17114     case E_DImode:
17115       cmp_mode = DImode;
17116       icode = CODE_FOR_ccmpdi;
17117       break;
17118
17119     case E_SFmode:
17120       cmp_mode = SFmode;
17121       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
17122       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
17123       break;
17124
17125     case E_DFmode:
17126       cmp_mode = DFmode;
17127       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
17128       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
17129       break;
17130
17131     default:
17132       end_sequence ();
17133       return NULL_RTX;
17134     }
17135
17136   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
17137   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
17138   if (!op0 || !op1)
17139     {
17140       end_sequence ();
17141       return NULL_RTX;
17142     }
17143   *prep_seq = get_insns ();
17144   end_sequence ();
17145
17146   target = gen_rtx_REG (cc_mode, CC_REGNUM);
17147   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
17148
17149   if (bit_code != AND)
17150     {
17151       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
17152                                                 GET_MODE (XEXP (prev, 0))),
17153                              VOIDmode, XEXP (prev, 0), const0_rtx);
17154       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
17155     }
17156
17157   create_fixed_operand (&ops[0], XEXP (prev, 0));
17158   create_fixed_operand (&ops[1], target);
17159   create_fixed_operand (&ops[2], op0);
17160   create_fixed_operand (&ops[3], op1);
17161   create_fixed_operand (&ops[4], prev);
17162   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
17163
17164   push_to_sequence (*gen_seq);
17165   if (!maybe_expand_insn (icode, 6, ops))
17166     {
17167       end_sequence ();
17168       return NULL_RTX;
17169     }
17170
17171   *gen_seq = get_insns ();
17172   end_sequence ();
17173
17174   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
17175 }
17176
17177 #undef TARGET_GEN_CCMP_FIRST
17178 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
17179
17180 #undef TARGET_GEN_CCMP_NEXT
17181 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
17182
17183 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
17184    instruction fusion of some sort.  */
17185
17186 static bool
17187 aarch64_macro_fusion_p (void)
17188 {
17189   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
17190 }
17191
17192
17193 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
17194    should be kept together during scheduling.  */
17195
17196 static bool
17197 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
17198 {
17199   rtx set_dest;
17200   rtx prev_set = single_set (prev);
17201   rtx curr_set = single_set (curr);
17202   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
17203   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
17204
17205   if (!aarch64_macro_fusion_p ())
17206     return false;
17207
17208   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
17209     {
17210       /* We are trying to match:
17211          prev (mov)  == (set (reg r0) (const_int imm16))
17212          curr (movk) == (set (zero_extract (reg r0)
17213                                            (const_int 16)
17214                                            (const_int 16))
17215                              (const_int imm16_1))  */
17216
17217       set_dest = SET_DEST (curr_set);
17218
17219       if (GET_CODE (set_dest) == ZERO_EXTRACT
17220           && CONST_INT_P (SET_SRC (curr_set))
17221           && CONST_INT_P (SET_SRC (prev_set))
17222           && CONST_INT_P (XEXP (set_dest, 2))
17223           && INTVAL (XEXP (set_dest, 2)) == 16
17224           && REG_P (XEXP (set_dest, 0))
17225           && REG_P (SET_DEST (prev_set))
17226           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
17227         {
17228           return true;
17229         }
17230     }
17231
17232   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
17233     {
17234
17235       /*  We're trying to match:
17236           prev (adrp) == (set (reg r1)
17237                               (high (symbol_ref ("SYM"))))
17238           curr (add) == (set (reg r0)
17239                              (lo_sum (reg r1)
17240                                      (symbol_ref ("SYM"))))
17241           Note that r0 need not necessarily be the same as r1, especially
17242           during pre-regalloc scheduling.  */
17243
17244       if (satisfies_constraint_Ush (SET_SRC (prev_set))
17245           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
17246         {
17247           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
17248               && REG_P (XEXP (SET_SRC (curr_set), 0))
17249               && REGNO (XEXP (SET_SRC (curr_set), 0))
17250                  == REGNO (SET_DEST (prev_set))
17251               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
17252                               XEXP (SET_SRC (curr_set), 1)))
17253             return true;
17254         }
17255     }
17256
17257   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
17258     {
17259
17260       /* We're trying to match:
17261          prev (movk) == (set (zero_extract (reg r0)
17262                                            (const_int 16)
17263                                            (const_int 32))
17264                              (const_int imm16_1))
17265          curr (movk) == (set (zero_extract (reg r0)
17266                                            (const_int 16)
17267                                            (const_int 48))
17268                              (const_int imm16_2))  */
17269
17270       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
17271           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
17272           && REG_P (XEXP (SET_DEST (prev_set), 0))
17273           && REG_P (XEXP (SET_DEST (curr_set), 0))
17274           && REGNO (XEXP (SET_DEST (prev_set), 0))
17275              == REGNO (XEXP (SET_DEST (curr_set), 0))
17276           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
17277           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
17278           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
17279           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
17280           && CONST_INT_P (SET_SRC (prev_set))
17281           && CONST_INT_P (SET_SRC (curr_set)))
17282         return true;
17283
17284     }
17285   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
17286     {
17287       /* We're trying to match:
17288           prev (adrp) == (set (reg r0)
17289                               (high (symbol_ref ("SYM"))))
17290           curr (ldr) == (set (reg r1)
17291                              (mem (lo_sum (reg r0)
17292                                              (symbol_ref ("SYM")))))
17293                  or
17294           curr (ldr) == (set (reg r1)
17295                              (zero_extend (mem
17296                                            (lo_sum (reg r0)
17297                                                    (symbol_ref ("SYM"))))))  */
17298       if (satisfies_constraint_Ush (SET_SRC (prev_set))
17299           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
17300         {
17301           rtx curr_src = SET_SRC (curr_set);
17302
17303           if (GET_CODE (curr_src) == ZERO_EXTEND)
17304             curr_src = XEXP (curr_src, 0);
17305
17306           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
17307               && REG_P (XEXP (XEXP (curr_src, 0), 0))
17308               && REGNO (XEXP (XEXP (curr_src, 0), 0))
17309                  == REGNO (SET_DEST (prev_set))
17310               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
17311                               XEXP (SET_SRC (prev_set), 0)))
17312               return true;
17313         }
17314     }
17315
17316   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
17317        && aarch_crypto_can_dual_issue (prev, curr))
17318     return true;
17319
17320   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
17321       && any_condjump_p (curr))
17322     {
17323       unsigned int condreg1, condreg2;
17324       rtx cc_reg_1;
17325       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
17326       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
17327
17328       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
17329           && prev
17330           && modified_in_p (cc_reg_1, prev))
17331         {
17332           enum attr_type prev_type = get_attr_type (prev);
17333
17334           /* FIXME: this misses some which is considered simple arthematic
17335              instructions for ThunderX.  Simple shifts are missed here.  */
17336           if (prev_type == TYPE_ALUS_SREG
17337               || prev_type == TYPE_ALUS_IMM
17338               || prev_type == TYPE_LOGICS_REG
17339               || prev_type == TYPE_LOGICS_IMM)
17340             return true;
17341         }
17342     }
17343
17344   if (prev_set
17345       && curr_set
17346       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
17347       && any_condjump_p (curr))
17348     {
17349       /* We're trying to match:
17350           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
17351           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
17352                                                          (const_int 0))
17353                                                  (label_ref ("SYM"))
17354                                                  (pc))  */
17355       if (SET_DEST (curr_set) == (pc_rtx)
17356           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
17357           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
17358           && REG_P (SET_DEST (prev_set))
17359           && REGNO (SET_DEST (prev_set))
17360              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
17361         {
17362           /* Fuse ALU operations followed by conditional branch instruction.  */
17363           switch (get_attr_type (prev))
17364             {
17365             case TYPE_ALU_IMM:
17366             case TYPE_ALU_SREG:
17367             case TYPE_ADC_REG:
17368             case TYPE_ADC_IMM:
17369             case TYPE_ADCS_REG:
17370             case TYPE_ADCS_IMM:
17371             case TYPE_LOGIC_REG:
17372             case TYPE_LOGIC_IMM:
17373             case TYPE_CSEL:
17374             case TYPE_ADR:
17375             case TYPE_MOV_IMM:
17376             case TYPE_SHIFT_REG:
17377             case TYPE_SHIFT_IMM:
17378             case TYPE_BFM:
17379             case TYPE_RBIT:
17380             case TYPE_REV:
17381             case TYPE_EXTEND:
17382               return true;
17383
17384             default:;
17385             }
17386         }
17387     }
17388
17389   return false;
17390 }
17391
17392 /* Return true iff the instruction fusion described by OP is enabled.  */
17393
17394 bool
17395 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
17396 {
17397   return (aarch64_tune_params.fusible_ops & op) != 0;
17398 }
17399
17400 /* If MEM is in the form of [base+offset], extract the two parts
17401    of address and set to BASE and OFFSET, otherwise return false
17402    after clearing BASE and OFFSET.  */
17403
17404 bool
17405 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
17406 {
17407   rtx addr;
17408
17409   gcc_assert (MEM_P (mem));
17410
17411   addr = XEXP (mem, 0);
17412
17413   if (REG_P (addr))
17414     {
17415       *base = addr;
17416       *offset = const0_rtx;
17417       return true;
17418     }
17419
17420   if (GET_CODE (addr) == PLUS
17421       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
17422     {
17423       *base = XEXP (addr, 0);
17424       *offset = XEXP (addr, 1);
17425       return true;
17426     }
17427
17428   *base = NULL_RTX;
17429   *offset = NULL_RTX;
17430
17431   return false;
17432 }
17433
17434 /* Types for scheduling fusion.  */
17435 enum sched_fusion_type
17436 {
17437   SCHED_FUSION_NONE = 0,
17438   SCHED_FUSION_LD_SIGN_EXTEND,
17439   SCHED_FUSION_LD_ZERO_EXTEND,
17440   SCHED_FUSION_LD,
17441   SCHED_FUSION_ST,
17442   SCHED_FUSION_NUM
17443 };
17444
17445 /* If INSN is a load or store of address in the form of [base+offset],
17446    extract the two parts and set to BASE and OFFSET.  Return scheduling
17447    fusion type this INSN is.  */
17448
17449 static enum sched_fusion_type
17450 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
17451 {
17452   rtx x, dest, src;
17453   enum sched_fusion_type fusion = SCHED_FUSION_LD;
17454
17455   gcc_assert (INSN_P (insn));
17456   x = PATTERN (insn);
17457   if (GET_CODE (x) != SET)
17458     return SCHED_FUSION_NONE;
17459
17460   src = SET_SRC (x);
17461   dest = SET_DEST (x);
17462
17463   machine_mode dest_mode = GET_MODE (dest);
17464
17465   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
17466     return SCHED_FUSION_NONE;
17467
17468   if (GET_CODE (src) == SIGN_EXTEND)
17469     {
17470       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
17471       src = XEXP (src, 0);
17472       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
17473         return SCHED_FUSION_NONE;
17474     }
17475   else if (GET_CODE (src) == ZERO_EXTEND)
17476     {
17477       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
17478       src = XEXP (src, 0);
17479       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
17480         return SCHED_FUSION_NONE;
17481     }
17482
17483   if (GET_CODE (src) == MEM && REG_P (dest))
17484     extract_base_offset_in_addr (src, base, offset);
17485   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
17486     {
17487       fusion = SCHED_FUSION_ST;
17488       extract_base_offset_in_addr (dest, base, offset);
17489     }
17490   else
17491     return SCHED_FUSION_NONE;
17492
17493   if (*base == NULL_RTX || *offset == NULL_RTX)
17494     fusion = SCHED_FUSION_NONE;
17495
17496   return fusion;
17497 }
17498
17499 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
17500
17501    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
17502    and PRI are only calculated for these instructions.  For other instruction,
17503    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
17504    type instruction fusion can be added by returning different priorities.
17505
17506    It's important that irrelevant instructions get the largest FUSION_PRI.  */
17507
17508 static void
17509 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
17510                                int *fusion_pri, int *pri)
17511 {
17512   int tmp, off_val;
17513   rtx base, offset;
17514   enum sched_fusion_type fusion;
17515
17516   gcc_assert (INSN_P (insn));
17517
17518   tmp = max_pri - 1;
17519   fusion = fusion_load_store (insn, &base, &offset);
17520   if (fusion == SCHED_FUSION_NONE)
17521     {
17522       *pri = tmp;
17523       *fusion_pri = tmp;
17524       return;
17525     }
17526
17527   /* Set FUSION_PRI according to fusion type and base register.  */
17528   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
17529
17530   /* Calculate PRI.  */
17531   tmp /= 2;
17532
17533   /* INSN with smaller offset goes first.  */
17534   off_val = (int)(INTVAL (offset));
17535   if (off_val >= 0)
17536     tmp -= (off_val & 0xfffff);
17537   else
17538     tmp += ((- off_val) & 0xfffff);
17539
17540   *pri = tmp;
17541   return;
17542 }
17543
17544 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
17545    Adjust priority of sha1h instructions so they are scheduled before
17546    other SHA1 instructions.  */
17547
17548 static int
17549 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
17550 {
17551   rtx x = PATTERN (insn);
17552
17553   if (GET_CODE (x) == SET)
17554     {
17555       x = SET_SRC (x);
17556
17557       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
17558         return priority + 10;
17559     }
17560
17561   return priority;
17562 }
17563
17564 /* Given OPERANDS of consecutive load/store, check if we can merge
17565    them into ldp/stp.  LOAD is true if they are load instructions.
17566    MODE is the mode of memory operands.  */
17567
17568 bool
17569 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
17570                                 machine_mode mode)
17571 {
17572   HOST_WIDE_INT offval_1, offval_2, msize;
17573   enum reg_class rclass_1, rclass_2;
17574   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
17575
17576   if (load)
17577     {
17578       mem_1 = operands[1];
17579       mem_2 = operands[3];
17580       reg_1 = operands[0];
17581       reg_2 = operands[2];
17582       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
17583       if (REGNO (reg_1) == REGNO (reg_2))
17584         return false;
17585     }
17586   else
17587     {
17588       mem_1 = operands[0];
17589       mem_2 = operands[2];
17590       reg_1 = operands[1];
17591       reg_2 = operands[3];
17592     }
17593
17594   /* The mems cannot be volatile.  */
17595   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
17596     return false;
17597
17598   /* If we have SImode and slow unaligned ldp,
17599      check the alignment to be at least 8 byte. */
17600   if (mode == SImode
17601       && (aarch64_tune_params.extra_tuning_flags
17602           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
17603       && !optimize_size
17604       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
17605     return false;
17606
17607   /* Check if the addresses are in the form of [base+offset].  */
17608   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
17609   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
17610     return false;
17611   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
17612   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
17613     return false;
17614
17615   /* Check if the bases are same.  */
17616   if (!rtx_equal_p (base_1, base_2))
17617     return false;
17618
17619   /* The operands must be of the same size.  */
17620   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
17621                          GET_MODE_SIZE (GET_MODE (mem_2))));
17622
17623   offval_1 = INTVAL (offset_1);
17624   offval_2 = INTVAL (offset_2);
17625   /* We should only be trying this for fixed-sized modes.  There is no
17626      SVE LDP/STP instruction.  */
17627   msize = GET_MODE_SIZE (mode).to_constant ();
17628   /* Check if the offsets are consecutive.  */
17629   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
17630     return false;
17631
17632   /* Check if the addresses are clobbered by load.  */
17633   if (load)
17634     {
17635       if (reg_mentioned_p (reg_1, mem_1))
17636         return false;
17637
17638       /* In increasing order, the last load can clobber the address.  */
17639       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
17640         return false;
17641     }
17642
17643   /* One of the memory accesses must be a mempair operand.
17644      If it is not the first one, they need to be swapped by the
17645      peephole.  */
17646   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
17647        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
17648     return false;
17649
17650   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
17651     rclass_1 = FP_REGS;
17652   else
17653     rclass_1 = GENERAL_REGS;
17654
17655   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
17656     rclass_2 = FP_REGS;
17657   else
17658     rclass_2 = GENERAL_REGS;
17659
17660   /* Check if the registers are of same class.  */
17661   if (rclass_1 != rclass_2)
17662     return false;
17663
17664   return true;
17665 }
17666
17667 /* Given OPERANDS of consecutive load/store that can be merged,
17668    swap them if they are not in ascending order.  */
17669 void
17670 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
17671 {
17672   rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
17673   HOST_WIDE_INT offval_1, offval_2;
17674
17675   if (load)
17676     {
17677       mem_1 = operands[1];
17678       mem_2 = operands[3];
17679     }
17680   else
17681     {
17682       mem_1 = operands[0];
17683       mem_2 = operands[2];
17684     }
17685
17686   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
17687   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
17688
17689   offval_1 = INTVAL (offset_1);
17690   offval_2 = INTVAL (offset_2);
17691
17692   if (offval_1 > offval_2)
17693     {
17694       /* Irrespective of whether this is a load or a store,
17695          we do the same swap.  */
17696       std::swap (operands[0], operands[2]);
17697       std::swap (operands[1], operands[3]);
17698     }
17699 }
17700
17701 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
17702    comparison between the two.  */
17703 int
17704 aarch64_host_wide_int_compare (const void *x, const void *y)
17705 {
17706   return wi::cmps (* ((const HOST_WIDE_INT *) x),
17707                    * ((const HOST_WIDE_INT *) y));
17708 }
17709
17710 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
17711    other pointing to a REG rtx containing an offset, compare the offsets
17712    of the two pairs.
17713
17714    Return:
17715
17716         1 iff offset (X) > offset (Y)
17717         0 iff offset (X) == offset (Y)
17718         -1 iff offset (X) < offset (Y)  */
17719 int
17720 aarch64_ldrstr_offset_compare (const void *x, const void *y)
17721 {
17722   const rtx * operands_1 = (const rtx *) x;
17723   const rtx * operands_2 = (const rtx *) y;
17724   rtx mem_1, mem_2, base, offset_1, offset_2;
17725
17726   if (MEM_P (operands_1[0]))
17727     mem_1 = operands_1[0];
17728   else
17729     mem_1 = operands_1[1];
17730
17731   if (MEM_P (operands_2[0]))
17732     mem_2 = operands_2[0];
17733   else
17734     mem_2 = operands_2[1];
17735
17736   /* Extract the offsets.  */
17737   extract_base_offset_in_addr (mem_1, &base, &offset_1);
17738   extract_base_offset_in_addr (mem_2, &base, &offset_2);
17739
17740   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
17741
17742   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
17743 }
17744
17745 /* Given OPERANDS of consecutive load/store, check if we can merge
17746    them into ldp/stp by adjusting the offset.  LOAD is true if they
17747    are load instructions.  MODE is the mode of memory operands.
17748
17749    Given below consecutive stores:
17750
17751      str  w1, [xb, 0x100]
17752      str  w1, [xb, 0x104]
17753      str  w1, [xb, 0x108]
17754      str  w1, [xb, 0x10c]
17755
17756    Though the offsets are out of the range supported by stp, we can
17757    still pair them after adjusting the offset, like:
17758
17759      add  scratch, xb, 0x100
17760      stp  w1, w1, [scratch]
17761      stp  w1, w1, [scratch, 0x8]
17762
17763    The peephole patterns detecting this opportunity should guarantee
17764    the scratch register is avaliable.  */
17765
17766 bool
17767 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
17768                                        scalar_mode mode)
17769 {
17770   const int num_insns = 4;
17771   enum reg_class rclass;
17772   HOST_WIDE_INT offvals[num_insns], msize;
17773   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
17774
17775   if (load)
17776     {
17777       for (int i = 0; i < num_insns; i++)
17778         {
17779           reg[i] = operands[2 * i];
17780           mem[i] = operands[2 * i + 1];
17781
17782           gcc_assert (REG_P (reg[i]));
17783         }
17784
17785       /* Do not attempt to merge the loads if the loads clobber each other.  */
17786       for (int i = 0; i < 8; i += 2)
17787         for (int j = i + 2; j < 8; j += 2)
17788           if (reg_overlap_mentioned_p (operands[i], operands[j]))
17789             return false;
17790     }
17791   else
17792     for (int i = 0; i < num_insns; i++)
17793       {
17794         mem[i] = operands[2 * i];
17795         reg[i] = operands[2 * i + 1];
17796       }
17797
17798   /* Skip if memory operand is by itself valid for ldp/stp.  */
17799   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
17800     return false;
17801
17802   for (int i = 0; i < num_insns; i++)
17803     {
17804       /* The mems cannot be volatile.  */
17805       if (MEM_VOLATILE_P (mem[i]))
17806         return false;
17807
17808       /* Check if the addresses are in the form of [base+offset].  */
17809       extract_base_offset_in_addr (mem[i], base + i, offset + i);
17810       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
17811         return false;
17812     }
17813
17814   /* Check if the registers are of same class.  */
17815   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
17816     ? FP_REGS : GENERAL_REGS;
17817
17818   for (int i = 1; i < num_insns; i++)
17819     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
17820       {
17821         if (rclass != FP_REGS)
17822           return false;
17823       }
17824     else
17825       {
17826         if (rclass != GENERAL_REGS)
17827           return false;
17828       }
17829
17830   /* Only the last register in the order in which they occur
17831      may be clobbered by the load.  */
17832   if (rclass == GENERAL_REGS && load)
17833     for (int i = 0; i < num_insns - 1; i++)
17834       if (reg_mentioned_p (reg[i], mem[i]))
17835         return false;
17836
17837   /* Check if the bases are same.  */
17838   for (int i = 0; i < num_insns - 1; i++)
17839     if (!rtx_equal_p (base[i], base[i + 1]))
17840       return false;
17841
17842   for (int i = 0; i < num_insns; i++)
17843     offvals[i] = INTVAL (offset[i]);
17844
17845   msize = GET_MODE_SIZE (mode);
17846
17847   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
17848   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
17849          aarch64_host_wide_int_compare);
17850
17851   if (!(offvals[1] == offvals[0] + msize
17852         && offvals[3] == offvals[2] + msize))
17853     return false;
17854
17855   /* Check that offsets are within range of each other.  The ldp/stp
17856      instructions have 7 bit immediate offsets, so use 0x80.  */
17857   if (offvals[2] - offvals[0] >= msize * 0x80)
17858     return false;
17859
17860   /* The offsets must be aligned with respect to each other.  */
17861   if (offvals[0] % msize != offvals[2] % msize)
17862     return false;
17863
17864   /* If we have SImode and slow unaligned ldp,
17865      check the alignment to be at least 8 byte. */
17866   if (mode == SImode
17867       && (aarch64_tune_params.extra_tuning_flags
17868           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
17869       && !optimize_size
17870       && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
17871     return false;
17872
17873   return true;
17874 }
17875
17876 /* Given OPERANDS of consecutive load/store, this function pairs them
17877    into LDP/STP after adjusting the offset.  It depends on the fact
17878    that the operands can be sorted so the offsets are correct for STP.
17879    MODE is the mode of memory operands.  CODE is the rtl operator
17880    which should be applied to all memory operands, it's SIGN_EXTEND,
17881    ZERO_EXTEND or UNKNOWN.  */
17882
17883 bool
17884 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
17885                              scalar_mode mode, RTX_CODE code)
17886 {
17887   rtx base, offset_1, offset_3, t1, t2;
17888   rtx mem_1, mem_2, mem_3, mem_4;
17889   rtx temp_operands[8];
17890   HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
17891                 stp_off_upper_limit, stp_off_lower_limit, msize;
17892
17893   /* We make changes on a copy as we may still bail out.  */
17894   for (int i = 0; i < 8; i ++)
17895     temp_operands[i] = operands[i];
17896
17897   /* Sort the operands.  */
17898   qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
17899
17900   if (load)
17901     {
17902       mem_1 = temp_operands[1];
17903       mem_2 = temp_operands[3];
17904       mem_3 = temp_operands[5];
17905       mem_4 = temp_operands[7];
17906     }
17907   else
17908     {
17909       mem_1 = temp_operands[0];
17910       mem_2 = temp_operands[2];
17911       mem_3 = temp_operands[4];
17912       mem_4 = temp_operands[6];
17913       gcc_assert (code == UNKNOWN);
17914     }
17915
17916   extract_base_offset_in_addr (mem_1, &base, &offset_1);
17917   extract_base_offset_in_addr (mem_3, &base, &offset_3);
17918   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
17919               && offset_3 != NULL_RTX);
17920
17921   /* Adjust offset so it can fit in LDP/STP instruction.  */
17922   msize = GET_MODE_SIZE (mode);
17923   stp_off_upper_limit = msize * (0x40 - 1);
17924   stp_off_lower_limit = - msize * 0x40;
17925
17926   off_val_1 = INTVAL (offset_1);
17927   off_val_3 = INTVAL (offset_3);
17928
17929   /* The base offset is optimally half way between the two STP/LDP offsets.  */
17930   if (msize <= 4)
17931     base_off = (off_val_1 + off_val_3) / 2;
17932   else
17933     /* However, due to issues with negative LDP/STP offset generation for
17934        larger modes, for DF, DI and vector modes. we must not use negative
17935        addresses smaller than 9 signed unadjusted bits can store.  This
17936        provides the most range in this case.  */
17937     base_off = off_val_1;
17938
17939   /* Adjust the base so that it is aligned with the addresses but still
17940      optimal.  */
17941   if (base_off % msize != off_val_1 % msize)
17942     /* Fix the offset, bearing in mind we want to make it bigger not
17943        smaller.  */
17944     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17945   else if (msize <= 4)
17946     /* The negative range of LDP/STP is one larger than the positive range.  */
17947     base_off += msize;
17948
17949   /* Check if base offset is too big or too small.  We can attempt to resolve
17950      this issue by setting it to the maximum value and seeing if the offsets
17951      still fit.  */
17952   if (base_off >= 0x1000)
17953     {
17954       base_off = 0x1000 - 1;
17955       /* We must still make sure that the base offset is aligned with respect
17956          to the address.  But it may may not be made any bigger.  */
17957       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17958     }
17959
17960   /* Likewise for the case where the base is too small.  */
17961   if (base_off <= -0x1000)
17962     {
17963       base_off = -0x1000 + 1;
17964       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17965     }
17966
17967   /* Offset of the first STP/LDP.  */
17968   new_off_1 = off_val_1 - base_off;
17969
17970   /* Offset of the second STP/LDP.  */
17971   new_off_3 = off_val_3 - base_off;
17972
17973   /* The offsets must be within the range of the LDP/STP instructions.  */
17974   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
17975       || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
17976     return false;
17977
17978   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
17979                                                   new_off_1), true);
17980   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
17981                                                   new_off_1 + msize), true);
17982   replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
17983                                                   new_off_3), true);
17984   replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
17985                                                   new_off_3 + msize), true);
17986
17987   if (!aarch64_mem_pair_operand (mem_1, mode)
17988       || !aarch64_mem_pair_operand (mem_3, mode))
17989     return false;
17990
17991   if (code == ZERO_EXTEND)
17992     {
17993       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
17994       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
17995       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
17996       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
17997     }
17998   else if (code == SIGN_EXTEND)
17999     {
18000       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
18001       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
18002       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
18003       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
18004     }
18005
18006   if (load)
18007     {
18008       operands[0] = temp_operands[0];
18009       operands[1] = mem_1;
18010       operands[2] = temp_operands[2];
18011       operands[3] = mem_2;
18012       operands[4] = temp_operands[4];
18013       operands[5] = mem_3;
18014       operands[6] = temp_operands[6];
18015       operands[7] = mem_4;
18016     }
18017   else
18018     {
18019       operands[0] = mem_1;
18020       operands[1] = temp_operands[1];
18021       operands[2] = mem_2;
18022       operands[3] = temp_operands[3];
18023       operands[4] = mem_3;
18024       operands[5] = temp_operands[5];
18025       operands[6] = mem_4;
18026       operands[7] = temp_operands[7];
18027     }
18028
18029   /* Emit adjusting instruction.  */
18030   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
18031   /* Emit ldp/stp instructions.  */
18032   t1 = gen_rtx_SET (operands[0], operands[1]);
18033   t2 = gen_rtx_SET (operands[2], operands[3]);
18034   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
18035   t1 = gen_rtx_SET (operands[4], operands[5]);
18036   t2 = gen_rtx_SET (operands[6], operands[7]);
18037   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
18038   return true;
18039 }
18040
18041 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
18042    it isn't worth branching around empty masked ops (including masked
18043    stores).  */
18044
18045 static bool
18046 aarch64_empty_mask_is_expensive (unsigned)
18047 {
18048   return false;
18049 }
18050
18051 /* Return 1 if pseudo register should be created and used to hold
18052    GOT address for PIC code.  */
18053
18054 bool
18055 aarch64_use_pseudo_pic_reg (void)
18056 {
18057   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
18058 }
18059
18060 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
18061
18062 static int
18063 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
18064 {
18065   switch (XINT (x, 1))
18066     {
18067     case UNSPEC_GOTSMALLPIC:
18068     case UNSPEC_GOTSMALLPIC28K:
18069     case UNSPEC_GOTTINYPIC:
18070       return 0;
18071     default:
18072       break;
18073     }
18074
18075   return default_unspec_may_trap_p (x, flags);
18076 }
18077
18078
18079 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
18080    return the log2 of that value.  Otherwise return -1.  */
18081
18082 int
18083 aarch64_fpconst_pow_of_2 (rtx x)
18084 {
18085   const REAL_VALUE_TYPE *r;
18086
18087   if (!CONST_DOUBLE_P (x))
18088     return -1;
18089
18090   r = CONST_DOUBLE_REAL_VALUE (x);
18091
18092   if (REAL_VALUE_NEGATIVE (*r)
18093       || REAL_VALUE_ISNAN (*r)
18094       || REAL_VALUE_ISINF (*r)
18095       || !real_isinteger (r, DFmode))
18096     return -1;
18097
18098   return exact_log2 (real_to_integer (r));
18099 }
18100
18101 /* If X is a vector of equal CONST_DOUBLE values and that value is
18102    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
18103
18104 int
18105 aarch64_vec_fpconst_pow_of_2 (rtx x)
18106 {
18107   int nelts;
18108   if (GET_CODE (x) != CONST_VECTOR
18109       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
18110     return -1;
18111
18112   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
18113     return -1;
18114
18115   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
18116   if (firstval <= 0)
18117     return -1;
18118
18119   for (int i = 1; i < nelts; i++)
18120     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
18121       return -1;
18122
18123   return firstval;
18124 }
18125
18126 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
18127    to float.
18128
18129    __fp16 always promotes through this hook.
18130    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
18131    through the generic excess precision logic rather than here.  */
18132
18133 static tree
18134 aarch64_promoted_type (const_tree t)
18135 {
18136   if (SCALAR_FLOAT_TYPE_P (t)
18137       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
18138     return float_type_node;
18139
18140   return NULL_TREE;
18141 }
18142
18143 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
18144
18145 static bool
18146 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
18147                            optimization_type opt_type)
18148 {
18149   switch (op)
18150     {
18151     case rsqrt_optab:
18152       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
18153
18154     default:
18155       return true;
18156     }
18157 }
18158
18159 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
18160
18161 static unsigned int
18162 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
18163                                         int *offset)
18164 {
18165   /* Polynomial invariant 1 == (VG / 2) - 1.  */
18166   gcc_assert (i == 1);
18167   *factor = 2;
18168   *offset = 1;
18169   return AARCH64_DWARF_VG;
18170 }
18171
18172 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
18173    if MODE is HFmode, and punt to the generic implementation otherwise.  */
18174
18175 static bool
18176 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
18177 {
18178   return (mode == HFmode
18179           ? true
18180           : default_libgcc_floating_mode_supported_p (mode));
18181 }
18182
18183 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
18184    if MODE is HFmode, and punt to the generic implementation otherwise.  */
18185
18186 static bool
18187 aarch64_scalar_mode_supported_p (scalar_mode mode)
18188 {
18189   return (mode == HFmode
18190           ? true
18191           : default_scalar_mode_supported_p (mode));
18192 }
18193
18194 /* Set the value of FLT_EVAL_METHOD.
18195    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
18196
18197     0: evaluate all operations and constants, whose semantic type has at
18198        most the range and precision of type float, to the range and
18199        precision of float; evaluate all other operations and constants to
18200        the range and precision of the semantic type;
18201
18202     N, where _FloatN is a supported interchange floating type
18203        evaluate all operations and constants, whose semantic type has at
18204        most the range and precision of _FloatN type, to the range and
18205        precision of the _FloatN type; evaluate all other operations and
18206        constants to the range and precision of the semantic type;
18207
18208    If we have the ARMv8.2-A extensions then we support _Float16 in native
18209    precision, so we should set this to 16.  Otherwise, we support the type,
18210    but want to evaluate expressions in float precision, so set this to
18211    0.  */
18212
18213 static enum flt_eval_method
18214 aarch64_excess_precision (enum excess_precision_type type)
18215 {
18216   switch (type)
18217     {
18218       case EXCESS_PRECISION_TYPE_FAST:
18219       case EXCESS_PRECISION_TYPE_STANDARD:
18220         /* We can calculate either in 16-bit range and precision or
18221            32-bit range and precision.  Make that decision based on whether
18222            we have native support for the ARMv8.2-A 16-bit floating-point
18223            instructions or not.  */
18224         return (TARGET_FP_F16INST
18225                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
18226                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
18227       case EXCESS_PRECISION_TYPE_IMPLICIT:
18228         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
18229       default:
18230         gcc_unreachable ();
18231     }
18232   return FLT_EVAL_METHOD_UNPREDICTABLE;
18233 }
18234
18235 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
18236    scheduled for speculative execution.  Reject the long-running division
18237    and square-root instructions.  */
18238
18239 static bool
18240 aarch64_sched_can_speculate_insn (rtx_insn *insn)
18241 {
18242   switch (get_attr_type (insn))
18243     {
18244       case TYPE_SDIV:
18245       case TYPE_UDIV:
18246       case TYPE_FDIVS:
18247       case TYPE_FDIVD:
18248       case TYPE_FSQRTS:
18249       case TYPE_FSQRTD:
18250       case TYPE_NEON_FP_SQRT_S:
18251       case TYPE_NEON_FP_SQRT_D:
18252       case TYPE_NEON_FP_SQRT_S_Q:
18253       case TYPE_NEON_FP_SQRT_D_Q:
18254       case TYPE_NEON_FP_DIV_S:
18255       case TYPE_NEON_FP_DIV_D:
18256       case TYPE_NEON_FP_DIV_S_Q:
18257       case TYPE_NEON_FP_DIV_D_Q:
18258         return false;
18259       default:
18260         return true;
18261     }
18262 }
18263
18264 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
18265
18266 static int
18267 aarch64_compute_pressure_classes (reg_class *classes)
18268 {
18269   int i = 0;
18270   classes[i++] = GENERAL_REGS;
18271   classes[i++] = FP_REGS;
18272   /* PR_REGS isn't a useful pressure class because many predicate pseudo
18273      registers need to go in PR_LO_REGS at some point during their
18274      lifetime.  Splitting it into two halves has the effect of making
18275      all predicates count against PR_LO_REGS, so that we try whenever
18276      possible to restrict the number of live predicates to 8.  This
18277      greatly reduces the amount of spilling in certain loops.  */
18278   classes[i++] = PR_LO_REGS;
18279   classes[i++] = PR_HI_REGS;
18280   return i;
18281 }
18282
18283 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
18284
18285 static bool
18286 aarch64_can_change_mode_class (machine_mode from,
18287                                machine_mode to, reg_class_t)
18288 {
18289   if (BYTES_BIG_ENDIAN)
18290     {
18291       bool from_sve_p = aarch64_sve_data_mode_p (from);
18292       bool to_sve_p = aarch64_sve_data_mode_p (to);
18293
18294       /* Don't allow changes between SVE data modes and non-SVE modes.
18295          See the comment at the head of aarch64-sve.md for details.  */
18296       if (from_sve_p != to_sve_p)
18297         return false;
18298
18299       /* Don't allow changes in element size: lane 0 of the new vector
18300          would not then be lane 0 of the old vector.  See the comment
18301          above aarch64_maybe_expand_sve_subreg_move for a more detailed
18302          description.
18303
18304          In the worst case, this forces a register to be spilled in
18305          one mode and reloaded in the other, which handles the
18306          endianness correctly.  */
18307       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
18308         return false;
18309     }
18310   return true;
18311 }
18312
18313 /* Implement TARGET_EARLY_REMAT_MODES.  */
18314
18315 static void
18316 aarch64_select_early_remat_modes (sbitmap modes)
18317 {
18318   /* SVE values are not normally live across a call, so it should be
18319      worth doing early rematerialization even in VL-specific mode.  */
18320   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
18321     {
18322       machine_mode mode = (machine_mode) i;
18323       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
18324       if (vec_flags & VEC_ANY_SVE)
18325         bitmap_set_bit (modes, i);
18326     }
18327 }
18328
18329 /* Override the default target speculation_safe_value.  */
18330 static rtx
18331 aarch64_speculation_safe_value (machine_mode mode,
18332                                 rtx result, rtx val, rtx failval)
18333 {
18334   /* Maybe we should warn if falling back to hard barriers.  They are
18335      likely to be noticably more expensive than the alternative below.  */
18336   if (!aarch64_track_speculation)
18337     return default_speculation_safe_value (mode, result, val, failval);
18338
18339   if (!REG_P (val))
18340     val = copy_to_mode_reg (mode, val);
18341
18342   if (!aarch64_reg_or_zero (failval, mode))
18343     failval = copy_to_mode_reg (mode, failval);
18344
18345   emit_insn (gen_despeculate_copy (mode, result, val, failval));
18346   return result;
18347 }
18348
18349 /* Implement TARGET_ESTIMATED_POLY_VALUE.
18350    Look into the tuning structure for an estimate.
18351    VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
18352    Advanced SIMD 128 bits.  */
18353
18354 static HOST_WIDE_INT
18355 aarch64_estimated_poly_value (poly_int64 val)
18356 {
18357   enum aarch64_sve_vector_bits_enum width_source
18358     = aarch64_tune_params.sve_width;
18359
18360   /* If we still don't have an estimate, use the default.  */
18361   if (width_source == SVE_SCALABLE)
18362     return default_estimated_poly_value (val);
18363
18364   HOST_WIDE_INT over_128 = width_source - 128;
18365   return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
18366 }
18367
18368 /* Target-specific selftests.  */
18369
18370 #if CHECKING_P
18371
18372 namespace selftest {
18373
18374 /* Selftest for the RTL loader.
18375    Verify that the RTL loader copes with a dump from
18376    print_rtx_function.  This is essentially just a test that class
18377    function_reader can handle a real dump, but it also verifies
18378    that lookup_reg_by_dump_name correctly handles hard regs.
18379    The presence of hard reg names in the dump means that the test is
18380    target-specific, hence it is in this file.  */
18381
18382 static void
18383 aarch64_test_loading_full_dump ()
18384 {
18385   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
18386
18387   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
18388
18389   rtx_insn *insn_1 = get_insn_by_uid (1);
18390   ASSERT_EQ (NOTE, GET_CODE (insn_1));
18391
18392   rtx_insn *insn_15 = get_insn_by_uid (15);
18393   ASSERT_EQ (INSN, GET_CODE (insn_15));
18394   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
18395
18396   /* Verify crtl->return_rtx.  */
18397   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
18398   ASSERT_EQ (0, REGNO (crtl->return_rtx));
18399   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
18400 }
18401
18402 /* Run all target-specific selftests.  */
18403
18404 static void
18405 aarch64_run_selftests (void)
18406 {
18407   aarch64_test_loading_full_dump ();
18408 }
18409
18410 } // namespace selftest
18411
18412 #endif /* #if CHECKING_P */
18413
18414 #undef TARGET_ADDRESS_COST
18415 #define TARGET_ADDRESS_COST aarch64_address_cost
18416
18417 /* This hook will determines whether unnamed bitfields affect the alignment
18418    of the containing structure.  The hook returns true if the structure
18419    should inherit the alignment requirements of an unnamed bitfield's
18420    type.  */
18421 #undef TARGET_ALIGN_ANON_BITFIELD
18422 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
18423
18424 #undef TARGET_ASM_ALIGNED_DI_OP
18425 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
18426
18427 #undef TARGET_ASM_ALIGNED_HI_OP
18428 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
18429
18430 #undef TARGET_ASM_ALIGNED_SI_OP
18431 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
18432
18433 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
18434 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
18435   hook_bool_const_tree_hwi_hwi_const_tree_true
18436
18437 #undef TARGET_ASM_FILE_START
18438 #define TARGET_ASM_FILE_START aarch64_start_file
18439
18440 #undef TARGET_ASM_OUTPUT_MI_THUNK
18441 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
18442
18443 #undef TARGET_ASM_SELECT_RTX_SECTION
18444 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
18445
18446 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
18447 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
18448
18449 #undef TARGET_BUILD_BUILTIN_VA_LIST
18450 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
18451
18452 #undef TARGET_CALLEE_COPIES
18453 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
18454
18455 #undef TARGET_CAN_ELIMINATE
18456 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
18457
18458 #undef TARGET_CAN_INLINE_P
18459 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
18460
18461 #undef TARGET_CANNOT_FORCE_CONST_MEM
18462 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
18463
18464 #undef TARGET_CASE_VALUES_THRESHOLD
18465 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
18466
18467 #undef TARGET_CONDITIONAL_REGISTER_USAGE
18468 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
18469
18470 /* Only the least significant bit is used for initialization guard
18471    variables.  */
18472 #undef TARGET_CXX_GUARD_MASK_BIT
18473 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
18474
18475 #undef TARGET_C_MODE_FOR_SUFFIX
18476 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
18477
18478 #ifdef TARGET_BIG_ENDIAN_DEFAULT
18479 #undef  TARGET_DEFAULT_TARGET_FLAGS
18480 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
18481 #endif
18482
18483 #undef TARGET_CLASS_MAX_NREGS
18484 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
18485
18486 #undef TARGET_BUILTIN_DECL
18487 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
18488
18489 #undef TARGET_BUILTIN_RECIPROCAL
18490 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
18491
18492 #undef TARGET_C_EXCESS_PRECISION
18493 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
18494
18495 #undef  TARGET_EXPAND_BUILTIN
18496 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
18497
18498 #undef TARGET_EXPAND_BUILTIN_VA_START
18499 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
18500
18501 #undef TARGET_FOLD_BUILTIN
18502 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
18503
18504 #undef TARGET_FUNCTION_ARG
18505 #define TARGET_FUNCTION_ARG aarch64_function_arg
18506
18507 #undef TARGET_FUNCTION_ARG_ADVANCE
18508 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
18509
18510 #undef TARGET_FUNCTION_ARG_BOUNDARY
18511 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
18512
18513 #undef TARGET_FUNCTION_ARG_PADDING
18514 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
18515
18516 #undef TARGET_GET_RAW_RESULT_MODE
18517 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
18518 #undef TARGET_GET_RAW_ARG_MODE
18519 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
18520
18521 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
18522 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
18523
18524 #undef TARGET_FUNCTION_VALUE
18525 #define TARGET_FUNCTION_VALUE aarch64_function_value
18526
18527 #undef TARGET_FUNCTION_VALUE_REGNO_P
18528 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
18529
18530 #undef TARGET_GIMPLE_FOLD_BUILTIN
18531 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
18532
18533 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
18534 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
18535
18536 #undef  TARGET_INIT_BUILTINS
18537 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
18538
18539 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
18540 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
18541   aarch64_ira_change_pseudo_allocno_class
18542
18543 #undef TARGET_LEGITIMATE_ADDRESS_P
18544 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
18545
18546 #undef TARGET_LEGITIMATE_CONSTANT_P
18547 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
18548
18549 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
18550 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
18551   aarch64_legitimize_address_displacement
18552
18553 #undef TARGET_LIBGCC_CMP_RETURN_MODE
18554 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
18555
18556 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
18557 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
18558 aarch64_libgcc_floating_mode_supported_p
18559
18560 #undef TARGET_MANGLE_TYPE
18561 #define TARGET_MANGLE_TYPE aarch64_mangle_type
18562
18563 #undef TARGET_MEMORY_MOVE_COST
18564 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
18565
18566 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
18567 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
18568
18569 #undef TARGET_MUST_PASS_IN_STACK
18570 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
18571
18572 /* This target hook should return true if accesses to volatile bitfields
18573    should use the narrowest mode possible.  It should return false if these
18574    accesses should use the bitfield container type.  */
18575 #undef TARGET_NARROW_VOLATILE_BITFIELD
18576 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
18577
18578 #undef  TARGET_OPTION_OVERRIDE
18579 #define TARGET_OPTION_OVERRIDE aarch64_override_options
18580
18581 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
18582 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
18583   aarch64_override_options_after_change
18584
18585 #undef TARGET_OPTION_SAVE
18586 #define TARGET_OPTION_SAVE aarch64_option_save
18587
18588 #undef TARGET_OPTION_RESTORE
18589 #define TARGET_OPTION_RESTORE aarch64_option_restore
18590
18591 #undef TARGET_OPTION_PRINT
18592 #define TARGET_OPTION_PRINT aarch64_option_print
18593
18594 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
18595 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
18596
18597 #undef TARGET_SET_CURRENT_FUNCTION
18598 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
18599
18600 #undef TARGET_PASS_BY_REFERENCE
18601 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
18602
18603 #undef TARGET_PREFERRED_RELOAD_CLASS
18604 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
18605
18606 #undef TARGET_SCHED_REASSOCIATION_WIDTH
18607 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
18608
18609 #undef TARGET_PROMOTED_TYPE
18610 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
18611
18612 #undef TARGET_SECONDARY_RELOAD
18613 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
18614
18615 #undef TARGET_SHIFT_TRUNCATION_MASK
18616 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
18617
18618 #undef TARGET_SETUP_INCOMING_VARARGS
18619 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
18620
18621 #undef TARGET_STRUCT_VALUE_RTX
18622 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
18623
18624 #undef TARGET_REGISTER_MOVE_COST
18625 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
18626
18627 #undef TARGET_RETURN_IN_MEMORY
18628 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
18629
18630 #undef TARGET_RETURN_IN_MSB
18631 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
18632
18633 #undef TARGET_RTX_COSTS
18634 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
18635
18636 #undef TARGET_SCALAR_MODE_SUPPORTED_P
18637 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
18638
18639 #undef TARGET_SCHED_ISSUE_RATE
18640 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
18641
18642 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
18643 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
18644   aarch64_sched_first_cycle_multipass_dfa_lookahead
18645
18646 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
18647 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
18648   aarch64_first_cycle_multipass_dfa_lookahead_guard
18649
18650 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
18651 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
18652   aarch64_get_separate_components
18653
18654 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
18655 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
18656   aarch64_components_for_bb
18657
18658 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
18659 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
18660   aarch64_disqualify_components
18661
18662 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
18663 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
18664   aarch64_emit_prologue_components
18665
18666 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
18667 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
18668   aarch64_emit_epilogue_components
18669
18670 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
18671 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
18672   aarch64_set_handled_components
18673
18674 #undef TARGET_TRAMPOLINE_INIT
18675 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
18676
18677 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
18678 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
18679
18680 #undef TARGET_VECTOR_MODE_SUPPORTED_P
18681 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
18682
18683 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
18684 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
18685   aarch64_builtin_support_vector_misalignment
18686
18687 #undef TARGET_ARRAY_MODE
18688 #define TARGET_ARRAY_MODE aarch64_array_mode
18689
18690 #undef TARGET_ARRAY_MODE_SUPPORTED_P
18691 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
18692
18693 #undef TARGET_VECTORIZE_ADD_STMT_COST
18694 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
18695
18696 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
18697 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
18698   aarch64_builtin_vectorization_cost
18699
18700 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
18701 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
18702
18703 #undef TARGET_VECTORIZE_BUILTINS
18704 #define TARGET_VECTORIZE_BUILTINS
18705
18706 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
18707 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
18708   aarch64_builtin_vectorized_function
18709
18710 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
18711 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
18712   aarch64_autovectorize_vector_sizes
18713
18714 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
18715 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
18716   aarch64_atomic_assign_expand_fenv
18717
18718 /* Section anchor support.  */
18719
18720 #undef TARGET_MIN_ANCHOR_OFFSET
18721 #define TARGET_MIN_ANCHOR_OFFSET -256
18722
18723 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
18724    byte offset; we can do much more for larger data types, but have no way
18725    to determine the size of the access.  We assume accesses are aligned.  */
18726 #undef TARGET_MAX_ANCHOR_OFFSET
18727 #define TARGET_MAX_ANCHOR_OFFSET 4095
18728
18729 #undef TARGET_VECTOR_ALIGNMENT
18730 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
18731
18732 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
18733 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
18734   aarch64_vectorize_preferred_vector_alignment
18735 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
18736 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
18737   aarch64_simd_vector_alignment_reachable
18738
18739 /* vec_perm support.  */
18740
18741 #undef TARGET_VECTORIZE_VEC_PERM_CONST
18742 #define TARGET_VECTORIZE_VEC_PERM_CONST \
18743   aarch64_vectorize_vec_perm_const
18744
18745 #undef TARGET_VECTORIZE_GET_MASK_MODE
18746 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
18747 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
18748 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
18749   aarch64_empty_mask_is_expensive
18750 #undef TARGET_PREFERRED_ELSE_VALUE
18751 #define TARGET_PREFERRED_ELSE_VALUE \
18752   aarch64_preferred_else_value
18753
18754 #undef TARGET_INIT_LIBFUNCS
18755 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
18756
18757 #undef TARGET_FIXED_CONDITION_CODE_REGS
18758 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
18759
18760 #undef TARGET_FLAGS_REGNUM
18761 #define TARGET_FLAGS_REGNUM CC_REGNUM
18762
18763 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
18764 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
18765
18766 #undef TARGET_ASAN_SHADOW_OFFSET
18767 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
18768
18769 #undef TARGET_LEGITIMIZE_ADDRESS
18770 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
18771
18772 #undef TARGET_SCHED_CAN_SPECULATE_INSN
18773 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
18774
18775 #undef TARGET_CAN_USE_DOLOOP_P
18776 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
18777
18778 #undef TARGET_SCHED_ADJUST_PRIORITY
18779 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
18780
18781 #undef TARGET_SCHED_MACRO_FUSION_P
18782 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
18783
18784 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
18785 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
18786
18787 #undef TARGET_SCHED_FUSION_PRIORITY
18788 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
18789
18790 #undef TARGET_UNSPEC_MAY_TRAP_P
18791 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
18792
18793 #undef TARGET_USE_PSEUDO_PIC_REG
18794 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
18795
18796 #undef TARGET_PRINT_OPERAND
18797 #define TARGET_PRINT_OPERAND aarch64_print_operand
18798
18799 #undef TARGET_PRINT_OPERAND_ADDRESS
18800 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
18801
18802 #undef TARGET_OPTAB_SUPPORTED_P
18803 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
18804
18805 #undef TARGET_OMIT_STRUCT_RETURN_REG
18806 #define TARGET_OMIT_STRUCT_RETURN_REG true
18807
18808 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
18809 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
18810   aarch64_dwarf_poly_indeterminate_value
18811
18812 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
18813 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
18814 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
18815
18816 #undef TARGET_HARD_REGNO_NREGS
18817 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
18818 #undef TARGET_HARD_REGNO_MODE_OK
18819 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
18820
18821 #undef TARGET_MODES_TIEABLE_P
18822 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
18823
18824 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
18825 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
18826   aarch64_hard_regno_call_part_clobbered
18827
18828 #undef TARGET_CONSTANT_ALIGNMENT
18829 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
18830
18831 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
18832 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
18833   aarch64_stack_clash_protection_alloca_probe_range
18834
18835 #undef TARGET_COMPUTE_PRESSURE_CLASSES
18836 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
18837
18838 #undef TARGET_CAN_CHANGE_MODE_CLASS
18839 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
18840
18841 #undef TARGET_SELECT_EARLY_REMAT_MODES
18842 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
18843
18844 #undef TARGET_SPECULATION_SAFE_VALUE
18845 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
18846
18847 #undef TARGET_ESTIMATED_POLY_VALUE
18848 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
18849
18850 #undef TARGET_ATTRIBUTE_TABLE
18851 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
18852
18853 #if CHECKING_P
18854 #undef TARGET_RUN_TARGET_SELFTESTS
18855 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
18856 #endif /* #if CHECKING_P */
18857
18858 struct gcc_target targetm = TARGET_INITIALIZER;
18859
18860 #include "gt-aarch64.h"