gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2018 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #include "config.h"
  24 #define INCLUDE_STRING
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "memmodel.h"
  32 #include "gimple.h"
  33 #include "cfghooks.h"
  34 #include "cfgloop.h"
  35 #include "df.h"
  36 #include "tm_p.h"
  37 #include "stringpool.h"
  38 #include "attribs.h"
  39 #include "optabs.h"
  40 #include "regs.h"
  41 #include "emit-rtl.h"
  42 #include "recog.h"
  43 #include "diagnostic.h"
  44 #include "insn-attr.h"
  45 #include "alias.h"
  46 #include "fold-const.h"
  47 #include "stor-layout.h"
  48 #include "calls.h"
  49 #include "varasm.h"
  50 #include "output.h"
  51 #include "flags.h"
  52 #include "explow.h"
  53 #include "expr.h"
  54 #include "reload.h"
  55 #include "langhooks.h"
  56 #include "opts.h"
  57 #include "params.h"
  58 #include "gimplify.h"
  59 #include "dwarf2.h"
  60 #include "gimple-iterator.h"
  61 #include "tree-vectorizer.h"
  62 #include "aarch64-cost-tables.h"
  63 #include "dumpfile.h"
  64 #include "builtins.h"
  65 #include "rtl-iter.h"
  66 #include "tm-constrs.h"
  67 #include "sched-int.h"
  68 #include "target-globals.h"
  69 #include "common/common-target.h"
  70 #include "cfgrtl.h"
  71 #include "selftest.h"
  72 #include "selftest-rtl.h"
  73 #include "rtx-vector-builder.h"
  74
  75 /* This file should be included last.  */
  76 #include "target-def.h"
  77
  78 /* Defined for convenience.  */
  79 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  80
  81 /* Information about a legitimate vector immediate operand.  */
  82 struct simd_immediate_info
  83 {
  84   enum insn_type { MOV, MVN };
  85   enum modifier_type { LSL, MSL };
  86
  87   simd_immediate_info () {}
  88   simd_immediate_info (scalar_float_mode, rtx);
  89   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
  90                        insn_type = MOV, modifier_type = LSL,
  91                        unsigned int = 0);
  92   simd_immediate_info (scalar_mode, rtx, rtx);
  93
  94   /* The mode of the elements.  */
  95   scalar_mode elt_mode;
  96
  97   /* The value of each element if all elements are the same, or the
  98      first value if the constant is a series.  */
  99   rtx value;
 100
 101   /* The value of the step if the constant is a series, null otherwise.  */
 102   rtx step;
 103
 104   /* The instruction to use to move the immediate into a vector.  */
 105   insn_type insn;
 106
 107   /* The kind of shift modifier to use, and the number of bits to shift.
 108      This is (LSL, 0) if no shift is needed.  */
 109   modifier_type modifier;
 110   unsigned int shift;
 111 };
 112
 113 /* Construct a floating-point immediate in which each element has mode
 114    ELT_MODE_IN and value VALUE_IN.  */
 115 inline simd_immediate_info
 116 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 117   : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
 118     modifier (LSL), shift (0)
 119 {}
 120
 121 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 122    and value VALUE_IN.  The other parameters are as for the structure
 123    fields.  */
 124 inline simd_immediate_info
 125 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 126                        unsigned HOST_WIDE_INT value_in,
 127                        insn_type insn_in, modifier_type modifier_in,
 128                        unsigned int shift_in)
 129   : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
 130     step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
 131 {}
 132
 133 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 134    and where element I is equal to VALUE_IN + I * STEP_IN.  */
 135 inline simd_immediate_info
 136 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
 137   : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
 138     modifier (LSL), shift (0)
 139 {}
 140
 141 /* The current code model.  */
 142 enum aarch64_code_model aarch64_cmodel;
 143
 144 /* The number of 64-bit elements in an SVE vector.  */
 145 poly_uint16 aarch64_sve_vg;
 146
 147 #ifdef HAVE_AS_TLS
 148 #undef TARGET_HAVE_TLS
 149 #define TARGET_HAVE_TLS 1
 150 #endif
 151
 152 static bool aarch64_composite_type_p (const_tree, machine_mode);
 153 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 154                                                      const_tree,
 155                                                      machine_mode *, int *,
 156                                                      bool *);
 157 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 158 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 159 static void aarch64_override_options_after_change (void);
 160 static bool aarch64_vector_mode_supported_p (machine_mode);
 161 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 162 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 163                                                          const_tree type,
 164                                                          int misalignment,
 165                                                          bool is_packed);
 166 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 167 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 168                                             aarch64_addr_query_type);
 169
 170 /* Major revision number of the ARM Architecture implemented by the target.  */
 171 unsigned aarch64_architecture_version;
 172
 173 /* The processor for which instructions should be scheduled.  */
 174 enum aarch64_processor aarch64_tune = cortexa53;
 175
 176 /* Mask to specify which instruction scheduling options should be used.  */
 177 unsigned long aarch64_tune_flags = 0;
 178
 179 /* Global flag for PC relative loads.  */
 180 bool aarch64_pcrelative_literal_loads;
 181
 182 /* Global flag for whether frame pointer is enabled.  */
 183 bool aarch64_use_frame_pointer;
 184
 185 /* Support for command line parsing of boolean flags in the tuning
 186    structures.  */
 187 struct aarch64_flag_desc
 188 {
 189   const char* name;
 190   unsigned int flag;
 191 };
 192
 193 #define AARCH64_FUSION_PAIR(name, internal_name) \
 194   { name, AARCH64_FUSE_##internal_name },
 195 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 196 {
 197   { "none", AARCH64_FUSE_NOTHING },
 198 #include "aarch64-fusion-pairs.def"
 199   { "all", AARCH64_FUSE_ALL },
 200   { NULL, AARCH64_FUSE_NOTHING }
 201 };
 202
 203 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 204   { name, AARCH64_EXTRA_TUNE_##internal_name },
 205 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 206 {
 207   { "none", AARCH64_EXTRA_TUNE_NONE },
 208 #include "aarch64-tuning-flags.def"
 209   { "all", AARCH64_EXTRA_TUNE_ALL },
 210   { NULL, AARCH64_EXTRA_TUNE_NONE }
 211 };
 212
 213 /* Tuning parameters.  */
 214
 215 static const struct cpu_addrcost_table generic_addrcost_table =
 216 {
 217     {
 218       1, /* hi  */
 219       0, /* si  */
 220       0, /* di  */
 221       1, /* ti  */
 222     },
 223   0, /* pre_modify  */
 224   0, /* post_modify  */
 225   0, /* register_offset  */
 226   0, /* register_sextend  */
 227   0, /* register_zextend  */
 228   0 /* imm_offset  */
 229 };
 230
 231 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 232 {
 233     {
 234       0, /* hi  */
 235       0, /* si  */
 236       0, /* di  */
 237       2, /* ti  */
 238     },
 239   0, /* pre_modify  */
 240   0, /* post_modify  */
 241   1, /* register_offset  */
 242   1, /* register_sextend  */
 243   2, /* register_zextend  */
 244   0, /* imm_offset  */
 245 };
 246
 247 static const struct cpu_addrcost_table xgene1_addrcost_table =
 248 {
 249     {
 250       1, /* hi  */
 251       0, /* si  */
 252       0, /* di  */
 253       1, /* ti  */
 254     },
 255   1, /* pre_modify  */
 256   0, /* post_modify  */
 257   0, /* register_offset  */
 258   1, /* register_sextend  */
 259   1, /* register_zextend  */
 260   0, /* imm_offset  */
 261 };
 262
 263 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 264 {
 265     {
 266       1, /* hi  */
 267       1, /* si  */
 268       1, /* di  */
 269       2, /* ti  */
 270     },
 271   0, /* pre_modify  */
 272   0, /* post_modify  */
 273   2, /* register_offset  */
 274   3, /* register_sextend  */
 275   3, /* register_zextend  */
 276   0, /* imm_offset  */
 277 };
 278
 279 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 280 {
 281     {
 282       1, /* hi  */
 283       1, /* si  */
 284       1, /* di  */
 285       2, /* ti  */
 286     },
 287   1, /* pre_modify  */
 288   1, /* post_modify  */
 289   3, /* register_offset  */
 290   3, /* register_sextend  */
 291   3, /* register_zextend  */
 292   2, /* imm_offset  */
 293 };
 294
 295 static const struct cpu_regmove_cost generic_regmove_cost =
 296 {
 297   1, /* GP2GP  */
 298   /* Avoid the use of slow int<->fp moves for spilling by setting
 299      their cost higher than memmov_cost.  */
 300   5, /* GP2FP  */
 301   5, /* FP2GP  */
 302   2 /* FP2FP  */
 303 };
 304
 305 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 306 {
 307   1, /* GP2GP  */
 308   /* Avoid the use of slow int<->fp moves for spilling by setting
 309      their cost higher than memmov_cost.  */
 310   5, /* GP2FP  */
 311   5, /* FP2GP  */
 312   2 /* FP2FP  */
 313 };
 314
 315 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 316 {
 317   1, /* GP2GP  */
 318   /* Avoid the use of slow int<->fp moves for spilling by setting
 319      their cost higher than memmov_cost.  */
 320   5, /* GP2FP  */
 321   5, /* FP2GP  */
 322   2 /* FP2FP  */
 323 };
 324
 325 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 326 {
 327   1, /* GP2GP  */
 328   /* Avoid the use of slow int<->fp moves for spilling by setting
 329      their cost higher than memmov_cost (actual, 4 and 9).  */
 330   9, /* GP2FP  */
 331   9, /* FP2GP  */
 332   1 /* FP2FP  */
 333 };
 334
 335 static const struct cpu_regmove_cost thunderx_regmove_cost =
 336 {
 337   2, /* GP2GP  */
 338   2, /* GP2FP  */
 339   6, /* FP2GP  */
 340   4 /* FP2FP  */
 341 };
 342
 343 static const struct cpu_regmove_cost xgene1_regmove_cost =
 344 {
 345   1, /* GP2GP  */
 346   /* Avoid the use of slow int<->fp moves for spilling by setting
 347      their cost higher than memmov_cost.  */
 348   8, /* GP2FP  */
 349   8, /* FP2GP  */
 350   2 /* FP2FP  */
 351 };
 352
 353 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 354 {
 355   2, /* GP2GP  */
 356   /* Avoid the use of int<->fp moves for spilling.  */
 357   6, /* GP2FP  */
 358   6, /* FP2GP  */
 359   4 /* FP2FP  */
 360 };
 361
 362 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 363 {
 364   1, /* GP2GP  */
 365   /* Avoid the use of int<->fp moves for spilling.  */
 366   8, /* GP2FP  */
 367   8, /* FP2GP  */
 368   4  /* FP2FP  */
 369 };
 370
 371 /* Generic costs for vector insn classes.  */
 372 static const struct cpu_vector_cost generic_vector_cost =
 373 {
 374   1, /* scalar_int_stmt_cost  */
 375   1, /* scalar_fp_stmt_cost  */
 376   1, /* scalar_load_cost  */
 377   1, /* scalar_store_cost  */
 378   1, /* vec_int_stmt_cost  */
 379   1, /* vec_fp_stmt_cost  */
 380   2, /* vec_permute_cost  */
 381   1, /* vec_to_scalar_cost  */
 382   1, /* scalar_to_vec_cost  */
 383   1, /* vec_align_load_cost  */
 384   1, /* vec_unalign_load_cost  */
 385   1, /* vec_unalign_store_cost  */
 386   1, /* vec_store_cost  */
 387   3, /* cond_taken_branch_cost  */
 388   1 /* cond_not_taken_branch_cost  */
 389 };
 390
 391 /* QDF24XX costs for vector insn classes.  */
 392 static const struct cpu_vector_cost qdf24xx_vector_cost =
 393 {
 394   1, /* scalar_int_stmt_cost  */
 395   1, /* scalar_fp_stmt_cost  */
 396   1, /* scalar_load_cost  */
 397   1, /* scalar_store_cost  */
 398   1, /* vec_int_stmt_cost  */
 399   3, /* vec_fp_stmt_cost  */
 400   2, /* vec_permute_cost  */
 401   1, /* vec_to_scalar_cost  */
 402   1, /* scalar_to_vec_cost  */
 403   1, /* vec_align_load_cost  */
 404   1, /* vec_unalign_load_cost  */
 405   1, /* vec_unalign_store_cost  */
 406   1, /* vec_store_cost  */
 407   3, /* cond_taken_branch_cost  */
 408   1 /* cond_not_taken_branch_cost  */
 409 };
 410
 411 /* ThunderX costs for vector insn classes.  */
 412 static const struct cpu_vector_cost thunderx_vector_cost =
 413 {
 414   1, /* scalar_int_stmt_cost  */
 415   1, /* scalar_fp_stmt_cost  */
 416   3, /* scalar_load_cost  */
 417   1, /* scalar_store_cost  */
 418   4, /* vec_int_stmt_cost  */
 419   1, /* vec_fp_stmt_cost  */
 420   4, /* vec_permute_cost  */
 421   2, /* vec_to_scalar_cost  */
 422   2, /* scalar_to_vec_cost  */
 423   3, /* vec_align_load_cost  */
 424   5, /* vec_unalign_load_cost  */
 425   5, /* vec_unalign_store_cost  */
 426   1, /* vec_store_cost  */
 427   3, /* cond_taken_branch_cost  */
 428   3 /* cond_not_taken_branch_cost  */
 429 };
 430
 431 /* Generic costs for vector insn classes.  */
 432 static const struct cpu_vector_cost cortexa57_vector_cost =
 433 {
 434   1, /* scalar_int_stmt_cost  */
 435   1, /* scalar_fp_stmt_cost  */
 436   4, /* scalar_load_cost  */
 437   1, /* scalar_store_cost  */
 438   2, /* vec_int_stmt_cost  */
 439   2, /* vec_fp_stmt_cost  */
 440   3, /* vec_permute_cost  */
 441   8, /* vec_to_scalar_cost  */
 442   8, /* scalar_to_vec_cost  */
 443   4, /* vec_align_load_cost  */
 444   4, /* vec_unalign_load_cost  */
 445   1, /* vec_unalign_store_cost  */
 446   1, /* vec_store_cost  */
 447   1, /* cond_taken_branch_cost  */
 448   1 /* cond_not_taken_branch_cost  */
 449 };
 450
 451 static const struct cpu_vector_cost exynosm1_vector_cost =
 452 {
 453   1, /* scalar_int_stmt_cost  */
 454   1, /* scalar_fp_stmt_cost  */
 455   5, /* scalar_load_cost  */
 456   1, /* scalar_store_cost  */
 457   3, /* vec_int_stmt_cost  */
 458   3, /* vec_fp_stmt_cost  */
 459   3, /* vec_permute_cost  */
 460   3, /* vec_to_scalar_cost  */
 461   3, /* scalar_to_vec_cost  */
 462   5, /* vec_align_load_cost  */
 463   5, /* vec_unalign_load_cost  */
 464   1, /* vec_unalign_store_cost  */
 465   1, /* vec_store_cost  */
 466   1, /* cond_taken_branch_cost  */
 467   1 /* cond_not_taken_branch_cost  */
 468 };
 469
 470 /* Generic costs for vector insn classes.  */
 471 static const struct cpu_vector_cost xgene1_vector_cost =
 472 {
 473   1, /* scalar_int_stmt_cost  */
 474   1, /* scalar_fp_stmt_cost  */
 475   5, /* scalar_load_cost  */
 476   1, /* scalar_store_cost  */
 477   2, /* vec_int_stmt_cost  */
 478   2, /* vec_fp_stmt_cost  */
 479   2, /* vec_permute_cost  */
 480   4, /* vec_to_scalar_cost  */
 481   4, /* scalar_to_vec_cost  */
 482   10, /* vec_align_load_cost  */
 483   10, /* vec_unalign_load_cost  */
 484   2, /* vec_unalign_store_cost  */
 485   2, /* vec_store_cost  */
 486   2, /* cond_taken_branch_cost  */
 487   1 /* cond_not_taken_branch_cost  */
 488 };
 489
 490 /* Costs for vector insn classes for Vulcan.  */
 491 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 492 {
 493   1, /* scalar_int_stmt_cost  */
 494   6, /* scalar_fp_stmt_cost  */
 495   4, /* scalar_load_cost  */
 496   1, /* scalar_store_cost  */
 497   5, /* vec_int_stmt_cost  */
 498   6, /* vec_fp_stmt_cost  */
 499   3, /* vec_permute_cost  */
 500   6, /* vec_to_scalar_cost  */
 501   5, /* scalar_to_vec_cost  */
 502   8, /* vec_align_load_cost  */
 503   8, /* vec_unalign_load_cost  */
 504   4, /* vec_unalign_store_cost  */
 505   4, /* vec_store_cost  */
 506   2, /* cond_taken_branch_cost  */
 507   1  /* cond_not_taken_branch_cost  */
 508 };
 509
 510 /* Generic costs for branch instructions.  */
 511 static const struct cpu_branch_cost generic_branch_cost =
 512 {
 513   1,  /* Predictable.  */
 514   3   /* Unpredictable.  */
 515 };
 516
 517 /* Generic approximation modes.  */
 518 static const cpu_approx_modes generic_approx_modes =
 519 {
 520   AARCH64_APPROX_NONE,  /* division  */
 521   AARCH64_APPROX_NONE,  /* sqrt  */
 522   AARCH64_APPROX_NONE   /* recip_sqrt  */
 523 };
 524
 525 /* Approximation modes for Exynos M1.  */
 526 static const cpu_approx_modes exynosm1_approx_modes =
 527 {
 528   AARCH64_APPROX_NONE,  /* division  */
 529   AARCH64_APPROX_ALL,   /* sqrt  */
 530   AARCH64_APPROX_ALL    /* recip_sqrt  */
 531 };
 532
 533 /* Approximation modes for X-Gene 1.  */
 534 static const cpu_approx_modes xgene1_approx_modes =
 535 {
 536   AARCH64_APPROX_NONE,  /* division  */
 537   AARCH64_APPROX_NONE,  /* sqrt  */
 538   AARCH64_APPROX_ALL    /* recip_sqrt  */
 539 };
 540
 541 /* Generic prefetch settings (which disable prefetch).  */
 542 static const cpu_prefetch_tune generic_prefetch_tune =
 543 {
 544   0,                    /* num_slots  */
 545   -1,                   /* l1_cache_size  */
 546   -1,                   /* l1_cache_line_size  */
 547   -1,                   /* l2_cache_size  */
 548   true,                 /* prefetch_dynamic_strides */
 549   -1,                   /* minimum_stride */
 550   -1                    /* default_opt_level  */
 551 };
 552
 553 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 554 {
 555   0,                    /* num_slots  */
 556   -1,                   /* l1_cache_size  */
 557   64,                   /* l1_cache_line_size  */
 558   -1,                   /* l2_cache_size  */
 559   true,                 /* prefetch_dynamic_strides */
 560   -1,                   /* minimum_stride */
 561   -1                    /* default_opt_level  */
 562 };
 563
 564 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 565 {
 566   4,                    /* num_slots  */
 567   32,                   /* l1_cache_size  */
 568   64,                   /* l1_cache_line_size  */
 569   512,                  /* l2_cache_size  */
 570   false,                /* prefetch_dynamic_strides */
 571   2048,                 /* minimum_stride */
 572   3                     /* default_opt_level  */
 573 };
 574
 575 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 576 {
 577   8,                    /* num_slots  */
 578   32,                   /* l1_cache_size  */
 579   128,                  /* l1_cache_line_size  */
 580   16*1024,              /* l2_cache_size  */
 581   true,                 /* prefetch_dynamic_strides */
 582   -1,                   /* minimum_stride */
 583   3                     /* default_opt_level  */
 584 };
 585
 586 static const cpu_prefetch_tune thunderx_prefetch_tune =
 587 {
 588   8,                    /* num_slots  */
 589   32,                   /* l1_cache_size  */
 590   128,                  /* l1_cache_line_size  */
 591   -1,                   /* l2_cache_size  */
 592   true,                 /* prefetch_dynamic_strides */
 593   -1,                   /* minimum_stride */
 594   -1                    /* default_opt_level  */
 595 };
 596
 597 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 598 {
 599   8,                    /* num_slots  */
 600   32,                   /* l1_cache_size  */
 601   64,                   /* l1_cache_line_size  */
 602   256,                  /* l2_cache_size  */
 603   true,                 /* prefetch_dynamic_strides */
 604   -1,                   /* minimum_stride */
 605   -1                    /* default_opt_level  */
 606 };
 607
 608 static const struct tune_params generic_tunings =
 609 {
 610   &cortexa57_extra_costs,
 611   &generic_addrcost_table,
 612   &generic_regmove_cost,
 613   &generic_vector_cost,
 614   &generic_branch_cost,
 615   &generic_approx_modes,
 616   4, /* memmov_cost  */
 617   2, /* issue_rate  */
 618   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 619   "8",  /* function_align.  */
 620   "4",  /* jump_align.  */
 621   "8",  /* loop_align.  */
 622   2,    /* int_reassoc_width.  */
 623   4,    /* fp_reassoc_width.  */
 624   1,    /* vec_reassoc_width.  */
 625   2,    /* min_div_recip_mul_sf.  */
 626   2,    /* min_div_recip_mul_df.  */
 627   0,    /* max_case_values.  */
 628   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 629   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 630   &generic_prefetch_tune
 631 };
 632
 633 static const struct tune_params cortexa35_tunings =
 634 {
 635   &cortexa53_extra_costs,
 636   &generic_addrcost_table,
 637   &cortexa53_regmove_cost,
 638   &generic_vector_cost,
 639   &generic_branch_cost,
 640   &generic_approx_modes,
 641   4, /* memmov_cost  */
 642   1, /* issue_rate  */
 643   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 644    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 645   "16", /* function_align.  */
 646   "4",  /* jump_align.  */
 647   "8",  /* loop_align.  */
 648   2,    /* int_reassoc_width.  */
 649   4,    /* fp_reassoc_width.  */
 650   1,    /* vec_reassoc_width.  */
 651   2,    /* min_div_recip_mul_sf.  */
 652   2,    /* min_div_recip_mul_df.  */
 653   0,    /* max_case_values.  */
 654   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 655   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 656   &generic_prefetch_tune
 657 };
 658
 659 static const struct tune_params cortexa53_tunings =
 660 {
 661   &cortexa53_extra_costs,
 662   &generic_addrcost_table,
 663   &cortexa53_regmove_cost,
 664   &generic_vector_cost,
 665   &generic_branch_cost,
 666   &generic_approx_modes,
 667   4, /* memmov_cost  */
 668   2, /* issue_rate  */
 669   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 670    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 671   "16", /* function_align.  */
 672   "4",  /* jump_align.  */
 673   "8",  /* loop_align.  */
 674   2,    /* int_reassoc_width.  */
 675   4,    /* fp_reassoc_width.  */
 676   1,    /* vec_reassoc_width.  */
 677   2,    /* min_div_recip_mul_sf.  */
 678   2,    /* min_div_recip_mul_df.  */
 679   0,    /* max_case_values.  */
 680   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 681   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 682   &generic_prefetch_tune
 683 };
 684
 685 static const struct tune_params cortexa57_tunings =
 686 {
 687   &cortexa57_extra_costs,
 688   &generic_addrcost_table,
 689   &cortexa57_regmove_cost,
 690   &cortexa57_vector_cost,
 691   &generic_branch_cost,
 692   &generic_approx_modes,
 693   4, /* memmov_cost  */
 694   3, /* issue_rate  */
 695   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 696    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 697   "16", /* function_align.  */
 698   "4",  /* jump_align.  */
 699   "8",  /* loop_align.  */
 700   2,    /* int_reassoc_width.  */
 701   4,    /* fp_reassoc_width.  */
 702   1,    /* vec_reassoc_width.  */
 703   2,    /* min_div_recip_mul_sf.  */
 704   2,    /* min_div_recip_mul_df.  */
 705   0,    /* max_case_values.  */
 706   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 707   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 708   &generic_prefetch_tune
 709 };
 710
 711 static const struct tune_params cortexa72_tunings =
 712 {
 713   &cortexa57_extra_costs,
 714   &generic_addrcost_table,
 715   &cortexa57_regmove_cost,
 716   &cortexa57_vector_cost,
 717   &generic_branch_cost,
 718   &generic_approx_modes,
 719   4, /* memmov_cost  */
 720   3, /* issue_rate  */
 721   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 722    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 723   "16", /* function_align.  */
 724   "4",  /* jump_align.  */
 725   "8",  /* loop_align.  */
 726   2,    /* int_reassoc_width.  */
 727   4,    /* fp_reassoc_width.  */
 728   1,    /* vec_reassoc_width.  */
 729   2,    /* min_div_recip_mul_sf.  */
 730   2,    /* min_div_recip_mul_df.  */
 731   0,    /* max_case_values.  */
 732   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 733   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 734   &generic_prefetch_tune
 735 };
 736
 737 static const struct tune_params cortexa73_tunings =
 738 {
 739   &cortexa57_extra_costs,
 740   &generic_addrcost_table,
 741   &cortexa57_regmove_cost,
 742   &cortexa57_vector_cost,
 743   &generic_branch_cost,
 744   &generic_approx_modes,
 745   4, /* memmov_cost.  */
 746   2, /* issue_rate.  */
 747   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 748    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 749   "16", /* function_align.  */
 750   "4",  /* jump_align.  */
 751   "8",  /* loop_align.  */
 752   2,    /* int_reassoc_width.  */
 753   4,    /* fp_reassoc_width.  */
 754   1,    /* vec_reassoc_width.  */
 755   2,    /* min_div_recip_mul_sf.  */
 756   2,    /* min_div_recip_mul_df.  */
 757   0,    /* max_case_values.  */
 758   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 759   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 760   &generic_prefetch_tune
 761 };
 762
 763
 764
 765 static const struct tune_params exynosm1_tunings =
 766 {
 767   &exynosm1_extra_costs,
 768   &exynosm1_addrcost_table,
 769   &exynosm1_regmove_cost,
 770   &exynosm1_vector_cost,
 771   &generic_branch_cost,
 772   &exynosm1_approx_modes,
 773   4,    /* memmov_cost  */
 774   3,    /* issue_rate  */
 775   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 776   "4",  /* function_align.  */
 777   "4",  /* jump_align.  */
 778   "4",  /* loop_align.  */
 779   2,    /* int_reassoc_width.  */
 780   4,    /* fp_reassoc_width.  */
 781   1,    /* vec_reassoc_width.  */
 782   2,    /* min_div_recip_mul_sf.  */
 783   2,    /* min_div_recip_mul_df.  */
 784   48,   /* max_case_values.  */
 785   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 786   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 787   &exynosm1_prefetch_tune
 788 };
 789
 790 static const struct tune_params thunderxt88_tunings =
 791 {
 792   &thunderx_extra_costs,
 793   &generic_addrcost_table,
 794   &thunderx_regmove_cost,
 795   &thunderx_vector_cost,
 796   &generic_branch_cost,
 797   &generic_approx_modes,
 798   6, /* memmov_cost  */
 799   2, /* issue_rate  */
 800   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 801   "8",  /* function_align.  */
 802   "8",  /* jump_align.  */
 803   "8",  /* loop_align.  */
 804   2,    /* int_reassoc_width.  */
 805   4,    /* fp_reassoc_width.  */
 806   1,    /* vec_reassoc_width.  */
 807   2,    /* min_div_recip_mul_sf.  */
 808   2,    /* min_div_recip_mul_df.  */
 809   0,    /* max_case_values.  */
 810   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 811   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 812   &thunderxt88_prefetch_tune
 813 };
 814
 815 static const struct tune_params thunderx_tunings =
 816 {
 817   &thunderx_extra_costs,
 818   &generic_addrcost_table,
 819   &thunderx_regmove_cost,
 820   &thunderx_vector_cost,
 821   &generic_branch_cost,
 822   &generic_approx_modes,
 823   6, /* memmov_cost  */
 824   2, /* issue_rate  */
 825   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 826   "8",  /* function_align.  */
 827   "8",  /* jump_align.  */
 828   "8",  /* loop_align.  */
 829   2,    /* int_reassoc_width.  */
 830   4,    /* fp_reassoc_width.  */
 831   1,    /* vec_reassoc_width.  */
 832   2,    /* min_div_recip_mul_sf.  */
 833   2,    /* min_div_recip_mul_df.  */
 834   0,    /* max_case_values.  */
 835   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 836   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 837    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 838   &thunderx_prefetch_tune
 839 };
 840
 841 static const struct tune_params xgene1_tunings =
 842 {
 843   &xgene1_extra_costs,
 844   &xgene1_addrcost_table,
 845   &xgene1_regmove_cost,
 846   &xgene1_vector_cost,
 847   &generic_branch_cost,
 848   &xgene1_approx_modes,
 849   6, /* memmov_cost  */
 850   4, /* issue_rate  */
 851   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 852   "16", /* function_align.  */
 853   "8",  /* jump_align.  */
 854   "16", /* loop_align.  */
 855   2,    /* int_reassoc_width.  */
 856   4,    /* fp_reassoc_width.  */
 857   1,    /* vec_reassoc_width.  */
 858   2,    /* min_div_recip_mul_sf.  */
 859   2,    /* min_div_recip_mul_df.  */
 860   0,    /* max_case_values.  */
 861   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 862   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
 863   &generic_prefetch_tune
 864 };
 865
 866 static const struct tune_params qdf24xx_tunings =
 867 {
 868   &qdf24xx_extra_costs,
 869   &qdf24xx_addrcost_table,
 870   &qdf24xx_regmove_cost,
 871   &qdf24xx_vector_cost,
 872   &generic_branch_cost,
 873   &generic_approx_modes,
 874   4, /* memmov_cost  */
 875   4, /* issue_rate  */
 876   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 877    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 878   "16", /* function_align.  */
 879   "8",  /* jump_align.  */
 880   "16", /* loop_align.  */
 881   2,    /* int_reassoc_width.  */
 882   4,    /* fp_reassoc_width.  */
 883   1,    /* vec_reassoc_width.  */
 884   2,    /* min_div_recip_mul_sf.  */
 885   2,    /* min_div_recip_mul_df.  */
 886   0,    /* max_case_values.  */
 887   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 888   AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
 889   &qdf24xx_prefetch_tune
 890 };
 891
 892 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
 893    for now.  */
 894 static const struct tune_params saphira_tunings =
 895 {
 896   &generic_extra_costs,
 897   &generic_addrcost_table,
 898   &generic_regmove_cost,
 899   &generic_vector_cost,
 900   &generic_branch_cost,
 901   &generic_approx_modes,
 902   4, /* memmov_cost  */
 903   4, /* issue_rate  */
 904   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 905    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 906   "16", /* function_align.  */
 907   "8",  /* jump_align.  */
 908   "16", /* loop_align.  */
 909   2,    /* int_reassoc_width.  */
 910   4,    /* fp_reassoc_width.  */
 911   1,    /* vec_reassoc_width.  */
 912   2,    /* min_div_recip_mul_sf.  */
 913   2,    /* min_div_recip_mul_df.  */
 914   0,    /* max_case_values.  */
 915   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 916   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
 917   &generic_prefetch_tune
 918 };
 919
 920 static const struct tune_params thunderx2t99_tunings =
 921 {
 922   &thunderx2t99_extra_costs,
 923   &thunderx2t99_addrcost_table,
 924   &thunderx2t99_regmove_cost,
 925   &thunderx2t99_vector_cost,
 926   &generic_branch_cost,
 927   &generic_approx_modes,
 928   4, /* memmov_cost.  */
 929   4, /* issue_rate.  */
 930   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
 931    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 932   "16", /* function_align.  */
 933   "8",  /* jump_align.  */
 934   "16", /* loop_align.  */
 935   3,    /* int_reassoc_width.  */
 936   2,    /* fp_reassoc_width.  */
 937   2,    /* vec_reassoc_width.  */
 938   2,    /* min_div_recip_mul_sf.  */
 939   2,    /* min_div_recip_mul_df.  */
 940   0,    /* max_case_values.  */
 941   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 942   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 943   &thunderx2t99_prefetch_tune
 944 };
 945
 946 /* Support for fine-grained override of the tuning structures.  */
 947 struct aarch64_tuning_override_function
 948 {
 949   const char* name;
 950   void (*parse_override)(const char*, struct tune_params*);
 951 };
 952
 953 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 954 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 955
 956 static const struct aarch64_tuning_override_function
 957 aarch64_tuning_override_functions[] =
 958 {
 959   { "fuse", aarch64_parse_fuse_string },
 960   { "tune", aarch64_parse_tune_string },
 961   { NULL, NULL }
 962 };
 963
 964 /* A processor implementing AArch64.  */
 965 struct processor
 966 {
 967   const char *const name;
 968   enum aarch64_processor ident;
 969   enum aarch64_processor sched_core;
 970   enum aarch64_arch arch;
 971   unsigned architecture_version;
 972   const unsigned long flags;
 973   const struct tune_params *const tune;
 974 };
 975
 976 /* Architectures implementing AArch64.  */
 977 static const struct processor all_architectures[] =
 978 {
 979 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
 980   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
 981 #include "aarch64-arches.def"
 982   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 983 };
 984
 985 /* Processor cores implementing AArch64.  */
 986 static const struct processor all_cores[] =
 987 {
 988 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
 989   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
 990   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
 991   FLAGS, &COSTS##_tunings},
 992 #include "aarch64-cores.def"
 993   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
 994     AARCH64_FL_FOR_ARCH8, &generic_tunings},
 995   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 996 };
 997
 998
 999 /* Target specification.  These are populated by the -march, -mtune, -mcpu
1000    handling code or by target attributes.  */
1001 static const struct processor *selected_arch;
1002 static const struct processor *selected_cpu;
1003 static const struct processor *selected_tune;
1004
1005 /* The current tuning set.  */
1006 struct tune_params aarch64_tune_params = generic_tunings;
1007
1008 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1009
1010 /* An ISA extension in the co-processor and main instruction set space.  */
1011 struct aarch64_option_extension
1012 {
1013   const char *const name;
1014   const unsigned long flags_on;
1015   const unsigned long flags_off;
1016 };
1017
1018 typedef enum aarch64_cond_code
1019 {
1020   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1021   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1022   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1023 }
1024 aarch64_cc;
1025
1026 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1027
1028 /* The condition codes of the processor, and the inverse function.  */
1029 static const char * const aarch64_condition_codes[] =
1030 {
1031   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1032   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1033 };
1034
1035 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1036 const char *
1037 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1038                         const char * branch_format)
1039 {
1040     rtx_code_label * tmp_label = gen_label_rtx ();
1041     char label_buf[256];
1042     char buffer[128];
1043     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1044                                  CODE_LABEL_NUMBER (tmp_label));
1045     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1046     rtx dest_label = operands[pos_label];
1047     operands[pos_label] = tmp_label;
1048
1049     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1050     output_asm_insn (buffer, operands);
1051
1052     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1053     operands[pos_label] = dest_label;
1054     output_asm_insn (buffer, operands);
1055     return "";
1056 }
1057
1058 void
1059 aarch64_err_no_fpadvsimd (machine_mode mode)
1060 {
1061   if (TARGET_GENERAL_REGS_ONLY)
1062     if (FLOAT_MODE_P (mode))
1063       error ("%qs is incompatible with the use of floating-point types",
1064              "-mgeneral-regs-only");
1065     else
1066       error ("%qs is incompatible with the use of vector types",
1067              "-mgeneral-regs-only");
1068   else
1069     if (FLOAT_MODE_P (mode))
1070       error ("%qs feature modifier is incompatible with the use of"
1071              " floating-point types", "+nofp");
1072     else
1073       error ("%qs feature modifier is incompatible with the use of"
1074              " vector types", "+nofp");
1075 }
1076
1077 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1078    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1079    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1080    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1081    and GENERAL_REGS is lower than the memory cost (in this case the best class
1082    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
1083    cost results in bad allocations with many redundant int<->FP moves which
1084    are expensive on various cores.
1085    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1086    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
1087    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
1088    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
1089    The result of this is that it is no longer inefficient to have a higher
1090    memory move cost than the register move cost.
1091 */
1092
1093 static reg_class_t
1094 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1095                                          reg_class_t best_class)
1096 {
1097   machine_mode mode;
1098
1099   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1100       || !reg_class_subset_p (FP_REGS, allocno_class))
1101     return allocno_class;
1102
1103   if (!reg_class_subset_p (GENERAL_REGS, best_class)
1104       || !reg_class_subset_p (FP_REGS, best_class))
1105     return best_class;
1106
1107   mode = PSEUDO_REGNO_MODE (regno);
1108   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1109 }
1110
1111 static unsigned int
1112 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1113 {
1114   if (GET_MODE_UNIT_SIZE (mode) == 4)
1115     return aarch64_tune_params.min_div_recip_mul_sf;
1116   return aarch64_tune_params.min_div_recip_mul_df;
1117 }
1118
1119 /* Return the reassociation width of treeop OPC with mode MODE.  */
1120 static int
1121 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1122 {
1123   if (VECTOR_MODE_P (mode))
1124     return aarch64_tune_params.vec_reassoc_width;
1125   if (INTEGRAL_MODE_P (mode))
1126     return aarch64_tune_params.int_reassoc_width;
1127   /* Avoid reassociating floating point addition so we emit more FMAs.  */
1128   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1129     return aarch64_tune_params.fp_reassoc_width;
1130   return 1;
1131 }
1132
1133 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1134 unsigned
1135 aarch64_dbx_register_number (unsigned regno)
1136 {
1137    if (GP_REGNUM_P (regno))
1138      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1139    else if (regno == SP_REGNUM)
1140      return AARCH64_DWARF_SP;
1141    else if (FP_REGNUM_P (regno))
1142      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1143    else if (PR_REGNUM_P (regno))
1144      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1145    else if (regno == VG_REGNUM)
1146      return AARCH64_DWARF_VG;
1147
1148    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1149       equivalent DWARF register.  */
1150    return DWARF_FRAME_REGISTERS;
1151 }
1152
1153 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1154 static bool
1155 aarch64_advsimd_struct_mode_p (machine_mode mode)
1156 {
1157   return (TARGET_SIMD
1158           && (mode == OImode || mode == CImode || mode == XImode));
1159 }
1160
1161 /* Return true if MODE is an SVE predicate mode.  */
1162 static bool
1163 aarch64_sve_pred_mode_p (machine_mode mode)
1164 {
1165   return (TARGET_SVE
1166           && (mode == VNx16BImode
1167               || mode == VNx8BImode
1168               || mode == VNx4BImode
1169               || mode == VNx2BImode));
1170 }
1171
1172 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1173 const unsigned int VEC_ADVSIMD  = 1;
1174 const unsigned int VEC_SVE_DATA = 2;
1175 const unsigned int VEC_SVE_PRED = 4;
1176 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1177    a structure of 2, 3 or 4 vectors.  */
1178 const unsigned int VEC_STRUCT   = 8;
1179 /* Useful combinations of the above.  */
1180 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1181 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1182
1183 /* Return a set of flags describing the vector properties of mode MODE.
1184    Ignore modes that are not supported by the current target.  */
1185 static unsigned int
1186 aarch64_classify_vector_mode (machine_mode mode)
1187 {
1188   if (aarch64_advsimd_struct_mode_p (mode))
1189     return VEC_ADVSIMD | VEC_STRUCT;
1190
1191   if (aarch64_sve_pred_mode_p (mode))
1192     return VEC_SVE_PRED;
1193
1194   scalar_mode inner = GET_MODE_INNER (mode);
1195   if (VECTOR_MODE_P (mode)
1196       && (inner == QImode
1197           || inner == HImode
1198           || inner == HFmode
1199           || inner == SImode
1200           || inner == SFmode
1201           || inner == DImode
1202           || inner == DFmode))
1203     {
1204       if (TARGET_SVE)
1205         {
1206           if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1207             return VEC_SVE_DATA;
1208           if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1209               || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1210               || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1211             return VEC_SVE_DATA | VEC_STRUCT;
1212         }
1213
1214       /* This includes V1DF but not V1DI (which doesn't exist).  */
1215       if (TARGET_SIMD
1216           && (known_eq (GET_MODE_BITSIZE (mode), 64)
1217               || known_eq (GET_MODE_BITSIZE (mode), 128)))
1218         return VEC_ADVSIMD;
1219     }
1220
1221   return 0;
1222 }
1223
1224 /* Return true if MODE is any of the data vector modes, including
1225    structure modes.  */
1226 static bool
1227 aarch64_vector_data_mode_p (machine_mode mode)
1228 {
1229   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1230 }
1231
1232 /* Return true if MODE is an SVE data vector mode; either a single vector
1233    or a structure of vectors.  */
1234 static bool
1235 aarch64_sve_data_mode_p (machine_mode mode)
1236 {
1237   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1238 }
1239
1240 /* Implement target hook TARGET_ARRAY_MODE.  */
1241 static opt_machine_mode
1242 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1243 {
1244   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1245       && IN_RANGE (nelems, 2, 4))
1246     return mode_for_vector (GET_MODE_INNER (mode),
1247                             GET_MODE_NUNITS (mode) * nelems);
1248
1249   return opt_machine_mode ();
1250 }
1251
1252 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1253 static bool
1254 aarch64_array_mode_supported_p (machine_mode mode,
1255                                 unsigned HOST_WIDE_INT nelems)
1256 {
1257   if (TARGET_SIMD
1258       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1259           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1260       && (nelems >= 2 && nelems <= 4))
1261     return true;
1262
1263   return false;
1264 }
1265
1266 /* Return the SVE predicate mode to use for elements that have
1267    ELEM_NBYTES bytes, if such a mode exists.  */
1268
1269 opt_machine_mode
1270 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1271 {
1272   if (TARGET_SVE)
1273     {
1274       if (elem_nbytes == 1)
1275         return VNx16BImode;
1276       if (elem_nbytes == 2)
1277         return VNx8BImode;
1278       if (elem_nbytes == 4)
1279         return VNx4BImode;
1280       if (elem_nbytes == 8)
1281         return VNx2BImode;
1282     }
1283   return opt_machine_mode ();
1284 }
1285
1286 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1287
1288 static opt_machine_mode
1289 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1290 {
1291   if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1292     {
1293       unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1294       machine_mode pred_mode;
1295       if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1296         return pred_mode;
1297     }
1298
1299   return default_get_mask_mode (nunits, nbytes);
1300 }
1301
1302 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
1303    prefer to use the first arithmetic operand as the else value if
1304    the else value doesn't matter, since that exactly matches the SVE
1305    destructive merging form.  For ternary operations we could either
1306    pick the first operand and use FMAD-like instructions or the last
1307    operand and use FMLA-like instructions; the latter seems more
1308    natural.  */
1309
1310 static tree
1311 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1312 {
1313   return nops == 3 ? ops[2] : ops[0];
1314 }
1315
1316 /* Implement TARGET_HARD_REGNO_NREGS.  */
1317
1318 static unsigned int
1319 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1320 {
1321   /* ??? Logically we should only need to provide a value when
1322      HARD_REGNO_MODE_OK says that the combination is valid,
1323      but at the moment we need to handle all modes.  Just ignore
1324      any runtime parts for registers that can't store them.  */
1325   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1326   switch (aarch64_regno_regclass (regno))
1327     {
1328     case FP_REGS:
1329     case FP_LO_REGS:
1330       if (aarch64_sve_data_mode_p (mode))
1331         return exact_div (GET_MODE_SIZE (mode),
1332                           BYTES_PER_SVE_VECTOR).to_constant ();
1333       return CEIL (lowest_size, UNITS_PER_VREG);
1334     case PR_REGS:
1335     case PR_LO_REGS:
1336     case PR_HI_REGS:
1337       return 1;
1338     default:
1339       return CEIL (lowest_size, UNITS_PER_WORD);
1340     }
1341   gcc_unreachable ();
1342 }
1343
1344 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1345
1346 static bool
1347 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1348 {
1349   if (GET_MODE_CLASS (mode) == MODE_CC)
1350     return regno == CC_REGNUM;
1351
1352   if (regno == VG_REGNUM)
1353     /* This must have the same size as _Unwind_Word.  */
1354     return mode == DImode;
1355
1356   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1357   if (vec_flags & VEC_SVE_PRED)
1358     return PR_REGNUM_P (regno);
1359
1360   if (PR_REGNUM_P (regno))
1361     return 0;
1362
1363   if (regno == SP_REGNUM)
1364     /* The purpose of comparing with ptr_mode is to support the
1365        global register variable associated with the stack pointer
1366        register via the syntax of asm ("wsp") in ILP32.  */
1367     return mode == Pmode || mode == ptr_mode;
1368
1369   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1370     return mode == Pmode;
1371
1372   if (GP_REGNUM_P (regno) && known_le (GET_MODE_SIZE (mode), 16))
1373     return true;
1374
1375   if (FP_REGNUM_P (regno))
1376     {
1377       if (vec_flags & VEC_STRUCT)
1378         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1379       else
1380         return !VECTOR_MODE_P (mode) || vec_flags != 0;
1381     }
1382
1383   return false;
1384 }
1385
1386 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1387    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1388    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1389
1390 static bool
1391 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1392 {
1393   return FP_REGNUM_P (regno) && maybe_gt (GET_MODE_SIZE (mode), 8);
1394 }
1395
1396 /* Implement REGMODE_NATURAL_SIZE.  */
1397 poly_uint64
1398 aarch64_regmode_natural_size (machine_mode mode)
1399 {
1400   /* The natural size for SVE data modes is one SVE data vector,
1401      and similarly for predicates.  We can't independently modify
1402      anything smaller than that.  */
1403   /* ??? For now, only do this for variable-width SVE registers.
1404      Doing it for constant-sized registers breaks lower-subreg.c.  */
1405   /* ??? And once that's fixed, we should probably have similar
1406      code for Advanced SIMD.  */
1407   if (!aarch64_sve_vg.is_constant ())
1408     {
1409       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1410       if (vec_flags & VEC_SVE_PRED)
1411         return BYTES_PER_SVE_PRED;
1412       if (vec_flags & VEC_SVE_DATA)
1413         return BYTES_PER_SVE_VECTOR;
1414     }
1415   return UNITS_PER_WORD;
1416 }
1417
1418 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1419 machine_mode
1420 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1421                                      machine_mode mode)
1422 {
1423   /* The predicate mode determines which bits are significant and
1424      which are "don't care".  Decreasing the number of lanes would
1425      lose data while increasing the number of lanes would make bits
1426      unnecessarily significant.  */
1427   if (PR_REGNUM_P (regno))
1428     return mode;
1429   if (known_ge (GET_MODE_SIZE (mode), 4))
1430     return mode;
1431   else
1432     return SImode;
1433 }
1434
1435 /* Return true if I's bits are consecutive ones from the MSB.  */
1436 bool
1437 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
1438 {
1439   return exact_log2 (-i) != HOST_WIDE_INT_M1;
1440 }
1441
1442 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
1443    that strcpy from constants will be faster.  */
1444
1445 static HOST_WIDE_INT
1446 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1447 {
1448   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1449     return MAX (align, BITS_PER_WORD);
1450   return align;
1451 }
1452
1453 /* Return true if calls to DECL should be treated as
1454    long-calls (ie called via a register).  */
1455 static bool
1456 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1457 {
1458   return false;
1459 }
1460
1461 /* Return true if calls to symbol-ref SYM should be treated as
1462    long-calls (ie called via a register).  */
1463 bool
1464 aarch64_is_long_call_p (rtx sym)
1465 {
1466   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1467 }
1468
1469 /* Return true if calls to symbol-ref SYM should not go through
1470    plt stubs.  */
1471
1472 bool
1473 aarch64_is_noplt_call_p (rtx sym)
1474 {
1475   const_tree decl = SYMBOL_REF_DECL (sym);
1476
1477   if (flag_pic
1478       && decl
1479       && (!flag_plt
1480           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1481       && !targetm.binds_local_p (decl))
1482     return true;
1483
1484   return false;
1485 }
1486
1487 /* Return true if the offsets to a zero/sign-extract operation
1488    represent an expression that matches an extend operation.  The
1489    operands represent the paramters from
1490
1491    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1492 bool
1493 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1494                                 rtx extract_imm)
1495 {
1496   HOST_WIDE_INT mult_val, extract_val;
1497
1498   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1499     return false;
1500
1501   mult_val = INTVAL (mult_imm);
1502   extract_val = INTVAL (extract_imm);
1503
1504   if (extract_val > 8
1505       && extract_val < GET_MODE_BITSIZE (mode)
1506       && exact_log2 (extract_val & ~7) > 0
1507       && (extract_val & 7) <= 4
1508       && mult_val == (1 << (extract_val & 7)))
1509     return true;
1510
1511   return false;
1512 }
1513
1514 /* Emit an insn that's a simple single-set.  Both the operands must be
1515    known to be valid.  */
1516 inline static rtx_insn *
1517 emit_set_insn (rtx x, rtx y)
1518 {
1519   return emit_insn (gen_rtx_SET (x, y));
1520 }
1521
1522 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1523    return the rtx for register 0 in the proper mode.  */
1524 rtx
1525 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1526 {
1527   machine_mode mode = SELECT_CC_MODE (code, x, y);
1528   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1529
1530   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1531   return cc_reg;
1532 }
1533
1534 /* Build the SYMBOL_REF for __tls_get_addr.  */
1535
1536 static GTY(()) rtx tls_get_addr_libfunc;
1537
1538 rtx
1539 aarch64_tls_get_addr (void)
1540 {
1541   if (!tls_get_addr_libfunc)
1542     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1543   return tls_get_addr_libfunc;
1544 }
1545
1546 /* Return the TLS model to use for ADDR.  */
1547
1548 static enum tls_model
1549 tls_symbolic_operand_type (rtx addr)
1550 {
1551   enum tls_model tls_kind = TLS_MODEL_NONE;
1552   if (GET_CODE (addr) == CONST)
1553     {
1554       poly_int64 addend;
1555       rtx sym = strip_offset (addr, &addend);
1556       if (GET_CODE (sym) == SYMBOL_REF)
1557         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1558     }
1559   else if (GET_CODE (addr) == SYMBOL_REF)
1560     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1561
1562   return tls_kind;
1563 }
1564
1565 /* We'll allow lo_sum's in addresses in our legitimate addresses
1566    so that combine would take care of combining addresses where
1567    necessary, but for generation purposes, we'll generate the address
1568    as :
1569    RTL                               Absolute
1570    tmp = hi (symbol_ref);            adrp  x1, foo
1571    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1572                                      nop
1573
1574    PIC                               TLS
1575    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1576    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1577                                      bl   __tls_get_addr
1578                                      nop
1579
1580    Load TLS symbol, depending on TLS mechanism and TLS access model.
1581
1582    Global Dynamic - Traditional TLS:
1583    adrp tmp, :tlsgd:imm
1584    add  dest, tmp, #:tlsgd_lo12:imm
1585    bl   __tls_get_addr
1586
1587    Global Dynamic - TLS Descriptors:
1588    adrp dest, :tlsdesc:imm
1589    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1590    add  dest, dest, #:tlsdesc_lo12:imm
1591    blr  tmp
1592    mrs  tp, tpidr_el0
1593    add  dest, dest, tp
1594
1595    Initial Exec:
1596    mrs  tp, tpidr_el0
1597    adrp tmp, :gottprel:imm
1598    ldr  dest, [tmp, #:gottprel_lo12:imm]
1599    add  dest, dest, tp
1600
1601    Local Exec:
1602    mrs  tp, tpidr_el0
1603    add  t0, tp, #:tprel_hi12:imm, lsl #12
1604    add  t0, t0, #:tprel_lo12_nc:imm
1605 */
1606
1607 static void
1608 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1609                                    enum aarch64_symbol_type type)
1610 {
1611   switch (type)
1612     {
1613     case SYMBOL_SMALL_ABSOLUTE:
1614       {
1615         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1616         rtx tmp_reg = dest;
1617         machine_mode mode = GET_MODE (dest);
1618
1619         gcc_assert (mode == Pmode || mode == ptr_mode);
1620
1621         if (can_create_pseudo_p ())
1622           tmp_reg = gen_reg_rtx (mode);
1623
1624         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1625         emit_insn (gen_add_losym (dest, tmp_reg, imm));
1626         return;
1627       }
1628
1629     case SYMBOL_TINY_ABSOLUTE:
1630       emit_insn (gen_rtx_SET (dest, imm));
1631       return;
1632
1633     case SYMBOL_SMALL_GOT_28K:
1634       {
1635         machine_mode mode = GET_MODE (dest);
1636         rtx gp_rtx = pic_offset_table_rtx;
1637         rtx insn;
1638         rtx mem;
1639
1640         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1641            here before rtl expand.  Tree IVOPT will generate rtl pattern to
1642            decide rtx costs, in which case pic_offset_table_rtx is not
1643            initialized.  For that case no need to generate the first adrp
1644            instruction as the final cost for global variable access is
1645            one instruction.  */
1646         if (gp_rtx != NULL)
1647           {
1648             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1649                using the page base as GOT base, the first page may be wasted,
1650                in the worst scenario, there is only 28K space for GOT).
1651
1652                The generate instruction sequence for accessing global variable
1653                is:
1654
1655                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1656
1657                Only one instruction needed. But we must initialize
1658                pic_offset_table_rtx properly.  We generate initialize insn for
1659                every global access, and allow CSE to remove all redundant.
1660
1661                The final instruction sequences will look like the following
1662                for multiply global variables access.
1663
1664                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1665
1666                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1667                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1668                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1669                  ...  */
1670
1671             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1672             crtl->uses_pic_offset_table = 1;
1673             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1674
1675             if (mode != GET_MODE (gp_rtx))
1676              gp_rtx = gen_lowpart (mode, gp_rtx);
1677
1678           }
1679
1680         if (mode == ptr_mode)
1681           {
1682             if (mode == DImode)
1683               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1684             else
1685               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1686
1687             mem = XVECEXP (SET_SRC (insn), 0, 0);
1688           }
1689         else
1690           {
1691             gcc_assert (mode == Pmode);
1692
1693             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1694             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1695           }
1696
1697         /* The operand is expected to be MEM.  Whenever the related insn
1698            pattern changed, above code which calculate mem should be
1699            updated.  */
1700         gcc_assert (GET_CODE (mem) == MEM);
1701         MEM_READONLY_P (mem) = 1;
1702         MEM_NOTRAP_P (mem) = 1;
1703         emit_insn (insn);
1704         return;
1705       }
1706
1707     case SYMBOL_SMALL_GOT_4G:
1708       {
1709         /* In ILP32, the mode of dest can be either SImode or DImode,
1710            while the got entry is always of SImode size.  The mode of
1711            dest depends on how dest is used: if dest is assigned to a
1712            pointer (e.g. in the memory), it has SImode; it may have
1713            DImode if dest is dereferenced to access the memeory.
1714            This is why we have to handle three different ldr_got_small
1715            patterns here (two patterns for ILP32).  */
1716
1717         rtx insn;
1718         rtx mem;
1719         rtx tmp_reg = dest;
1720         machine_mode mode = GET_MODE (dest);
1721
1722         if (can_create_pseudo_p ())
1723           tmp_reg = gen_reg_rtx (mode);
1724
1725         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1726         if (mode == ptr_mode)
1727           {
1728             if (mode == DImode)
1729               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1730             else
1731               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1732
1733             mem = XVECEXP (SET_SRC (insn), 0, 0);
1734           }
1735         else
1736           {
1737             gcc_assert (mode == Pmode);
1738
1739             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1740             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1741           }
1742
1743         gcc_assert (GET_CODE (mem) == MEM);
1744         MEM_READONLY_P (mem) = 1;
1745         MEM_NOTRAP_P (mem) = 1;
1746         emit_insn (insn);
1747         return;
1748       }
1749
1750     case SYMBOL_SMALL_TLSGD:
1751       {
1752         rtx_insn *insns;
1753         machine_mode mode = GET_MODE (dest);
1754         rtx result = gen_rtx_REG (mode, R0_REGNUM);
1755
1756         start_sequence ();
1757         if (TARGET_ILP32)
1758           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1759         else
1760           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1761         insns = get_insns ();
1762         end_sequence ();
1763
1764         RTL_CONST_CALL_P (insns) = 1;
1765         emit_libcall_block (insns, dest, result, imm);
1766         return;
1767       }
1768
1769     case SYMBOL_SMALL_TLSDESC:
1770       {
1771         machine_mode mode = GET_MODE (dest);
1772         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1773         rtx tp;
1774
1775         gcc_assert (mode == Pmode || mode == ptr_mode);
1776
1777         /* In ILP32, the got entry is always of SImode size.  Unlike
1778            small GOT, the dest is fixed at reg 0.  */
1779         if (TARGET_ILP32)
1780           emit_insn (gen_tlsdesc_small_si (imm));
1781         else
1782           emit_insn (gen_tlsdesc_small_di (imm));
1783         tp = aarch64_load_tp (NULL);
1784
1785         if (mode != Pmode)
1786           tp = gen_lowpart (mode, tp);
1787
1788         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1789         if (REG_P (dest))
1790           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1791         return;
1792       }
1793
1794     case SYMBOL_SMALL_TLSIE:
1795       {
1796         /* In ILP32, the mode of dest can be either SImode or DImode,
1797            while the got entry is always of SImode size.  The mode of
1798            dest depends on how dest is used: if dest is assigned to a
1799            pointer (e.g. in the memory), it has SImode; it may have
1800            DImode if dest is dereferenced to access the memeory.
1801            This is why we have to handle three different tlsie_small
1802            patterns here (two patterns for ILP32).  */
1803         machine_mode mode = GET_MODE (dest);
1804         rtx tmp_reg = gen_reg_rtx (mode);
1805         rtx tp = aarch64_load_tp (NULL);
1806
1807         if (mode == ptr_mode)
1808           {
1809             if (mode == DImode)
1810               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1811             else
1812               {
1813                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1814                 tp = gen_lowpart (mode, tp);
1815               }
1816           }
1817         else
1818           {
1819             gcc_assert (mode == Pmode);
1820             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1821           }
1822
1823         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1824         if (REG_P (dest))
1825           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1826         return;
1827       }
1828
1829     case SYMBOL_TLSLE12:
1830     case SYMBOL_TLSLE24:
1831     case SYMBOL_TLSLE32:
1832     case SYMBOL_TLSLE48:
1833       {
1834         machine_mode mode = GET_MODE (dest);
1835         rtx tp = aarch64_load_tp (NULL);
1836
1837         if (mode != Pmode)
1838           tp = gen_lowpart (mode, tp);
1839
1840         switch (type)
1841           {
1842           case SYMBOL_TLSLE12:
1843             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1844                         (dest, tp, imm));
1845             break;
1846           case SYMBOL_TLSLE24:
1847             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1848                         (dest, tp, imm));
1849           break;
1850           case SYMBOL_TLSLE32:
1851             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1852                         (dest, imm));
1853             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1854                         (dest, dest, tp));
1855           break;
1856           case SYMBOL_TLSLE48:
1857             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1858                         (dest, imm));
1859             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1860                         (dest, dest, tp));
1861             break;
1862           default:
1863             gcc_unreachable ();
1864           }
1865
1866         if (REG_P (dest))
1867           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1868         return;
1869       }
1870
1871     case SYMBOL_TINY_GOT:
1872       emit_insn (gen_ldr_got_tiny (dest, imm));
1873       return;
1874
1875     case SYMBOL_TINY_TLSIE:
1876       {
1877         machine_mode mode = GET_MODE (dest);
1878         rtx tp = aarch64_load_tp (NULL);
1879
1880         if (mode == ptr_mode)
1881           {
1882             if (mode == DImode)
1883               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1884             else
1885               {
1886                 tp = gen_lowpart (mode, tp);
1887                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1888               }
1889           }
1890         else
1891           {
1892             gcc_assert (mode == Pmode);
1893             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1894           }
1895
1896         if (REG_P (dest))
1897           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1898         return;
1899       }
1900
1901     default:
1902       gcc_unreachable ();
1903     }
1904 }
1905
1906 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1907    handle all moves if !can_create_pseudo_p ().  The distinction is
1908    important because, unlike emit_move_insn, the move expanders know
1909    how to force Pmode objects into the constant pool even when the
1910    constant pool address is not itself legitimate.  */
1911 static rtx
1912 aarch64_emit_move (rtx dest, rtx src)
1913 {
1914   return (can_create_pseudo_p ()
1915           ? emit_move_insn (dest, src)
1916           : emit_move_insn_1 (dest, src));
1917 }
1918
1919 /* Apply UNOPTAB to OP and store the result in DEST.  */
1920
1921 static void
1922 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
1923 {
1924   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
1925   if (dest != tmp)
1926     emit_move_insn (dest, tmp);
1927 }
1928
1929 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
1930
1931 static void
1932 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
1933 {
1934   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
1935                           OPTAB_DIRECT);
1936   if (dest != tmp)
1937     emit_move_insn (dest, tmp);
1938 }
1939
1940 /* Split a 128-bit move operation into two 64-bit move operations,
1941    taking care to handle partial overlap of register to register
1942    copies.  Special cases are needed when moving between GP regs and
1943    FP regs.  SRC can be a register, constant or memory; DST a register
1944    or memory.  If either operand is memory it must not have any side
1945    effects.  */
1946 void
1947 aarch64_split_128bit_move (rtx dst, rtx src)
1948 {
1949   rtx dst_lo, dst_hi;
1950   rtx src_lo, src_hi;
1951
1952   machine_mode mode = GET_MODE (dst);
1953
1954   gcc_assert (mode == TImode || mode == TFmode);
1955   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1956   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1957
1958   if (REG_P (dst) && REG_P (src))
1959     {
1960       int src_regno = REGNO (src);
1961       int dst_regno = REGNO (dst);
1962
1963       /* Handle FP <-> GP regs.  */
1964       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1965         {
1966           src_lo = gen_lowpart (word_mode, src);
1967           src_hi = gen_highpart (word_mode, src);
1968
1969           emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
1970           emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
1971           return;
1972         }
1973       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1974         {
1975           dst_lo = gen_lowpart (word_mode, dst);
1976           dst_hi = gen_highpart (word_mode, dst);
1977
1978           emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
1979           emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
1980           return;
1981         }
1982     }
1983
1984   dst_lo = gen_lowpart (word_mode, dst);
1985   dst_hi = gen_highpart (word_mode, dst);
1986   src_lo = gen_lowpart (word_mode, src);
1987   src_hi = gen_highpart_mode (word_mode, mode, src);
1988
1989   /* At most one pairing may overlap.  */
1990   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1991     {
1992       aarch64_emit_move (dst_hi, src_hi);
1993       aarch64_emit_move (dst_lo, src_lo);
1994     }
1995   else
1996     {
1997       aarch64_emit_move (dst_lo, src_lo);
1998       aarch64_emit_move (dst_hi, src_hi);
1999     }
2000 }
2001
2002 bool
2003 aarch64_split_128bit_move_p (rtx dst, rtx src)
2004 {
2005   return (! REG_P (src)
2006           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2007 }
2008
2009 /* Split a complex SIMD combine.  */
2010
2011 void
2012 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2013 {
2014   machine_mode src_mode = GET_MODE (src1);
2015   machine_mode dst_mode = GET_MODE (dst);
2016
2017   gcc_assert (VECTOR_MODE_P (dst_mode));
2018   gcc_assert (register_operand (dst, dst_mode)
2019               && register_operand (src1, src_mode)
2020               && register_operand (src2, src_mode));
2021
2022   emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2023   return;
2024 }
2025
2026 /* Split a complex SIMD move.  */
2027
2028 void
2029 aarch64_split_simd_move (rtx dst, rtx src)
2030 {
2031   machine_mode src_mode = GET_MODE (src);
2032   machine_mode dst_mode = GET_MODE (dst);
2033
2034   gcc_assert (VECTOR_MODE_P (dst_mode));
2035
2036   if (REG_P (dst) && REG_P (src))
2037     {
2038       gcc_assert (VECTOR_MODE_P (src_mode));
2039       emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2040     }
2041 }
2042
2043 bool
2044 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2045                               machine_mode ymode, rtx y)
2046 {
2047   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2048   gcc_assert (r != NULL);
2049   return rtx_equal_p (x, r);
2050 }
2051
2052
2053 static rtx
2054 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2055 {
2056   if (can_create_pseudo_p ())
2057     return force_reg (mode, value);
2058   else
2059     {
2060       gcc_assert (x);
2061       aarch64_emit_move (x, value);
2062       return x;
2063     }
2064 }
2065
2066 /* Return true if we can move VALUE into a register using a single
2067    CNT[BHWD] instruction.  */
2068
2069 static bool
2070 aarch64_sve_cnt_immediate_p (poly_int64 value)
2071 {
2072   HOST_WIDE_INT factor = value.coeffs[0];
2073   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
2074   return (value.coeffs[1] == factor
2075           && IN_RANGE (factor, 2, 16 * 16)
2076           && (factor & 1) == 0
2077           && factor <= 16 * (factor & -factor));
2078 }
2079
2080 /* Likewise for rtx X.  */
2081
2082 bool
2083 aarch64_sve_cnt_immediate_p (rtx x)
2084 {
2085   poly_int64 value;
2086   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2087 }
2088
2089 /* Return the asm string for an instruction with a CNT-like vector size
2090    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2091    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2092    first part of the operands template (the part that comes before the
2093    vector size itself).  FACTOR is the number of quadwords.
2094    NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2095    If it is zero, we can use any element size.  */
2096
2097 static char *
2098 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2099                                   unsigned int factor,
2100                                   unsigned int nelts_per_vq)
2101 {
2102   static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2103
2104   if (nelts_per_vq == 0)
2105     /* There is some overlap in the ranges of the four CNT instructions.
2106        Here we always use the smallest possible element size, so that the
2107        multiplier is 1 whereever possible.  */
2108     nelts_per_vq = factor & -factor;
2109   int shift = std::min (exact_log2 (nelts_per_vq), 4);
2110   gcc_assert (IN_RANGE (shift, 1, 4));
2111   char suffix = "dwhb"[shift - 1];
2112
2113   factor >>= shift;
2114   unsigned int written;
2115   if (factor == 1)
2116     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2117                         prefix, suffix, operands);
2118   else
2119     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2120                         prefix, suffix, operands, factor);
2121   gcc_assert (written < sizeof (buffer));
2122   return buffer;
2123 }
2124
2125 /* Return the asm string for an instruction with a CNT-like vector size
2126    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2127    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2128    first part of the operands template (the part that comes before the
2129    vector size itself).  X is the value of the vector size operand,
2130    as a polynomial integer rtx.  */
2131
2132 char *
2133 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2134                                   rtx x)
2135 {
2136   poly_int64 value = rtx_to_poly_int64 (x);
2137   gcc_assert (aarch64_sve_cnt_immediate_p (value));
2138   return aarch64_output_sve_cnt_immediate (prefix, operands,
2139                                            value.coeffs[1], 0);
2140 }
2141
2142 /* Return true if we can add VALUE to a register using a single ADDVL
2143    or ADDPL instruction.  */
2144
2145 static bool
2146 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2147 {
2148   HOST_WIDE_INT factor = value.coeffs[0];
2149   if (factor == 0 || value.coeffs[1] != factor)
2150     return false;
2151   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2152      and a value of 16 is one vector width.  */
2153   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2154           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2155 }
2156
2157 /* Likewise for rtx X.  */
2158
2159 bool
2160 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2161 {
2162   poly_int64 value;
2163   return (poly_int_rtx_p (x, &value)
2164           && aarch64_sve_addvl_addpl_immediate_p (value));
2165 }
2166
2167 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2168    and storing the result in operand 0.  */
2169
2170 char *
2171 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2172 {
2173   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2174   poly_int64 offset_value = rtx_to_poly_int64 (offset);
2175   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2176
2177   /* Use INC or DEC if possible.  */
2178   if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2179     {
2180       if (aarch64_sve_cnt_immediate_p (offset_value))
2181         return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2182                                                  offset_value.coeffs[1], 0);
2183       if (aarch64_sve_cnt_immediate_p (-offset_value))
2184         return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2185                                                  -offset_value.coeffs[1], 0);
2186     }
2187
2188   int factor = offset_value.coeffs[1];
2189   if ((factor & 15) == 0)
2190     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2191   else
2192     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2193   return buffer;
2194 }
2195
2196 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2197    instruction.  If it is, store the number of elements in each vector
2198    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2199    factor in *FACTOR_OUT (if nonnull).  */
2200
2201 bool
2202 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2203                                  unsigned int *nelts_per_vq_out)
2204 {
2205   rtx elt;
2206   poly_int64 value;
2207
2208   if (!const_vec_duplicate_p (x, &elt)
2209       || !poly_int_rtx_p (elt, &value))
2210     return false;
2211
2212   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2213   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2214     /* There's no vector INCB.  */
2215     return false;
2216
2217   HOST_WIDE_INT factor = value.coeffs[0];
2218   if (value.coeffs[1] != factor)
2219     return false;
2220
2221   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
2222   if ((factor % nelts_per_vq) != 0
2223       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2224     return false;
2225
2226   if (factor_out)
2227     *factor_out = factor;
2228   if (nelts_per_vq_out)
2229     *nelts_per_vq_out = nelts_per_vq;
2230   return true;
2231 }
2232
2233 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2234    instruction.  */
2235
2236 bool
2237 aarch64_sve_inc_dec_immediate_p (rtx x)
2238 {
2239   return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2240 }
2241
2242 /* Return the asm template for an SVE vector INC or DEC instruction.
2243    OPERANDS gives the operands before the vector count and X is the
2244    value of the vector count operand itself.  */
2245
2246 char *
2247 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2248 {
2249   int factor;
2250   unsigned int nelts_per_vq;
2251   if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2252     gcc_unreachable ();
2253   if (factor < 0)
2254     return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2255                                              nelts_per_vq);
2256   else
2257     return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2258                                              nelts_per_vq);
2259 }
2260
2261 static int
2262 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2263                                 scalar_int_mode mode)
2264 {
2265   int i;
2266   unsigned HOST_WIDE_INT val, val2, mask;
2267   int one_match, zero_match;
2268   int num_insns;
2269
2270   val = INTVAL (imm);
2271
2272   if (aarch64_move_imm (val, mode))
2273     {
2274       if (generate)
2275         emit_insn (gen_rtx_SET (dest, imm));
2276       return 1;
2277     }
2278
2279   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2280      (with XXXX non-zero). In that case check to see if the move can be done in
2281      a smaller mode.  */
2282   val2 = val & 0xffffffff;
2283   if (mode == DImode
2284       && aarch64_move_imm (val2, SImode)
2285       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2286     {
2287       if (generate)
2288         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2289
2290       /* Check if we have to emit a second instruction by checking to see
2291          if any of the upper 32 bits of the original DI mode value is set.  */
2292       if (val == val2)
2293         return 1;
2294
2295       i = (val >> 48) ? 48 : 32;
2296
2297       if (generate)
2298          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2299                                     GEN_INT ((val >> i) & 0xffff)));
2300
2301       return 2;
2302     }
2303
2304   if ((val >> 32) == 0 || mode == SImode)
2305     {
2306       if (generate)
2307         {
2308           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2309           if (mode == SImode)
2310             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2311                                        GEN_INT ((val >> 16) & 0xffff)));
2312           else
2313             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2314                                        GEN_INT ((val >> 16) & 0xffff)));
2315         }
2316       return 2;
2317     }
2318
2319   /* Remaining cases are all for DImode.  */
2320
2321   mask = 0xffff;
2322   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2323     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2324   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2325     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2326
2327   if (zero_match != 2 && one_match != 2)
2328     {
2329       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2330          For a 64-bit bitmask try whether changing 16 bits to all ones or
2331          zeroes creates a valid bitmask.  To check any repeated bitmask,
2332          try using 16 bits from the other 32-bit half of val.  */
2333
2334       for (i = 0; i < 64; i += 16, mask <<= 16)
2335         {
2336           val2 = val & ~mask;
2337           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2338             break;
2339           val2 = val | mask;
2340           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2341             break;
2342           val2 = val2 & ~mask;
2343           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2344           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2345             break;
2346         }
2347       if (i != 64)
2348         {
2349           if (generate)
2350             {
2351               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2352               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2353                                          GEN_INT ((val >> i) & 0xffff)));
2354             }
2355           return 2;
2356         }
2357     }
2358
2359   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2360      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
2361      otherwise skip zero bits.  */
2362
2363   num_insns = 1;
2364   mask = 0xffff;
2365   val2 = one_match > zero_match ? ~val : val;
2366   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2367
2368   if (generate)
2369     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2370                                            ? (val | ~(mask << i))
2371                                            : (val & (mask << i)))));
2372   for (i += 16; i < 64; i += 16)
2373     {
2374       if ((val2 & (mask << i)) == 0)
2375         continue;
2376       if (generate)
2377         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2378                                    GEN_INT ((val >> i) & 0xffff)));
2379       num_insns ++;
2380     }
2381
2382   return num_insns;
2383 }
2384
2385 /* Return whether imm is a 128-bit immediate which is simple enough to
2386    expand inline.  */
2387 bool
2388 aarch64_mov128_immediate (rtx imm)
2389 {
2390   if (GET_CODE (imm) == CONST_INT)
2391     return true;
2392
2393   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2394
2395   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2396   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2397
2398   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2399          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2400 }
2401
2402
2403 /* Return the number of temporary registers that aarch64_add_offset_1
2404    would need to add OFFSET to a register.  */
2405
2406 static unsigned int
2407 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2408 {
2409   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2410 }
2411
2412 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
2413    a non-polynomial OFFSET.  MODE is the mode of the addition.
2414    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2415    be set and CFA adjustments added to the generated instructions.
2416
2417    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2418    temporary if register allocation is already complete.  This temporary
2419    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
2420    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2421    the immediate again.
2422
2423    Since this function may be used to adjust the stack pointer, we must
2424    ensure that it cannot cause transient stack deallocation (for example
2425    by first incrementing SP and then decrementing when adjusting by a
2426    large immediate).  */
2427
2428 static void
2429 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2430                       rtx src, HOST_WIDE_INT offset, rtx temp1,
2431                       bool frame_related_p, bool emit_move_imm)
2432 {
2433   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2434   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2435
2436   HOST_WIDE_INT moffset = abs_hwi (offset);
2437   rtx_insn *insn;
2438
2439   if (!moffset)
2440     {
2441       if (!rtx_equal_p (dest, src))
2442         {
2443           insn = emit_insn (gen_rtx_SET (dest, src));
2444           RTX_FRAME_RELATED_P (insn) = frame_related_p;
2445         }
2446       return;
2447     }
2448
2449   /* Single instruction adjustment.  */
2450   if (aarch64_uimm12_shift (moffset))
2451     {
2452       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2453       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2454       return;
2455     }
2456
2457   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2458      and either:
2459
2460      a) the offset cannot be loaded by a 16-bit move or
2461      b) there is no spare register into which we can move it.  */
2462   if (moffset < 0x1000000
2463       && ((!temp1 && !can_create_pseudo_p ())
2464           || !aarch64_move_imm (moffset, mode)))
2465     {
2466       HOST_WIDE_INT low_off = moffset & 0xfff;
2467
2468       low_off = offset < 0 ? -low_off : low_off;
2469       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2470       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2471       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2472       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2473       return;
2474     }
2475
2476   /* Emit a move immediate if required and an addition/subtraction.  */
2477   if (emit_move_imm)
2478     {
2479       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2480       temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2481     }
2482   insn = emit_insn (offset < 0
2483                     ? gen_sub3_insn (dest, src, temp1)
2484                     : gen_add3_insn (dest, src, temp1));
2485   if (frame_related_p)
2486     {
2487       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2488       rtx adj = plus_constant (mode, src, offset);
2489       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2490     }
2491 }
2492
2493 /* Return the number of temporary registers that aarch64_add_offset
2494    would need to move OFFSET into a register or add OFFSET to a register;
2495    ADD_P is true if we want the latter rather than the former.  */
2496
2497 static unsigned int
2498 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2499 {
2500   /* This follows the same structure as aarch64_add_offset.  */
2501   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2502     return 0;
2503
2504   unsigned int count = 0;
2505   HOST_WIDE_INT factor = offset.coeffs[1];
2506   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2507   poly_int64 poly_offset (factor, factor);
2508   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2509     /* Need one register for the ADDVL/ADDPL result.  */
2510     count += 1;
2511   else if (factor != 0)
2512     {
2513       factor = abs (factor);
2514       if (factor > 16 * (factor & -factor))
2515         /* Need one register for the CNT result and one for the multiplication
2516            factor.  If necessary, the second temporary can be reused for the
2517            constant part of the offset.  */
2518         return 2;
2519       /* Need one register for the CNT result (which might then
2520          be shifted).  */
2521       count += 1;
2522     }
2523   return count + aarch64_add_offset_1_temporaries (constant);
2524 }
2525
2526 /* If X can be represented as a poly_int64, return the number
2527    of temporaries that are required to add it to a register.
2528    Return -1 otherwise.  */
2529
2530 int
2531 aarch64_add_offset_temporaries (rtx x)
2532 {
2533   poly_int64 offset;
2534   if (!poly_int_rtx_p (x, &offset))
2535     return -1;
2536   return aarch64_offset_temporaries (true, offset);
2537 }
2538
2539 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
2540    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2541    be set and CFA adjustments added to the generated instructions.
2542
2543    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2544    temporary if register allocation is already complete.  This temporary
2545    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2546    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2547    false to avoid emitting the immediate again.
2548
2549    TEMP2, if nonnull, is a second temporary register that doesn't
2550    overlap either DEST or REG.
2551
2552    Since this function may be used to adjust the stack pointer, we must
2553    ensure that it cannot cause transient stack deallocation (for example
2554    by first incrementing SP and then decrementing when adjusting by a
2555    large immediate).  */
2556
2557 static void
2558 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2559                     poly_int64 offset, rtx temp1, rtx temp2,
2560                     bool frame_related_p, bool emit_move_imm = true)
2561 {
2562   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2563   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2564   gcc_assert (temp1 == NULL_RTX
2565               || !frame_related_p
2566               || !reg_overlap_mentioned_p (temp1, dest));
2567   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2568
2569   /* Try using ADDVL or ADDPL to add the whole value.  */
2570   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2571     {
2572       rtx offset_rtx = gen_int_mode (offset, mode);
2573       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2574       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2575       return;
2576     }
2577
2578   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2579      SVE vector register, over and above the minimum size of 128 bits.
2580      This is equivalent to half the value returned by CNTD with a
2581      vector shape of ALL.  */
2582   HOST_WIDE_INT factor = offset.coeffs[1];
2583   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2584
2585   /* Try using ADDVL or ADDPL to add the VG-based part.  */
2586   poly_int64 poly_offset (factor, factor);
2587   if (src != const0_rtx
2588       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2589     {
2590       rtx offset_rtx = gen_int_mode (poly_offset, mode);
2591       if (frame_related_p)
2592         {
2593           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2594           RTX_FRAME_RELATED_P (insn) = true;
2595           src = dest;
2596         }
2597       else
2598         {
2599           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
2600           src = aarch64_force_temporary (mode, temp1, addr);
2601           temp1 = temp2;
2602           temp2 = NULL_RTX;
2603         }
2604     }
2605   /* Otherwise use a CNT-based sequence.  */
2606   else if (factor != 0)
2607     {
2608       /* Use a subtraction if we have a negative factor.  */
2609       rtx_code code = PLUS;
2610       if (factor < 0)
2611         {
2612           factor = -factor;
2613           code = MINUS;
2614         }
2615
2616       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
2617          into the multiplication.  */
2618       rtx val;
2619       int shift = 0;
2620       if (factor & 1)
2621         /* Use a right shift by 1.  */
2622         shift = -1;
2623       else
2624         factor /= 2;
2625       HOST_WIDE_INT low_bit = factor & -factor;
2626       if (factor <= 16 * low_bit)
2627         {
2628           if (factor > 16 * 8)
2629             {
2630               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
2631                  the value with the minimum multiplier and shift it into
2632                  position.  */
2633               int extra_shift = exact_log2 (low_bit);
2634               shift += extra_shift;
2635               factor >>= extra_shift;
2636             }
2637           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
2638         }
2639       else
2640         {
2641           /* Use CNTD, then multiply it by FACTOR.  */
2642           val = gen_int_mode (poly_int64 (2, 2), mode);
2643           val = aarch64_force_temporary (mode, temp1, val);
2644
2645           /* Go back to using a negative multiplication factor if we have
2646              no register from which to subtract.  */
2647           if (code == MINUS && src == const0_rtx)
2648             {
2649               factor = -factor;
2650               code = PLUS;
2651             }
2652           rtx coeff1 = gen_int_mode (factor, mode);
2653           coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
2654           val = gen_rtx_MULT (mode, val, coeff1);
2655         }
2656
2657       if (shift > 0)
2658         {
2659           /* Multiply by 1 << SHIFT.  */
2660           val = aarch64_force_temporary (mode, temp1, val);
2661           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
2662         }
2663       else if (shift == -1)
2664         {
2665           /* Divide by 2.  */
2666           val = aarch64_force_temporary (mode, temp1, val);
2667           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
2668         }
2669
2670       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
2671       if (src != const0_rtx)
2672         {
2673           val = aarch64_force_temporary (mode, temp1, val);
2674           val = gen_rtx_fmt_ee (code, mode, src, val);
2675         }
2676       else if (code == MINUS)
2677         {
2678           val = aarch64_force_temporary (mode, temp1, val);
2679           val = gen_rtx_NEG (mode, val);
2680         }
2681
2682       if (constant == 0 || frame_related_p)
2683         {
2684           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
2685           if (frame_related_p)
2686             {
2687               RTX_FRAME_RELATED_P (insn) = true;
2688               add_reg_note (insn, REG_CFA_ADJUST_CFA,
2689                             gen_rtx_SET (dest, plus_constant (Pmode, src,
2690                                                               poly_offset)));
2691             }
2692           src = dest;
2693           if (constant == 0)
2694             return;
2695         }
2696       else
2697         {
2698           src = aarch64_force_temporary (mode, temp1, val);
2699           temp1 = temp2;
2700           temp2 = NULL_RTX;
2701         }
2702
2703       emit_move_imm = true;
2704     }
2705
2706   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
2707                         frame_related_p, emit_move_imm);
2708 }
2709
2710 /* Like aarch64_add_offset, but the offset is given as an rtx rather
2711    than a poly_int64.  */
2712
2713 void
2714 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2715                           rtx offset_rtx, rtx temp1, rtx temp2)
2716 {
2717   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
2718                       temp1, temp2, false);
2719 }
2720
2721 /* Add DELTA to the stack pointer, marking the instructions frame-related.
2722    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
2723    if TEMP1 already contains abs (DELTA).  */
2724
2725 static inline void
2726 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
2727 {
2728   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
2729                       temp1, temp2, true, emit_move_imm);
2730 }
2731
2732 /* Subtract DELTA from the stack pointer, marking the instructions
2733    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
2734    if nonnull.  */
2735
2736 static inline void
2737 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p)
2738 {
2739   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
2740                       temp1, temp2, frame_related_p);
2741 }
2742
2743 /* Set DEST to (vec_series BASE STEP).  */
2744
2745 static void
2746 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
2747 {
2748   machine_mode mode = GET_MODE (dest);
2749   scalar_mode inner = GET_MODE_INNER (mode);
2750
2751   /* Each operand can be a register or an immediate in the range [-16, 15].  */
2752   if (!aarch64_sve_index_immediate_p (base))
2753     base = force_reg (inner, base);
2754   if (!aarch64_sve_index_immediate_p (step))
2755     step = force_reg (inner, step);
2756
2757   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
2758 }
2759
2760 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
2761    integer of mode INT_MODE.  Return true on success.  */
2762
2763 static bool
2764 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
2765                                       rtx src)
2766 {
2767   /* If the constant is smaller than 128 bits, we can do the move
2768      using a vector of SRC_MODEs.  */
2769   if (src_mode != TImode)
2770     {
2771       poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
2772                                      GET_MODE_SIZE (src_mode));
2773       machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
2774       emit_move_insn (gen_lowpart (dup_mode, dest),
2775                       gen_const_vec_duplicate (dup_mode, src));
2776       return true;
2777     }
2778
2779   /* Use LD1RQ[BHWD] to load the 128 bits from memory.  */
2780   src = force_const_mem (src_mode, src);
2781   if (!src)
2782     return false;
2783
2784   /* Make sure that the address is legitimate.  */
2785   if (!aarch64_sve_ld1r_operand_p (src))
2786     {
2787       rtx addr = force_reg (Pmode, XEXP (src, 0));
2788       src = replace_equiv_address (src, addr);
2789     }
2790
2791   machine_mode mode = GET_MODE (dest);
2792   unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
2793   machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
2794   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
2795   src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
2796   emit_insn (gen_rtx_SET (dest, src));
2797   return true;
2798 }
2799
2800 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
2801    isn't a simple duplicate or series.  */
2802
2803 static void
2804 aarch64_expand_sve_const_vector (rtx dest, rtx src)
2805 {
2806   machine_mode mode = GET_MODE (src);
2807   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
2808   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
2809   gcc_assert (npatterns > 1);
2810
2811   if (nelts_per_pattern == 1)
2812     {
2813       /* The constant is a repeating seqeuence of at least two elements,
2814          where the repeating elements occupy no more than 128 bits.
2815          Get an integer representation of the replicated value.  */
2816       scalar_int_mode int_mode;
2817       if (BYTES_BIG_ENDIAN)
2818         /* For now, always use LD1RQ to load the value on big-endian
2819            targets, since the handling of smaller integers includes a
2820            subreg that is semantically an element reverse.  */
2821         int_mode = TImode;
2822       else
2823         {
2824           unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
2825           gcc_assert (int_bits <= 128);
2826           int_mode = int_mode_for_size (int_bits, 0).require ();
2827         }
2828       rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
2829       if (int_value
2830           && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
2831         return;
2832     }
2833
2834   /* Expand each pattern individually.  */
2835   rtx_vector_builder builder;
2836   auto_vec<rtx, 16> vectors (npatterns);
2837   for (unsigned int i = 0; i < npatterns; ++i)
2838     {
2839       builder.new_vector (mode, 1, nelts_per_pattern);
2840       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
2841         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
2842       vectors.quick_push (force_reg (mode, builder.build ()));
2843     }
2844
2845   /* Use permutes to interleave the separate vectors.  */
2846   while (npatterns > 1)
2847     {
2848       npatterns /= 2;
2849       for (unsigned int i = 0; i < npatterns; ++i)
2850         {
2851           rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
2852           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
2853           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
2854           vectors[i] = tmp;
2855         }
2856     }
2857   gcc_assert (vectors[0] == dest);
2858 }
2859
2860 /* Set DEST to immediate IMM.  For SVE vector modes, GEN_VEC_DUPLICATE
2861    is a pattern that can be used to set DEST to a replicated scalar
2862    element.  */
2863
2864 void
2865 aarch64_expand_mov_immediate (rtx dest, rtx imm,
2866                               rtx (*gen_vec_duplicate) (rtx, rtx))
2867 {
2868   machine_mode mode = GET_MODE (dest);
2869
2870   /* Check on what type of symbol it is.  */
2871   scalar_int_mode int_mode;
2872   if ((GET_CODE (imm) == SYMBOL_REF
2873        || GET_CODE (imm) == LABEL_REF
2874        || GET_CODE (imm) == CONST
2875        || GET_CODE (imm) == CONST_POLY_INT)
2876       && is_a <scalar_int_mode> (mode, &int_mode))
2877     {
2878       rtx mem;
2879       poly_int64 offset;
2880       HOST_WIDE_INT const_offset;
2881       enum aarch64_symbol_type sty;
2882
2883       /* If we have (const (plus symbol offset)), separate out the offset
2884          before we start classifying the symbol.  */
2885       rtx base = strip_offset (imm, &offset);
2886
2887       /* We must always add an offset involving VL separately, rather than
2888          folding it into the relocation.  */
2889       if (!offset.is_constant (&const_offset))
2890         {
2891           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
2892             emit_insn (gen_rtx_SET (dest, imm));
2893           else
2894             {
2895               /* Do arithmetic on 32-bit values if the result is smaller
2896                  than that.  */
2897               if (partial_subreg_p (int_mode, SImode))
2898                 {
2899                   /* It is invalid to do symbol calculations in modes
2900                      narrower than SImode.  */
2901                   gcc_assert (base == const0_rtx);
2902                   dest = gen_lowpart (SImode, dest);
2903                   int_mode = SImode;
2904                 }
2905               if (base != const0_rtx)
2906                 {
2907                   base = aarch64_force_temporary (int_mode, dest, base);
2908                   aarch64_add_offset (int_mode, dest, base, offset,
2909                                       NULL_RTX, NULL_RTX, false);
2910                 }
2911               else
2912                 aarch64_add_offset (int_mode, dest, base, offset,
2913                                     dest, NULL_RTX, false);
2914             }
2915           return;
2916         }
2917
2918       sty = aarch64_classify_symbol (base, const_offset);
2919       switch (sty)
2920         {
2921         case SYMBOL_FORCE_TO_MEM:
2922           if (const_offset != 0
2923               && targetm.cannot_force_const_mem (int_mode, imm))
2924             {
2925               gcc_assert (can_create_pseudo_p ());
2926               base = aarch64_force_temporary (int_mode, dest, base);
2927               aarch64_add_offset (int_mode, dest, base, const_offset,
2928                                   NULL_RTX, NULL_RTX, false);
2929               return;
2930             }
2931
2932           mem = force_const_mem (ptr_mode, imm);
2933           gcc_assert (mem);
2934
2935           /* If we aren't generating PC relative literals, then
2936              we need to expand the literal pool access carefully.
2937              This is something that needs to be done in a number
2938              of places, so could well live as a separate function.  */
2939           if (!aarch64_pcrelative_literal_loads)
2940             {
2941               gcc_assert (can_create_pseudo_p ());
2942               base = gen_reg_rtx (ptr_mode);
2943               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
2944               if (ptr_mode != Pmode)
2945                 base = convert_memory_address (Pmode, base);
2946               mem = gen_rtx_MEM (ptr_mode, base);
2947             }
2948
2949           if (int_mode != ptr_mode)
2950             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
2951
2952           emit_insn (gen_rtx_SET (dest, mem));
2953
2954           return;
2955
2956         case SYMBOL_SMALL_TLSGD:
2957         case SYMBOL_SMALL_TLSDESC:
2958         case SYMBOL_SMALL_TLSIE:
2959         case SYMBOL_SMALL_GOT_28K:
2960         case SYMBOL_SMALL_GOT_4G:
2961         case SYMBOL_TINY_GOT:
2962         case SYMBOL_TINY_TLSIE:
2963           if (const_offset != 0)
2964             {
2965               gcc_assert(can_create_pseudo_p ());
2966               base = aarch64_force_temporary (int_mode, dest, base);
2967               aarch64_add_offset (int_mode, dest, base, const_offset,
2968                                   NULL_RTX, NULL_RTX, false);
2969               return;
2970             }
2971           /* FALLTHRU */
2972
2973         case SYMBOL_SMALL_ABSOLUTE:
2974         case SYMBOL_TINY_ABSOLUTE:
2975         case SYMBOL_TLSLE12:
2976         case SYMBOL_TLSLE24:
2977         case SYMBOL_TLSLE32:
2978         case SYMBOL_TLSLE48:
2979           aarch64_load_symref_appropriately (dest, imm, sty);
2980           return;
2981
2982         default:
2983           gcc_unreachable ();
2984         }
2985     }
2986
2987   if (!CONST_INT_P (imm))
2988     {
2989       rtx base, step, value;
2990       if (GET_CODE (imm) == HIGH
2991           || aarch64_simd_valid_immediate (imm, NULL))
2992         emit_insn (gen_rtx_SET (dest, imm));
2993       else if (const_vec_series_p (imm, &base, &step))
2994         aarch64_expand_vec_series (dest, base, step);
2995       else if (const_vec_duplicate_p (imm, &value))
2996         {
2997           /* If the constant is out of range of an SVE vector move,
2998              load it from memory if we can, otherwise move it into
2999              a register and use a DUP.  */
3000           scalar_mode inner_mode = GET_MODE_INNER (mode);
3001           rtx op = force_const_mem (inner_mode, value);
3002           if (!op)
3003             op = force_reg (inner_mode, value);
3004           else if (!aarch64_sve_ld1r_operand_p (op))
3005             {
3006               rtx addr = force_reg (Pmode, XEXP (op, 0));
3007               op = replace_equiv_address (op, addr);
3008             }
3009           emit_insn (gen_vec_duplicate (dest, op));
3010         }
3011       else if (GET_CODE (imm) == CONST_VECTOR
3012                && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3013         aarch64_expand_sve_const_vector (dest, imm);
3014       else
3015         {
3016           rtx mem = force_const_mem (mode, imm);
3017           gcc_assert (mem);
3018           emit_move_insn (dest, mem);
3019         }
3020
3021       return;
3022     }
3023
3024   aarch64_internal_mov_immediate (dest, imm, true,
3025                                   as_a <scalar_int_mode> (mode));
3026 }
3027
3028 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
3029    that is known to contain PTRUE.  */
3030
3031 void
3032 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3033 {
3034   emit_insn (gen_rtx_SET (dest, gen_rtx_UNSPEC (GET_MODE (dest),
3035                                                 gen_rtvec (2, pred, src),
3036                                                 UNSPEC_MERGE_PTRUE)));
3037 }
3038
3039 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3040    operand is in memory.  In this case we need to use the predicated LD1
3041    and ST1 instead of LDR and STR, both for correctness on big-endian
3042    targets and because LD1 and ST1 support a wider range of addressing modes.
3043    PRED_MODE is the mode of the predicate.
3044
3045    See the comment at the head of aarch64-sve.md for details about the
3046    big-endian handling.  */
3047
3048 void
3049 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3050 {
3051   machine_mode mode = GET_MODE (dest);
3052   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3053   if (!register_operand (src, mode)
3054       && !register_operand (dest, mode))
3055     {
3056       rtx tmp = gen_reg_rtx (mode);
3057       if (MEM_P (src))
3058         aarch64_emit_sve_pred_move (tmp, ptrue, src);
3059       else
3060         emit_move_insn (tmp, src);
3061       src = tmp;
3062     }
3063   aarch64_emit_sve_pred_move (dest, ptrue, src);
3064 }
3065
3066 /* Called only on big-endian targets.  See whether an SVE vector move
3067    from SRC to DEST is effectively a REV[BHW] instruction, because at
3068    least one operand is a subreg of an SVE vector that has wider or
3069    narrower elements.  Return true and emit the instruction if so.
3070
3071    For example:
3072
3073      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3074
3075    represents a VIEW_CONVERT between the following vectors, viewed
3076    in memory order:
3077
3078      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
3079      R1: { [0],      [1],      [2],      [3],     ... }
3080
3081    The high part of lane X in R2 should therefore correspond to lane X*2
3082    of R1, but the register representations are:
3083
3084          msb                                      lsb
3085      R2: ...... [1].high  [1].low   [0].high  [0].low
3086      R1: ...... [3]       [2]       [1]       [0]
3087
3088    where the low part of lane X in R2 corresponds to lane X*2 in R1.
3089    We therefore need a reverse operation to swap the high and low values
3090    around.
3091
3092    This is purely an optimization.  Without it we would spill the
3093    subreg operand to the stack in one mode and reload it in the
3094    other mode, which has the same effect as the REV.  */
3095
3096 bool
3097 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3098 {
3099   gcc_assert (BYTES_BIG_ENDIAN);
3100   if (GET_CODE (dest) == SUBREG)
3101     dest = SUBREG_REG (dest);
3102   if (GET_CODE (src) == SUBREG)
3103     src = SUBREG_REG (src);
3104
3105   /* The optimization handles two single SVE REGs with different element
3106      sizes.  */
3107   if (!REG_P (dest)
3108       || !REG_P (src)
3109       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3110       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3111       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3112           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3113     return false;
3114
3115   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
3116   rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3117   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3118                                UNSPEC_REV_SUBREG);
3119   emit_insn (gen_rtx_SET (dest, unspec));
3120   return true;
3121 }
3122
3123 /* Return a copy of X with mode MODE, without changing its other
3124    attributes.  Unlike gen_lowpart, this doesn't care whether the
3125    mode change is valid.  */
3126
3127 static rtx
3128 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3129 {
3130   if (GET_MODE (x) == mode)
3131     return x;
3132
3133   x = shallow_copy_rtx (x);
3134   set_mode_and_regno (x, mode, REGNO (x));
3135   return x;
3136 }
3137
3138 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3139    operands.  */
3140
3141 void
3142 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3143 {
3144   /* Decide which REV operation we need.  The mode with narrower elements
3145      determines the mode of the operands and the mode with the wider
3146      elements determines the reverse width.  */
3147   machine_mode mode_with_wider_elts = GET_MODE (dest);
3148   machine_mode mode_with_narrower_elts = GET_MODE (src);
3149   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3150       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3151     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3152
3153   unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3154   unsigned int unspec;
3155   if (wider_bytes == 8)
3156     unspec = UNSPEC_REV64;
3157   else if (wider_bytes == 4)
3158     unspec = UNSPEC_REV32;
3159   else if (wider_bytes == 2)
3160     unspec = UNSPEC_REV16;
3161   else
3162     gcc_unreachable ();
3163   machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3164
3165   /* Emit:
3166
3167        (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3168                          UNSPEC_MERGE_PTRUE))
3169
3170      with the appropriate modes.  */
3171   ptrue = gen_lowpart (pred_mode, ptrue);
3172   dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3173   src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3174   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3175   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3176                         UNSPEC_MERGE_PTRUE);
3177   emit_insn (gen_rtx_SET (dest, src));
3178 }
3179
3180 static bool
3181 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3182                                  tree exp ATTRIBUTE_UNUSED)
3183 {
3184   /* Currently, always true.  */
3185   return true;
3186 }
3187
3188 /* Implement TARGET_PASS_BY_REFERENCE.  */
3189
3190 static bool
3191 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3192                            machine_mode mode,
3193                            const_tree type,
3194                            bool named ATTRIBUTE_UNUSED)
3195 {
3196   HOST_WIDE_INT size;
3197   machine_mode dummymode;
3198   int nregs;
3199
3200   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
3201   if (mode == BLKmode && type)
3202     size = int_size_in_bytes (type);
3203   else
3204     /* No frontends can create types with variable-sized modes, so we
3205        shouldn't be asked to pass or return them.  */
3206     size = GET_MODE_SIZE (mode).to_constant ();
3207
3208   /* Aggregates are passed by reference based on their size.  */
3209   if (type && AGGREGATE_TYPE_P (type))
3210     {
3211       size = int_size_in_bytes (type);
3212     }
3213
3214   /* Variable sized arguments are always returned by reference.  */
3215   if (size < 0)
3216     return true;
3217
3218   /* Can this be a candidate to be passed in fp/simd register(s)?  */
3219   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3220                                                &dummymode, &nregs,
3221                                                NULL))
3222     return false;
3223
3224   /* Arguments which are variable sized or larger than 2 registers are
3225      passed by reference unless they are a homogenous floating point
3226      aggregate.  */
3227   return size > 2 * UNITS_PER_WORD;
3228 }
3229
3230 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
3231 static bool
3232 aarch64_return_in_msb (const_tree valtype)
3233 {
3234   machine_mode dummy_mode;
3235   int dummy_int;
3236
3237   /* Never happens in little-endian mode.  */
3238   if (!BYTES_BIG_ENDIAN)
3239     return false;
3240
3241   /* Only composite types smaller than or equal to 16 bytes can
3242      be potentially returned in registers.  */
3243   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3244       || int_size_in_bytes (valtype) <= 0
3245       || int_size_in_bytes (valtype) > 16)
3246     return false;
3247
3248   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3249      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3250      is always passed/returned in the least significant bits of fp/simd
3251      register(s).  */
3252   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3253                                                &dummy_mode, &dummy_int, NULL))
3254     return false;
3255
3256   return true;
3257 }
3258
3259 /* Implement TARGET_FUNCTION_VALUE.
3260    Define how to find the value returned by a function.  */
3261
3262 static rtx
3263 aarch64_function_value (const_tree type, const_tree func,
3264                         bool outgoing ATTRIBUTE_UNUSED)
3265 {
3266   machine_mode mode;
3267   int unsignedp;
3268   int count;
3269   machine_mode ag_mode;
3270
3271   mode = TYPE_MODE (type);
3272   if (INTEGRAL_TYPE_P (type))
3273     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3274
3275   if (aarch64_return_in_msb (type))
3276     {
3277       HOST_WIDE_INT size = int_size_in_bytes (type);
3278
3279       if (size % UNITS_PER_WORD != 0)
3280         {
3281           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3282           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3283         }
3284     }
3285
3286   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3287                                                &ag_mode, &count, NULL))
3288     {
3289       if (!aarch64_composite_type_p (type, mode))
3290         {
3291           gcc_assert (count == 1 && mode == ag_mode);
3292           return gen_rtx_REG (mode, V0_REGNUM);
3293         }
3294       else
3295         {
3296           int i;
3297           rtx par;
3298
3299           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3300           for (i = 0; i < count; i++)
3301             {
3302               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3303               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3304               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3305               XVECEXP (par, 0, i) = tmp;
3306             }
3307           return par;
3308         }
3309     }
3310   else
3311     return gen_rtx_REG (mode, R0_REGNUM);
3312 }
3313
3314 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3315    Return true if REGNO is the number of a hard register in which the values
3316    of called function may come back.  */
3317
3318 static bool
3319 aarch64_function_value_regno_p (const unsigned int regno)
3320 {
3321   /* Maximum of 16 bytes can be returned in the general registers.  Examples
3322      of 16-byte return values are: 128-bit integers and 16-byte small
3323      structures (excluding homogeneous floating-point aggregates).  */
3324   if (regno == R0_REGNUM || regno == R1_REGNUM)
3325     return true;
3326
3327   /* Up to four fp/simd registers can return a function value, e.g. a
3328      homogeneous floating-point aggregate having four members.  */
3329   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3330     return TARGET_FLOAT;
3331
3332   return false;
3333 }
3334
3335 /* Implement TARGET_RETURN_IN_MEMORY.
3336
3337    If the type T of the result of a function is such that
3338      void func (T arg)
3339    would require that arg be passed as a value in a register (or set of
3340    registers) according to the parameter passing rules, then the result
3341    is returned in the same registers as would be used for such an
3342    argument.  */
3343
3344 static bool
3345 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3346 {
3347   HOST_WIDE_INT size;
3348   machine_mode ag_mode;
3349   int count;
3350
3351   if (!AGGREGATE_TYPE_P (type)
3352       && TREE_CODE (type) != COMPLEX_TYPE
3353       && TREE_CODE (type) != VECTOR_TYPE)
3354     /* Simple scalar types always returned in registers.  */
3355     return false;
3356
3357   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3358                                                type,
3359                                                &ag_mode,
3360                                                &count,
3361                                                NULL))
3362     return false;
3363
3364   /* Types larger than 2 registers returned in memory.  */
3365   size = int_size_in_bytes (type);
3366   return (size < 0 || size > 2 * UNITS_PER_WORD);
3367 }
3368
3369 static bool
3370 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3371                                const_tree type, int *nregs)
3372 {
3373   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3374   return aarch64_vfp_is_call_or_return_candidate (mode,
3375                                                   type,
3376                                                   &pcum->aapcs_vfp_rmode,
3377                                                   nregs,
3378                                                   NULL);
3379 }
3380
3381 /* Given MODE and TYPE of a function argument, return the alignment in
3382    bits.  The idea is to suppress any stronger alignment requested by
3383    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3384    This is a helper function for local use only.  */
3385
3386 static unsigned int
3387 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
3388 {
3389   if (!type)
3390     return GET_MODE_ALIGNMENT (mode);
3391
3392   if (integer_zerop (TYPE_SIZE (type)))
3393     return 0;
3394
3395   gcc_assert (TYPE_MODE (type) == mode);
3396
3397   if (!AGGREGATE_TYPE_P (type))
3398     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3399
3400   if (TREE_CODE (type) == ARRAY_TYPE)
3401     return TYPE_ALIGN (TREE_TYPE (type));
3402
3403   unsigned int alignment = 0;
3404   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3405     if (TREE_CODE (field) == FIELD_DECL)
3406       alignment = std::max (alignment, DECL_ALIGN (field));
3407
3408   return alignment;
3409 }
3410
3411 /* Layout a function argument according to the AAPCS64 rules.  The rule
3412    numbers refer to the rule numbers in the AAPCS64.  */
3413
3414 static void
3415 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3416                     const_tree type,
3417                     bool named ATTRIBUTE_UNUSED)
3418 {
3419   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3420   int ncrn, nvrn, nregs;
3421   bool allocate_ncrn, allocate_nvrn;
3422   HOST_WIDE_INT size;
3423
3424   /* We need to do this once per argument.  */
3425   if (pcum->aapcs_arg_processed)
3426     return;
3427
3428   pcum->aapcs_arg_processed = true;
3429
3430   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
3431   if (type)
3432     size = int_size_in_bytes (type);
3433   else
3434     /* No frontends can create types with variable-sized modes, so we
3435        shouldn't be asked to pass or return them.  */
3436     size = GET_MODE_SIZE (mode).to_constant ();
3437   size = ROUND_UP (size, UNITS_PER_WORD);
3438
3439   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3440   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3441                                                  mode,
3442                                                  type,
3443                                                  &nregs);
3444
3445   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3446      The following code thus handles passing by SIMD/FP registers first.  */
3447
3448   nvrn = pcum->aapcs_nvrn;
3449
3450   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3451      and homogenous short-vector aggregates (HVA).  */
3452   if (allocate_nvrn)
3453     {
3454       if (!TARGET_FLOAT)
3455         aarch64_err_no_fpadvsimd (mode);
3456
3457       if (nvrn + nregs <= NUM_FP_ARG_REGS)
3458         {
3459           pcum->aapcs_nextnvrn = nvrn + nregs;
3460           if (!aarch64_composite_type_p (type, mode))
3461             {
3462               gcc_assert (nregs == 1);
3463               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3464             }
3465           else
3466             {
3467               rtx par;
3468               int i;
3469               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3470               for (i = 0; i < nregs; i++)
3471                 {
3472                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3473                                          V0_REGNUM + nvrn + i);
3474                   rtx offset = gen_int_mode
3475                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3476                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3477                   XVECEXP (par, 0, i) = tmp;
3478                 }
3479               pcum->aapcs_reg = par;
3480             }
3481           return;
3482         }
3483       else
3484         {
3485           /* C.3 NSRN is set to 8.  */
3486           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3487           goto on_stack;
3488         }
3489     }
3490
3491   ncrn = pcum->aapcs_ncrn;
3492   nregs = size / UNITS_PER_WORD;
3493
3494   /* C6 - C9.  though the sign and zero extension semantics are
3495      handled elsewhere.  This is the case where the argument fits
3496      entirely general registers.  */
3497   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3498     {
3499
3500       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3501
3502       /* C.8 if the argument has an alignment of 16 then the NGRN is
3503          rounded up to the next even number.  */
3504       if (nregs == 2
3505           && ncrn % 2
3506           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3507              comparison is there because for > 16 * BITS_PER_UNIT
3508              alignment nregs should be > 2 and therefore it should be
3509              passed by reference rather than value.  */
3510           && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3511         {
3512           ++ncrn;
3513           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3514         }
3515
3516       /* NREGS can be 0 when e.g. an empty structure is to be passed.
3517          A reg is still generated for it, but the caller should be smart
3518          enough not to use it.  */
3519       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3520         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3521       else
3522         {
3523           rtx par;
3524           int i;
3525
3526           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3527           for (i = 0; i < nregs; i++)
3528             {
3529               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3530               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3531                                        GEN_INT (i * UNITS_PER_WORD));
3532               XVECEXP (par, 0, i) = tmp;
3533             }
3534           pcum->aapcs_reg = par;
3535         }
3536
3537       pcum->aapcs_nextncrn = ncrn + nregs;
3538       return;
3539     }
3540
3541   /* C.11  */
3542   pcum->aapcs_nextncrn = NUM_ARG_REGS;
3543
3544   /* The argument is passed on stack; record the needed number of words for
3545      this argument and align the total size if necessary.  */
3546 on_stack:
3547   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3548
3549   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3550     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
3551                                        16 / UNITS_PER_WORD);
3552   return;
3553 }
3554
3555 /* Implement TARGET_FUNCTION_ARG.  */
3556
3557 static rtx
3558 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
3559                       const_tree type, bool named)
3560 {
3561   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3562   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
3563
3564   if (mode == VOIDmode)
3565     return NULL_RTX;
3566
3567   aarch64_layout_arg (pcum_v, mode, type, named);
3568   return pcum->aapcs_reg;
3569 }
3570
3571 void
3572 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
3573                            const_tree fntype ATTRIBUTE_UNUSED,
3574                            rtx libname ATTRIBUTE_UNUSED,
3575                            const_tree fndecl ATTRIBUTE_UNUSED,
3576                            unsigned n_named ATTRIBUTE_UNUSED)
3577 {
3578   pcum->aapcs_ncrn = 0;
3579   pcum->aapcs_nvrn = 0;
3580   pcum->aapcs_nextncrn = 0;
3581   pcum->aapcs_nextnvrn = 0;
3582   pcum->pcs_variant = ARM_PCS_AAPCS64;
3583   pcum->aapcs_reg = NULL_RTX;
3584   pcum->aapcs_arg_processed = false;
3585   pcum->aapcs_stack_words = 0;
3586   pcum->aapcs_stack_size = 0;
3587
3588   if (!TARGET_FLOAT
3589       && fndecl && TREE_PUBLIC (fndecl)
3590       && fntype && fntype != error_mark_node)
3591     {
3592       const_tree type = TREE_TYPE (fntype);
3593       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
3594       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
3595       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
3596                                                    &mode, &nregs, NULL))
3597         aarch64_err_no_fpadvsimd (TYPE_MODE (type));
3598     }
3599   return;
3600 }
3601
3602 static void
3603 aarch64_function_arg_advance (cumulative_args_t pcum_v,
3604                               machine_mode mode,
3605                               const_tree type,
3606                               bool named)
3607 {
3608   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3609   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
3610     {
3611       aarch64_layout_arg (pcum_v, mode, type, named);
3612       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
3613                   != (pcum->aapcs_stack_words != 0));
3614       pcum->aapcs_arg_processed = false;
3615       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
3616       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
3617       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
3618       pcum->aapcs_stack_words = 0;
3619       pcum->aapcs_reg = NULL_RTX;
3620     }
3621 }
3622
3623 bool
3624 aarch64_function_arg_regno_p (unsigned regno)
3625 {
3626   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
3627           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
3628 }
3629
3630 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
3631    PARM_BOUNDARY bits of alignment, but will be given anything up
3632    to STACK_BOUNDARY bits if the type requires it.  This makes sure
3633    that both before and after the layout of each argument, the Next
3634    Stacked Argument Address (NSAA) will have a minimum alignment of
3635    8 bytes.  */
3636
3637 static unsigned int
3638 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
3639 {
3640   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
3641   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
3642 }
3643
3644 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
3645
3646 static fixed_size_mode
3647 aarch64_get_reg_raw_mode (int regno)
3648 {
3649   if (TARGET_SVE && FP_REGNUM_P (regno))
3650     /* Don't use the SVE part of the register for __builtin_apply and
3651        __builtin_return.  The SVE registers aren't used by the normal PCS,
3652        so using them there would be a waste of time.  The PCS extensions
3653        for SVE types are fundamentally incompatible with the
3654        __builtin_return/__builtin_apply interface.  */
3655     return as_a <fixed_size_mode> (V16QImode);
3656   return default_get_reg_raw_mode (regno);
3657 }
3658
3659 /* Implement TARGET_FUNCTION_ARG_PADDING.
3660
3661    Small aggregate types are placed in the lowest memory address.
3662
3663    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
3664
3665 static pad_direction
3666 aarch64_function_arg_padding (machine_mode mode, const_tree type)
3667 {
3668   /* On little-endian targets, the least significant byte of every stack
3669      argument is passed at the lowest byte address of the stack slot.  */
3670   if (!BYTES_BIG_ENDIAN)
3671     return PAD_UPWARD;
3672
3673   /* Otherwise, integral, floating-point and pointer types are padded downward:
3674      the least significant byte of a stack argument is passed at the highest
3675      byte address of the stack slot.  */
3676   if (type
3677       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
3678          || POINTER_TYPE_P (type))
3679       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
3680     return PAD_DOWNWARD;
3681
3682   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
3683   return PAD_UPWARD;
3684 }
3685
3686 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
3687
3688    It specifies padding for the last (may also be the only)
3689    element of a block move between registers and memory.  If
3690    assuming the block is in the memory, padding upward means that
3691    the last element is padded after its highest significant byte,
3692    while in downward padding, the last element is padded at the
3693    its least significant byte side.
3694
3695    Small aggregates and small complex types are always padded
3696    upwards.
3697
3698    We don't need to worry about homogeneous floating-point or
3699    short-vector aggregates; their move is not affected by the
3700    padding direction determined here.  Regardless of endianness,
3701    each element of such an aggregate is put in the least
3702    significant bits of a fp/simd register.
3703
3704    Return !BYTES_BIG_ENDIAN if the least significant byte of the
3705    register has useful data, and return the opposite if the most
3706    significant byte does.  */
3707
3708 bool
3709 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
3710                      bool first ATTRIBUTE_UNUSED)
3711 {
3712
3713   /* Small composite types are always padded upward.  */
3714   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
3715     {
3716       HOST_WIDE_INT size;
3717       if (type)
3718         size = int_size_in_bytes (type);
3719       else
3720         /* No frontends can create types with variable-sized modes, so we
3721            shouldn't be asked to pass or return them.  */
3722         size = GET_MODE_SIZE (mode).to_constant ();
3723       if (size < 2 * UNITS_PER_WORD)
3724         return true;
3725     }
3726
3727   /* Otherwise, use the default padding.  */
3728   return !BYTES_BIG_ENDIAN;
3729 }
3730
3731 static scalar_int_mode
3732 aarch64_libgcc_cmp_return_mode (void)
3733 {
3734   return SImode;
3735 }
3736
3737 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
3738
3739 /* We use the 12-bit shifted immediate arithmetic instructions so values
3740    must be multiple of (1 << 12), i.e. 4096.  */
3741 #define ARITH_FACTOR 4096
3742
3743 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
3744 #error Cannot use simple address calculation for stack probing
3745 #endif
3746
3747 /* The pair of scratch registers used for stack probing.  */
3748 #define PROBE_STACK_FIRST_REG  9
3749 #define PROBE_STACK_SECOND_REG 10
3750
3751 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
3752    inclusive.  These are offsets from the current stack pointer.  */
3753
3754 static void
3755 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
3756 {
3757   HOST_WIDE_INT size;
3758   if (!poly_size.is_constant (&size))
3759     {
3760       sorry ("stack probes for SVE frames");
3761       return;
3762     }
3763
3764   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
3765
3766   /* See the same assertion on PROBE_INTERVAL above.  */
3767   gcc_assert ((first % ARITH_FACTOR) == 0);
3768
3769   /* See if we have a constant small number of probes to generate.  If so,
3770      that's the easy case.  */
3771   if (size <= PROBE_INTERVAL)
3772     {
3773       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
3774
3775       emit_set_insn (reg1,
3776                      plus_constant (Pmode,
3777                                     stack_pointer_rtx, -(first + base)));
3778       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
3779     }
3780
3781   /* The run-time loop is made up of 8 insns in the generic case while the
3782      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
3783   else if (size <= 4 * PROBE_INTERVAL)
3784     {
3785       HOST_WIDE_INT i, rem;
3786
3787       emit_set_insn (reg1,
3788                      plus_constant (Pmode,
3789                                     stack_pointer_rtx,
3790                                     -(first + PROBE_INTERVAL)));
3791       emit_stack_probe (reg1);
3792
3793       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
3794          it exceeds SIZE.  If only two probes are needed, this will not
3795          generate any code.  Then probe at FIRST + SIZE.  */
3796       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
3797         {
3798           emit_set_insn (reg1,
3799                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
3800           emit_stack_probe (reg1);
3801         }
3802
3803       rem = size - (i - PROBE_INTERVAL);
3804       if (rem > 256)
3805         {
3806           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3807
3808           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
3809           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
3810         }
3811       else
3812         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
3813     }
3814
3815   /* Otherwise, do the same as above, but in a loop.  Note that we must be
3816      extra careful with variables wrapping around because we might be at
3817      the very top (or the very bottom) of the address space and we have
3818      to be able to handle this case properly; in particular, we use an
3819      equality test for the loop condition.  */
3820   else
3821     {
3822       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
3823
3824       /* Step 1: round SIZE to the previous multiple of the interval.  */
3825
3826       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
3827
3828
3829       /* Step 2: compute initial and final value of the loop counter.  */
3830
3831       /* TEST_ADDR = SP + FIRST.  */
3832       emit_set_insn (reg1,
3833                      plus_constant (Pmode, stack_pointer_rtx, -first));
3834
3835       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
3836       HOST_WIDE_INT adjustment = - (first + rounded_size);
3837       if (! aarch64_uimm12_shift (adjustment))
3838         {
3839           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
3840                                           true, Pmode);
3841           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
3842         }
3843       else
3844         emit_set_insn (reg2,
3845                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
3846
3847       /* Step 3: the loop
3848
3849          do
3850            {
3851              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
3852              probe at TEST_ADDR
3853            }
3854          while (TEST_ADDR != LAST_ADDR)
3855
3856          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
3857          until it is equal to ROUNDED_SIZE.  */
3858
3859       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
3860
3861
3862       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
3863          that SIZE is equal to ROUNDED_SIZE.  */
3864
3865       if (size != rounded_size)
3866         {
3867           HOST_WIDE_INT rem = size - rounded_size;
3868
3869           if (rem > 256)
3870             {
3871               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3872
3873               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
3874               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
3875             }
3876           else
3877             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
3878         }
3879     }
3880
3881   /* Make sure nothing is scheduled before we are done.  */
3882   emit_insn (gen_blockage ());
3883 }
3884
3885 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
3886    absolute addresses.  */
3887
3888 const char *
3889 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
3890 {
3891   static int labelno = 0;
3892   char loop_lab[32];
3893   rtx xops[2];
3894
3895   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
3896
3897   /* Loop.  */
3898   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
3899
3900   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
3901   xops[0] = reg1;
3902   xops[1] = GEN_INT (PROBE_INTERVAL);
3903   output_asm_insn ("sub\t%0, %0, %1", xops);
3904
3905   /* Probe at TEST_ADDR.  */
3906   output_asm_insn ("str\txzr, [%0]", xops);
3907
3908   /* Test if TEST_ADDR == LAST_ADDR.  */
3909   xops[1] = reg2;
3910   output_asm_insn ("cmp\t%0, %1", xops);
3911
3912   /* Branch.  */
3913   fputs ("\tb.ne\t", asm_out_file);
3914   assemble_name_raw (asm_out_file, loop_lab);
3915   fputc ('\n', asm_out_file);
3916
3917   return "";
3918 }
3919
3920 /* Determine whether a frame chain needs to be generated.  */
3921 static bool
3922 aarch64_needs_frame_chain (void)
3923 {
3924   /* Force a frame chain for EH returns so the return address is at FP+8.  */
3925   if (frame_pointer_needed || crtl->calls_eh_return)
3926     return true;
3927
3928   /* A leaf function cannot have calls or write LR.  */
3929   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
3930
3931   /* Don't use a frame chain in leaf functions if leaf frame pointers
3932      are disabled.  */
3933   if (flag_omit_leaf_frame_pointer && is_leaf)
3934     return false;
3935
3936   return aarch64_use_frame_pointer;
3937 }
3938
3939 /* Mark the registers that need to be saved by the callee and calculate
3940    the size of the callee-saved registers area and frame record (both FP
3941    and LR may be omitted).  */
3942 static void
3943 aarch64_layout_frame (void)
3944 {
3945   HOST_WIDE_INT offset = 0;
3946   int regno, last_fp_reg = INVALID_REGNUM;
3947
3948   cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
3949
3950 #define SLOT_NOT_REQUIRED (-2)
3951 #define SLOT_REQUIRED     (-1)
3952
3953   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
3954   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
3955
3956   /* First mark all the registers that really need to be saved...  */
3957   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
3958     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
3959
3960   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
3961     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
3962
3963   /* ... that includes the eh data registers (if needed)...  */
3964   if (crtl->calls_eh_return)
3965     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
3966       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
3967         = SLOT_REQUIRED;
3968
3969   /* ... and any callee saved register that dataflow says is live.  */
3970   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
3971     if (df_regs_ever_live_p (regno)
3972         && (regno == R30_REGNUM
3973             || !call_used_regs[regno]))
3974       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
3975
3976   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
3977     if (df_regs_ever_live_p (regno)
3978         && !call_used_regs[regno])
3979       {
3980         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
3981         last_fp_reg = regno;
3982       }
3983
3984   if (cfun->machine->frame.emit_frame_chain)
3985     {
3986       /* FP and LR are placed in the linkage record.  */
3987       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
3988       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
3989       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
3990       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
3991       offset = 2 * UNITS_PER_WORD;
3992     }
3993
3994   /* Now assign stack slots for them.  */
3995   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
3996     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
3997       {
3998         cfun->machine->frame.reg_offset[regno] = offset;
3999         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4000           cfun->machine->frame.wb_candidate1 = regno;
4001         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4002           cfun->machine->frame.wb_candidate2 = regno;
4003         offset += UNITS_PER_WORD;
4004       }
4005
4006   HOST_WIDE_INT max_int_offset = offset;
4007   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4008   bool has_align_gap = offset != max_int_offset;
4009
4010   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4011     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4012       {
4013         /* If there is an alignment gap between integer and fp callee-saves,
4014            allocate the last fp register to it if possible.  */
4015         if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
4016           {
4017             cfun->machine->frame.reg_offset[regno] = max_int_offset;
4018             break;
4019           }
4020
4021         cfun->machine->frame.reg_offset[regno] = offset;
4022         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4023           cfun->machine->frame.wb_candidate1 = regno;
4024         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4025                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4026           cfun->machine->frame.wb_candidate2 = regno;
4027         offset += UNITS_PER_WORD;
4028       }
4029
4030   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4031
4032   cfun->machine->frame.saved_regs_size = offset;
4033
4034   HOST_WIDE_INT varargs_and_saved_regs_size
4035     = offset + cfun->machine->frame.saved_varargs_size;
4036
4037   cfun->machine->frame.hard_fp_offset
4038     = aligned_upper_bound (varargs_and_saved_regs_size
4039                            + get_frame_size (),
4040                            STACK_BOUNDARY / BITS_PER_UNIT);
4041
4042   /* Both these values are already aligned.  */
4043   gcc_assert (multiple_p (crtl->outgoing_args_size,
4044                           STACK_BOUNDARY / BITS_PER_UNIT));
4045   cfun->machine->frame.frame_size
4046     = (cfun->machine->frame.hard_fp_offset
4047        + crtl->outgoing_args_size);
4048
4049   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4050
4051   cfun->machine->frame.initial_adjust = 0;
4052   cfun->machine->frame.final_adjust = 0;
4053   cfun->machine->frame.callee_adjust = 0;
4054   cfun->machine->frame.callee_offset = 0;
4055
4056   HOST_WIDE_INT max_push_offset = 0;
4057   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4058     max_push_offset = 512;
4059   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4060     max_push_offset = 256;
4061
4062   HOST_WIDE_INT const_size, const_fp_offset;
4063   if (cfun->machine->frame.frame_size.is_constant (&const_size)
4064       && const_size < max_push_offset
4065       && known_eq (crtl->outgoing_args_size, 0))
4066     {
4067       /* Simple, small frame with no outgoing arguments:
4068          stp reg1, reg2, [sp, -frame_size]!
4069          stp reg3, reg4, [sp, 16]  */
4070       cfun->machine->frame.callee_adjust = const_size;
4071     }
4072   else if (known_lt (crtl->outgoing_args_size
4073                      + cfun->machine->frame.saved_regs_size, 512)
4074            && !(cfun->calls_alloca
4075                 && known_lt (cfun->machine->frame.hard_fp_offset,
4076                              max_push_offset)))
4077     {
4078       /* Frame with small outgoing arguments:
4079          sub sp, sp, frame_size
4080          stp reg1, reg2, [sp, outgoing_args_size]
4081          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
4082       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4083       cfun->machine->frame.callee_offset
4084         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4085     }
4086   else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4087            && const_fp_offset < max_push_offset)
4088     {
4089       /* Frame with large outgoing arguments but a small local area:
4090          stp reg1, reg2, [sp, -hard_fp_offset]!
4091          stp reg3, reg4, [sp, 16]
4092          sub sp, sp, outgoing_args_size  */
4093       cfun->machine->frame.callee_adjust = const_fp_offset;
4094       cfun->machine->frame.final_adjust
4095         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4096     }
4097   else
4098     {
4099       /* Frame with large local area and outgoing arguments using frame pointer:
4100          sub sp, sp, hard_fp_offset
4101          stp x29, x30, [sp, 0]
4102          add x29, sp, 0
4103          stp reg3, reg4, [sp, 16]
4104          sub sp, sp, outgoing_args_size  */
4105       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4106       cfun->machine->frame.final_adjust
4107         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4108     }
4109
4110   cfun->machine->frame.laid_out = true;
4111 }
4112
4113 /* Return true if the register REGNO is saved on entry to
4114    the current function.  */
4115
4116 static bool
4117 aarch64_register_saved_on_entry (int regno)
4118 {
4119   return cfun->machine->frame.reg_offset[regno] >= 0;
4120 }
4121
4122 /* Return the next register up from REGNO up to LIMIT for the callee
4123    to save.  */
4124
4125 static unsigned
4126 aarch64_next_callee_save (unsigned regno, unsigned limit)
4127 {
4128   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4129     regno ++;
4130   return regno;
4131 }
4132
4133 /* Push the register number REGNO of mode MODE to the stack with write-back
4134    adjusting the stack by ADJUSTMENT.  */
4135
4136 static void
4137 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4138                            HOST_WIDE_INT adjustment)
4139  {
4140   rtx base_rtx = stack_pointer_rtx;
4141   rtx insn, reg, mem;
4142
4143   reg = gen_rtx_REG (mode, regno);
4144   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4145                             plus_constant (Pmode, base_rtx, -adjustment));
4146   mem = gen_frame_mem (mode, mem);
4147
4148   insn = emit_move_insn (mem, reg);
4149   RTX_FRAME_RELATED_P (insn) = 1;
4150 }
4151
4152 /* Generate and return an instruction to store the pair of registers
4153    REG and REG2 of mode MODE to location BASE with write-back adjusting
4154    the stack location BASE by ADJUSTMENT.  */
4155
4156 static rtx
4157 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4158                           HOST_WIDE_INT adjustment)
4159 {
4160   switch (mode)
4161     {
4162     case E_DImode:
4163       return gen_storewb_pairdi_di (base, base, reg, reg2,
4164                                     GEN_INT (-adjustment),
4165                                     GEN_INT (UNITS_PER_WORD - adjustment));
4166     case E_DFmode:
4167       return gen_storewb_pairdf_di (base, base, reg, reg2,
4168                                     GEN_INT (-adjustment),
4169                                     GEN_INT (UNITS_PER_WORD - adjustment));
4170     default:
4171       gcc_unreachable ();
4172     }
4173 }
4174
4175 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4176    stack pointer by ADJUSTMENT.  */
4177
4178 static void
4179 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4180 {
4181   rtx_insn *insn;
4182   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4183
4184   if (regno2 == INVALID_REGNUM)
4185     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4186
4187   rtx reg1 = gen_rtx_REG (mode, regno1);
4188   rtx reg2 = gen_rtx_REG (mode, regno2);
4189
4190   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4191                                               reg2, adjustment));
4192   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4193   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4194   RTX_FRAME_RELATED_P (insn) = 1;
4195 }
4196
4197 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4198    adjusting it by ADJUSTMENT afterwards.  */
4199
4200 static rtx
4201 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4202                          HOST_WIDE_INT adjustment)
4203 {
4204   switch (mode)
4205     {
4206     case E_DImode:
4207       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4208                                    GEN_INT (UNITS_PER_WORD));
4209     case E_DFmode:
4210       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4211                                    GEN_INT (UNITS_PER_WORD));
4212     default:
4213       gcc_unreachable ();
4214     }
4215 }
4216
4217 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4218    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4219    into CFI_OPS.  */
4220
4221 static void
4222 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4223                   rtx *cfi_ops)
4224 {
4225   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4226   rtx reg1 = gen_rtx_REG (mode, regno1);
4227
4228   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4229
4230   if (regno2 == INVALID_REGNUM)
4231     {
4232       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4233       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4234       emit_move_insn (reg1, gen_frame_mem (mode, mem));
4235     }
4236   else
4237     {
4238       rtx reg2 = gen_rtx_REG (mode, regno2);
4239       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4240       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4241                                           reg2, adjustment));
4242     }
4243 }
4244
4245 /* Generate and return a store pair instruction of mode MODE to store
4246    register REG1 to MEM1 and register REG2 to MEM2.  */
4247
4248 static rtx
4249 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4250                         rtx reg2)
4251 {
4252   switch (mode)
4253     {
4254     case E_DImode:
4255       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
4256
4257     case E_DFmode:
4258       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
4259
4260     default:
4261       gcc_unreachable ();
4262     }
4263 }
4264
4265 /* Generate and regurn a load pair isntruction of mode MODE to load register
4266    REG1 from MEM1 and register REG2 from MEM2.  */
4267
4268 static rtx
4269 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4270                        rtx mem2)
4271 {
4272   switch (mode)
4273     {
4274     case E_DImode:
4275       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
4276
4277     case E_DFmode:
4278       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
4279
4280     default:
4281       gcc_unreachable ();
4282     }
4283 }
4284
4285 /* Return TRUE if return address signing should be enabled for the current
4286    function, otherwise return FALSE.  */
4287
4288 bool
4289 aarch64_return_address_signing_enabled (void)
4290 {
4291   /* This function should only be called after frame laid out.   */
4292   gcc_assert (cfun->machine->frame.laid_out);
4293
4294   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4295      if it's LR is pushed onto stack.  */
4296   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4297           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4298               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4299 }
4300
4301 /* Emit code to save the callee-saved registers from register number START
4302    to LIMIT to the stack at the location starting at offset START_OFFSET,
4303    skipping any write-back candidates if SKIP_WB is true.  */
4304
4305 static void
4306 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4307                            unsigned start, unsigned limit, bool skip_wb)
4308 {
4309   rtx_insn *insn;
4310   unsigned regno;
4311   unsigned regno2;
4312
4313   for (regno = aarch64_next_callee_save (start, limit);
4314        regno <= limit;
4315        regno = aarch64_next_callee_save (regno + 1, limit))
4316     {
4317       rtx reg, mem;
4318       poly_int64 offset;
4319
4320       if (skip_wb
4321           && (regno == cfun->machine->frame.wb_candidate1
4322               || regno == cfun->machine->frame.wb_candidate2))
4323         continue;
4324
4325       if (cfun->machine->reg_is_wrapped_separately[regno])
4326        continue;
4327
4328       reg = gen_rtx_REG (mode, regno);
4329       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4330       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4331                                                 offset));
4332
4333       regno2 = aarch64_next_callee_save (regno + 1, limit);
4334
4335       if (regno2 <= limit
4336           && !cfun->machine->reg_is_wrapped_separately[regno2]
4337           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4338               == cfun->machine->frame.reg_offset[regno2]))
4339
4340         {
4341           rtx reg2 = gen_rtx_REG (mode, regno2);
4342           rtx mem2;
4343
4344           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4345           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4346                                                      offset));
4347           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4348                                                     reg2));
4349
4350           /* The first part of a frame-related parallel insn is
4351              always assumed to be relevant to the frame
4352              calculations; subsequent parts, are only
4353              frame-related if explicitly marked.  */
4354           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4355           regno = regno2;
4356         }
4357       else
4358         insn = emit_move_insn (mem, reg);
4359
4360       RTX_FRAME_RELATED_P (insn) = 1;
4361     }
4362 }
4363
4364 /* Emit code to restore the callee registers of mode MODE from register
4365    number START up to and including LIMIT.  Restore from the stack offset
4366    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4367    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
4368
4369 static void
4370 aarch64_restore_callee_saves (machine_mode mode,
4371                               poly_int64 start_offset, unsigned start,
4372                               unsigned limit, bool skip_wb, rtx *cfi_ops)
4373 {
4374   rtx base_rtx = stack_pointer_rtx;
4375   unsigned regno;
4376   unsigned regno2;
4377   poly_int64 offset;
4378
4379   for (regno = aarch64_next_callee_save (start, limit);
4380        regno <= limit;
4381        regno = aarch64_next_callee_save (regno + 1, limit))
4382     {
4383       if (cfun->machine->reg_is_wrapped_separately[regno])
4384        continue;
4385
4386       rtx reg, mem;
4387
4388       if (skip_wb
4389           && (regno == cfun->machine->frame.wb_candidate1
4390               || regno == cfun->machine->frame.wb_candidate2))
4391         continue;
4392
4393       reg = gen_rtx_REG (mode, regno);
4394       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4395       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4396
4397       regno2 = aarch64_next_callee_save (regno + 1, limit);
4398
4399       if (regno2 <= limit
4400           && !cfun->machine->reg_is_wrapped_separately[regno2]
4401           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4402               == cfun->machine->frame.reg_offset[regno2]))
4403         {
4404           rtx reg2 = gen_rtx_REG (mode, regno2);
4405           rtx mem2;
4406
4407           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4408           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4409           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4410
4411           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4412           regno = regno2;
4413         }
4414       else
4415         emit_move_insn (reg, mem);
4416       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
4417     }
4418 }
4419
4420 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4421    of MODE.  */
4422
4423 static inline bool
4424 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4425 {
4426   HOST_WIDE_INT multiple;
4427   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4428           && IN_RANGE (multiple, -8, 7));
4429 }
4430
4431 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4432    of MODE.  */
4433
4434 static inline bool
4435 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4436 {
4437   HOST_WIDE_INT multiple;
4438   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4439           && IN_RANGE (multiple, 0, 63));
4440 }
4441
4442 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4443    of MODE.  */
4444
4445 bool
4446 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4447 {
4448   HOST_WIDE_INT multiple;
4449   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4450           && IN_RANGE (multiple, -64, 63));
4451 }
4452
4453 /* Return true if OFFSET is a signed 9-bit value.  */
4454
4455 bool
4456 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
4457                                        poly_int64 offset)
4458 {
4459   HOST_WIDE_INT const_offset;
4460   return (offset.is_constant (&const_offset)
4461           && IN_RANGE (const_offset, -256, 255));
4462 }
4463
4464 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4465    of MODE.  */
4466
4467 static inline bool
4468 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4469 {
4470   HOST_WIDE_INT multiple;
4471   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4472           && IN_RANGE (multiple, -256, 255));
4473 }
4474
4475 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
4476    of MODE.  */
4477
4478 static inline bool
4479 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4480 {
4481   HOST_WIDE_INT multiple;
4482   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4483           && IN_RANGE (multiple, 0, 4095));
4484 }
4485
4486 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
4487
4488 static sbitmap
4489 aarch64_get_separate_components (void)
4490 {
4491   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4492   bitmap_clear (components);
4493
4494   /* The registers we need saved to the frame.  */
4495   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4496     if (aarch64_register_saved_on_entry (regno))
4497       {
4498         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4499         if (!frame_pointer_needed)
4500           offset += cfun->machine->frame.frame_size
4501                     - cfun->machine->frame.hard_fp_offset;
4502         /* Check that we can access the stack slot of the register with one
4503            direct load with no adjustments needed.  */
4504         if (offset_12bit_unsigned_scaled_p (DImode, offset))
4505           bitmap_set_bit (components, regno);
4506       }
4507
4508   /* Don't mess with the hard frame pointer.  */
4509   if (frame_pointer_needed)
4510     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
4511
4512   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4513   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4514   /* If registers have been chosen to be stored/restored with
4515      writeback don't interfere with them to avoid having to output explicit
4516      stack adjustment instructions.  */
4517   if (reg2 != INVALID_REGNUM)
4518     bitmap_clear_bit (components, reg2);
4519   if (reg1 != INVALID_REGNUM)
4520     bitmap_clear_bit (components, reg1);
4521
4522   bitmap_clear_bit (components, LR_REGNUM);
4523   bitmap_clear_bit (components, SP_REGNUM);
4524
4525   return components;
4526 }
4527
4528 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
4529
4530 static sbitmap
4531 aarch64_components_for_bb (basic_block bb)
4532 {
4533   bitmap in = DF_LIVE_IN (bb);
4534   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
4535   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
4536
4537   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4538   bitmap_clear (components);
4539
4540   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
4541   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4542     if ((!call_used_regs[regno])
4543        && (bitmap_bit_p (in, regno)
4544            || bitmap_bit_p (gen, regno)
4545            || bitmap_bit_p (kill, regno)))
4546       {
4547         unsigned regno2, offset, offset2;
4548         bitmap_set_bit (components, regno);
4549
4550         /* If there is a callee-save at an adjacent offset, add it too
4551            to increase the use of LDP/STP.  */
4552         offset = cfun->machine->frame.reg_offset[regno];
4553         regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
4554
4555         if (regno2 <= LAST_SAVED_REGNUM)
4556           {
4557             offset2 = cfun->machine->frame.reg_offset[regno2];
4558             if ((offset & ~8) == (offset2 & ~8))
4559               bitmap_set_bit (components, regno2);
4560           }
4561       }
4562
4563   return components;
4564 }
4565
4566 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
4567    Nothing to do for aarch64.  */
4568
4569 static void
4570 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
4571 {
4572 }
4573
4574 /* Return the next set bit in BMP from START onwards.  Return the total number
4575    of bits in BMP if no set bit is found at or after START.  */
4576
4577 static unsigned int
4578 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
4579 {
4580   unsigned int nbits = SBITMAP_SIZE (bmp);
4581   if (start == nbits)
4582     return start;
4583
4584   gcc_assert (start < nbits);
4585   for (unsigned int i = start; i < nbits; i++)
4586     if (bitmap_bit_p (bmp, i))
4587       return i;
4588
4589   return nbits;
4590 }
4591
4592 /* Do the work for aarch64_emit_prologue_components and
4593    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
4594    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
4595    for these components or the epilogue sequence.  That is, it determines
4596    whether we should emit stores or loads and what kind of CFA notes to attach
4597    to the insns.  Otherwise the logic for the two sequences is very
4598    similar.  */
4599
4600 static void
4601 aarch64_process_components (sbitmap components, bool prologue_p)
4602 {
4603   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
4604                              ? HARD_FRAME_POINTER_REGNUM
4605                              : STACK_POINTER_REGNUM);
4606
4607   unsigned last_regno = SBITMAP_SIZE (components);
4608   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
4609   rtx_insn *insn = NULL;
4610
4611   while (regno != last_regno)
4612     {
4613       /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
4614          so DFmode for the vector registers is enough.  */
4615       machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
4616       rtx reg = gen_rtx_REG (mode, regno);
4617       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4618       if (!frame_pointer_needed)
4619         offset += cfun->machine->frame.frame_size
4620                   - cfun->machine->frame.hard_fp_offset;
4621       rtx addr = plus_constant (Pmode, ptr_reg, offset);
4622       rtx mem = gen_frame_mem (mode, addr);
4623
4624       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
4625       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
4626       /* No more registers to handle after REGNO.
4627          Emit a single save/restore and exit.  */
4628       if (regno2 == last_regno)
4629         {
4630           insn = emit_insn (set);
4631           RTX_FRAME_RELATED_P (insn) = 1;
4632           if (prologue_p)
4633             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4634           else
4635             add_reg_note (insn, REG_CFA_RESTORE, reg);
4636           break;
4637         }
4638
4639       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
4640       /* The next register is not of the same class or its offset is not
4641          mergeable with the current one into a pair.  */
4642       if (!satisfies_constraint_Ump (mem)
4643           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
4644           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
4645                        GET_MODE_SIZE (mode)))
4646         {
4647           insn = emit_insn (set);
4648           RTX_FRAME_RELATED_P (insn) = 1;
4649           if (prologue_p)
4650             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4651           else
4652             add_reg_note (insn, REG_CFA_RESTORE, reg);
4653
4654           regno = regno2;
4655           continue;
4656         }
4657
4658       /* REGNO2 can be saved/restored in a pair with REGNO.  */
4659       rtx reg2 = gen_rtx_REG (mode, regno2);
4660       if (!frame_pointer_needed)
4661         offset2 += cfun->machine->frame.frame_size
4662                   - cfun->machine->frame.hard_fp_offset;
4663       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
4664       rtx mem2 = gen_frame_mem (mode, addr2);
4665       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
4666                              : gen_rtx_SET (reg2, mem2);
4667
4668       if (prologue_p)
4669         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
4670       else
4671         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4672
4673       RTX_FRAME_RELATED_P (insn) = 1;
4674       if (prologue_p)
4675         {
4676           add_reg_note (insn, REG_CFA_OFFSET, set);
4677           add_reg_note (insn, REG_CFA_OFFSET, set2);
4678         }
4679       else
4680         {
4681           add_reg_note (insn, REG_CFA_RESTORE, reg);
4682           add_reg_note (insn, REG_CFA_RESTORE, reg2);
4683         }
4684
4685       regno = aarch64_get_next_set_bit (components, regno2 + 1);
4686     }
4687 }
4688
4689 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
4690
4691 static void
4692 aarch64_emit_prologue_components (sbitmap components)
4693 {
4694   aarch64_process_components (components, true);
4695 }
4696
4697 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
4698
4699 static void
4700 aarch64_emit_epilogue_components (sbitmap components)
4701 {
4702   aarch64_process_components (components, false);
4703 }
4704
4705 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
4706
4707 static void
4708 aarch64_set_handled_components (sbitmap components)
4709 {
4710   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4711     if (bitmap_bit_p (components, regno))
4712       cfun->machine->reg_is_wrapped_separately[regno] = true;
4713 }
4714
4715 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
4716    is saved at BASE + OFFSET.  */
4717
4718 static void
4719 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
4720                             rtx base, poly_int64 offset)
4721 {
4722   rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
4723   add_reg_note (insn, REG_CFA_EXPRESSION,
4724                 gen_rtx_SET (mem, regno_reg_rtx[reg]));
4725 }
4726
4727 /* AArch64 stack frames generated by this compiler look like:
4728
4729         +-------------------------------+
4730         |                               |
4731         |  incoming stack arguments     |
4732         |                               |
4733         +-------------------------------+
4734         |                               | <-- incoming stack pointer (aligned)
4735         |  callee-allocated save area   |
4736         |  for register varargs         |
4737         |                               |
4738         +-------------------------------+
4739         |  local variables              | <-- frame_pointer_rtx
4740         |                               |
4741         +-------------------------------+
4742         |  padding0                     | \
4743         +-------------------------------+  |
4744         |  callee-saved registers       |  | frame.saved_regs_size
4745         +-------------------------------+  |
4746         |  LR'                          |  |
4747         +-------------------------------+  |
4748         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
4749         +-------------------------------+
4750         |  dynamic allocation           |
4751         +-------------------------------+
4752         |  padding                      |
4753         +-------------------------------+
4754         |  outgoing stack arguments     | <-- arg_pointer
4755         |                               |
4756         +-------------------------------+
4757         |                               | <-- stack_pointer_rtx (aligned)
4758
4759    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
4760    but leave frame_pointer_rtx and hard_frame_pointer_rtx
4761    unchanged.  */
4762
4763 /* Generate the prologue instructions for entry into a function.
4764    Establish the stack frame by decreasing the stack pointer with a
4765    properly calculated size and, if necessary, create a frame record
4766    filled with the values of LR and previous frame pointer.  The
4767    current FP is also set up if it is in use.  */
4768
4769 void
4770 aarch64_expand_prologue (void)
4771 {
4772   poly_int64 frame_size = cfun->machine->frame.frame_size;
4773   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4774   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4775   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4776   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4777   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4778   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4779   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
4780   rtx_insn *insn;
4781
4782   /* Sign return address for functions.  */
4783   if (aarch64_return_address_signing_enabled ())
4784     {
4785       insn = emit_insn (gen_pacisp ());
4786       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
4787       RTX_FRAME_RELATED_P (insn) = 1;
4788     }
4789
4790   if (flag_stack_usage_info)
4791     current_function_static_stack_size = constant_lower_bound (frame_size);
4792
4793   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4794     {
4795       if (crtl->is_leaf && !cfun->calls_alloca)
4796         {
4797           if (maybe_gt (frame_size, PROBE_INTERVAL)
4798               && maybe_gt (frame_size, get_stack_check_protect ()))
4799             aarch64_emit_probe_stack_range (get_stack_check_protect (),
4800                                             (frame_size
4801                                              - get_stack_check_protect ()));
4802         }
4803       else if (maybe_gt (frame_size, 0))
4804         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
4805     }
4806
4807   rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4808   rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4809
4810   aarch64_sub_sp (ip0_rtx, ip1_rtx, initial_adjust, true);
4811
4812   if (callee_adjust != 0)
4813     aarch64_push_regs (reg1, reg2, callee_adjust);
4814
4815   if (emit_frame_chain)
4816     {
4817       poly_int64 reg_offset = callee_adjust;
4818       if (callee_adjust == 0)
4819         {
4820           reg1 = R29_REGNUM;
4821           reg2 = R30_REGNUM;
4822           reg_offset = callee_offset;
4823           aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
4824         }
4825       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
4826                           stack_pointer_rtx, callee_offset,
4827                           ip1_rtx, ip0_rtx, frame_pointer_needed);
4828       if (frame_pointer_needed && !frame_size.is_constant ())
4829         {
4830           /* Variable-sized frames need to describe the save slot
4831              address using DW_CFA_expression rather than DW_CFA_offset.
4832              This means that, without taking further action, the
4833              locations of the registers that we've already saved would
4834              remain based on the stack pointer even after we redefine
4835              the CFA based on the frame pointer.  We therefore need new
4836              DW_CFA_expressions to re-express the save slots with addresses
4837              based on the frame pointer.  */
4838           rtx_insn *insn = get_last_insn ();
4839           gcc_assert (RTX_FRAME_RELATED_P (insn));
4840
4841           /* Add an explicit CFA definition if this was previously
4842              implicit.  */
4843           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
4844             {
4845               rtx src = plus_constant (Pmode, stack_pointer_rtx,
4846                                        callee_offset);
4847               add_reg_note (insn, REG_CFA_ADJUST_CFA,
4848                             gen_rtx_SET (hard_frame_pointer_rtx, src));
4849             }
4850
4851           /* Change the save slot expressions for the registers that
4852              we've already saved.  */
4853           reg_offset -= callee_offset;
4854           aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
4855                                       reg_offset + UNITS_PER_WORD);
4856           aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
4857                                       reg_offset);
4858         }
4859       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
4860     }
4861
4862   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4863                              callee_adjust != 0 || emit_frame_chain);
4864   aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4865                              callee_adjust != 0 || emit_frame_chain);
4866   aarch64_sub_sp (ip1_rtx, ip0_rtx, final_adjust, !frame_pointer_needed);
4867 }
4868
4869 /* Return TRUE if we can use a simple_return insn.
4870
4871    This function checks whether the callee saved stack is empty, which
4872    means no restore actions are need. The pro_and_epilogue will use
4873    this to check whether shrink-wrapping opt is feasible.  */
4874
4875 bool
4876 aarch64_use_return_insn_p (void)
4877 {
4878   if (!reload_completed)
4879     return false;
4880
4881   if (crtl->profile)
4882     return false;
4883
4884   return known_eq (cfun->machine->frame.frame_size, 0);
4885 }
4886
4887 /* Generate the epilogue instructions for returning from a function.
4888    This is almost exactly the reverse of the prolog sequence, except
4889    that we need to insert barriers to avoid scheduling loads that read
4890    from a deallocated stack, and we optimize the unwind records by
4891    emitting them all together if possible.  */
4892 void
4893 aarch64_expand_epilogue (bool for_sibcall)
4894 {
4895   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4896   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4897   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4898   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4899   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4900   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4901   rtx cfi_ops = NULL;
4902   rtx_insn *insn;
4903   /* A stack clash protection prologue may not have left IP0_REGNUM or
4904      IP1_REGNUM in a usable state.  The same is true for allocations
4905      with an SVE component, since we then need both temporary registers
4906      for each allocation.  */
4907   bool can_inherit_p = (initial_adjust.is_constant ()
4908                         && final_adjust.is_constant ()
4909                         && !flag_stack_clash_protection);
4910
4911   /* We need to add memory barrier to prevent read from deallocated stack.  */
4912   bool need_barrier_p
4913     = maybe_ne (get_frame_size ()
4914                 + cfun->machine->frame.saved_varargs_size, 0);
4915
4916   /* Emit a barrier to prevent loads from a deallocated stack.  */
4917   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
4918       || cfun->calls_alloca
4919       || crtl->calls_eh_return)
4920     {
4921       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
4922       need_barrier_p = false;
4923     }
4924
4925   /* Restore the stack pointer from the frame pointer if it may not
4926      be the same as the stack pointer.  */
4927   rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4928   rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4929   if (frame_pointer_needed
4930       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
4931     /* If writeback is used when restoring callee-saves, the CFA
4932        is restored on the instruction doing the writeback.  */
4933     aarch64_add_offset (Pmode, stack_pointer_rtx,
4934                         hard_frame_pointer_rtx, -callee_offset,
4935                         ip1_rtx, ip0_rtx, callee_adjust == 0);
4936   else
4937     aarch64_add_sp (ip1_rtx, ip0_rtx, final_adjust,
4938                     !can_inherit_p || df_regs_ever_live_p (IP1_REGNUM));
4939
4940   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4941                                 callee_adjust != 0, &cfi_ops);
4942   aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4943                                 callee_adjust != 0, &cfi_ops);
4944
4945   if (need_barrier_p)
4946     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
4947
4948   if (callee_adjust != 0)
4949     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
4950
4951   if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
4952     {
4953       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
4954       insn = get_last_insn ();
4955       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
4956       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
4957       RTX_FRAME_RELATED_P (insn) = 1;
4958       cfi_ops = NULL;
4959     }
4960
4961   aarch64_add_sp (ip0_rtx, ip1_rtx, initial_adjust,
4962                   !can_inherit_p || df_regs_ever_live_p (IP0_REGNUM));
4963
4964   if (cfi_ops)
4965     {
4966       /* Emit delayed restores and reset the CFA to be SP.  */
4967       insn = get_last_insn ();
4968       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
4969       REG_NOTES (insn) = cfi_ops;
4970       RTX_FRAME_RELATED_P (insn) = 1;
4971     }
4972
4973   /* We prefer to emit the combined return/authenticate instruction RETAA,
4974      however there are three cases in which we must instead emit an explicit
4975      authentication instruction.
4976
4977         1) Sibcalls don't return in a normal way, so if we're about to call one
4978            we must authenticate.
4979
4980         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
4981            generating code for !TARGET_ARMV8_3 we can't use it and must
4982            explicitly authenticate.
4983
4984         3) On an eh_return path we make extra stack adjustments to update the
4985            canonical frame address to be the exception handler's CFA.  We want
4986            to authenticate using the CFA of the function which calls eh_return.
4987     */
4988   if (aarch64_return_address_signing_enabled ()
4989       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
4990     {
4991       insn = emit_insn (gen_autisp ());
4992       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
4993       RTX_FRAME_RELATED_P (insn) = 1;
4994     }
4995
4996   /* Stack adjustment for exception handler.  */
4997   if (crtl->calls_eh_return)
4998     {
4999       /* We need to unwind the stack by the offset computed by
5000          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
5001          to be SP; letting the CFA move during this adjustment
5002          is just as correct as retaining the CFA from the body
5003          of the function.  Therefore, do nothing special.  */
5004       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
5005     }
5006
5007   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
5008   if (!for_sibcall)
5009     emit_jump_insn (ret_rtx);
5010 }
5011
5012 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
5013    normally or return to a previous frame after unwinding.
5014
5015    An EH return uses a single shared return sequence.  The epilogue is
5016    exactly like a normal epilogue except that it has an extra input
5017    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5018    that must be applied after the frame has been destroyed.  An extra label
5019    is inserted before the epilogue which initializes this register to zero,
5020    and this is the entry point for a normal return.
5021
5022    An actual EH return updates the return address, initializes the stack
5023    adjustment and jumps directly into the epilogue (bypassing the zeroing
5024    of the adjustment).  Since the return address is typically saved on the
5025    stack when a function makes a call, the saved LR must be updated outside
5026    the epilogue.
5027
5028    This poses problems as the store is generated well before the epilogue,
5029    so the offset of LR is not known yet.  Also optimizations will remove the
5030    store as it appears dead, even after the epilogue is generated (as the
5031    base or offset for loading LR is different in many cases).
5032
5033    To avoid these problems this implementation forces the frame pointer
5034    in eh_return functions so that the location of LR is fixed and known early.
5035    It also marks the store volatile, so no optimization is permitted to
5036    remove the store.  */
5037 rtx
5038 aarch64_eh_return_handler_rtx (void)
5039 {
5040   rtx tmp = gen_frame_mem (Pmode,
5041     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
5042
5043   /* Mark the store volatile, so no optimization is permitted to remove it.  */
5044   MEM_VOLATILE_P (tmp) = true;
5045   return tmp;
5046 }
5047
5048 /* Output code to add DELTA to the first argument, and then jump
5049    to FUNCTION.  Used for C++ multiple inheritance.  */
5050 static void
5051 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
5052                          HOST_WIDE_INT delta,
5053                          HOST_WIDE_INT vcall_offset,
5054                          tree function)
5055 {
5056   /* The this pointer is always in x0.  Note that this differs from
5057      Arm where the this pointer maybe bumped to r1 if r0 is required
5058      to return a pointer to an aggregate.  On AArch64 a result value
5059      pointer will be in x8.  */
5060   int this_regno = R0_REGNUM;
5061   rtx this_rtx, temp0, temp1, addr, funexp;
5062   rtx_insn *insn;
5063
5064   reload_completed = 1;
5065   emit_note (NOTE_INSN_PROLOGUE_END);
5066
5067   this_rtx = gen_rtx_REG (Pmode, this_regno);
5068   temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
5069   temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
5070
5071   if (vcall_offset == 0)
5072     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
5073   else
5074     {
5075       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
5076
5077       addr = this_rtx;
5078       if (delta != 0)
5079         {
5080           if (delta >= -256 && delta < 256)
5081             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
5082                                        plus_constant (Pmode, this_rtx, delta));
5083           else
5084             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
5085                                 temp1, temp0, false);
5086         }
5087
5088       if (Pmode == ptr_mode)
5089         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
5090       else
5091         aarch64_emit_move (temp0,
5092                            gen_rtx_ZERO_EXTEND (Pmode,
5093                                                 gen_rtx_MEM (ptr_mode, addr)));
5094
5095       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
5096           addr = plus_constant (Pmode, temp0, vcall_offset);
5097       else
5098         {
5099           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
5100                                           Pmode);
5101           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
5102         }
5103
5104       if (Pmode == ptr_mode)
5105         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
5106       else
5107         aarch64_emit_move (temp1,
5108                            gen_rtx_SIGN_EXTEND (Pmode,
5109                                                 gen_rtx_MEM (ptr_mode, addr)));
5110
5111       emit_insn (gen_add2_insn (this_rtx, temp1));
5112     }
5113
5114   /* Generate a tail call to the target function.  */
5115   if (!TREE_USED (function))
5116     {
5117       assemble_external (function);
5118       TREE_USED (function) = 1;
5119     }
5120   funexp = XEXP (DECL_RTL (function), 0);
5121   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
5122   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
5123   SIBLING_CALL_P (insn) = 1;
5124
5125   insn = get_insns ();
5126   shorten_branches (insn);
5127   final_start_function (insn, file, 1);
5128   final (insn, file, 1);
5129   final_end_function ();
5130
5131   /* Stop pretending to be a post-reload pass.  */
5132   reload_completed = 0;
5133 }
5134
5135 static bool
5136 aarch64_tls_referenced_p (rtx x)
5137 {
5138   if (!TARGET_HAVE_TLS)
5139     return false;
5140   subrtx_iterator::array_type array;
5141   FOR_EACH_SUBRTX (iter, array, x, ALL)
5142     {
5143       const_rtx x = *iter;
5144       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
5145         return true;
5146       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
5147          TLS offsets, not real symbol references.  */
5148       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5149         iter.skip_subrtxes ();
5150     }
5151   return false;
5152 }
5153
5154
5155 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5156    a left shift of 0 or 12 bits.  */
5157 bool
5158 aarch64_uimm12_shift (HOST_WIDE_INT val)
5159 {
5160   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
5161           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
5162           );
5163 }
5164
5165
5166 /* Return true if val is an immediate that can be loaded into a
5167    register by a MOVZ instruction.  */
5168 static bool
5169 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
5170 {
5171   if (GET_MODE_SIZE (mode) > 4)
5172     {
5173       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
5174           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
5175         return 1;
5176     }
5177   else
5178     {
5179       /* Ignore sign extension.  */
5180       val &= (HOST_WIDE_INT) 0xffffffff;
5181     }
5182   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
5183           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
5184 }
5185
5186 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
5187    64-bit (DImode) integer.  */
5188
5189 static unsigned HOST_WIDE_INT
5190 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5191 {
5192   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
5193   while (size < 64)
5194     {
5195       val &= (HOST_WIDE_INT_1U << size) - 1;
5196       val |= val << size;
5197       size *= 2;
5198     }
5199   return val;
5200 }
5201
5202 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
5203
5204 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
5205   {
5206     0x0000000100000001ull,
5207     0x0001000100010001ull,
5208     0x0101010101010101ull,
5209     0x1111111111111111ull,
5210     0x5555555555555555ull,
5211   };
5212
5213
5214 /* Return true if val is a valid bitmask immediate.  */
5215
5216 bool
5217 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
5218 {
5219   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
5220   int bits;
5221
5222   /* Check for a single sequence of one bits and return quickly if so.
5223      The special cases of all ones and all zeroes returns false.  */
5224   val = aarch64_replicate_bitmask_imm (val_in, mode);
5225   tmp = val + (val & -val);
5226
5227   if (tmp == (tmp & -tmp))
5228     return (val + 1) > 1;
5229
5230   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
5231   if (mode == SImode)
5232     val = (val << 32) | (val & 0xffffffff);
5233
5234   /* Invert if the immediate doesn't start with a zero bit - this means we
5235      only need to search for sequences of one bits.  */
5236   if (val & 1)
5237     val = ~val;
5238
5239   /* Find the first set bit and set tmp to val with the first sequence of one
5240      bits removed.  Return success if there is a single sequence of ones.  */
5241   first_one = val & -val;
5242   tmp = val & (val + first_one);
5243
5244   if (tmp == 0)
5245     return true;
5246
5247   /* Find the next set bit and compute the difference in bit position.  */
5248   next_one = tmp & -tmp;
5249   bits = clz_hwi (first_one) - clz_hwi (next_one);
5250   mask = val ^ tmp;
5251
5252   /* Check the bit position difference is a power of 2, and that the first
5253      sequence of one bits fits within 'bits' bits.  */
5254   if ((mask >> bits) != 0 || bits != (bits & -bits))
5255     return false;
5256
5257   /* Check the sequence of one bits is repeated 64/bits times.  */
5258   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
5259 }
5260
5261 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5262    Assumed precondition: VAL_IN Is not zero.  */
5263
5264 unsigned HOST_WIDE_INT
5265 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
5266 {
5267   int lowest_bit_set = ctz_hwi (val_in);
5268   int highest_bit_set = floor_log2 (val_in);
5269   gcc_assert (val_in != 0);
5270
5271   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
5272           (HOST_WIDE_INT_1U << lowest_bit_set));
5273 }
5274
5275 /* Create constant where bits outside of lowest bit set to highest bit set
5276    are set to 1.  */
5277
5278 unsigned HOST_WIDE_INT
5279 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
5280 {
5281   return val_in | ~aarch64_and_split_imm1 (val_in);
5282 }
5283
5284 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
5285
5286 bool
5287 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
5288 {
5289   scalar_int_mode int_mode;
5290   if (!is_a <scalar_int_mode> (mode, &int_mode))
5291     return false;
5292
5293   if (aarch64_bitmask_imm (val_in, int_mode))
5294     return false;
5295
5296   if (aarch64_move_imm (val_in, int_mode))
5297     return false;
5298
5299   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
5300
5301   return aarch64_bitmask_imm (imm2, int_mode);
5302 }
5303
5304 /* Return true if val is an immediate that can be loaded into a
5305    register in a single instruction.  */
5306 bool
5307 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
5308 {
5309   scalar_int_mode int_mode;
5310   if (!is_a <scalar_int_mode> (mode, &int_mode))
5311     return false;
5312
5313   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
5314     return 1;
5315   return aarch64_bitmask_imm (val, int_mode);
5316 }
5317
5318 static bool
5319 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
5320 {
5321   rtx base, offset;
5322
5323   if (GET_CODE (x) == HIGH)
5324     return true;
5325
5326   /* There's no way to calculate VL-based values using relocations.  */
5327   subrtx_iterator::array_type array;
5328   FOR_EACH_SUBRTX (iter, array, x, ALL)
5329     if (GET_CODE (*iter) == CONST_POLY_INT)
5330       return true;
5331
5332   split_const (x, &base, &offset);
5333   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
5334     {
5335       if (aarch64_classify_symbol (base, INTVAL (offset))
5336           != SYMBOL_FORCE_TO_MEM)
5337         return true;
5338       else
5339         /* Avoid generating a 64-bit relocation in ILP32; leave
5340            to aarch64_expand_mov_immediate to handle it properly.  */
5341         return mode != ptr_mode;
5342     }
5343
5344   return aarch64_tls_referenced_p (x);
5345 }
5346
5347 /* Implement TARGET_CASE_VALUES_THRESHOLD.
5348    The expansion for a table switch is quite expensive due to the number
5349    of instructions, the table lookup and hard to predict indirect jump.
5350    When optimizing for speed, and -O3 enabled, use the per-core tuning if
5351    set, otherwise use tables for > 16 cases as a tradeoff between size and
5352    performance.  When optimizing for size, use the default setting.  */
5353
5354 static unsigned int
5355 aarch64_case_values_threshold (void)
5356 {
5357   /* Use the specified limit for the number of cases before using jump
5358      tables at higher optimization levels.  */
5359   if (optimize > 2
5360       && selected_cpu->tune->max_case_values != 0)
5361     return selected_cpu->tune->max_case_values;
5362   else
5363     return optimize_size ? default_case_values_threshold () : 17;
5364 }
5365
5366 /* Return true if register REGNO is a valid index register.
5367    STRICT_P is true if REG_OK_STRICT is in effect.  */
5368
5369 bool
5370 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
5371 {
5372   if (!HARD_REGISTER_NUM_P (regno))
5373     {
5374       if (!strict_p)
5375         return true;
5376
5377       if (!reg_renumber)
5378         return false;
5379
5380       regno = reg_renumber[regno];
5381     }
5382   return GP_REGNUM_P (regno);
5383 }
5384
5385 /* Return true if register REGNO is a valid base register for mode MODE.
5386    STRICT_P is true if REG_OK_STRICT is in effect.  */
5387
5388 bool
5389 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
5390 {
5391   if (!HARD_REGISTER_NUM_P (regno))
5392     {
5393       if (!strict_p)
5394         return true;
5395
5396       if (!reg_renumber)
5397         return false;
5398
5399       regno = reg_renumber[regno];
5400     }
5401
5402   /* The fake registers will be eliminated to either the stack or
5403      hard frame pointer, both of which are usually valid base registers.
5404      Reload deals with the cases where the eliminated form isn't valid.  */
5405   return (GP_REGNUM_P (regno)
5406           || regno == SP_REGNUM
5407           || regno == FRAME_POINTER_REGNUM
5408           || regno == ARG_POINTER_REGNUM);
5409 }
5410
5411 /* Return true if X is a valid base register for mode MODE.
5412    STRICT_P is true if REG_OK_STRICT is in effect.  */
5413
5414 static bool
5415 aarch64_base_register_rtx_p (rtx x, bool strict_p)
5416 {
5417   if (!strict_p
5418       && GET_CODE (x) == SUBREG
5419       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
5420     x = SUBREG_REG (x);
5421
5422   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
5423 }
5424
5425 /* Return true if address offset is a valid index.  If it is, fill in INFO
5426    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
5427
5428 static bool
5429 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
5430                         machine_mode mode, bool strict_p)
5431 {
5432   enum aarch64_address_type type;
5433   rtx index;
5434   int shift;
5435
5436   /* (reg:P) */
5437   if ((REG_P (x) || GET_CODE (x) == SUBREG)
5438       && GET_MODE (x) == Pmode)
5439     {
5440       type = ADDRESS_REG_REG;
5441       index = x;
5442       shift = 0;
5443     }
5444   /* (sign_extend:DI (reg:SI)) */
5445   else if ((GET_CODE (x) == SIGN_EXTEND
5446             || GET_CODE (x) == ZERO_EXTEND)
5447            && GET_MODE (x) == DImode
5448            && GET_MODE (XEXP (x, 0)) == SImode)
5449     {
5450       type = (GET_CODE (x) == SIGN_EXTEND)
5451         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5452       index = XEXP (x, 0);
5453       shift = 0;
5454     }
5455   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
5456   else if (GET_CODE (x) == MULT
5457            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5458                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5459            && GET_MODE (XEXP (x, 0)) == DImode
5460            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5461            && CONST_INT_P (XEXP (x, 1)))
5462     {
5463       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5464         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5465       index = XEXP (XEXP (x, 0), 0);
5466       shift = exact_log2 (INTVAL (XEXP (x, 1)));
5467     }
5468   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
5469   else if (GET_CODE (x) == ASHIFT
5470            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5471                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5472            && GET_MODE (XEXP (x, 0)) == DImode
5473            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5474            && CONST_INT_P (XEXP (x, 1)))
5475     {
5476       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5477         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5478       index = XEXP (XEXP (x, 0), 0);
5479       shift = INTVAL (XEXP (x, 1));
5480     }
5481   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
5482   else if ((GET_CODE (x) == SIGN_EXTRACT
5483             || GET_CODE (x) == ZERO_EXTRACT)
5484            && GET_MODE (x) == DImode
5485            && GET_CODE (XEXP (x, 0)) == MULT
5486            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5487            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5488     {
5489       type = (GET_CODE (x) == SIGN_EXTRACT)
5490         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5491       index = XEXP (XEXP (x, 0), 0);
5492       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5493       if (INTVAL (XEXP (x, 1)) != 32 + shift
5494           || INTVAL (XEXP (x, 2)) != 0)
5495         shift = -1;
5496     }
5497   /* (and:DI (mult:DI (reg:DI) (const_int scale))
5498      (const_int 0xffffffff<<shift)) */
5499   else if (GET_CODE (x) == AND
5500            && GET_MODE (x) == DImode
5501            && GET_CODE (XEXP (x, 0)) == MULT
5502            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5503            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5504            && CONST_INT_P (XEXP (x, 1)))
5505     {
5506       type = ADDRESS_REG_UXTW;
5507       index = XEXP (XEXP (x, 0), 0);
5508       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5509       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5510         shift = -1;
5511     }
5512   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
5513   else if ((GET_CODE (x) == SIGN_EXTRACT
5514             || GET_CODE (x) == ZERO_EXTRACT)
5515            && GET_MODE (x) == DImode
5516            && GET_CODE (XEXP (x, 0)) == ASHIFT
5517            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5518            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5519     {
5520       type = (GET_CODE (x) == SIGN_EXTRACT)
5521         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5522       index = XEXP (XEXP (x, 0), 0);
5523       shift = INTVAL (XEXP (XEXP (x, 0), 1));
5524       if (INTVAL (XEXP (x, 1)) != 32 + shift
5525           || INTVAL (XEXP (x, 2)) != 0)
5526         shift = -1;
5527     }
5528   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
5529      (const_int 0xffffffff<<shift)) */
5530   else if (GET_CODE (x) == AND
5531            && GET_MODE (x) == DImode
5532            && GET_CODE (XEXP (x, 0)) == ASHIFT
5533            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5534            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5535            && CONST_INT_P (XEXP (x, 1)))
5536     {
5537       type = ADDRESS_REG_UXTW;
5538       index = XEXP (XEXP (x, 0), 0);
5539       shift = INTVAL (XEXP (XEXP (x, 0), 1));
5540       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5541         shift = -1;
5542     }
5543   /* (mult:P (reg:P) (const_int scale)) */
5544   else if (GET_CODE (x) == MULT
5545            && GET_MODE (x) == Pmode
5546            && GET_MODE (XEXP (x, 0)) == Pmode
5547            && CONST_INT_P (XEXP (x, 1)))
5548     {
5549       type = ADDRESS_REG_REG;
5550       index = XEXP (x, 0);
5551       shift = exact_log2 (INTVAL (XEXP (x, 1)));
5552     }
5553   /* (ashift:P (reg:P) (const_int shift)) */
5554   else if (GET_CODE (x) == ASHIFT
5555            && GET_MODE (x) == Pmode
5556            && GET_MODE (XEXP (x, 0)) == Pmode
5557            && CONST_INT_P (XEXP (x, 1)))
5558     {
5559       type = ADDRESS_REG_REG;
5560       index = XEXP (x, 0);
5561       shift = INTVAL (XEXP (x, 1));
5562     }
5563   else
5564     return false;
5565
5566   if (!strict_p
5567       && GET_CODE (index) == SUBREG
5568       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
5569     index = SUBREG_REG (index);
5570
5571   if (aarch64_sve_data_mode_p (mode))
5572     {
5573       if (type != ADDRESS_REG_REG
5574           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
5575         return false;
5576     }
5577   else
5578     {
5579       if (shift != 0
5580           && !(IN_RANGE (shift, 1, 3)
5581                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
5582         return false;
5583     }
5584
5585   if (REG_P (index)
5586       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
5587     {
5588       info->type = type;
5589       info->offset = index;
5590       info->shift = shift;
5591       return true;
5592     }
5593
5594   return false;
5595 }
5596
5597 /* Return true if MODE is one of the modes for which we
5598    support LDP/STP operations.  */
5599
5600 static bool
5601 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
5602 {
5603   return mode == SImode || mode == DImode
5604          || mode == SFmode || mode == DFmode
5605          || (aarch64_vector_mode_supported_p (mode)
5606              && (known_eq (GET_MODE_SIZE (mode), 8)
5607                  || (known_eq (GET_MODE_SIZE (mode), 16)
5608                     && (aarch64_tune_params.extra_tuning_flags
5609                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
5610 }
5611
5612 /* Return true if REGNO is a virtual pointer register, or an eliminable
5613    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
5614    include stack_pointer or hard_frame_pointer.  */
5615 static bool
5616 virt_or_elim_regno_p (unsigned regno)
5617 {
5618   return ((regno >= FIRST_VIRTUAL_REGISTER
5619            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
5620           || regno == FRAME_POINTER_REGNUM
5621           || regno == ARG_POINTER_REGNUM);
5622 }
5623
5624 /* Return true if X is a valid address of type TYPE for machine mode MODE.
5625    If it is, fill in INFO appropriately.  STRICT_P is true if
5626    REG_OK_STRICT is in effect.  */
5627
5628 bool
5629 aarch64_classify_address (struct aarch64_address_info *info,
5630                           rtx x, machine_mode mode, bool strict_p,
5631                           aarch64_addr_query_type type)
5632 {
5633   enum rtx_code code = GET_CODE (x);
5634   rtx op0, op1;
5635   poly_int64 offset;
5636
5637   HOST_WIDE_INT const_size;
5638
5639   /* On BE, we use load/store pair for all large int mode load/stores.
5640      TI/TFmode may also use a load/store pair.  */
5641   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5642   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
5643   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
5644                             || type == ADDR_QUERY_LDP_STP_N
5645                             || mode == TImode
5646                             || mode == TFmode
5647                             || (BYTES_BIG_ENDIAN && advsimd_struct_p));
5648
5649   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
5650      corresponds to the actual size of the memory being loaded/stored and the
5651      mode of the corresponding addressing mode is half of that.  */
5652   if (type == ADDR_QUERY_LDP_STP_N
5653       && known_eq (GET_MODE_SIZE (mode), 16))
5654     mode = DFmode;
5655
5656   bool allow_reg_index_p = (!load_store_pair_p
5657                             && (known_lt (GET_MODE_SIZE (mode), 16)
5658                                 || vec_flags == VEC_ADVSIMD
5659                                 || vec_flags == VEC_SVE_DATA));
5660
5661   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
5662      [Rn, #offset, MUL VL].  */
5663   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
5664       && (code != REG && code != PLUS))
5665     return false;
5666
5667   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
5668      REG addressing.  */
5669   if (advsimd_struct_p
5670       && !BYTES_BIG_ENDIAN
5671       && (code != POST_INC && code != REG))
5672     return false;
5673
5674   gcc_checking_assert (GET_MODE (x) == VOIDmode
5675                        || SCALAR_INT_MODE_P (GET_MODE (x)));
5676
5677   switch (code)
5678     {
5679     case REG:
5680     case SUBREG:
5681       info->type = ADDRESS_REG_IMM;
5682       info->base = x;
5683       info->offset = const0_rtx;
5684       info->const_offset = 0;
5685       return aarch64_base_register_rtx_p (x, strict_p);
5686
5687     case PLUS:
5688       op0 = XEXP (x, 0);
5689       op1 = XEXP (x, 1);
5690
5691       if (! strict_p
5692           && REG_P (op0)
5693           && virt_or_elim_regno_p (REGNO (op0))
5694           && poly_int_rtx_p (op1, &offset))
5695         {
5696           info->type = ADDRESS_REG_IMM;
5697           info->base = op0;
5698           info->offset = op1;
5699           info->const_offset = offset;
5700
5701           return true;
5702         }
5703
5704       if (maybe_ne (GET_MODE_SIZE (mode), 0)
5705           && aarch64_base_register_rtx_p (op0, strict_p)
5706           && poly_int_rtx_p (op1, &offset))
5707         {
5708           info->type = ADDRESS_REG_IMM;
5709           info->base = op0;
5710           info->offset = op1;
5711           info->const_offset = offset;
5712
5713           /* TImode and TFmode values are allowed in both pairs of X
5714              registers and individual Q registers.  The available
5715              address modes are:
5716              X,X: 7-bit signed scaled offset
5717              Q:   9-bit signed offset
5718              We conservatively require an offset representable in either mode.
5719              When performing the check for pairs of X registers i.e.  LDP/STP
5720              pass down DImode since that is the natural size of the LDP/STP
5721              instruction memory accesses.  */
5722           if (mode == TImode || mode == TFmode)
5723             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
5724                     && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
5725                         || offset_12bit_unsigned_scaled_p (mode, offset)));
5726
5727           /* A 7bit offset check because OImode will emit a ldp/stp
5728              instruction (only big endian will get here).
5729              For ldp/stp instructions, the offset is scaled for the size of a
5730              single element of the pair.  */
5731           if (mode == OImode)
5732             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
5733
5734           /* Three 9/12 bit offsets checks because CImode will emit three
5735              ldr/str instructions (only big endian will get here).  */
5736           if (mode == CImode)
5737             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5738                     && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
5739                                                                offset + 32)
5740                         || offset_12bit_unsigned_scaled_p (V16QImode,
5741                                                            offset + 32)));
5742
5743           /* Two 7bit offsets checks because XImode will emit two ldp/stp
5744              instructions (only big endian will get here).  */
5745           if (mode == XImode)
5746             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5747                     && aarch64_offset_7bit_signed_scaled_p (TImode,
5748                                                             offset + 32));
5749
5750           /* Make "m" use the LD1 offset range for SVE data modes, so
5751              that pre-RTL optimizers like ivopts will work to that
5752              instead of the wider LDR/STR range.  */
5753           if (vec_flags == VEC_SVE_DATA)
5754             return (type == ADDR_QUERY_M
5755                     ? offset_4bit_signed_scaled_p (mode, offset)
5756                     : offset_9bit_signed_scaled_p (mode, offset));
5757
5758           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
5759             {
5760               poly_int64 end_offset = (offset
5761                                        + GET_MODE_SIZE (mode)
5762                                        - BYTES_PER_SVE_VECTOR);
5763               return (type == ADDR_QUERY_M
5764                       ? offset_4bit_signed_scaled_p (mode, offset)
5765                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
5766                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
5767                                                          end_offset)));
5768             }
5769
5770           if (vec_flags == VEC_SVE_PRED)
5771             return offset_9bit_signed_scaled_p (mode, offset);
5772
5773           if (load_store_pair_p)
5774             return ((known_eq (GET_MODE_SIZE (mode), 4)
5775                      || known_eq (GET_MODE_SIZE (mode), 8)
5776                      || known_eq (GET_MODE_SIZE (mode), 16))
5777                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5778           else
5779             return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
5780                     || offset_12bit_unsigned_scaled_p (mode, offset));
5781         }
5782
5783       if (allow_reg_index_p)
5784         {
5785           /* Look for base + (scaled/extended) index register.  */
5786           if (aarch64_base_register_rtx_p (op0, strict_p)
5787               && aarch64_classify_index (info, op1, mode, strict_p))
5788             {
5789               info->base = op0;
5790               return true;
5791             }
5792           if (aarch64_base_register_rtx_p (op1, strict_p)
5793               && aarch64_classify_index (info, op0, mode, strict_p))
5794             {
5795               info->base = op1;
5796               return true;
5797             }
5798         }
5799
5800       return false;
5801
5802     case POST_INC:
5803     case POST_DEC:
5804     case PRE_INC:
5805     case PRE_DEC:
5806       info->type = ADDRESS_REG_WB;
5807       info->base = XEXP (x, 0);
5808       info->offset = NULL_RTX;
5809       return aarch64_base_register_rtx_p (info->base, strict_p);
5810
5811     case POST_MODIFY:
5812     case PRE_MODIFY:
5813       info->type = ADDRESS_REG_WB;
5814       info->base = XEXP (x, 0);
5815       if (GET_CODE (XEXP (x, 1)) == PLUS
5816           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
5817           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
5818           && aarch64_base_register_rtx_p (info->base, strict_p))
5819         {
5820           info->offset = XEXP (XEXP (x, 1), 1);
5821           info->const_offset = offset;
5822
5823           /* TImode and TFmode values are allowed in both pairs of X
5824              registers and individual Q registers.  The available
5825              address modes are:
5826              X,X: 7-bit signed scaled offset
5827              Q:   9-bit signed offset
5828              We conservatively require an offset representable in either mode.
5829            */
5830           if (mode == TImode || mode == TFmode)
5831             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
5832                     && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
5833
5834           if (load_store_pair_p)
5835             return ((known_eq (GET_MODE_SIZE (mode), 4)
5836                      || known_eq (GET_MODE_SIZE (mode), 8)
5837                      || known_eq (GET_MODE_SIZE (mode), 16))
5838                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5839           else
5840             return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
5841         }
5842       return false;
5843
5844     case CONST:
5845     case SYMBOL_REF:
5846     case LABEL_REF:
5847       /* load literal: pc-relative constant pool entry.  Only supported
5848          for SI mode or larger.  */
5849       info->type = ADDRESS_SYMBOLIC;
5850
5851       if (!load_store_pair_p
5852           && GET_MODE_SIZE (mode).is_constant (&const_size)
5853           && const_size >= 4)
5854         {
5855           rtx sym, addend;
5856
5857           split_const (x, &sym, &addend);
5858           return ((GET_CODE (sym) == LABEL_REF
5859                    || (GET_CODE (sym) == SYMBOL_REF
5860                        && CONSTANT_POOL_ADDRESS_P (sym)
5861                        && aarch64_pcrelative_literal_loads)));
5862         }
5863       return false;
5864
5865     case LO_SUM:
5866       info->type = ADDRESS_LO_SUM;
5867       info->base = XEXP (x, 0);
5868       info->offset = XEXP (x, 1);
5869       if (allow_reg_index_p
5870           && aarch64_base_register_rtx_p (info->base, strict_p))
5871         {
5872           rtx sym, offs;
5873           split_const (info->offset, &sym, &offs);
5874           if (GET_CODE (sym) == SYMBOL_REF
5875               && (aarch64_classify_symbol (sym, INTVAL (offs))
5876                   == SYMBOL_SMALL_ABSOLUTE))
5877             {
5878               /* The symbol and offset must be aligned to the access size.  */
5879               unsigned int align;
5880
5881               if (CONSTANT_POOL_ADDRESS_P (sym))
5882                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
5883               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
5884                 {
5885                   tree exp = SYMBOL_REF_DECL (sym);
5886                   align = TYPE_ALIGN (TREE_TYPE (exp));
5887                   align = aarch64_constant_alignment (exp, align);
5888                 }
5889               else if (SYMBOL_REF_DECL (sym))
5890                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
5891               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
5892                        && SYMBOL_REF_BLOCK (sym) != NULL)
5893                 align = SYMBOL_REF_BLOCK (sym)->alignment;
5894               else
5895                 align = BITS_PER_UNIT;
5896
5897               poly_int64 ref_size = GET_MODE_SIZE (mode);
5898               if (known_eq (ref_size, 0))
5899                 ref_size = GET_MODE_SIZE (DImode);
5900
5901               return (multiple_p (INTVAL (offs), ref_size)
5902                       && multiple_p (align / BITS_PER_UNIT, ref_size));
5903             }
5904         }
5905       return false;
5906
5907     default:
5908       return false;
5909     }
5910 }
5911
5912 /* Return true if the address X is valid for a PRFM instruction.
5913    STRICT_P is true if we should do strict checking with
5914    aarch64_classify_address.  */
5915
5916 bool
5917 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
5918 {
5919   struct aarch64_address_info addr;
5920
5921   /* PRFM accepts the same addresses as DImode...  */
5922   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
5923   if (!res)
5924     return false;
5925
5926   /* ... except writeback forms.  */
5927   return addr.type != ADDRESS_REG_WB;
5928 }
5929
5930 bool
5931 aarch64_symbolic_address_p (rtx x)
5932 {
5933   rtx offset;
5934
5935   split_const (x, &x, &offset);
5936   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
5937 }
5938
5939 /* Classify the base of symbolic expression X.  */
5940
5941 enum aarch64_symbol_type
5942 aarch64_classify_symbolic_expression (rtx x)
5943 {
5944   rtx offset;
5945
5946   split_const (x, &x, &offset);
5947   return aarch64_classify_symbol (x, INTVAL (offset));
5948 }
5949
5950
5951 /* Return TRUE if X is a legitimate address for accessing memory in
5952    mode MODE.  */
5953 static bool
5954 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
5955 {
5956   struct aarch64_address_info addr;
5957
5958   return aarch64_classify_address (&addr, x, mode, strict_p);
5959 }
5960
5961 /* Return TRUE if X is a legitimate address of type TYPE for accessing
5962    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
5963 bool
5964 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
5965                               aarch64_addr_query_type type)
5966 {
5967   struct aarch64_address_info addr;
5968
5969   return aarch64_classify_address (&addr, x, mode, strict_p, type);
5970 }
5971
5972 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
5973
5974 static bool
5975 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
5976                                          poly_int64 orig_offset,
5977                                          machine_mode mode)
5978 {
5979   HOST_WIDE_INT size;
5980   if (GET_MODE_SIZE (mode).is_constant (&size))
5981     {
5982       HOST_WIDE_INT const_offset, second_offset;
5983
5984       /* A general SVE offset is A * VQ + B.  Remove the A component from
5985          coefficient 0 in order to get the constant B.  */
5986       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
5987
5988       /* Split an out-of-range address displacement into a base and
5989          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
5990          range otherwise to increase opportunities for sharing the base
5991          address of different sizes.  Unaligned accesses use the signed
5992          9-bit range, TImode/TFmode use the intersection of signed
5993          scaled 7-bit and signed 9-bit offset.  */
5994       if (mode == TImode || mode == TFmode)
5995         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
5996       else if ((const_offset & (size - 1)) != 0)
5997         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
5998       else
5999         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
6000
6001       if (second_offset == 0 || known_eq (orig_offset, second_offset))
6002         return false;
6003
6004       /* Split the offset into second_offset and the rest.  */
6005       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6006       *offset2 = gen_int_mode (second_offset, Pmode);
6007       return true;
6008     }
6009   else
6010     {
6011       /* Get the mode we should use as the basis of the range.  For structure
6012          modes this is the mode of one vector.  */
6013       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6014       machine_mode step_mode
6015         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
6016
6017       /* Get the "mul vl" multiplier we'd like to use.  */
6018       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
6019       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
6020       if (vec_flags & VEC_SVE_DATA)
6021         /* LDR supports a 9-bit range, but the move patterns for
6022            structure modes require all vectors to be in range of the
6023            same base.  The simplest way of accomodating that while still
6024            promoting reuse of anchor points between different modes is
6025            to use an 8-bit range unconditionally.  */
6026         vnum = ((vnum + 128) & 255) - 128;
6027       else
6028         /* Predicates are only handled singly, so we might as well use
6029            the full range.  */
6030         vnum = ((vnum + 256) & 511) - 256;
6031       if (vnum == 0)
6032         return false;
6033
6034       /* Convert the "mul vl" multiplier into a byte offset.  */
6035       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
6036       if (known_eq (second_offset, orig_offset))
6037         return false;
6038
6039       /* Split the offset into second_offset and the rest.  */
6040       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6041       *offset2 = gen_int_mode (second_offset, Pmode);
6042       return true;
6043     }
6044 }
6045
6046 /* Return the binary representation of floating point constant VALUE in INTVAL.
6047    If the value cannot be converted, return false without setting INTVAL.
6048    The conversion is done in the given MODE.  */
6049 bool
6050 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
6051 {
6052
6053   /* We make a general exception for 0.  */
6054   if (aarch64_float_const_zero_rtx_p (value))
6055     {
6056       *intval = 0;
6057       return true;
6058     }
6059
6060   scalar_float_mode mode;
6061   if (GET_CODE (value) != CONST_DOUBLE
6062       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
6063       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
6064       /* Only support up to DF mode.  */
6065       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
6066     return false;
6067
6068   unsigned HOST_WIDE_INT ival = 0;
6069
6070   long res[2];
6071   real_to_target (res,
6072                   CONST_DOUBLE_REAL_VALUE (value),
6073                   REAL_MODE_FORMAT (mode));
6074
6075   if (mode == DFmode)
6076     {
6077       int order = BYTES_BIG_ENDIAN ? 1 : 0;
6078       ival = zext_hwi (res[order], 32);
6079       ival |= (zext_hwi (res[1 - order], 32) << 32);
6080     }
6081   else
6082       ival = zext_hwi (res[0], 32);
6083
6084   *intval = ival;
6085   return true;
6086 }
6087
6088 /* Return TRUE if rtx X is an immediate constant that can be moved using a
6089    single MOV(+MOVK) followed by an FMOV.  */
6090 bool
6091 aarch64_float_const_rtx_p (rtx x)
6092 {
6093   machine_mode mode = GET_MODE (x);
6094   if (mode == VOIDmode)
6095     return false;
6096
6097   /* Determine whether it's cheaper to write float constants as
6098      mov/movk pairs over ldr/adrp pairs.  */
6099   unsigned HOST_WIDE_INT ival;
6100
6101   if (GET_CODE (x) == CONST_DOUBLE
6102       && SCALAR_FLOAT_MODE_P (mode)
6103       && aarch64_reinterpret_float_as_int (x, &ival))
6104     {
6105       scalar_int_mode imode = (mode == HFmode
6106                                ? SImode
6107                                : int_mode_for_mode (mode).require ());
6108       int num_instr = aarch64_internal_mov_immediate
6109                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
6110       return num_instr < 3;
6111     }
6112
6113   return false;
6114 }
6115
6116 /* Return TRUE if rtx X is immediate constant 0.0 */
6117 bool
6118 aarch64_float_const_zero_rtx_p (rtx x)
6119 {
6120   if (GET_MODE (x) == VOIDmode)
6121     return false;
6122
6123   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
6124     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
6125   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
6126 }
6127
6128 /* Return TRUE if rtx X is immediate constant that fits in a single
6129    MOVI immediate operation.  */
6130 bool
6131 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
6132 {
6133   if (!TARGET_SIMD)
6134      return false;
6135
6136   machine_mode vmode;
6137   scalar_int_mode imode;
6138   unsigned HOST_WIDE_INT ival;
6139
6140   if (GET_CODE (x) == CONST_DOUBLE
6141       && SCALAR_FLOAT_MODE_P (mode))
6142     {
6143       if (!aarch64_reinterpret_float_as_int (x, &ival))
6144         return false;
6145
6146       /* We make a general exception for 0.  */
6147       if (aarch64_float_const_zero_rtx_p (x))
6148         return true;
6149
6150       imode = int_mode_for_mode (mode).require ();
6151     }
6152   else if (GET_CODE (x) == CONST_INT
6153            && is_a <scalar_int_mode> (mode, &imode))
6154     ival = INTVAL (x);
6155   else
6156     return false;
6157
6158    /* use a 64 bit mode for everything except for DI/DF mode, where we use
6159      a 128 bit vector mode.  */
6160   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
6161
6162   vmode = aarch64_simd_container_mode (imode, width);
6163   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
6164
6165   return aarch64_simd_valid_immediate (v_op, NULL);
6166 }
6167
6168
6169 /* Return the fixed registers used for condition codes.  */
6170
6171 static bool
6172 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
6173 {
6174   *p1 = CC_REGNUM;
6175   *p2 = INVALID_REGNUM;
6176   return true;
6177 }
6178
6179 /* This function is used by the call expanders of the machine description.
6180    RESULT is the register in which the result is returned.  It's NULL for
6181    "call" and "sibcall".
6182    MEM is the location of the function call.
6183    SIBCALL indicates whether this function call is normal call or sibling call.
6184    It will generate different pattern accordingly.  */
6185
6186 void
6187 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
6188 {
6189   rtx call, callee, tmp;
6190   rtvec vec;
6191   machine_mode mode;
6192
6193   gcc_assert (MEM_P (mem));
6194   callee = XEXP (mem, 0);
6195   mode = GET_MODE (callee);
6196   gcc_assert (mode == Pmode);
6197
6198   /* Decide if we should generate indirect calls by loading the
6199      address of the callee into a register before performing
6200      the branch-and-link.  */
6201   if (SYMBOL_REF_P (callee)
6202       ? (aarch64_is_long_call_p (callee)
6203          || aarch64_is_noplt_call_p (callee))
6204       : !REG_P (callee))
6205     XEXP (mem, 0) = force_reg (mode, callee);
6206
6207   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
6208
6209   if (result != NULL_RTX)
6210     call = gen_rtx_SET (result, call);
6211
6212   if (sibcall)
6213     tmp = ret_rtx;
6214   else
6215     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
6216
6217   vec = gen_rtvec (2, call, tmp);
6218   call = gen_rtx_PARALLEL (VOIDmode, vec);
6219
6220   aarch64_emit_call_insn (call);
6221 }
6222
6223 /* Emit call insn with PAT and do aarch64-specific handling.  */
6224
6225 void
6226 aarch64_emit_call_insn (rtx pat)
6227 {
6228   rtx insn = emit_call_insn (pat);
6229
6230   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
6231   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
6232   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
6233 }
6234
6235 machine_mode
6236 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
6237 {
6238   /* All floating point compares return CCFP if it is an equality
6239      comparison, and CCFPE otherwise.  */
6240   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
6241     {
6242       switch (code)
6243         {
6244         case EQ:
6245         case NE:
6246         case UNORDERED:
6247         case ORDERED:
6248         case UNLT:
6249         case UNLE:
6250         case UNGT:
6251         case UNGE:
6252         case UNEQ:
6253           return CCFPmode;
6254
6255         case LT:
6256         case LE:
6257         case GT:
6258         case GE:
6259         case LTGT:
6260           return CCFPEmode;
6261
6262         default:
6263           gcc_unreachable ();
6264         }
6265     }
6266
6267   /* Equality comparisons of short modes against zero can be performed
6268      using the TST instruction with the appropriate bitmask.  */
6269   if (y == const0_rtx && REG_P (x)
6270       && (code == EQ || code == NE)
6271       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
6272     return CC_NZmode;
6273
6274   /* Similarly, comparisons of zero_extends from shorter modes can
6275      be performed using an ANDS with an immediate mask.  */
6276   if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
6277       && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6278       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
6279       && (code == EQ || code == NE))
6280     return CC_NZmode;
6281
6282   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6283       && y == const0_rtx
6284       && (code == EQ || code == NE || code == LT || code == GE)
6285       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
6286           || GET_CODE (x) == NEG
6287           || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
6288               && CONST_INT_P (XEXP (x, 2)))))
6289     return CC_NZmode;
6290
6291   /* A compare with a shifted operand.  Because of canonicalization,
6292      the comparison will have to be swapped when we emit the assembly
6293      code.  */
6294   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6295       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
6296       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
6297           || GET_CODE (x) == LSHIFTRT
6298           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
6299     return CC_SWPmode;
6300
6301   /* Similarly for a negated operand, but we can only do this for
6302      equalities.  */
6303   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6304       && (REG_P (y) || GET_CODE (y) == SUBREG)
6305       && (code == EQ || code == NE)
6306       && GET_CODE (x) == NEG)
6307     return CC_Zmode;
6308
6309   /* A test for unsigned overflow.  */
6310   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
6311       && code == NE
6312       && GET_CODE (x) == PLUS
6313       && GET_CODE (y) == ZERO_EXTEND)
6314     return CC_Cmode;
6315
6316   /* A test for signed overflow.  */
6317   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
6318       && code == NE
6319       && GET_CODE (x) == PLUS
6320       && GET_CODE (y) == SIGN_EXTEND)
6321     return CC_Vmode;
6322
6323   /* For everything else, return CCmode.  */
6324   return CCmode;
6325 }
6326
6327 static int
6328 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
6329
6330 int
6331 aarch64_get_condition_code (rtx x)
6332 {
6333   machine_mode mode = GET_MODE (XEXP (x, 0));
6334   enum rtx_code comp_code = GET_CODE (x);
6335
6336   if (GET_MODE_CLASS (mode) != MODE_CC)
6337     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
6338   return aarch64_get_condition_code_1 (mode, comp_code);
6339 }
6340
6341 static int
6342 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
6343 {
6344   switch (mode)
6345     {
6346     case E_CCFPmode:
6347     case E_CCFPEmode:
6348       switch (comp_code)
6349         {
6350         case GE: return AARCH64_GE;
6351         case GT: return AARCH64_GT;
6352         case LE: return AARCH64_LS;
6353         case LT: return AARCH64_MI;
6354         case NE: return AARCH64_NE;
6355         case EQ: return AARCH64_EQ;
6356         case ORDERED: return AARCH64_VC;
6357         case UNORDERED: return AARCH64_VS;
6358         case UNLT: return AARCH64_LT;
6359         case UNLE: return AARCH64_LE;
6360         case UNGT: return AARCH64_HI;
6361         case UNGE: return AARCH64_PL;
6362         default: return -1;
6363         }
6364       break;
6365
6366     case E_CCmode:
6367       switch (comp_code)
6368         {
6369         case NE: return AARCH64_NE;
6370         case EQ: return AARCH64_EQ;
6371         case GE: return AARCH64_GE;
6372         case GT: return AARCH64_GT;
6373         case LE: return AARCH64_LE;
6374         case LT: return AARCH64_LT;
6375         case GEU: return AARCH64_CS;
6376         case GTU: return AARCH64_HI;
6377         case LEU: return AARCH64_LS;
6378         case LTU: return AARCH64_CC;
6379         default: return -1;
6380         }
6381       break;
6382
6383     case E_CC_SWPmode:
6384       switch (comp_code)
6385         {
6386         case NE: return AARCH64_NE;
6387         case EQ: return AARCH64_EQ;
6388         case GE: return AARCH64_LE;
6389         case GT: return AARCH64_LT;
6390         case LE: return AARCH64_GE;
6391         case LT: return AARCH64_GT;
6392         case GEU: return AARCH64_LS;
6393         case GTU: return AARCH64_CC;
6394         case LEU: return AARCH64_CS;
6395         case LTU: return AARCH64_HI;
6396         default: return -1;
6397         }
6398       break;
6399
6400     case E_CC_NZmode:
6401       switch (comp_code)
6402         {
6403         case NE: return AARCH64_NE;
6404         case EQ: return AARCH64_EQ;
6405         case GE: return AARCH64_PL;
6406         case LT: return AARCH64_MI;
6407         default: return -1;
6408         }
6409       break;
6410
6411     case E_CC_Zmode:
6412       switch (comp_code)
6413         {
6414         case NE: return AARCH64_NE;
6415         case EQ: return AARCH64_EQ;
6416         default: return -1;
6417         }
6418       break;
6419
6420     case E_CC_Cmode:
6421       switch (comp_code)
6422         {
6423         case NE: return AARCH64_CS;
6424         case EQ: return AARCH64_CC;
6425         default: return -1;
6426         }
6427       break;
6428
6429     case E_CC_Vmode:
6430       switch (comp_code)
6431         {
6432         case NE: return AARCH64_VS;
6433         case EQ: return AARCH64_VC;
6434         default: return -1;
6435         }
6436       break;
6437
6438     default:
6439       return -1;
6440     }
6441
6442   return -1;
6443 }
6444
6445 bool
6446 aarch64_const_vec_all_same_in_range_p (rtx x,
6447                                        HOST_WIDE_INT minval,
6448                                        HOST_WIDE_INT maxval)
6449 {
6450   rtx elt;
6451   return (const_vec_duplicate_p (x, &elt)
6452           && CONST_INT_P (elt)
6453           && IN_RANGE (INTVAL (elt), minval, maxval));
6454 }
6455
6456 bool
6457 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
6458 {
6459   return aarch64_const_vec_all_same_in_range_p (x, val, val);
6460 }
6461
6462 /* Return true if VEC is a constant in which every element is in the range
6463    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
6464
6465 static bool
6466 aarch64_const_vec_all_in_range_p (rtx vec,
6467                                   HOST_WIDE_INT minval,
6468                                   HOST_WIDE_INT maxval)
6469 {
6470   if (GET_CODE (vec) != CONST_VECTOR
6471       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
6472     return false;
6473
6474   int nunits;
6475   if (!CONST_VECTOR_STEPPED_P (vec))
6476     nunits = const_vector_encoded_nelts (vec);
6477   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
6478     return false;
6479
6480   for (int i = 0; i < nunits; i++)
6481     {
6482       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
6483       if (!CONST_INT_P (vec_elem)
6484           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
6485         return false;
6486     }
6487   return true;
6488 }
6489
6490 /* N Z C V.  */
6491 #define AARCH64_CC_V 1
6492 #define AARCH64_CC_C (1 << 1)
6493 #define AARCH64_CC_Z (1 << 2)
6494 #define AARCH64_CC_N (1 << 3)
6495
6496 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
6497 static const int aarch64_nzcv_codes[] =
6498 {
6499   0,            /* EQ, Z == 1.  */
6500   AARCH64_CC_Z, /* NE, Z == 0.  */
6501   0,            /* CS, C == 1.  */
6502   AARCH64_CC_C, /* CC, C == 0.  */
6503   0,            /* MI, N == 1.  */
6504   AARCH64_CC_N, /* PL, N == 0.  */
6505   0,            /* VS, V == 1.  */
6506   AARCH64_CC_V, /* VC, V == 0.  */
6507   0,            /* HI, C ==1 && Z == 0.  */
6508   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
6509   AARCH64_CC_V, /* GE, N == V.  */
6510   0,            /* LT, N != V.  */
6511   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
6512   0,            /* LE, !(Z == 0 && N == V).  */
6513   0,            /* AL, Any.  */
6514   0             /* NV, Any.  */
6515 };
6516
6517 /* Print floating-point vector immediate operand X to F, negating it
6518    first if NEGATE is true.  Return true on success, false if it isn't
6519    a constant we can handle.  */
6520
6521 static bool
6522 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
6523 {
6524   rtx elt;
6525
6526   if (!const_vec_duplicate_p (x, &elt))
6527     return false;
6528
6529   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
6530   if (negate)
6531     r = real_value_negate (&r);
6532
6533   /* We only handle the SVE single-bit immediates here.  */
6534   if (real_equal (&r, &dconst0))
6535     asm_fprintf (f, "0.0");
6536   else if (real_equal (&r, &dconst1))
6537     asm_fprintf (f, "1.0");
6538   else if (real_equal (&r, &dconsthalf))
6539     asm_fprintf (f, "0.5");
6540   else
6541     return false;
6542
6543   return true;
6544 }
6545
6546 /* Return the equivalent letter for size.  */
6547 static char
6548 sizetochar (int size)
6549 {
6550   switch (size)
6551     {
6552     case 64: return 'd';
6553     case 32: return 's';
6554     case 16: return 'h';
6555     case 8 : return 'b';
6556     default: gcc_unreachable ();
6557     }
6558 }
6559
6560 /* Print operand X to file F in a target specific manner according to CODE.
6561    The acceptable formatting commands given by CODE are:
6562      'c':               An integer or symbol address without a preceding #
6563                         sign.
6564      'C':               Take the duplicated element in a vector constant
6565                         and print it in hex.
6566      'D':               Take the duplicated element in a vector constant
6567                         and print it as an unsigned integer, in decimal.
6568      'e':               Print the sign/zero-extend size as a character 8->b,
6569                         16->h, 32->w.
6570      'p':               Prints N such that 2^N == X (X must be power of 2 and
6571                         const int).
6572      'P':               Print the number of non-zero bits in X (a const_int).
6573      'H':               Print the higher numbered register of a pair (TImode)
6574                         of regs.
6575      'm':               Print a condition (eq, ne, etc).
6576      'M':               Same as 'm', but invert condition.
6577      'N':               Take the duplicated element in a vector constant
6578                         and print the negative of it in decimal.
6579      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
6580      'S/T/U/V':         Print a FP/SIMD register name for a register list.
6581                         The register printed is the FP/SIMD register name
6582                         of X + 0/1/2/3 for S/T/U/V.
6583      'R':               Print a scalar FP/SIMD register name + 1.
6584      'X':               Print bottom 16 bits of integer constant in hex.
6585      'w/x':             Print a general register name or the zero register
6586                         (32-bit or 64-bit).
6587      '0':               Print a normal operand, if it's a general register,
6588                         then we assume DImode.
6589      'k':               Print NZCV for conditional compare instructions.
6590      'A':               Output address constant representing the first
6591                         argument of X, specifying a relocation offset
6592                         if appropriate.
6593      'L':               Output constant address specified by X
6594                         with a relocation offset if appropriate.
6595      'G':               Prints address of X, specifying a PC relative
6596                         relocation mode if appropriate.
6597      'y':               Output address of LDP or STP - this is used for
6598                         some LDP/STPs which don't use a PARALLEL in their
6599                         pattern (so the mode needs to be adjusted).
6600      'z':               Output address of a typical LDP or STP.  */
6601
6602 static void
6603 aarch64_print_operand (FILE *f, rtx x, int code)
6604 {
6605   rtx elt;
6606   switch (code)
6607     {
6608     case 'c':
6609       switch (GET_CODE (x))
6610         {
6611         case CONST_INT:
6612           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
6613           break;
6614
6615         case SYMBOL_REF:
6616           output_addr_const (f, x);
6617           break;
6618
6619         case CONST:
6620           if (GET_CODE (XEXP (x, 0)) == PLUS
6621               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
6622             {
6623               output_addr_const (f, x);
6624               break;
6625             }
6626           /* Fall through.  */
6627
6628         default:
6629           output_operand_lossage ("unsupported operand for code '%c'", code);
6630         }
6631       break;
6632
6633     case 'e':
6634       {
6635         int n;
6636
6637         if (!CONST_INT_P (x)
6638             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
6639           {
6640             output_operand_lossage ("invalid operand for '%%%c'", code);
6641             return;
6642           }
6643
6644         switch (n)
6645           {
6646           case 3:
6647             fputc ('b', f);
6648             break;
6649           case 4:
6650             fputc ('h', f);
6651             break;
6652           case 5:
6653             fputc ('w', f);
6654             break;
6655           default:
6656             output_operand_lossage ("invalid operand for '%%%c'", code);
6657             return;
6658           }
6659       }
6660       break;
6661
6662     case 'p':
6663       {
6664         int n;
6665
6666         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
6667           {
6668             output_operand_lossage ("invalid operand for '%%%c'", code);
6669             return;
6670           }
6671
6672         asm_fprintf (f, "%d", n);
6673       }
6674       break;
6675
6676     case 'P':
6677       if (!CONST_INT_P (x))
6678         {
6679           output_operand_lossage ("invalid operand for '%%%c'", code);
6680           return;
6681         }
6682
6683       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
6684       break;
6685
6686     case 'H':
6687       if (x == const0_rtx)
6688         {
6689           asm_fprintf (f, "xzr");
6690           break;
6691         }
6692
6693       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
6694         {
6695           output_operand_lossage ("invalid operand for '%%%c'", code);
6696           return;
6697         }
6698
6699       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
6700       break;
6701
6702     case 'M':
6703     case 'm':
6704       {
6705         int cond_code;
6706         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
6707         if (x == const_true_rtx)
6708           {
6709             if (code == 'M')
6710               fputs ("nv", f);
6711             return;
6712           }
6713
6714         if (!COMPARISON_P (x))
6715           {
6716             output_operand_lossage ("invalid operand for '%%%c'", code);
6717             return;
6718           }
6719
6720         cond_code = aarch64_get_condition_code (x);
6721         gcc_assert (cond_code >= 0);
6722         if (code == 'M')
6723           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
6724         fputs (aarch64_condition_codes[cond_code], f);
6725       }
6726       break;
6727
6728     case 'N':
6729       if (!const_vec_duplicate_p (x, &elt))
6730         {
6731           output_operand_lossage ("invalid vector constant");
6732           return;
6733         }
6734
6735       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6736         asm_fprintf (f, "%wd", -INTVAL (elt));
6737       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6738                && aarch64_print_vector_float_operand (f, x, true))
6739         ;
6740       else
6741         {
6742           output_operand_lossage ("invalid vector constant");
6743           return;
6744         }
6745       break;
6746
6747     case 'b':
6748     case 'h':
6749     case 's':
6750     case 'd':
6751     case 'q':
6752       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6753         {
6754           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6755           return;
6756         }
6757       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
6758       break;
6759
6760     case 'S':
6761     case 'T':
6762     case 'U':
6763     case 'V':
6764       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6765         {
6766           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6767           return;
6768         }
6769       asm_fprintf (f, "%c%d",
6770                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
6771                    REGNO (x) - V0_REGNUM + (code - 'S'));
6772       break;
6773
6774     case 'R':
6775       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6776         {
6777           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6778           return;
6779         }
6780       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
6781       break;
6782
6783     case 'X':
6784       if (!CONST_INT_P (x))
6785         {
6786           output_operand_lossage ("invalid operand for '%%%c'", code);
6787           return;
6788         }
6789       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
6790       break;
6791
6792     case 'C':
6793       {
6794         /* Print a replicated constant in hex.  */
6795         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6796           {
6797             output_operand_lossage ("invalid operand for '%%%c'", code);
6798             return;
6799           }
6800         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6801         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6802       }
6803       break;
6804
6805     case 'D':
6806       {
6807         /* Print a replicated constant in decimal, treating it as
6808            unsigned.  */
6809         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6810           {
6811             output_operand_lossage ("invalid operand for '%%%c'", code);
6812             return;
6813           }
6814         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6815         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6816       }
6817       break;
6818
6819     case 'w':
6820     case 'x':
6821       if (x == const0_rtx
6822           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
6823         {
6824           asm_fprintf (f, "%czr", code);
6825           break;
6826         }
6827
6828       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
6829         {
6830           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
6831           break;
6832         }
6833
6834       if (REG_P (x) && REGNO (x) == SP_REGNUM)
6835         {
6836           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
6837           break;
6838         }
6839
6840       /* Fall through */
6841
6842     case 0:
6843       if (x == NULL)
6844         {
6845           output_operand_lossage ("missing operand");
6846           return;
6847         }
6848
6849       switch (GET_CODE (x))
6850         {
6851         case REG:
6852           if (aarch64_sve_data_mode_p (GET_MODE (x)))
6853             {
6854               if (REG_NREGS (x) == 1)
6855                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
6856               else
6857                 {
6858                   char suffix
6859                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
6860                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
6861                                REGNO (x) - V0_REGNUM, suffix,
6862                                END_REGNO (x) - V0_REGNUM - 1, suffix);
6863                 }
6864             }
6865           else
6866             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
6867           break;
6868
6869         case MEM:
6870           output_address (GET_MODE (x), XEXP (x, 0));
6871           break;
6872
6873         case LABEL_REF:
6874         case SYMBOL_REF:
6875           output_addr_const (asm_out_file, x);
6876           break;
6877
6878         case CONST_INT:
6879           asm_fprintf (f, "%wd", INTVAL (x));
6880           break;
6881
6882         case CONST:
6883           if (!VECTOR_MODE_P (GET_MODE (x)))
6884             {
6885               output_addr_const (asm_out_file, x);
6886               break;
6887             }
6888           /* fall through */
6889
6890         case CONST_VECTOR:
6891           if (!const_vec_duplicate_p (x, &elt))
6892             {
6893               output_operand_lossage ("invalid vector constant");
6894               return;
6895             }
6896
6897           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6898             asm_fprintf (f, "%wd", INTVAL (elt));
6899           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6900                    && aarch64_print_vector_float_operand (f, x, false))
6901             ;
6902           else
6903             {
6904               output_operand_lossage ("invalid vector constant");
6905               return;
6906             }
6907           break;
6908
6909         case CONST_DOUBLE:
6910           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
6911              be getting CONST_DOUBLEs holding integers.  */
6912           gcc_assert (GET_MODE (x) != VOIDmode);
6913           if (aarch64_float_const_zero_rtx_p (x))
6914             {
6915               fputc ('0', f);
6916               break;
6917             }
6918           else if (aarch64_float_const_representable_p (x))
6919             {
6920 #define buf_size 20
6921               char float_buf[buf_size] = {'\0'};
6922               real_to_decimal_for_mode (float_buf,
6923                                         CONST_DOUBLE_REAL_VALUE (x),
6924                                         buf_size, buf_size,
6925                                         1, GET_MODE (x));
6926               asm_fprintf (asm_out_file, "%s", float_buf);
6927               break;
6928 #undef buf_size
6929             }
6930           output_operand_lossage ("invalid constant");
6931           return;
6932         default:
6933           output_operand_lossage ("invalid operand");
6934           return;
6935         }
6936       break;
6937
6938     case 'A':
6939       if (GET_CODE (x) == HIGH)
6940         x = XEXP (x, 0);
6941
6942       switch (aarch64_classify_symbolic_expression (x))
6943         {
6944         case SYMBOL_SMALL_GOT_4G:
6945           asm_fprintf (asm_out_file, ":got:");
6946           break;
6947
6948         case SYMBOL_SMALL_TLSGD:
6949           asm_fprintf (asm_out_file, ":tlsgd:");
6950           break;
6951
6952         case SYMBOL_SMALL_TLSDESC:
6953           asm_fprintf (asm_out_file, ":tlsdesc:");
6954           break;
6955
6956         case SYMBOL_SMALL_TLSIE:
6957           asm_fprintf (asm_out_file, ":gottprel:");
6958           break;
6959
6960         case SYMBOL_TLSLE24:
6961           asm_fprintf (asm_out_file, ":tprel:");
6962           break;
6963
6964         case SYMBOL_TINY_GOT:
6965           gcc_unreachable ();
6966           break;
6967
6968         default:
6969           break;
6970         }
6971       output_addr_const (asm_out_file, x);
6972       break;
6973
6974     case 'L':
6975       switch (aarch64_classify_symbolic_expression (x))
6976         {
6977         case SYMBOL_SMALL_GOT_4G:
6978           asm_fprintf (asm_out_file, ":lo12:");
6979           break;
6980
6981         case SYMBOL_SMALL_TLSGD:
6982           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
6983           break;
6984
6985         case SYMBOL_SMALL_TLSDESC:
6986           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
6987           break;
6988
6989         case SYMBOL_SMALL_TLSIE:
6990           asm_fprintf (asm_out_file, ":gottprel_lo12:");
6991           break;
6992
6993         case SYMBOL_TLSLE12:
6994           asm_fprintf (asm_out_file, ":tprel_lo12:");
6995           break;
6996
6997         case SYMBOL_TLSLE24:
6998           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
6999           break;
7000
7001         case SYMBOL_TINY_GOT:
7002           asm_fprintf (asm_out_file, ":got:");
7003           break;
7004
7005         case SYMBOL_TINY_TLSIE:
7006           asm_fprintf (asm_out_file, ":gottprel:");
7007           break;
7008
7009         default:
7010           break;
7011         }
7012       output_addr_const (asm_out_file, x);
7013       break;
7014
7015     case 'G':
7016       switch (aarch64_classify_symbolic_expression (x))
7017         {
7018         case SYMBOL_TLSLE24:
7019           asm_fprintf (asm_out_file, ":tprel_hi12:");
7020           break;
7021         default:
7022           break;
7023         }
7024       output_addr_const (asm_out_file, x);
7025       break;
7026
7027     case 'k':
7028       {
7029         HOST_WIDE_INT cond_code;
7030
7031         if (!CONST_INT_P (x))
7032           {
7033             output_operand_lossage ("invalid operand for '%%%c'", code);
7034             return;
7035           }
7036
7037         cond_code = INTVAL (x);
7038         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
7039         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
7040       }
7041       break;
7042
7043     case 'y':
7044     case 'z':
7045       {
7046         machine_mode mode = GET_MODE (x);
7047
7048         if (GET_CODE (x) != MEM
7049             || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
7050           {
7051             output_operand_lossage ("invalid operand for '%%%c'", code);
7052             return;
7053           }
7054
7055         if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
7056                                             code == 'y'
7057                                             ? ADDR_QUERY_LDP_STP_N
7058                                             : ADDR_QUERY_LDP_STP))
7059           output_operand_lossage ("invalid operand prefix '%%%c'", code);
7060       }
7061       break;
7062
7063     default:
7064       output_operand_lossage ("invalid operand prefix '%%%c'", code);
7065       return;
7066     }
7067 }
7068
7069 /* Print address 'x' of a memory access with mode 'mode'.
7070    'op' is the context required by aarch64_classify_address.  It can either be
7071    MEM for a normal memory access or PARALLEL for LDP/STP.  */
7072 static bool
7073 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
7074                                 aarch64_addr_query_type type)
7075 {
7076   struct aarch64_address_info addr;
7077   unsigned int size;
7078
7079   /* Check all addresses are Pmode - including ILP32.  */
7080   if (GET_MODE (x) != Pmode)
7081     output_operand_lossage ("invalid address mode");
7082
7083   if (aarch64_classify_address (&addr, x, mode, true, type))
7084     switch (addr.type)
7085       {
7086       case ADDRESS_REG_IMM:
7087         if (known_eq (addr.const_offset, 0))
7088           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
7089         else if (aarch64_sve_data_mode_p (mode))
7090           {
7091             HOST_WIDE_INT vnum
7092               = exact_div (addr.const_offset,
7093                            BYTES_PER_SVE_VECTOR).to_constant ();
7094             asm_fprintf (f, "[%s, #%wd, mul vl]",
7095                          reg_names[REGNO (addr.base)], vnum);
7096           }
7097         else if (aarch64_sve_pred_mode_p (mode))
7098           {
7099             HOST_WIDE_INT vnum
7100               = exact_div (addr.const_offset,
7101                            BYTES_PER_SVE_PRED).to_constant ();
7102             asm_fprintf (f, "[%s, #%wd, mul vl]",
7103                          reg_names[REGNO (addr.base)], vnum);
7104           }
7105         else
7106           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
7107                        INTVAL (addr.offset));
7108         return true;
7109
7110       case ADDRESS_REG_REG:
7111         if (addr.shift == 0)
7112           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
7113                        reg_names [REGNO (addr.offset)]);
7114         else
7115           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
7116                        reg_names [REGNO (addr.offset)], addr.shift);
7117         return true;
7118
7119       case ADDRESS_REG_UXTW:
7120         if (addr.shift == 0)
7121           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
7122                        REGNO (addr.offset) - R0_REGNUM);
7123         else
7124           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
7125                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
7126         return true;
7127
7128       case ADDRESS_REG_SXTW:
7129         if (addr.shift == 0)
7130           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
7131                        REGNO (addr.offset) - R0_REGNUM);
7132         else
7133           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
7134                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
7135         return true;
7136
7137       case ADDRESS_REG_WB:
7138         /* Writeback is only supported for fixed-width modes.  */
7139         size = GET_MODE_SIZE (mode).to_constant ();
7140         switch (GET_CODE (x))
7141           {
7142           case PRE_INC:
7143             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
7144             return true;
7145           case POST_INC:
7146             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
7147             return true;
7148           case PRE_DEC:
7149             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
7150             return true;
7151           case POST_DEC:
7152             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
7153             return true;
7154           case PRE_MODIFY:
7155             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
7156                          INTVAL (addr.offset));
7157             return true;
7158           case POST_MODIFY:
7159             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
7160                          INTVAL (addr.offset));
7161             return true;
7162           default:
7163             break;
7164           }
7165         break;
7166
7167       case ADDRESS_LO_SUM:
7168         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
7169         output_addr_const (f, addr.offset);
7170         asm_fprintf (f, "]");
7171         return true;
7172
7173       case ADDRESS_SYMBOLIC:
7174         output_addr_const (f, x);
7175         return true;
7176       }
7177
7178   return false;
7179 }
7180
7181 /* Print address 'x' of a memory access with mode 'mode'.  */
7182 static void
7183 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
7184 {
7185   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
7186     output_addr_const (f, x);
7187 }
7188
7189 bool
7190 aarch64_label_mentioned_p (rtx x)
7191 {
7192   const char *fmt;
7193   int i;
7194
7195   if (GET_CODE (x) == LABEL_REF)
7196     return true;
7197
7198   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
7199      referencing instruction, but they are constant offsets, not
7200      symbols.  */
7201   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
7202     return false;
7203
7204   fmt = GET_RTX_FORMAT (GET_CODE (x));
7205   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
7206     {
7207       if (fmt[i] == 'E')
7208         {
7209           int j;
7210
7211           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
7212             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
7213               return 1;
7214         }
7215       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
7216         return 1;
7217     }
7218
7219   return 0;
7220 }
7221
7222 /* Implement REGNO_REG_CLASS.  */
7223
7224 enum reg_class
7225 aarch64_regno_regclass (unsigned regno)
7226 {
7227   if (GP_REGNUM_P (regno))
7228     return GENERAL_REGS;
7229
7230   if (regno == SP_REGNUM)
7231     return STACK_REG;
7232
7233   if (regno == FRAME_POINTER_REGNUM
7234       || regno == ARG_POINTER_REGNUM)
7235     return POINTER_REGS;
7236
7237   if (FP_REGNUM_P (regno))
7238     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
7239
7240   if (PR_REGNUM_P (regno))
7241     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
7242
7243   return NO_REGS;
7244 }
7245
7246 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
7247    If OFFSET is out of range, return an offset of an anchor point
7248    that is in range.  Return 0 otherwise.  */
7249
7250 static HOST_WIDE_INT
7251 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
7252                        machine_mode mode)
7253 {
7254   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
7255   if (size > 16)
7256     return (offset + 0x400) & ~0x7f0;
7257
7258   /* For offsets that aren't a multiple of the access size, the limit is
7259      -256...255.  */
7260   if (offset & (size - 1))
7261     {
7262       /* BLKmode typically uses LDP of X-registers.  */
7263       if (mode == BLKmode)
7264         return (offset + 512) & ~0x3ff;
7265       return (offset + 0x100) & ~0x1ff;
7266     }
7267
7268   /* Small negative offsets are supported.  */
7269   if (IN_RANGE (offset, -256, 0))
7270     return 0;
7271
7272   if (mode == TImode || mode == TFmode)
7273     return (offset + 0x100) & ~0x1ff;
7274
7275   /* Use 12-bit offset by access size.  */
7276   return offset & (~0xfff * size);
7277 }
7278
7279 static rtx
7280 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
7281 {
7282   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
7283      where mask is selected by alignment and size of the offset.
7284      We try to pick as large a range for the offset as possible to
7285      maximize the chance of a CSE.  However, for aligned addresses
7286      we limit the range to 4k so that structures with different sized
7287      elements are likely to use the same base.  We need to be careful
7288      not to split a CONST for some forms of address expression, otherwise
7289      it will generate sub-optimal code.  */
7290
7291   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
7292     {
7293       rtx base = XEXP (x, 0);
7294       rtx offset_rtx = XEXP (x, 1);
7295       HOST_WIDE_INT offset = INTVAL (offset_rtx);
7296
7297       if (GET_CODE (base) == PLUS)
7298         {
7299           rtx op0 = XEXP (base, 0);
7300           rtx op1 = XEXP (base, 1);
7301
7302           /* Force any scaling into a temp for CSE.  */
7303           op0 = force_reg (Pmode, op0);
7304           op1 = force_reg (Pmode, op1);
7305
7306           /* Let the pointer register be in op0.  */
7307           if (REG_POINTER (op1))
7308             std::swap (op0, op1);
7309
7310           /* If the pointer is virtual or frame related, then we know that
7311              virtual register instantiation or register elimination is going
7312              to apply a second constant.  We want the two constants folded
7313              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
7314           if (virt_or_elim_regno_p (REGNO (op0)))
7315             {
7316               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
7317                                    NULL_RTX, true, OPTAB_DIRECT);
7318               return gen_rtx_PLUS (Pmode, base, op1);
7319             }
7320
7321           /* Otherwise, in order to encourage CSE (and thence loop strength
7322              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
7323           base = expand_binop (Pmode, add_optab, op0, op1,
7324                                NULL_RTX, true, OPTAB_DIRECT);
7325           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
7326         }
7327
7328       HOST_WIDE_INT size;
7329       if (GET_MODE_SIZE (mode).is_constant (&size))
7330         {
7331           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
7332                                                              mode);
7333           if (base_offset != 0)
7334             {
7335               base = plus_constant (Pmode, base, base_offset);
7336               base = force_operand (base, NULL_RTX);
7337               return plus_constant (Pmode, base, offset - base_offset);
7338             }
7339         }
7340     }
7341
7342   return x;
7343 }
7344
7345 static reg_class_t
7346 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
7347                           reg_class_t rclass,
7348                           machine_mode mode,
7349                           secondary_reload_info *sri)
7350 {
7351   /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
7352      directly by the *aarch64_sve_mov<mode>_be move pattern.  See the
7353      comment at the head of aarch64-sve.md for more details about the
7354      big-endian handling.  */
7355   if (BYTES_BIG_ENDIAN
7356       && reg_class_subset_p (rclass, FP_REGS)
7357       && !((REG_P (x) && HARD_REGISTER_P (x))
7358            || aarch64_simd_valid_immediate (x, NULL))
7359       && aarch64_sve_data_mode_p (mode))
7360     {
7361       sri->icode = CODE_FOR_aarch64_sve_reload_be;
7362       return NO_REGS;
7363     }
7364
7365   /* If we have to disable direct literal pool loads and stores because the
7366      function is too big, then we need a scratch register.  */
7367   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
7368       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
7369           || targetm.vector_mode_supported_p (GET_MODE (x)))
7370       && !aarch64_pcrelative_literal_loads)
7371     {
7372       sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
7373       return NO_REGS;
7374     }
7375
7376   /* Without the TARGET_SIMD instructions we cannot move a Q register
7377      to a Q register directly.  We need a scratch.  */
7378   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
7379       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
7380       && reg_class_subset_p (rclass, FP_REGS))
7381     {
7382       sri->icode = code_for_aarch64_reload_mov (mode);
7383       return NO_REGS;
7384     }
7385
7386   /* A TFmode or TImode memory access should be handled via an FP_REGS
7387      because AArch64 has richer addressing modes for LDR/STR instructions
7388      than LDP/STP instructions.  */
7389   if (TARGET_FLOAT && rclass == GENERAL_REGS
7390       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
7391     return FP_REGS;
7392
7393   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
7394       return GENERAL_REGS;
7395
7396   return NO_REGS;
7397 }
7398
7399 static bool
7400 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
7401 {
7402   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
7403
7404   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
7405      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
7406   if (frame_pointer_needed)
7407     return to == HARD_FRAME_POINTER_REGNUM;
7408   return true;
7409 }
7410
7411 poly_int64
7412 aarch64_initial_elimination_offset (unsigned from, unsigned to)
7413 {
7414   if (to == HARD_FRAME_POINTER_REGNUM)
7415     {
7416       if (from == ARG_POINTER_REGNUM)
7417         return cfun->machine->frame.hard_fp_offset;
7418
7419       if (from == FRAME_POINTER_REGNUM)
7420         return cfun->machine->frame.hard_fp_offset
7421                - cfun->machine->frame.locals_offset;
7422     }
7423
7424   if (to == STACK_POINTER_REGNUM)
7425     {
7426       if (from == FRAME_POINTER_REGNUM)
7427           return cfun->machine->frame.frame_size
7428                  - cfun->machine->frame.locals_offset;
7429     }
7430
7431   return cfun->machine->frame.frame_size;
7432 }
7433
7434 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
7435    previous frame.  */
7436
7437 rtx
7438 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
7439 {
7440   if (count != 0)
7441     return const0_rtx;
7442   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
7443 }
7444
7445
7446 static void
7447 aarch64_asm_trampoline_template (FILE *f)
7448 {
7449   if (TARGET_ILP32)
7450     {
7451       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
7452       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
7453     }
7454   else
7455     {
7456       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
7457       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
7458     }
7459   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
7460   assemble_aligned_integer (4, const0_rtx);
7461   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7462   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7463 }
7464
7465 static void
7466 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
7467 {
7468   rtx fnaddr, mem, a_tramp;
7469   const int tramp_code_sz = 16;
7470
7471   /* Don't need to copy the trailing D-words, we fill those in below.  */
7472   emit_block_move (m_tramp, assemble_trampoline_template (),
7473                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
7474   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
7475   fnaddr = XEXP (DECL_RTL (fndecl), 0);
7476   if (GET_MODE (fnaddr) != ptr_mode)
7477     fnaddr = convert_memory_address (ptr_mode, fnaddr);
7478   emit_move_insn (mem, fnaddr);
7479
7480   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
7481   emit_move_insn (mem, chain_value);
7482
7483   /* XXX We should really define a "clear_cache" pattern and use
7484      gen_clear_cache().  */
7485   a_tramp = XEXP (m_tramp, 0);
7486   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
7487                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
7488                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
7489                      ptr_mode);
7490 }
7491
7492 static unsigned char
7493 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
7494 {
7495   /* ??? Logically we should only need to provide a value when
7496      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
7497      can hold MODE, but at the moment we need to handle all modes.
7498      Just ignore any runtime parts for registers that can't store them.  */
7499   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
7500   unsigned int nregs;
7501   switch (regclass)
7502     {
7503     case TAILCALL_ADDR_REGS:
7504     case POINTER_REGS:
7505     case GENERAL_REGS:
7506     case ALL_REGS:
7507     case POINTER_AND_FP_REGS:
7508     case FP_REGS:
7509     case FP_LO_REGS:
7510       if (aarch64_sve_data_mode_p (mode)
7511           && constant_multiple_p (GET_MODE_SIZE (mode),
7512                                   BYTES_PER_SVE_VECTOR, &nregs))
7513         return nregs;
7514       return (aarch64_vector_data_mode_p (mode)
7515               ? CEIL (lowest_size, UNITS_PER_VREG)
7516               : CEIL (lowest_size, UNITS_PER_WORD));
7517     case STACK_REG:
7518     case PR_REGS:
7519     case PR_LO_REGS:
7520     case PR_HI_REGS:
7521       return 1;
7522
7523     case NO_REGS:
7524       return 0;
7525
7526     default:
7527       break;
7528     }
7529   gcc_unreachable ();
7530 }
7531
7532 static reg_class_t
7533 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
7534 {
7535   if (regclass == POINTER_REGS)
7536     return GENERAL_REGS;
7537
7538   if (regclass == STACK_REG)
7539     {
7540       if (REG_P(x)
7541           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
7542           return regclass;
7543
7544       return NO_REGS;
7545     }
7546
7547   /* Register eliminiation can result in a request for
7548      SP+constant->FP_REGS.  We cannot support such operations which
7549      use SP as source and an FP_REG as destination, so reject out
7550      right now.  */
7551   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
7552     {
7553       rtx lhs = XEXP (x, 0);
7554
7555       /* Look through a possible SUBREG introduced by ILP32.  */
7556       if (GET_CODE (lhs) == SUBREG)
7557         lhs = SUBREG_REG (lhs);
7558
7559       gcc_assert (REG_P (lhs));
7560       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
7561                                       POINTER_REGS));
7562       return NO_REGS;
7563     }
7564
7565   return regclass;
7566 }
7567
7568 void
7569 aarch64_asm_output_labelref (FILE* f, const char *name)
7570 {
7571   asm_fprintf (f, "%U%s", name);
7572 }
7573
7574 static void
7575 aarch64_elf_asm_constructor (rtx symbol, int priority)
7576 {
7577   if (priority == DEFAULT_INIT_PRIORITY)
7578     default_ctor_section_asm_out_constructor (symbol, priority);
7579   else
7580     {
7581       section *s;
7582       /* While priority is known to be in range [0, 65535], so 18 bytes
7583          would be enough, the compiler might not know that.  To avoid
7584          -Wformat-truncation false positive, use a larger size.  */
7585       char buf[23];
7586       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
7587       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7588       switch_to_section (s);
7589       assemble_align (POINTER_SIZE);
7590       assemble_aligned_integer (POINTER_BYTES, symbol);
7591     }
7592 }
7593
7594 static void
7595 aarch64_elf_asm_destructor (rtx symbol, int priority)
7596 {
7597   if (priority == DEFAULT_INIT_PRIORITY)
7598     default_dtor_section_asm_out_destructor (symbol, priority);
7599   else
7600     {
7601       section *s;
7602       /* While priority is known to be in range [0, 65535], so 18 bytes
7603          would be enough, the compiler might not know that.  To avoid
7604          -Wformat-truncation false positive, use a larger size.  */
7605       char buf[23];
7606       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
7607       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7608       switch_to_section (s);
7609       assemble_align (POINTER_SIZE);
7610       assemble_aligned_integer (POINTER_BYTES, symbol);
7611     }
7612 }
7613
7614 const char*
7615 aarch64_output_casesi (rtx *operands)
7616 {
7617   char buf[100];
7618   char label[100];
7619   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
7620   int index;
7621   static const char *const patterns[4][2] =
7622   {
7623     {
7624       "ldrb\t%w3, [%0,%w1,uxtw]",
7625       "add\t%3, %4, %w3, sxtb #2"
7626     },
7627     {
7628       "ldrh\t%w3, [%0,%w1,uxtw #1]",
7629       "add\t%3, %4, %w3, sxth #2"
7630     },
7631     {
7632       "ldr\t%w3, [%0,%w1,uxtw #2]",
7633       "add\t%3, %4, %w3, sxtw #2"
7634     },
7635     /* We assume that DImode is only generated when not optimizing and
7636        that we don't really need 64-bit address offsets.  That would
7637        imply an object file with 8GB of code in a single function!  */
7638     {
7639       "ldr\t%w3, [%0,%w1,uxtw #2]",
7640       "add\t%3, %4, %w3, sxtw #2"
7641     }
7642   };
7643
7644   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
7645
7646   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
7647   index = exact_log2 (GET_MODE_SIZE (mode));
7648
7649   gcc_assert (index >= 0 && index <= 3);
7650
7651   /* Need to implement table size reduction, by chaning the code below.  */
7652   output_asm_insn (patterns[index][0], operands);
7653   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
7654   snprintf (buf, sizeof (buf),
7655             "adr\t%%4, %s", targetm.strip_name_encoding (label));
7656   output_asm_insn (buf, operands);
7657   output_asm_insn (patterns[index][1], operands);
7658   output_asm_insn ("br\t%3", operands);
7659   assemble_label (asm_out_file, label);
7660   return "";
7661 }
7662
7663
7664 /* Return size in bits of an arithmetic operand which is shifted/scaled and
7665    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
7666    operator.  */
7667
7668 int
7669 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
7670 {
7671   if (shift >= 0 && shift <= 3)
7672     {
7673       int size;
7674       for (size = 8; size <= 32; size *= 2)
7675         {
7676           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
7677           if (mask == bits << shift)
7678             return size;
7679         }
7680     }
7681   return 0;
7682 }
7683
7684 /* Constant pools are per function only when PC relative
7685    literal loads are true or we are in the large memory
7686    model.  */
7687
7688 static inline bool
7689 aarch64_can_use_per_function_literal_pools_p (void)
7690 {
7691   return (aarch64_pcrelative_literal_loads
7692           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
7693 }
7694
7695 static bool
7696 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
7697 {
7698   /* We can't use blocks for constants when we're using a per-function
7699      constant pool.  */
7700   return !aarch64_can_use_per_function_literal_pools_p ();
7701 }
7702
7703 /* Select appropriate section for constants depending
7704    on where we place literal pools.  */
7705
7706 static section *
7707 aarch64_select_rtx_section (machine_mode mode,
7708                             rtx x,
7709                             unsigned HOST_WIDE_INT align)
7710 {
7711   if (aarch64_can_use_per_function_literal_pools_p ())
7712     return function_section (current_function_decl);
7713
7714   return default_elf_select_rtx_section (mode, x, align);
7715 }
7716
7717 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
7718 void
7719 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
7720                                   HOST_WIDE_INT offset)
7721 {
7722   /* When using per-function literal pools, we must ensure that any code
7723      section is aligned to the minimal instruction length, lest we get
7724      errors from the assembler re "unaligned instructions".  */
7725   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
7726     ASM_OUTPUT_ALIGN (f, 2);
7727 }
7728
7729 /* Costs.  */
7730
7731 /* Helper function for rtx cost calculation.  Strip a shift expression
7732    from X.  Returns the inner operand if successful, or the original
7733    expression on failure.  */
7734 static rtx
7735 aarch64_strip_shift (rtx x)
7736 {
7737   rtx op = x;
7738
7739   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
7740      we can convert both to ROR during final output.  */
7741   if ((GET_CODE (op) == ASHIFT
7742        || GET_CODE (op) == ASHIFTRT
7743        || GET_CODE (op) == LSHIFTRT
7744        || GET_CODE (op) == ROTATERT
7745        || GET_CODE (op) == ROTATE)
7746       && CONST_INT_P (XEXP (op, 1)))
7747     return XEXP (op, 0);
7748
7749   if (GET_CODE (op) == MULT
7750       && CONST_INT_P (XEXP (op, 1))
7751       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
7752     return XEXP (op, 0);
7753
7754   return x;
7755 }
7756
7757 /* Helper function for rtx cost calculation.  Strip an extend
7758    expression from X.  Returns the inner operand if successful, or the
7759    original expression on failure.  We deal with a number of possible
7760    canonicalization variations here. If STRIP_SHIFT is true, then
7761    we can strip off a shift also.  */
7762 static rtx
7763 aarch64_strip_extend (rtx x, bool strip_shift)
7764 {
7765   scalar_int_mode mode;
7766   rtx op = x;
7767
7768   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
7769     return op;
7770
7771   /* Zero and sign extraction of a widened value.  */
7772   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
7773       && XEXP (op, 2) == const0_rtx
7774       && GET_CODE (XEXP (op, 0)) == MULT
7775       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
7776                                          XEXP (op, 1)))
7777     return XEXP (XEXP (op, 0), 0);
7778
7779   /* It can also be represented (for zero-extend) as an AND with an
7780      immediate.  */
7781   if (GET_CODE (op) == AND
7782       && GET_CODE (XEXP (op, 0)) == MULT
7783       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
7784       && CONST_INT_P (XEXP (op, 1))
7785       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
7786                            INTVAL (XEXP (op, 1))) != 0)
7787     return XEXP (XEXP (op, 0), 0);
7788
7789   /* Now handle extended register, as this may also have an optional
7790      left shift by 1..4.  */
7791   if (strip_shift
7792       && GET_CODE (op) == ASHIFT
7793       && CONST_INT_P (XEXP (op, 1))
7794       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
7795     op = XEXP (op, 0);
7796
7797   if (GET_CODE (op) == ZERO_EXTEND
7798       || GET_CODE (op) == SIGN_EXTEND)
7799     op = XEXP (op, 0);
7800
7801   if (op != x)
7802     return op;
7803
7804   return x;
7805 }
7806
7807 /* Return true iff CODE is a shift supported in combination
7808    with arithmetic instructions.  */
7809
7810 static bool
7811 aarch64_shift_p (enum rtx_code code)
7812 {
7813   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
7814 }
7815
7816
7817 /* Return true iff X is a cheap shift without a sign extend. */
7818
7819 static bool
7820 aarch64_cheap_mult_shift_p (rtx x)
7821 {
7822   rtx op0, op1;
7823
7824   op0 = XEXP (x, 0);
7825   op1 = XEXP (x, 1);
7826
7827   if (!(aarch64_tune_params.extra_tuning_flags
7828                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
7829     return false;
7830
7831   if (GET_CODE (op0) == SIGN_EXTEND)
7832     return false;
7833
7834   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
7835       && UINTVAL (op1) <= 4)
7836     return true;
7837
7838   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
7839     return false;
7840
7841   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
7842
7843   if (l2 > 0 && l2 <= 4)
7844     return true;
7845
7846   return false;
7847 }
7848
7849 /* Helper function for rtx cost calculation.  Calculate the cost of
7850    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
7851    Return the calculated cost of the expression, recursing manually in to
7852    operands where needed.  */
7853
7854 static int
7855 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
7856 {
7857   rtx op0, op1;
7858   const struct cpu_cost_table *extra_cost
7859     = aarch64_tune_params.insn_extra_cost;
7860   int cost = 0;
7861   bool compound_p = (outer == PLUS || outer == MINUS);
7862   machine_mode mode = GET_MODE (x);
7863
7864   gcc_checking_assert (code == MULT);
7865
7866   op0 = XEXP (x, 0);
7867   op1 = XEXP (x, 1);
7868
7869   if (VECTOR_MODE_P (mode))
7870     mode = GET_MODE_INNER (mode);
7871
7872   /* Integer multiply/fma.  */
7873   if (GET_MODE_CLASS (mode) == MODE_INT)
7874     {
7875       /* The multiply will be canonicalized as a shift, cost it as such.  */
7876       if (aarch64_shift_p (GET_CODE (x))
7877           || (CONST_INT_P (op1)
7878               && exact_log2 (INTVAL (op1)) > 0))
7879         {
7880           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
7881                            || GET_CODE (op0) == SIGN_EXTEND;
7882           if (speed)
7883             {
7884               if (compound_p)
7885                 {
7886                   /* If the shift is considered cheap,
7887                      then don't add any cost. */
7888                   if (aarch64_cheap_mult_shift_p (x))
7889                     ;
7890                   else if (REG_P (op1))
7891                     /* ARITH + shift-by-register.  */
7892                     cost += extra_cost->alu.arith_shift_reg;
7893                   else if (is_extend)
7894                     /* ARITH + extended register.  We don't have a cost field
7895                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
7896                     cost += extra_cost->alu.extend_arith;
7897                   else
7898                     /* ARITH + shift-by-immediate.  */
7899                     cost += extra_cost->alu.arith_shift;
7900                 }
7901               else
7902                 /* LSL (immediate).  */
7903                 cost += extra_cost->alu.shift;
7904
7905             }
7906           /* Strip extends as we will have costed them in the case above.  */
7907           if (is_extend)
7908             op0 = aarch64_strip_extend (op0, true);
7909
7910           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
7911
7912           return cost;
7913         }
7914
7915       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
7916          compound and let the below cases handle it.  After all, MNEG is a
7917          special-case alias of MSUB.  */
7918       if (GET_CODE (op0) == NEG)
7919         {
7920           op0 = XEXP (op0, 0);
7921           compound_p = true;
7922         }
7923
7924       /* Integer multiplies or FMAs have zero/sign extending variants.  */
7925       if ((GET_CODE (op0) == ZERO_EXTEND
7926            && GET_CODE (op1) == ZERO_EXTEND)
7927           || (GET_CODE (op0) == SIGN_EXTEND
7928               && GET_CODE (op1) == SIGN_EXTEND))
7929         {
7930           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
7931           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
7932
7933           if (speed)
7934             {
7935               if (compound_p)
7936                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
7937                 cost += extra_cost->mult[0].extend_add;
7938               else
7939                 /* MUL/SMULL/UMULL.  */
7940                 cost += extra_cost->mult[0].extend;
7941             }
7942
7943           return cost;
7944         }
7945
7946       /* This is either an integer multiply or a MADD.  In both cases
7947          we want to recurse and cost the operands.  */
7948       cost += rtx_cost (op0, mode, MULT, 0, speed);
7949       cost += rtx_cost (op1, mode, MULT, 1, speed);
7950
7951       if (speed)
7952         {
7953           if (compound_p)
7954             /* MADD/MSUB.  */
7955             cost += extra_cost->mult[mode == DImode].add;
7956           else
7957             /* MUL.  */
7958             cost += extra_cost->mult[mode == DImode].simple;
7959         }
7960
7961       return cost;
7962     }
7963   else
7964     {
7965       if (speed)
7966         {
7967           /* Floating-point FMA/FMUL can also support negations of the
7968              operands, unless the rounding mode is upward or downward in
7969              which case FNMUL is different than FMUL with operand negation.  */
7970           bool neg0 = GET_CODE (op0) == NEG;
7971           bool neg1 = GET_CODE (op1) == NEG;
7972           if (compound_p || !flag_rounding_math || (neg0 && neg1))
7973             {
7974               if (neg0)
7975                 op0 = XEXP (op0, 0);
7976               if (neg1)
7977                 op1 = XEXP (op1, 0);
7978             }
7979
7980           if (compound_p)
7981             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
7982             cost += extra_cost->fp[mode == DFmode].fma;
7983           else
7984             /* FMUL/FNMUL.  */
7985             cost += extra_cost->fp[mode == DFmode].mult;
7986         }
7987
7988       cost += rtx_cost (op0, mode, MULT, 0, speed);
7989       cost += rtx_cost (op1, mode, MULT, 1, speed);
7990       return cost;
7991     }
7992 }
7993
7994 static int
7995 aarch64_address_cost (rtx x,
7996                       machine_mode mode,
7997                       addr_space_t as ATTRIBUTE_UNUSED,
7998                       bool speed)
7999 {
8000   enum rtx_code c = GET_CODE (x);
8001   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
8002   struct aarch64_address_info info;
8003   int cost = 0;
8004   info.shift = 0;
8005
8006   if (!aarch64_classify_address (&info, x, mode, false))
8007     {
8008       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
8009         {
8010           /* This is a CONST or SYMBOL ref which will be split
8011              in a different way depending on the code model in use.
8012              Cost it through the generic infrastructure.  */
8013           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
8014           /* Divide through by the cost of one instruction to
8015              bring it to the same units as the address costs.  */
8016           cost_symbol_ref /= COSTS_N_INSNS (1);
8017           /* The cost is then the cost of preparing the address,
8018              followed by an immediate (possibly 0) offset.  */
8019           return cost_symbol_ref + addr_cost->imm_offset;
8020         }
8021       else
8022         {
8023           /* This is most likely a jump table from a case
8024              statement.  */
8025           return addr_cost->register_offset;
8026         }
8027     }
8028
8029   switch (info.type)
8030     {
8031       case ADDRESS_LO_SUM:
8032       case ADDRESS_SYMBOLIC:
8033       case ADDRESS_REG_IMM:
8034         cost += addr_cost->imm_offset;
8035         break;
8036
8037       case ADDRESS_REG_WB:
8038         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
8039           cost += addr_cost->pre_modify;
8040         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
8041           cost += addr_cost->post_modify;
8042         else
8043           gcc_unreachable ();
8044
8045         break;
8046
8047       case ADDRESS_REG_REG:
8048         cost += addr_cost->register_offset;
8049         break;
8050
8051       case ADDRESS_REG_SXTW:
8052         cost += addr_cost->register_sextend;
8053         break;
8054
8055       case ADDRESS_REG_UXTW:
8056         cost += addr_cost->register_zextend;
8057         break;
8058
8059       default:
8060         gcc_unreachable ();
8061     }
8062
8063
8064   if (info.shift > 0)
8065     {
8066       /* For the sake of calculating the cost of the shifted register
8067          component, we can treat same sized modes in the same way.  */
8068       if (known_eq (GET_MODE_BITSIZE (mode), 16))
8069         cost += addr_cost->addr_scale_costs.hi;
8070       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
8071         cost += addr_cost->addr_scale_costs.si;
8072       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
8073         cost += addr_cost->addr_scale_costs.di;
8074       else
8075         /* We can't tell, or this is a 128-bit vector.  */
8076         cost += addr_cost->addr_scale_costs.ti;
8077     }
8078
8079   return cost;
8080 }
8081
8082 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
8083    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
8084    to be taken.  */
8085
8086 int
8087 aarch64_branch_cost (bool speed_p, bool predictable_p)
8088 {
8089   /* When optimizing for speed, use the cost of unpredictable branches.  */
8090   const struct cpu_branch_cost *branch_costs =
8091     aarch64_tune_params.branch_costs;
8092
8093   if (!speed_p || predictable_p)
8094     return branch_costs->predictable;
8095   else
8096     return branch_costs->unpredictable;
8097 }
8098
8099 /* Return true if the RTX X in mode MODE is a zero or sign extract
8100    usable in an ADD or SUB (extended register) instruction.  */
8101 static bool
8102 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
8103 {
8104   /* Catch add with a sign extract.
8105      This is add_<optab><mode>_multp2.  */
8106   if (GET_CODE (x) == SIGN_EXTRACT
8107       || GET_CODE (x) == ZERO_EXTRACT)
8108     {
8109       rtx op0 = XEXP (x, 0);
8110       rtx op1 = XEXP (x, 1);
8111       rtx op2 = XEXP (x, 2);
8112
8113       if (GET_CODE (op0) == MULT
8114           && CONST_INT_P (op1)
8115           && op2 == const0_rtx
8116           && CONST_INT_P (XEXP (op0, 1))
8117           && aarch64_is_extend_from_extract (mode,
8118                                              XEXP (op0, 1),
8119                                              op1))
8120         {
8121           return true;
8122         }
8123     }
8124   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
8125      No shift.  */
8126   else if (GET_CODE (x) == SIGN_EXTEND
8127            || GET_CODE (x) == ZERO_EXTEND)
8128     return REG_P (XEXP (x, 0));
8129
8130   return false;
8131 }
8132
8133 static bool
8134 aarch64_frint_unspec_p (unsigned int u)
8135 {
8136   switch (u)
8137     {
8138       case UNSPEC_FRINTZ:
8139       case UNSPEC_FRINTP:
8140       case UNSPEC_FRINTM:
8141       case UNSPEC_FRINTA:
8142       case UNSPEC_FRINTN:
8143       case UNSPEC_FRINTX:
8144       case UNSPEC_FRINTI:
8145         return true;
8146
8147       default:
8148         return false;
8149     }
8150 }
8151
8152 /* Return true iff X is an rtx that will match an extr instruction
8153    i.e. as described in the *extr<mode>5_insn family of patterns.
8154    OP0 and OP1 will be set to the operands of the shifts involved
8155    on success and will be NULL_RTX otherwise.  */
8156
8157 static bool
8158 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
8159 {
8160   rtx op0, op1;
8161   scalar_int_mode mode;
8162   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
8163     return false;
8164
8165   *res_op0 = NULL_RTX;
8166   *res_op1 = NULL_RTX;
8167
8168   if (GET_CODE (x) != IOR)
8169     return false;
8170
8171   op0 = XEXP (x, 0);
8172   op1 = XEXP (x, 1);
8173
8174   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
8175       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
8176     {
8177      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
8178       if (GET_CODE (op1) == ASHIFT)
8179         std::swap (op0, op1);
8180
8181       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
8182         return false;
8183
8184       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
8185       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
8186
8187       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
8188           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
8189         {
8190           *res_op0 = XEXP (op0, 0);
8191           *res_op1 = XEXP (op1, 0);
8192           return true;
8193         }
8194     }
8195
8196   return false;
8197 }
8198
8199 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
8200    storing it in *COST.  Result is true if the total cost of the operation
8201    has now been calculated.  */
8202 static bool
8203 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
8204 {
8205   rtx inner;
8206   rtx comparator;
8207   enum rtx_code cmpcode;
8208
8209   if (COMPARISON_P (op0))
8210     {
8211       inner = XEXP (op0, 0);
8212       comparator = XEXP (op0, 1);
8213       cmpcode = GET_CODE (op0);
8214     }
8215   else
8216     {
8217       inner = op0;
8218       comparator = const0_rtx;
8219       cmpcode = NE;
8220     }
8221
8222   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
8223     {
8224       /* Conditional branch.  */
8225       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8226         return true;
8227       else
8228         {
8229           if (cmpcode == NE || cmpcode == EQ)
8230             {
8231               if (comparator == const0_rtx)
8232                 {
8233                   /* TBZ/TBNZ/CBZ/CBNZ.  */
8234                   if (GET_CODE (inner) == ZERO_EXTRACT)
8235                     /* TBZ/TBNZ.  */
8236                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
8237                                        ZERO_EXTRACT, 0, speed);
8238                   else
8239                     /* CBZ/CBNZ.  */
8240                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
8241
8242                 return true;
8243               }
8244             }
8245           else if (cmpcode == LT || cmpcode == GE)
8246             {
8247               /* TBZ/TBNZ.  */
8248               if (comparator == const0_rtx)
8249                 return true;
8250             }
8251         }
8252     }
8253   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8254     {
8255       /* CCMP.  */
8256       if (GET_CODE (op1) == COMPARE)
8257         {
8258           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
8259           if (XEXP (op1, 1) == const0_rtx)
8260             *cost += 1;
8261           if (speed)
8262             {
8263               machine_mode mode = GET_MODE (XEXP (op1, 0));
8264               const struct cpu_cost_table *extra_cost
8265                 = aarch64_tune_params.insn_extra_cost;
8266
8267               if (GET_MODE_CLASS (mode) == MODE_INT)
8268                 *cost += extra_cost->alu.arith;
8269               else
8270                 *cost += extra_cost->fp[mode == DFmode].compare;
8271             }
8272           return true;
8273         }
8274
8275       /* It's a conditional operation based on the status flags,
8276          so it must be some flavor of CSEL.  */
8277
8278       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
8279       if (GET_CODE (op1) == NEG
8280           || GET_CODE (op1) == NOT
8281           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
8282         op1 = XEXP (op1, 0);
8283       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
8284         {
8285           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
8286           op1 = XEXP (op1, 0);
8287           op2 = XEXP (op2, 0);
8288         }
8289
8290       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
8291       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
8292       return true;
8293     }
8294
8295   /* We don't know what this is, cost all operands.  */
8296   return false;
8297 }
8298
8299 /* Check whether X is a bitfield operation of the form shift + extend that
8300    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
8301    operand to which the bitfield operation is applied.  Otherwise return
8302    NULL_RTX.  */
8303
8304 static rtx
8305 aarch64_extend_bitfield_pattern_p (rtx x)
8306 {
8307   rtx_code outer_code = GET_CODE (x);
8308   machine_mode outer_mode = GET_MODE (x);
8309
8310   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
8311       && outer_mode != SImode && outer_mode != DImode)
8312     return NULL_RTX;
8313
8314   rtx inner = XEXP (x, 0);
8315   rtx_code inner_code = GET_CODE (inner);
8316   machine_mode inner_mode = GET_MODE (inner);
8317   rtx op = NULL_RTX;
8318
8319   switch (inner_code)
8320     {
8321       case ASHIFT:
8322         if (CONST_INT_P (XEXP (inner, 1))
8323             && (inner_mode == QImode || inner_mode == HImode))
8324           op = XEXP (inner, 0);
8325         break;
8326       case LSHIFTRT:
8327         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
8328             && (inner_mode == QImode || inner_mode == HImode))
8329           op = XEXP (inner, 0);
8330         break;
8331       case ASHIFTRT:
8332         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
8333             && (inner_mode == QImode || inner_mode == HImode))
8334           op = XEXP (inner, 0);
8335         break;
8336       default:
8337         break;
8338     }
8339
8340   return op;
8341 }
8342
8343 /* Return true if the mask and a shift amount from an RTX of the form
8344    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
8345    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
8346
8347 bool
8348 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
8349                                     rtx shft_amnt)
8350 {
8351   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
8352          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
8353          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
8354          && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
8355 }
8356
8357 /* Calculate the cost of calculating X, storing it in *COST.  Result
8358    is true if the total cost of the operation has now been calculated.  */
8359 static bool
8360 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
8361                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
8362 {
8363   rtx op0, op1, op2;
8364   const struct cpu_cost_table *extra_cost
8365     = aarch64_tune_params.insn_extra_cost;
8366   int code = GET_CODE (x);
8367   scalar_int_mode int_mode;
8368
8369   /* By default, assume that everything has equivalent cost to the
8370      cheapest instruction.  Any additional costs are applied as a delta
8371      above this default.  */
8372   *cost = COSTS_N_INSNS (1);
8373
8374   switch (code)
8375     {
8376     case SET:
8377       /* The cost depends entirely on the operands to SET.  */
8378       *cost = 0;
8379       op0 = SET_DEST (x);
8380       op1 = SET_SRC (x);
8381
8382       switch (GET_CODE (op0))
8383         {
8384         case MEM:
8385           if (speed)
8386             {
8387               rtx address = XEXP (op0, 0);
8388               if (VECTOR_MODE_P (mode))
8389                 *cost += extra_cost->ldst.storev;
8390               else if (GET_MODE_CLASS (mode) == MODE_INT)
8391                 *cost += extra_cost->ldst.store;
8392               else if (mode == SFmode)
8393                 *cost += extra_cost->ldst.storef;
8394               else if (mode == DFmode)
8395                 *cost += extra_cost->ldst.stored;
8396
8397               *cost +=
8398                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8399                                                      0, speed));
8400             }
8401
8402           *cost += rtx_cost (op1, mode, SET, 1, speed);
8403           return true;
8404
8405         case SUBREG:
8406           if (! REG_P (SUBREG_REG (op0)))
8407             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
8408
8409           /* Fall through.  */
8410         case REG:
8411           /* The cost is one per vector-register copied.  */
8412           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
8413             {
8414               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
8415               *cost = COSTS_N_INSNS (nregs);
8416             }
8417           /* const0_rtx is in general free, but we will use an
8418              instruction to set a register to 0.  */
8419           else if (REG_P (op1) || op1 == const0_rtx)
8420             {
8421               /* The cost is 1 per register copied.  */
8422               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
8423               *cost = COSTS_N_INSNS (nregs);
8424             }
8425           else
8426             /* Cost is just the cost of the RHS of the set.  */
8427             *cost += rtx_cost (op1, mode, SET, 1, speed);
8428           return true;
8429
8430         case ZERO_EXTRACT:
8431         case SIGN_EXTRACT:
8432           /* Bit-field insertion.  Strip any redundant widening of
8433              the RHS to meet the width of the target.  */
8434           if (GET_CODE (op1) == SUBREG)
8435             op1 = SUBREG_REG (op1);
8436           if ((GET_CODE (op1) == ZERO_EXTEND
8437                || GET_CODE (op1) == SIGN_EXTEND)
8438               && CONST_INT_P (XEXP (op0, 1))
8439               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
8440               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
8441             op1 = XEXP (op1, 0);
8442
8443           if (CONST_INT_P (op1))
8444             {
8445               /* MOV immediate is assumed to always be cheap.  */
8446               *cost = COSTS_N_INSNS (1);
8447             }
8448           else
8449             {
8450               /* BFM.  */
8451               if (speed)
8452                 *cost += extra_cost->alu.bfi;
8453               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
8454             }
8455
8456           return true;
8457
8458         default:
8459           /* We can't make sense of this, assume default cost.  */
8460           *cost = COSTS_N_INSNS (1);
8461           return false;
8462         }
8463       return false;
8464
8465     case CONST_INT:
8466       /* If an instruction can incorporate a constant within the
8467          instruction, the instruction's expression avoids calling
8468          rtx_cost() on the constant.  If rtx_cost() is called on a
8469          constant, then it is usually because the constant must be
8470          moved into a register by one or more instructions.
8471
8472          The exception is constant 0, which can be expressed
8473          as XZR/WZR and is therefore free.  The exception to this is
8474          if we have (set (reg) (const0_rtx)) in which case we must cost
8475          the move.  However, we can catch that when we cost the SET, so
8476          we don't need to consider that here.  */
8477       if (x == const0_rtx)
8478         *cost = 0;
8479       else
8480         {
8481           /* To an approximation, building any other constant is
8482              proportionally expensive to the number of instructions
8483              required to build that constant.  This is true whether we
8484              are compiling for SPEED or otherwise.  */
8485           if (!is_a <scalar_int_mode> (mode, &int_mode))
8486             int_mode = word_mode;
8487           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
8488                                  (NULL_RTX, x, false, int_mode));
8489         }
8490       return true;
8491
8492     case CONST_DOUBLE:
8493
8494       /* First determine number of instructions to do the move
8495           as an integer constant.  */
8496       if (!aarch64_float_const_representable_p (x)
8497            && !aarch64_can_const_movi_rtx_p (x, mode)
8498            && aarch64_float_const_rtx_p (x))
8499         {
8500           unsigned HOST_WIDE_INT ival;
8501           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
8502           gcc_assert (succeed);
8503
8504           scalar_int_mode imode = (mode == HFmode
8505                                    ? SImode
8506                                    : int_mode_for_mode (mode).require ());
8507           int ncost = aarch64_internal_mov_immediate
8508                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
8509           *cost += COSTS_N_INSNS (ncost);
8510           return true;
8511         }
8512
8513       if (speed)
8514         {
8515           /* mov[df,sf]_aarch64.  */
8516           if (aarch64_float_const_representable_p (x))
8517             /* FMOV (scalar immediate).  */
8518             *cost += extra_cost->fp[mode == DFmode].fpconst;
8519           else if (!aarch64_float_const_zero_rtx_p (x))
8520             {
8521               /* This will be a load from memory.  */
8522               if (mode == DFmode)
8523                 *cost += extra_cost->ldst.loadd;
8524               else
8525                 *cost += extra_cost->ldst.loadf;
8526             }
8527           else
8528             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
8529                or MOV v0.s[0], wzr - neither of which are modeled by the
8530                cost tables.  Just use the default cost.  */
8531             {
8532             }
8533         }
8534
8535       return true;
8536
8537     case MEM:
8538       if (speed)
8539         {
8540           /* For loads we want the base cost of a load, plus an
8541              approximation for the additional cost of the addressing
8542              mode.  */
8543           rtx address = XEXP (x, 0);
8544           if (VECTOR_MODE_P (mode))
8545             *cost += extra_cost->ldst.loadv;
8546           else if (GET_MODE_CLASS (mode) == MODE_INT)
8547             *cost += extra_cost->ldst.load;
8548           else if (mode == SFmode)
8549             *cost += extra_cost->ldst.loadf;
8550           else if (mode == DFmode)
8551             *cost += extra_cost->ldst.loadd;
8552
8553           *cost +=
8554                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8555                                                      0, speed));
8556         }
8557
8558       return true;
8559
8560     case NEG:
8561       op0 = XEXP (x, 0);
8562
8563       if (VECTOR_MODE_P (mode))
8564         {
8565           if (speed)
8566             {
8567               /* FNEG.  */
8568               *cost += extra_cost->vect.alu;
8569             }
8570           return false;
8571         }
8572
8573       if (GET_MODE_CLASS (mode) == MODE_INT)
8574         {
8575           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8576               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8577             {
8578               /* CSETM.  */
8579               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
8580               return true;
8581             }
8582
8583           /* Cost this as SUB wzr, X.  */
8584           op0 = CONST0_RTX (mode);
8585           op1 = XEXP (x, 0);
8586           goto cost_minus;
8587         }
8588
8589       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8590         {
8591           /* Support (neg(fma...)) as a single instruction only if
8592              sign of zeros is unimportant.  This matches the decision
8593              making in aarch64.md.  */
8594           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
8595             {
8596               /* FNMADD.  */
8597               *cost = rtx_cost (op0, mode, NEG, 0, speed);
8598               return true;
8599             }
8600           if (GET_CODE (op0) == MULT)
8601             {
8602               /* FNMUL.  */
8603               *cost = rtx_cost (op0, mode, NEG, 0, speed);
8604               return true;
8605             }
8606           if (speed)
8607             /* FNEG.  */
8608             *cost += extra_cost->fp[mode == DFmode].neg;
8609           return false;
8610         }
8611
8612       return false;
8613
8614     case CLRSB:
8615     case CLZ:
8616       if (speed)
8617         {
8618           if (VECTOR_MODE_P (mode))
8619             *cost += extra_cost->vect.alu;
8620           else
8621             *cost += extra_cost->alu.clz;
8622         }
8623
8624       return false;
8625
8626     case COMPARE:
8627       op0 = XEXP (x, 0);
8628       op1 = XEXP (x, 1);
8629
8630       if (op1 == const0_rtx
8631           && GET_CODE (op0) == AND)
8632         {
8633           x = op0;
8634           mode = GET_MODE (op0);
8635           goto cost_logic;
8636         }
8637
8638       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
8639         {
8640           /* TODO: A write to the CC flags possibly costs extra, this
8641              needs encoding in the cost tables.  */
8642
8643           mode = GET_MODE (op0);
8644           /* ANDS.  */
8645           if (GET_CODE (op0) == AND)
8646             {
8647               x = op0;
8648               goto cost_logic;
8649             }
8650
8651           if (GET_CODE (op0) == PLUS)
8652             {
8653               /* ADDS (and CMN alias).  */
8654               x = op0;
8655               goto cost_plus;
8656             }
8657
8658           if (GET_CODE (op0) == MINUS)
8659             {
8660               /* SUBS.  */
8661               x = op0;
8662               goto cost_minus;
8663             }
8664
8665           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
8666               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
8667               && CONST_INT_P (XEXP (op0, 2)))
8668             {
8669               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
8670                  Handle it here directly rather than going to cost_logic
8671                  since we know the immediate generated for the TST is valid
8672                  so we can avoid creating an intermediate rtx for it only
8673                  for costing purposes.  */
8674               if (speed)
8675                 *cost += extra_cost->alu.logical;
8676
8677               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
8678                                  ZERO_EXTRACT, 0, speed);
8679               return true;
8680             }
8681
8682           if (GET_CODE (op1) == NEG)
8683             {
8684               /* CMN.  */
8685               if (speed)
8686                 *cost += extra_cost->alu.arith;
8687
8688               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
8689               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
8690               return true;
8691             }
8692
8693           /* CMP.
8694
8695              Compare can freely swap the order of operands, and
8696              canonicalization puts the more complex operation first.
8697              But the integer MINUS logic expects the shift/extend
8698              operation in op1.  */
8699           if (! (REG_P (op0)
8700                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
8701           {
8702             op0 = XEXP (x, 1);
8703             op1 = XEXP (x, 0);
8704           }
8705           goto cost_minus;
8706         }
8707
8708       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
8709         {
8710           /* FCMP.  */
8711           if (speed)
8712             *cost += extra_cost->fp[mode == DFmode].compare;
8713
8714           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
8715             {
8716               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
8717               /* FCMP supports constant 0.0 for no extra cost. */
8718               return true;
8719             }
8720           return false;
8721         }
8722
8723       if (VECTOR_MODE_P (mode))
8724         {
8725           /* Vector compare.  */
8726           if (speed)
8727             *cost += extra_cost->vect.alu;
8728
8729           if (aarch64_float_const_zero_rtx_p (op1))
8730             {
8731               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
8732                  cost.  */
8733               return true;
8734             }
8735           return false;
8736         }
8737       return false;
8738
8739     case MINUS:
8740       {
8741         op0 = XEXP (x, 0);
8742         op1 = XEXP (x, 1);
8743
8744 cost_minus:
8745         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
8746
8747         /* Detect valid immediates.  */
8748         if ((GET_MODE_CLASS (mode) == MODE_INT
8749              || (GET_MODE_CLASS (mode) == MODE_CC
8750                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
8751             && CONST_INT_P (op1)
8752             && aarch64_uimm12_shift (INTVAL (op1)))
8753           {
8754             if (speed)
8755               /* SUB(S) (immediate).  */
8756               *cost += extra_cost->alu.arith;
8757             return true;
8758           }
8759
8760         /* Look for SUB (extended register).  */
8761         if (is_a <scalar_int_mode> (mode, &int_mode)
8762             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
8763           {
8764             if (speed)
8765               *cost += extra_cost->alu.extend_arith;
8766
8767             op1 = aarch64_strip_extend (op1, true);
8768             *cost += rtx_cost (op1, VOIDmode,
8769                                (enum rtx_code) GET_CODE (op1), 0, speed);
8770             return true;
8771           }
8772
8773         rtx new_op1 = aarch64_strip_extend (op1, false);
8774
8775         /* Cost this as an FMA-alike operation.  */
8776         if ((GET_CODE (new_op1) == MULT
8777              || aarch64_shift_p (GET_CODE (new_op1)))
8778             && code != COMPARE)
8779           {
8780             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
8781                                             (enum rtx_code) code,
8782                                             speed);
8783             return true;
8784           }
8785
8786         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
8787
8788         if (speed)
8789           {
8790             if (VECTOR_MODE_P (mode))
8791               {
8792                 /* Vector SUB.  */
8793                 *cost += extra_cost->vect.alu;
8794               }
8795             else if (GET_MODE_CLASS (mode) == MODE_INT)
8796               {
8797                 /* SUB(S).  */
8798                 *cost += extra_cost->alu.arith;
8799               }
8800             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8801               {
8802                 /* FSUB.  */
8803                 *cost += extra_cost->fp[mode == DFmode].addsub;
8804               }
8805           }
8806         return true;
8807       }
8808
8809     case PLUS:
8810       {
8811         rtx new_op0;
8812
8813         op0 = XEXP (x, 0);
8814         op1 = XEXP (x, 1);
8815
8816 cost_plus:
8817         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8818             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8819           {
8820             /* CSINC.  */
8821             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
8822             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8823             return true;
8824           }
8825
8826         if (GET_MODE_CLASS (mode) == MODE_INT
8827             && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
8828                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
8829           {
8830             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
8831
8832             if (speed)
8833               /* ADD (immediate).  */
8834               *cost += extra_cost->alu.arith;
8835             return true;
8836           }
8837
8838         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8839
8840         /* Look for ADD (extended register).  */
8841         if (is_a <scalar_int_mode> (mode, &int_mode)
8842             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
8843           {
8844             if (speed)
8845               *cost += extra_cost->alu.extend_arith;
8846
8847             op0 = aarch64_strip_extend (op0, true);
8848             *cost += rtx_cost (op0, VOIDmode,
8849                                (enum rtx_code) GET_CODE (op0), 0, speed);
8850             return true;
8851           }
8852
8853         /* Strip any extend, leave shifts behind as we will
8854            cost them through mult_cost.  */
8855         new_op0 = aarch64_strip_extend (op0, false);
8856
8857         if (GET_CODE (new_op0) == MULT
8858             || aarch64_shift_p (GET_CODE (new_op0)))
8859           {
8860             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
8861                                             speed);
8862             return true;
8863           }
8864
8865         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
8866
8867         if (speed)
8868           {
8869             if (VECTOR_MODE_P (mode))
8870               {
8871                 /* Vector ADD.  */
8872                 *cost += extra_cost->vect.alu;
8873               }
8874             else if (GET_MODE_CLASS (mode) == MODE_INT)
8875               {
8876                 /* ADD.  */
8877                 *cost += extra_cost->alu.arith;
8878               }
8879             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8880               {
8881                 /* FADD.  */
8882                 *cost += extra_cost->fp[mode == DFmode].addsub;
8883               }
8884           }
8885         return true;
8886       }
8887
8888     case BSWAP:
8889       *cost = COSTS_N_INSNS (1);
8890
8891       if (speed)
8892         {
8893           if (VECTOR_MODE_P (mode))
8894             *cost += extra_cost->vect.alu;
8895           else
8896             *cost += extra_cost->alu.rev;
8897         }
8898       return false;
8899
8900     case IOR:
8901       if (aarch_rev16_p (x))
8902         {
8903           *cost = COSTS_N_INSNS (1);
8904
8905           if (speed)
8906             {
8907               if (VECTOR_MODE_P (mode))
8908                 *cost += extra_cost->vect.alu;
8909               else
8910                 *cost += extra_cost->alu.rev;
8911             }
8912           return true;
8913         }
8914
8915       if (aarch64_extr_rtx_p (x, &op0, &op1))
8916         {
8917           *cost += rtx_cost (op0, mode, IOR, 0, speed);
8918           *cost += rtx_cost (op1, mode, IOR, 1, speed);
8919           if (speed)
8920             *cost += extra_cost->alu.shift;
8921
8922           return true;
8923         }
8924     /* Fall through.  */
8925     case XOR:
8926     case AND:
8927     cost_logic:
8928       op0 = XEXP (x, 0);
8929       op1 = XEXP (x, 1);
8930
8931       if (VECTOR_MODE_P (mode))
8932         {
8933           if (speed)
8934             *cost += extra_cost->vect.alu;
8935           return true;
8936         }
8937
8938       if (code == AND
8939           && GET_CODE (op0) == MULT
8940           && CONST_INT_P (XEXP (op0, 1))
8941           && CONST_INT_P (op1)
8942           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
8943                                INTVAL (op1)) != 0)
8944         {
8945           /* This is a UBFM/SBFM.  */
8946           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
8947           if (speed)
8948             *cost += extra_cost->alu.bfx;
8949           return true;
8950         }
8951
8952       if (is_int_mode (mode, &int_mode))
8953         {
8954           if (CONST_INT_P (op1))
8955             {
8956               /* We have a mask + shift version of a UBFIZ
8957                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
8958               if (GET_CODE (op0) == ASHIFT
8959                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
8960                                                          XEXP (op0, 1)))
8961                 {
8962                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
8963                                      (enum rtx_code) code, 0, speed);
8964                   if (speed)
8965                     *cost += extra_cost->alu.bfx;
8966
8967                   return true;
8968                 }
8969               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
8970                 {
8971                 /* We possibly get the immediate for free, this is not
8972                    modelled.  */
8973                   *cost += rtx_cost (op0, int_mode,
8974                                      (enum rtx_code) code, 0, speed);
8975                   if (speed)
8976                     *cost += extra_cost->alu.logical;
8977
8978                   return true;
8979                 }
8980             }
8981           else
8982             {
8983               rtx new_op0 = op0;
8984
8985               /* Handle ORN, EON, or BIC.  */
8986               if (GET_CODE (op0) == NOT)
8987                 op0 = XEXP (op0, 0);
8988
8989               new_op0 = aarch64_strip_shift (op0);
8990
8991               /* If we had a shift on op0 then this is a logical-shift-
8992                  by-register/immediate operation.  Otherwise, this is just
8993                  a logical operation.  */
8994               if (speed)
8995                 {
8996                   if (new_op0 != op0)
8997                     {
8998                       /* Shift by immediate.  */
8999                       if (CONST_INT_P (XEXP (op0, 1)))
9000                         *cost += extra_cost->alu.log_shift;
9001                       else
9002                         *cost += extra_cost->alu.log_shift_reg;
9003                     }
9004                   else
9005                     *cost += extra_cost->alu.logical;
9006                 }
9007
9008               /* In both cases we want to cost both operands.  */
9009               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
9010                                  0, speed);
9011               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
9012                                  1, speed);
9013
9014               return true;
9015             }
9016         }
9017       return false;
9018
9019     case NOT:
9020       x = XEXP (x, 0);
9021       op0 = aarch64_strip_shift (x);
9022
9023       if (VECTOR_MODE_P (mode))
9024         {
9025           /* Vector NOT.  */
9026           *cost += extra_cost->vect.alu;
9027           return false;
9028         }
9029
9030       /* MVN-shifted-reg.  */
9031       if (op0 != x)
9032         {
9033           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9034
9035           if (speed)
9036             *cost += extra_cost->alu.log_shift;
9037
9038           return true;
9039         }
9040       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
9041          Handle the second form here taking care that 'a' in the above can
9042          be a shift.  */
9043       else if (GET_CODE (op0) == XOR)
9044         {
9045           rtx newop0 = XEXP (op0, 0);
9046           rtx newop1 = XEXP (op0, 1);
9047           rtx op0_stripped = aarch64_strip_shift (newop0);
9048
9049           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
9050           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
9051
9052           if (speed)
9053             {
9054               if (op0_stripped != newop0)
9055                 *cost += extra_cost->alu.log_shift;
9056               else
9057                 *cost += extra_cost->alu.logical;
9058             }
9059
9060           return true;
9061         }
9062       /* MVN.  */
9063       if (speed)
9064         *cost += extra_cost->alu.logical;
9065
9066       return false;
9067
9068     case ZERO_EXTEND:
9069
9070       op0 = XEXP (x, 0);
9071       /* If a value is written in SI mode, then zero extended to DI
9072          mode, the operation will in general be free as a write to
9073          a 'w' register implicitly zeroes the upper bits of an 'x'
9074          register.  However, if this is
9075
9076            (set (reg) (zero_extend (reg)))
9077
9078          we must cost the explicit register move.  */
9079       if (mode == DImode
9080           && GET_MODE (op0) == SImode
9081           && outer == SET)
9082         {
9083           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
9084
9085         /* If OP_COST is non-zero, then the cost of the zero extend
9086            is effectively the cost of the inner operation.  Otherwise
9087            we have a MOV instruction and we take the cost from the MOV
9088            itself.  This is true independently of whether we are
9089            optimizing for space or time.  */
9090           if (op_cost)
9091             *cost = op_cost;
9092
9093           return true;
9094         }
9095       else if (MEM_P (op0))
9096         {
9097           /* All loads can zero extend to any size for free.  */
9098           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
9099           return true;
9100         }
9101
9102       op0 = aarch64_extend_bitfield_pattern_p (x);
9103       if (op0)
9104         {
9105           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
9106           if (speed)
9107             *cost += extra_cost->alu.bfx;
9108           return true;
9109         }
9110
9111       if (speed)
9112         {
9113           if (VECTOR_MODE_P (mode))
9114             {
9115               /* UMOV.  */
9116               *cost += extra_cost->vect.alu;
9117             }
9118           else
9119             {
9120               /* We generate an AND instead of UXTB/UXTH.  */
9121               *cost += extra_cost->alu.logical;
9122             }
9123         }
9124       return false;
9125
9126     case SIGN_EXTEND:
9127       if (MEM_P (XEXP (x, 0)))
9128         {
9129           /* LDRSH.  */
9130           if (speed)
9131             {
9132               rtx address = XEXP (XEXP (x, 0), 0);
9133               *cost += extra_cost->ldst.load_sign_extend;
9134
9135               *cost +=
9136                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9137                                                      0, speed));
9138             }
9139           return true;
9140         }
9141
9142       op0 = aarch64_extend_bitfield_pattern_p (x);
9143       if (op0)
9144         {
9145           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
9146           if (speed)
9147             *cost += extra_cost->alu.bfx;
9148           return true;
9149         }
9150
9151       if (speed)
9152         {
9153           if (VECTOR_MODE_P (mode))
9154             *cost += extra_cost->vect.alu;
9155           else
9156             *cost += extra_cost->alu.extend;
9157         }
9158       return false;
9159
9160     case ASHIFT:
9161       op0 = XEXP (x, 0);
9162       op1 = XEXP (x, 1);
9163
9164       if (CONST_INT_P (op1))
9165         {
9166           if (speed)
9167             {
9168               if (VECTOR_MODE_P (mode))
9169                 {
9170                   /* Vector shift (immediate).  */
9171                   *cost += extra_cost->vect.alu;
9172                 }
9173               else
9174                 {
9175                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
9176                      aliases.  */
9177                   *cost += extra_cost->alu.shift;
9178                 }
9179             }
9180
9181           /* We can incorporate zero/sign extend for free.  */
9182           if (GET_CODE (op0) == ZERO_EXTEND
9183               || GET_CODE (op0) == SIGN_EXTEND)
9184             op0 = XEXP (op0, 0);
9185
9186           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
9187           return true;
9188         }
9189       else
9190         {
9191           if (VECTOR_MODE_P (mode))
9192             {
9193               if (speed)
9194                 /* Vector shift (register).  */
9195                 *cost += extra_cost->vect.alu;
9196             }
9197           else
9198             {
9199               if (speed)
9200                 /* LSLV.  */
9201                 *cost += extra_cost->alu.shift_reg;
9202
9203               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9204                   && CONST_INT_P (XEXP (op1, 1))
9205                   && known_eq (INTVAL (XEXP (op1, 1)),
9206                                GET_MODE_BITSIZE (mode) - 1))
9207                 {
9208                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9209                   /* We already demanded XEXP (op1, 0) to be REG_P, so
9210                      don't recurse into it.  */
9211                   return true;
9212                 }
9213             }
9214           return false;  /* All arguments need to be in registers.  */
9215         }
9216
9217     case ROTATE:
9218     case ROTATERT:
9219     case LSHIFTRT:
9220     case ASHIFTRT:
9221       op0 = XEXP (x, 0);
9222       op1 = XEXP (x, 1);
9223
9224       if (CONST_INT_P (op1))
9225         {
9226           /* ASR (immediate) and friends.  */
9227           if (speed)
9228             {
9229               if (VECTOR_MODE_P (mode))
9230                 *cost += extra_cost->vect.alu;
9231               else
9232                 *cost += extra_cost->alu.shift;
9233             }
9234
9235           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9236           return true;
9237         }
9238       else
9239         {
9240           if (VECTOR_MODE_P (mode))
9241             {
9242               if (speed)
9243                 /* Vector shift (register).  */
9244                 *cost += extra_cost->vect.alu;
9245             }
9246           else
9247             {
9248               if (speed)
9249                 /* ASR (register) and friends.  */
9250                 *cost += extra_cost->alu.shift_reg;
9251
9252               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9253                   && CONST_INT_P (XEXP (op1, 1))
9254                   && known_eq (INTVAL (XEXP (op1, 1)),
9255                                GET_MODE_BITSIZE (mode) - 1))
9256                 {
9257                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9258                   /* We already demanded XEXP (op1, 0) to be REG_P, so
9259                      don't recurse into it.  */
9260                   return true;
9261                 }
9262             }
9263           return false;  /* All arguments need to be in registers.  */
9264         }
9265
9266     case SYMBOL_REF:
9267
9268       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
9269           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
9270         {
9271           /* LDR.  */
9272           if (speed)
9273             *cost += extra_cost->ldst.load;
9274         }
9275       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
9276                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
9277         {
9278           /* ADRP, followed by ADD.  */
9279           *cost += COSTS_N_INSNS (1);
9280           if (speed)
9281             *cost += 2 * extra_cost->alu.arith;
9282         }
9283       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
9284                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
9285         {
9286           /* ADR.  */
9287           if (speed)
9288             *cost += extra_cost->alu.arith;
9289         }
9290
9291       if (flag_pic)
9292         {
9293           /* One extra load instruction, after accessing the GOT.  */
9294           *cost += COSTS_N_INSNS (1);
9295           if (speed)
9296             *cost += extra_cost->ldst.load;
9297         }
9298       return true;
9299
9300     case HIGH:
9301     case LO_SUM:
9302       /* ADRP/ADD (immediate).  */
9303       if (speed)
9304         *cost += extra_cost->alu.arith;
9305       return true;
9306
9307     case ZERO_EXTRACT:
9308     case SIGN_EXTRACT:
9309       /* UBFX/SBFX.  */
9310       if (speed)
9311         {
9312           if (VECTOR_MODE_P (mode))
9313             *cost += extra_cost->vect.alu;
9314           else
9315             *cost += extra_cost->alu.bfx;
9316         }
9317
9318       /* We can trust that the immediates used will be correct (there
9319          are no by-register forms), so we need only cost op0.  */
9320       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
9321       return true;
9322
9323     case MULT:
9324       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
9325       /* aarch64_rtx_mult_cost always handles recursion to its
9326          operands.  */
9327       return true;
9328
9329     case MOD:
9330     /* We can expand signed mod by power of 2 using a NEGS, two parallel
9331        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
9332        an unconditional negate.  This case should only ever be reached through
9333        the set_smod_pow2_cheap check in expmed.c.  */
9334       if (CONST_INT_P (XEXP (x, 1))
9335           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
9336           && (mode == SImode || mode == DImode))
9337         {
9338           /* We expand to 4 instructions.  Reset the baseline.  */
9339           *cost = COSTS_N_INSNS (4);
9340
9341           if (speed)
9342             *cost += 2 * extra_cost->alu.logical
9343                      + 2 * extra_cost->alu.arith;
9344
9345           return true;
9346         }
9347
9348     /* Fall-through.  */
9349     case UMOD:
9350       if (speed)
9351         {
9352           /* Slighly prefer UMOD over SMOD.  */
9353           if (VECTOR_MODE_P (mode))
9354             *cost += extra_cost->vect.alu;
9355           else if (GET_MODE_CLASS (mode) == MODE_INT)
9356             *cost += (extra_cost->mult[mode == DImode].add
9357                       + extra_cost->mult[mode == DImode].idiv
9358                       + (code == MOD ? 1 : 0));
9359         }
9360       return false;  /* All arguments need to be in registers.  */
9361
9362     case DIV:
9363     case UDIV:
9364     case SQRT:
9365       if (speed)
9366         {
9367           if (VECTOR_MODE_P (mode))
9368             *cost += extra_cost->vect.alu;
9369           else if (GET_MODE_CLASS (mode) == MODE_INT)
9370             /* There is no integer SQRT, so only DIV and UDIV can get
9371                here.  */
9372             *cost += (extra_cost->mult[mode == DImode].idiv
9373                      /* Slighly prefer UDIV over SDIV.  */
9374                      + (code == DIV ? 1 : 0));
9375           else
9376             *cost += extra_cost->fp[mode == DFmode].div;
9377         }
9378       return false;  /* All arguments need to be in registers.  */
9379
9380     case IF_THEN_ELSE:
9381       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
9382                                          XEXP (x, 2), cost, speed);
9383
9384     case EQ:
9385     case NE:
9386     case GT:
9387     case GTU:
9388     case LT:
9389     case LTU:
9390     case GE:
9391     case GEU:
9392     case LE:
9393     case LEU:
9394
9395       return false; /* All arguments must be in registers.  */
9396
9397     case FMA:
9398       op0 = XEXP (x, 0);
9399       op1 = XEXP (x, 1);
9400       op2 = XEXP (x, 2);
9401
9402       if (speed)
9403         {
9404           if (VECTOR_MODE_P (mode))
9405             *cost += extra_cost->vect.alu;
9406           else
9407             *cost += extra_cost->fp[mode == DFmode].fma;
9408         }
9409
9410       /* FMSUB, FNMADD, and FNMSUB are free.  */
9411       if (GET_CODE (op0) == NEG)
9412         op0 = XEXP (op0, 0);
9413
9414       if (GET_CODE (op2) == NEG)
9415         op2 = XEXP (op2, 0);
9416
9417       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
9418          and the by-element operand as operand 0.  */
9419       if (GET_CODE (op1) == NEG)
9420         op1 = XEXP (op1, 0);
9421
9422       /* Catch vector-by-element operations.  The by-element operand can
9423          either be (vec_duplicate (vec_select (x))) or just
9424          (vec_select (x)), depending on whether we are multiplying by
9425          a vector or a scalar.
9426
9427          Canonicalization is not very good in these cases, FMA4 will put the
9428          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
9429       if (GET_CODE (op0) == VEC_DUPLICATE)
9430         op0 = XEXP (op0, 0);
9431       else if (GET_CODE (op1) == VEC_DUPLICATE)
9432         op1 = XEXP (op1, 0);
9433
9434       if (GET_CODE (op0) == VEC_SELECT)
9435         op0 = XEXP (op0, 0);
9436       else if (GET_CODE (op1) == VEC_SELECT)
9437         op1 = XEXP (op1, 0);
9438
9439       /* If the remaining parameters are not registers,
9440          get the cost to put them into registers.  */
9441       *cost += rtx_cost (op0, mode, FMA, 0, speed);
9442       *cost += rtx_cost (op1, mode, FMA, 1, speed);
9443       *cost += rtx_cost (op2, mode, FMA, 2, speed);
9444       return true;
9445
9446     case FLOAT:
9447     case UNSIGNED_FLOAT:
9448       if (speed)
9449         *cost += extra_cost->fp[mode == DFmode].fromint;
9450       return false;
9451
9452     case FLOAT_EXTEND:
9453       if (speed)
9454         {
9455           if (VECTOR_MODE_P (mode))
9456             {
9457               /*Vector truncate.  */
9458               *cost += extra_cost->vect.alu;
9459             }
9460           else
9461             *cost += extra_cost->fp[mode == DFmode].widen;
9462         }
9463       return false;
9464
9465     case FLOAT_TRUNCATE:
9466       if (speed)
9467         {
9468           if (VECTOR_MODE_P (mode))
9469             {
9470               /*Vector conversion.  */
9471               *cost += extra_cost->vect.alu;
9472             }
9473           else
9474             *cost += extra_cost->fp[mode == DFmode].narrow;
9475         }
9476       return false;
9477
9478     case FIX:
9479     case UNSIGNED_FIX:
9480       x = XEXP (x, 0);
9481       /* Strip the rounding part.  They will all be implemented
9482          by the fcvt* family of instructions anyway.  */
9483       if (GET_CODE (x) == UNSPEC)
9484         {
9485           unsigned int uns_code = XINT (x, 1);
9486
9487           if (uns_code == UNSPEC_FRINTA
9488               || uns_code == UNSPEC_FRINTM
9489               || uns_code == UNSPEC_FRINTN
9490               || uns_code == UNSPEC_FRINTP
9491               || uns_code == UNSPEC_FRINTZ)
9492             x = XVECEXP (x, 0, 0);
9493         }
9494
9495       if (speed)
9496         {
9497           if (VECTOR_MODE_P (mode))
9498             *cost += extra_cost->vect.alu;
9499           else
9500             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
9501         }
9502
9503       /* We can combine fmul by a power of 2 followed by a fcvt into a single
9504          fixed-point fcvt.  */
9505       if (GET_CODE (x) == MULT
9506           && ((VECTOR_MODE_P (mode)
9507                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
9508               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
9509         {
9510           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
9511                              0, speed);
9512           return true;
9513         }
9514
9515       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
9516       return true;
9517
9518     case ABS:
9519       if (VECTOR_MODE_P (mode))
9520         {
9521           /* ABS (vector).  */
9522           if (speed)
9523             *cost += extra_cost->vect.alu;
9524         }
9525       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9526         {
9527           op0 = XEXP (x, 0);
9528
9529           /* FABD, which is analogous to FADD.  */
9530           if (GET_CODE (op0) == MINUS)
9531             {
9532               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
9533               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
9534               if (speed)
9535                 *cost += extra_cost->fp[mode == DFmode].addsub;
9536
9537               return true;
9538             }
9539           /* Simple FABS is analogous to FNEG.  */
9540           if (speed)
9541             *cost += extra_cost->fp[mode == DFmode].neg;
9542         }
9543       else
9544         {
9545           /* Integer ABS will either be split to
9546              two arithmetic instructions, or will be an ABS
9547              (scalar), which we don't model.  */
9548           *cost = COSTS_N_INSNS (2);
9549           if (speed)
9550             *cost += 2 * extra_cost->alu.arith;
9551         }
9552       return false;
9553
9554     case SMAX:
9555     case SMIN:
9556       if (speed)
9557         {
9558           if (VECTOR_MODE_P (mode))
9559             *cost += extra_cost->vect.alu;
9560           else
9561             {
9562               /* FMAXNM/FMINNM/FMAX/FMIN.
9563                  TODO: This may not be accurate for all implementations, but
9564                  we do not model this in the cost tables.  */
9565               *cost += extra_cost->fp[mode == DFmode].addsub;
9566             }
9567         }
9568       return false;
9569
9570     case UNSPEC:
9571       /* The floating point round to integer frint* instructions.  */
9572       if (aarch64_frint_unspec_p (XINT (x, 1)))
9573         {
9574           if (speed)
9575             *cost += extra_cost->fp[mode == DFmode].roundint;
9576
9577           return false;
9578         }
9579
9580       if (XINT (x, 1) == UNSPEC_RBIT)
9581         {
9582           if (speed)
9583             *cost += extra_cost->alu.rev;
9584
9585           return false;
9586         }
9587       break;
9588
9589     case TRUNCATE:
9590
9591       /* Decompose <su>muldi3_highpart.  */
9592       if (/* (truncate:DI  */
9593           mode == DImode
9594           /*   (lshiftrt:TI  */
9595           && GET_MODE (XEXP (x, 0)) == TImode
9596           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
9597           /*      (mult:TI  */
9598           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
9599           /*        (ANY_EXTEND:TI (reg:DI))
9600                     (ANY_EXTEND:TI (reg:DI)))  */
9601           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
9602                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
9603               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
9604                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
9605           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
9606           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
9607           /*     (const_int 64)  */
9608           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9609           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
9610         {
9611           /* UMULH/SMULH.  */
9612           if (speed)
9613             *cost += extra_cost->mult[mode == DImode].extend;
9614           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
9615                              mode, MULT, 0, speed);
9616           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
9617                              mode, MULT, 1, speed);
9618           return true;
9619         }
9620
9621       /* Fall through.  */
9622     default:
9623       break;
9624     }
9625
9626   if (dump_file
9627       && flag_aarch64_verbose_cost)
9628     fprintf (dump_file,
9629       "\nFailed to cost RTX.  Assuming default cost.\n");
9630
9631   return true;
9632 }
9633
9634 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
9635    calculated for X.  This cost is stored in *COST.  Returns true
9636    if the total cost of X was calculated.  */
9637 static bool
9638 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
9639                    int param, int *cost, bool speed)
9640 {
9641   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
9642
9643   if (dump_file
9644       && flag_aarch64_verbose_cost)
9645     {
9646       print_rtl_single (dump_file, x);
9647       fprintf (dump_file, "\n%s cost: %d (%s)\n",
9648                speed ? "Hot" : "Cold",
9649                *cost, result ? "final" : "partial");
9650     }
9651
9652   return result;
9653 }
9654
9655 static int
9656 aarch64_register_move_cost (machine_mode mode,
9657                             reg_class_t from_i, reg_class_t to_i)
9658 {
9659   enum reg_class from = (enum reg_class) from_i;
9660   enum reg_class to = (enum reg_class) to_i;
9661   const struct cpu_regmove_cost *regmove_cost
9662     = aarch64_tune_params.regmove_cost;
9663
9664   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
9665   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
9666     to = GENERAL_REGS;
9667
9668   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
9669     from = GENERAL_REGS;
9670
9671   /* Moving between GPR and stack cost is the same as GP2GP.  */
9672   if ((from == GENERAL_REGS && to == STACK_REG)
9673       || (to == GENERAL_REGS && from == STACK_REG))
9674     return regmove_cost->GP2GP;
9675
9676   /* To/From the stack register, we move via the gprs.  */
9677   if (to == STACK_REG || from == STACK_REG)
9678     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
9679             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
9680
9681   if (known_eq (GET_MODE_SIZE (mode), 16))
9682     {
9683       /* 128-bit operations on general registers require 2 instructions.  */
9684       if (from == GENERAL_REGS && to == GENERAL_REGS)
9685         return regmove_cost->GP2GP * 2;
9686       else if (from == GENERAL_REGS)
9687         return regmove_cost->GP2FP * 2;
9688       else if (to == GENERAL_REGS)
9689         return regmove_cost->FP2GP * 2;
9690
9691       /* When AdvSIMD instructions are disabled it is not possible to move
9692          a 128-bit value directly between Q registers.  This is handled in
9693          secondary reload.  A general register is used as a scratch to move
9694          the upper DI value and the lower DI value is moved directly,
9695          hence the cost is the sum of three moves. */
9696       if (! TARGET_SIMD)
9697         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
9698
9699       return regmove_cost->FP2FP;
9700     }
9701
9702   if (from == GENERAL_REGS && to == GENERAL_REGS)
9703     return regmove_cost->GP2GP;
9704   else if (from == GENERAL_REGS)
9705     return regmove_cost->GP2FP;
9706   else if (to == GENERAL_REGS)
9707     return regmove_cost->FP2GP;
9708
9709   return regmove_cost->FP2FP;
9710 }
9711
9712 static int
9713 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
9714                           reg_class_t rclass ATTRIBUTE_UNUSED,
9715                           bool in ATTRIBUTE_UNUSED)
9716 {
9717   return aarch64_tune_params.memmov_cost;
9718 }
9719
9720 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
9721    to optimize 1.0/sqrt.  */
9722
9723 static bool
9724 use_rsqrt_p (machine_mode mode)
9725 {
9726   return (!flag_trapping_math
9727           && flag_unsafe_math_optimizations
9728           && ((aarch64_tune_params.approx_modes->recip_sqrt
9729                & AARCH64_APPROX_MODE (mode))
9730               || flag_mrecip_low_precision_sqrt));
9731 }
9732
9733 /* Function to decide when to use the approximate reciprocal square root
9734    builtin.  */
9735
9736 static tree
9737 aarch64_builtin_reciprocal (tree fndecl)
9738 {
9739   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
9740
9741   if (!use_rsqrt_p (mode))
9742     return NULL_TREE;
9743   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
9744 }
9745
9746 /* Emit instruction sequence to compute either the approximate square root
9747    or its approximate reciprocal, depending on the flag RECP, and return
9748    whether the sequence was emitted or not.  */
9749
9750 bool
9751 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
9752 {
9753   machine_mode mode = GET_MODE (dst);
9754
9755   if (GET_MODE_INNER (mode) == HFmode)
9756     {
9757       gcc_assert (!recp);
9758       return false;
9759     }
9760
9761   if (!recp)
9762     {
9763       if (!(flag_mlow_precision_sqrt
9764             || (aarch64_tune_params.approx_modes->sqrt
9765                 & AARCH64_APPROX_MODE (mode))))
9766         return false;
9767
9768       if (flag_finite_math_only
9769           || flag_trapping_math
9770           || !flag_unsafe_math_optimizations
9771           || optimize_function_for_size_p (cfun))
9772         return false;
9773     }
9774   else
9775     /* Caller assumes we cannot fail.  */
9776     gcc_assert (use_rsqrt_p (mode));
9777
9778   machine_mode mmsk = mode_for_int_vector (mode).require ();
9779   rtx xmsk = gen_reg_rtx (mmsk);
9780   if (!recp)
9781     /* When calculating the approximate square root, compare the
9782        argument with 0.0 and create a mask.  */
9783     emit_insn (gen_rtx_SET (xmsk,
9784                             gen_rtx_NEG (mmsk,
9785                                          gen_rtx_EQ (mmsk, src,
9786                                                      CONST0_RTX (mode)))));
9787
9788   /* Estimate the approximate reciprocal square root.  */
9789   rtx xdst = gen_reg_rtx (mode);
9790   emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
9791
9792   /* Iterate over the series twice for SF and thrice for DF.  */
9793   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9794
9795   /* Optionally iterate over the series once less for faster performance
9796      while sacrificing the accuracy.  */
9797   if ((recp && flag_mrecip_low_precision_sqrt)
9798       || (!recp && flag_mlow_precision_sqrt))
9799     iterations--;
9800
9801   /* Iterate over the series to calculate the approximate reciprocal square
9802      root.  */
9803   rtx x1 = gen_reg_rtx (mode);
9804   while (iterations--)
9805     {
9806       rtx x2 = gen_reg_rtx (mode);
9807       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
9808
9809       emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
9810
9811       if (iterations > 0)
9812         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
9813     }
9814
9815   if (!recp)
9816     {
9817       /* Qualify the approximate reciprocal square root when the argument is
9818          0.0 by squashing the intermediary result to 0.0.  */
9819       rtx xtmp = gen_reg_rtx (mmsk);
9820       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
9821                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
9822       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
9823
9824       /* Calculate the approximate square root.  */
9825       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
9826     }
9827
9828   /* Finalize the approximation.  */
9829   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
9830
9831   return true;
9832 }
9833
9834 /* Emit the instruction sequence to compute the approximation for the division
9835    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
9836
9837 bool
9838 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
9839 {
9840   machine_mode mode = GET_MODE (quo);
9841
9842   if (GET_MODE_INNER (mode) == HFmode)
9843     return false;
9844
9845   bool use_approx_division_p = (flag_mlow_precision_div
9846                                 || (aarch64_tune_params.approx_modes->division
9847                                     & AARCH64_APPROX_MODE (mode)));
9848
9849   if (!flag_finite_math_only
9850       || flag_trapping_math
9851       || !flag_unsafe_math_optimizations
9852       || optimize_function_for_size_p (cfun)
9853       || !use_approx_division_p)
9854     return false;
9855
9856   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
9857     return false;
9858
9859   /* Estimate the approximate reciprocal.  */
9860   rtx xrcp = gen_reg_rtx (mode);
9861   emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
9862
9863   /* Iterate over the series twice for SF and thrice for DF.  */
9864   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9865
9866   /* Optionally iterate over the series once less for faster performance,
9867      while sacrificing the accuracy.  */
9868   if (flag_mlow_precision_div)
9869     iterations--;
9870
9871   /* Iterate over the series to calculate the approximate reciprocal.  */
9872   rtx xtmp = gen_reg_rtx (mode);
9873   while (iterations--)
9874     {
9875       emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
9876
9877       if (iterations > 0)
9878         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
9879     }
9880
9881   if (num != CONST1_RTX (mode))
9882     {
9883       /* As the approximate reciprocal of DEN is already calculated, only
9884          calculate the approximate division when NUM is not 1.0.  */
9885       rtx xnum = force_reg (mode, num);
9886       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
9887     }
9888
9889   /* Finalize the approximation.  */
9890   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
9891   return true;
9892 }
9893
9894 /* Return the number of instructions that can be issued per cycle.  */
9895 static int
9896 aarch64_sched_issue_rate (void)
9897 {
9898   return aarch64_tune_params.issue_rate;
9899 }
9900
9901 static int
9902 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
9903 {
9904   int issue_rate = aarch64_sched_issue_rate ();
9905
9906   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
9907 }
9908
9909
9910 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
9911    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
9912    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
9913
9914 static int
9915 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
9916                                                     int ready_index)
9917 {
9918   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
9919 }
9920
9921
9922 /* Vectorizer cost model target hooks.  */
9923
9924 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
9925 static int
9926 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
9927                                     tree vectype,
9928                                     int misalign ATTRIBUTE_UNUSED)
9929 {
9930   unsigned elements;
9931   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
9932   bool fp = false;
9933
9934   if (vectype != NULL)
9935     fp = FLOAT_TYPE_P (vectype);
9936
9937   switch (type_of_cost)
9938     {
9939       case scalar_stmt:
9940         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
9941
9942       case scalar_load:
9943         return costs->scalar_load_cost;
9944
9945       case scalar_store:
9946         return costs->scalar_store_cost;
9947
9948       case vector_stmt:
9949         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
9950
9951       case vector_load:
9952         return costs->vec_align_load_cost;
9953
9954       case vector_store:
9955         return costs->vec_store_cost;
9956
9957       case vec_to_scalar:
9958         return costs->vec_to_scalar_cost;
9959
9960       case scalar_to_vec:
9961         return costs->scalar_to_vec_cost;
9962
9963       case unaligned_load:
9964       case vector_gather_load:
9965         return costs->vec_unalign_load_cost;
9966
9967       case unaligned_store:
9968       case vector_scatter_store:
9969         return costs->vec_unalign_store_cost;
9970
9971       case cond_branch_taken:
9972         return costs->cond_taken_branch_cost;
9973
9974       case cond_branch_not_taken:
9975         return costs->cond_not_taken_branch_cost;
9976
9977       case vec_perm:
9978         return costs->vec_permute_cost;
9979
9980       case vec_promote_demote:
9981         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
9982
9983       case vec_construct:
9984         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
9985         return elements / 2 + 1;
9986
9987       default:
9988         gcc_unreachable ();
9989     }
9990 }
9991
9992 /* Implement targetm.vectorize.add_stmt_cost.  */
9993 static unsigned
9994 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
9995                        struct _stmt_vec_info *stmt_info, int misalign,
9996                        enum vect_cost_model_location where)
9997 {
9998   unsigned *cost = (unsigned *) data;
9999   unsigned retval = 0;
10000
10001   if (flag_vect_cost_model)
10002     {
10003       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
10004       int stmt_cost =
10005             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
10006
10007       /* Statements in an inner loop relative to the loop being
10008          vectorized are weighted more heavily.  The value here is
10009          arbitrary and could potentially be improved with analysis.  */
10010       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
10011         count *= 50; /*  FIXME  */
10012
10013       retval = (unsigned) (count * stmt_cost);
10014       cost[where] += retval;
10015     }
10016
10017   return retval;
10018 }
10019
10020 static void initialize_aarch64_code_model (struct gcc_options *);
10021
10022 /* Parse the TO_PARSE string and put the architecture struct that it
10023    selects into RES and the architectural features into ISA_FLAGS.
10024    Return an aarch64_parse_opt_result describing the parse result.
10025    If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
10026
10027 static enum aarch64_parse_opt_result
10028 aarch64_parse_arch (const char *to_parse, const struct processor **res,
10029                     unsigned long *isa_flags)
10030 {
10031   char *ext;
10032   const struct processor *arch;
10033   char *str = (char *) alloca (strlen (to_parse) + 1);
10034   size_t len;
10035
10036   strcpy (str, to_parse);
10037
10038   ext = strchr (str, '+');
10039
10040   if (ext != NULL)
10041     len = ext - str;
10042   else
10043     len = strlen (str);
10044
10045   if (len == 0)
10046     return AARCH64_PARSE_MISSING_ARG;
10047
10048
10049   /* Loop through the list of supported ARCHes to find a match.  */
10050   for (arch = all_architectures; arch->name != NULL; arch++)
10051     {
10052       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
10053         {
10054           unsigned long isa_temp = arch->flags;
10055
10056           if (ext != NULL)
10057             {
10058               /* TO_PARSE string contains at least one extension.  */
10059               enum aarch64_parse_opt_result ext_res
10060                 = aarch64_parse_extension (ext, &isa_temp);
10061
10062               if (ext_res != AARCH64_PARSE_OK)
10063                 return ext_res;
10064             }
10065           /* Extension parsing was successful.  Confirm the result
10066              arch and ISA flags.  */
10067           *res = arch;
10068           *isa_flags = isa_temp;
10069           return AARCH64_PARSE_OK;
10070         }
10071     }
10072
10073   /* ARCH name not found in list.  */
10074   return AARCH64_PARSE_INVALID_ARG;
10075 }
10076
10077 /* Parse the TO_PARSE string and put the result tuning in RES and the
10078    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
10079    describing the parse result.  If there is an error parsing, RES and
10080    ISA_FLAGS are left unchanged.  */
10081
10082 static enum aarch64_parse_opt_result
10083 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
10084                    unsigned long *isa_flags)
10085 {
10086   char *ext;
10087   const struct processor *cpu;
10088   char *str = (char *) alloca (strlen (to_parse) + 1);
10089   size_t len;
10090
10091   strcpy (str, to_parse);
10092
10093   ext = strchr (str, '+');
10094
10095   if (ext != NULL)
10096     len = ext - str;
10097   else
10098     len = strlen (str);
10099
10100   if (len == 0)
10101     return AARCH64_PARSE_MISSING_ARG;
10102
10103
10104   /* Loop through the list of supported CPUs to find a match.  */
10105   for (cpu = all_cores; cpu->name != NULL; cpu++)
10106     {
10107       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
10108         {
10109           unsigned long isa_temp = cpu->flags;
10110
10111
10112           if (ext != NULL)
10113             {
10114               /* TO_PARSE string contains at least one extension.  */
10115               enum aarch64_parse_opt_result ext_res
10116                 = aarch64_parse_extension (ext, &isa_temp);
10117
10118               if (ext_res != AARCH64_PARSE_OK)
10119                 return ext_res;
10120             }
10121           /* Extension parsing was successfull.  Confirm the result
10122              cpu and ISA flags.  */
10123           *res = cpu;
10124           *isa_flags = isa_temp;
10125           return AARCH64_PARSE_OK;
10126         }
10127     }
10128
10129   /* CPU name not found in list.  */
10130   return AARCH64_PARSE_INVALID_ARG;
10131 }
10132
10133 /* Parse the TO_PARSE string and put the cpu it selects into RES.
10134    Return an aarch64_parse_opt_result describing the parse result.
10135    If the parsing fails the RES does not change.  */
10136
10137 static enum aarch64_parse_opt_result
10138 aarch64_parse_tune (const char *to_parse, const struct processor **res)
10139 {
10140   const struct processor *cpu;
10141   char *str = (char *) alloca (strlen (to_parse) + 1);
10142
10143   strcpy (str, to_parse);
10144
10145   /* Loop through the list of supported CPUs to find a match.  */
10146   for (cpu = all_cores; cpu->name != NULL; cpu++)
10147     {
10148       if (strcmp (cpu->name, str) == 0)
10149         {
10150           *res = cpu;
10151           return AARCH64_PARSE_OK;
10152         }
10153     }
10154
10155   /* CPU name not found in list.  */
10156   return AARCH64_PARSE_INVALID_ARG;
10157 }
10158
10159 /* Parse TOKEN, which has length LENGTH to see if it is an option
10160    described in FLAG.  If it is, return the index bit for that fusion type.
10161    If not, error (printing OPTION_NAME) and return zero.  */
10162
10163 static unsigned int
10164 aarch64_parse_one_option_token (const char *token,
10165                                 size_t length,
10166                                 const struct aarch64_flag_desc *flag,
10167                                 const char *option_name)
10168 {
10169   for (; flag->name != NULL; flag++)
10170     {
10171       if (length == strlen (flag->name)
10172           && !strncmp (flag->name, token, length))
10173         return flag->flag;
10174     }
10175
10176   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
10177   return 0;
10178 }
10179
10180 /* Parse OPTION which is a comma-separated list of flags to enable.
10181    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
10182    default state we inherit from the CPU tuning structures.  OPTION_NAME
10183    gives the top-level option we are parsing in the -moverride string,
10184    for use in error messages.  */
10185
10186 static unsigned int
10187 aarch64_parse_boolean_options (const char *option,
10188                                const struct aarch64_flag_desc *flags,
10189                                unsigned int initial_state,
10190                                const char *option_name)
10191 {
10192   const char separator = '.';
10193   const char* specs = option;
10194   const char* ntoken = option;
10195   unsigned int found_flags = initial_state;
10196
10197   while ((ntoken = strchr (specs, separator)))
10198     {
10199       size_t token_length = ntoken - specs;
10200       unsigned token_ops = aarch64_parse_one_option_token (specs,
10201                                                            token_length,
10202                                                            flags,
10203                                                            option_name);
10204       /* If we find "none" (or, for simplicity's sake, an error) anywhere
10205          in the token stream, reset the supported operations.  So:
10206
10207            adrp+add.cmp+branch.none.adrp+add
10208
10209            would have the result of turning on only adrp+add fusion.  */
10210       if (!token_ops)
10211         found_flags = 0;
10212
10213       found_flags |= token_ops;
10214       specs = ++ntoken;
10215     }
10216
10217   /* We ended with a comma, print something.  */
10218   if (!(*specs))
10219     {
10220       error ("%s string ill-formed\n", option_name);
10221       return 0;
10222     }
10223
10224   /* We still have one more token to parse.  */
10225   size_t token_length = strlen (specs);
10226   unsigned token_ops = aarch64_parse_one_option_token (specs,
10227                                                        token_length,
10228                                                        flags,
10229                                                        option_name);
10230    if (!token_ops)
10231      found_flags = 0;
10232
10233   found_flags |= token_ops;
10234   return found_flags;
10235 }
10236
10237 /* Support for overriding instruction fusion.  */
10238
10239 static void
10240 aarch64_parse_fuse_string (const char *fuse_string,
10241                             struct tune_params *tune)
10242 {
10243   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
10244                                                      aarch64_fusible_pairs,
10245                                                      tune->fusible_ops,
10246                                                      "fuse=");
10247 }
10248
10249 /* Support for overriding other tuning flags.  */
10250
10251 static void
10252 aarch64_parse_tune_string (const char *tune_string,
10253                             struct tune_params *tune)
10254 {
10255   tune->extra_tuning_flags
10256     = aarch64_parse_boolean_options (tune_string,
10257                                      aarch64_tuning_flags,
10258                                      tune->extra_tuning_flags,
10259                                      "tune=");
10260 }
10261
10262 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
10263    we understand.  If it is, extract the option string and handoff to
10264    the appropriate function.  */
10265
10266 void
10267 aarch64_parse_one_override_token (const char* token,
10268                                   size_t length,
10269                                   struct tune_params *tune)
10270 {
10271   const struct aarch64_tuning_override_function *fn
10272     = aarch64_tuning_override_functions;
10273
10274   const char *option_part = strchr (token, '=');
10275   if (!option_part)
10276     {
10277       error ("tuning string missing in option (%s)", token);
10278       return;
10279     }
10280
10281   /* Get the length of the option name.  */
10282   length = option_part - token;
10283   /* Skip the '=' to get to the option string.  */
10284   option_part++;
10285
10286   for (; fn->name != NULL; fn++)
10287     {
10288       if (!strncmp (fn->name, token, length))
10289         {
10290           fn->parse_override (option_part, tune);
10291           return;
10292         }
10293     }
10294
10295   error ("unknown tuning option (%s)",token);
10296   return;
10297 }
10298
10299 /* A checking mechanism for the implementation of the tls size.  */
10300
10301 static void
10302 initialize_aarch64_tls_size (struct gcc_options *opts)
10303 {
10304   if (aarch64_tls_size == 0)
10305     aarch64_tls_size = 24;
10306
10307   switch (opts->x_aarch64_cmodel_var)
10308     {
10309     case AARCH64_CMODEL_TINY:
10310       /* Both the default and maximum TLS size allowed under tiny is 1M which
10311          needs two instructions to address, so we clamp the size to 24.  */
10312       if (aarch64_tls_size > 24)
10313         aarch64_tls_size = 24;
10314       break;
10315     case AARCH64_CMODEL_SMALL:
10316       /* The maximum TLS size allowed under small is 4G.  */
10317       if (aarch64_tls_size > 32)
10318         aarch64_tls_size = 32;
10319       break;
10320     case AARCH64_CMODEL_LARGE:
10321       /* The maximum TLS size allowed under large is 16E.
10322          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
10323       if (aarch64_tls_size > 48)
10324         aarch64_tls_size = 48;
10325       break;
10326     default:
10327       gcc_unreachable ();
10328     }
10329
10330   return;
10331 }
10332
10333 /* Parse STRING looking for options in the format:
10334      string     :: option:string
10335      option     :: name=substring
10336      name       :: {a-z}
10337      substring  :: defined by option.  */
10338
10339 static void
10340 aarch64_parse_override_string (const char* input_string,
10341                                struct tune_params* tune)
10342 {
10343   const char separator = ':';
10344   size_t string_length = strlen (input_string) + 1;
10345   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
10346   char *string = string_root;
10347   strncpy (string, input_string, string_length);
10348   string[string_length - 1] = '\0';
10349
10350   char* ntoken = string;
10351
10352   while ((ntoken = strchr (string, separator)))
10353     {
10354       size_t token_length = ntoken - string;
10355       /* Make this substring look like a string.  */
10356       *ntoken = '\0';
10357       aarch64_parse_one_override_token (string, token_length, tune);
10358       string = ++ntoken;
10359     }
10360
10361   /* One last option to parse.  */
10362   aarch64_parse_one_override_token (string, strlen (string), tune);
10363   free (string_root);
10364 }
10365
10366
10367 static void
10368 aarch64_override_options_after_change_1 (struct gcc_options *opts)
10369 {
10370   /* PR 70044: We have to be careful about being called multiple times for the
10371      same function.  This means all changes should be repeatable.  */
10372
10373   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
10374      Disable the frame pointer flag so the mid-end will not use a frame
10375      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
10376      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
10377      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
10378   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
10379   if (opts->x_flag_omit_frame_pointer == 0)
10380     opts->x_flag_omit_frame_pointer = 2;
10381
10382   /* If not optimizing for size, set the default
10383      alignment to what the target wants.  */
10384   if (!opts->x_optimize_size)
10385     {
10386       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
10387         opts->x_str_align_loops = aarch64_tune_params.loop_align;
10388       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
10389         opts->x_str_align_jumps = aarch64_tune_params.jump_align;
10390       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
10391         opts->x_str_align_functions = aarch64_tune_params.function_align;
10392     }
10393
10394   /* We default to no pc-relative literal loads.  */
10395
10396   aarch64_pcrelative_literal_loads = false;
10397
10398   /* If -mpc-relative-literal-loads is set on the command line, this
10399      implies that the user asked for PC relative literal loads.  */
10400   if (opts->x_pcrelative_literal_loads == 1)
10401     aarch64_pcrelative_literal_loads = true;
10402
10403   /* In the tiny memory model it makes no sense to disallow PC relative
10404      literal pool loads.  */
10405   if (aarch64_cmodel == AARCH64_CMODEL_TINY
10406       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10407     aarch64_pcrelative_literal_loads = true;
10408
10409   /* When enabling the lower precision Newton series for the square root, also
10410      enable it for the reciprocal square root, since the latter is an
10411      intermediary step for the former.  */
10412   if (flag_mlow_precision_sqrt)
10413     flag_mrecip_low_precision_sqrt = true;
10414 }
10415
10416 /* 'Unpack' up the internal tuning structs and update the options
10417     in OPTS.  The caller must have set up selected_tune and selected_arch
10418     as all the other target-specific codegen decisions are
10419     derived from them.  */
10420
10421 void
10422 aarch64_override_options_internal (struct gcc_options *opts)
10423 {
10424   aarch64_tune_flags = selected_tune->flags;
10425   aarch64_tune = selected_tune->sched_core;
10426   /* Make a copy of the tuning parameters attached to the core, which
10427      we may later overwrite.  */
10428   aarch64_tune_params = *(selected_tune->tune);
10429   aarch64_architecture_version = selected_arch->architecture_version;
10430
10431   if (opts->x_aarch64_override_tune_string)
10432     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
10433                                   &aarch64_tune_params);
10434
10435   /* This target defaults to strict volatile bitfields.  */
10436   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
10437     opts->x_flag_strict_volatile_bitfields = 1;
10438
10439   initialize_aarch64_code_model (opts);
10440   initialize_aarch64_tls_size (opts);
10441
10442   int queue_depth = 0;
10443   switch (aarch64_tune_params.autoprefetcher_model)
10444     {
10445       case tune_params::AUTOPREFETCHER_OFF:
10446         queue_depth = -1;
10447         break;
10448       case tune_params::AUTOPREFETCHER_WEAK:
10449         queue_depth = 0;
10450         break;
10451       case tune_params::AUTOPREFETCHER_STRONG:
10452         queue_depth = max_insn_queue_index + 1;
10453         break;
10454       default:
10455         gcc_unreachable ();
10456     }
10457
10458   /* We don't mind passing in global_options_set here as we don't use
10459      the *options_set structs anyway.  */
10460   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
10461                          queue_depth,
10462                          opts->x_param_values,
10463                          global_options_set.x_param_values);
10464
10465   /* Set up parameters to be used in prefetching algorithm.  Do not
10466      override the defaults unless we are tuning for a core we have
10467      researched values for.  */
10468   if (aarch64_tune_params.prefetch->num_slots > 0)
10469     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
10470                            aarch64_tune_params.prefetch->num_slots,
10471                            opts->x_param_values,
10472                            global_options_set.x_param_values);
10473   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
10474     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
10475                            aarch64_tune_params.prefetch->l1_cache_size,
10476                            opts->x_param_values,
10477                            global_options_set.x_param_values);
10478   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
10479     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
10480                            aarch64_tune_params.prefetch->l1_cache_line_size,
10481                            opts->x_param_values,
10482                            global_options_set.x_param_values);
10483   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
10484     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
10485                            aarch64_tune_params.prefetch->l2_cache_size,
10486                            opts->x_param_values,
10487                            global_options_set.x_param_values);
10488   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
10489     maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
10490                            0,
10491                            opts->x_param_values,
10492                            global_options_set.x_param_values);
10493   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
10494     maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
10495                            aarch64_tune_params.prefetch->minimum_stride,
10496                            opts->x_param_values,
10497                            global_options_set.x_param_values);
10498
10499   /* Use the alternative scheduling-pressure algorithm by default.  */
10500   maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
10501                          opts->x_param_values,
10502                          global_options_set.x_param_values);
10503
10504   /* Enable sw prefetching at specified optimization level for
10505      CPUS that have prefetch.  Lower optimization level threshold by 1
10506      when profiling is enabled.  */
10507   if (opts->x_flag_prefetch_loop_arrays < 0
10508       && !opts->x_optimize_size
10509       && aarch64_tune_params.prefetch->default_opt_level >= 0
10510       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
10511     opts->x_flag_prefetch_loop_arrays = 1;
10512
10513   if (opts->x_aarch64_arch_string == NULL)
10514     opts->x_aarch64_arch_string = selected_arch->name;
10515   if (opts->x_aarch64_cpu_string == NULL)
10516     opts->x_aarch64_cpu_string = selected_cpu->name;
10517   if (opts->x_aarch64_tune_string == NULL)
10518     opts->x_aarch64_tune_string = selected_tune->name;
10519
10520   aarch64_override_options_after_change_1 (opts);
10521 }
10522
10523 /* Print a hint with a suggestion for a core or architecture name that
10524    most closely resembles what the user passed in STR.  ARCH is true if
10525    the user is asking for an architecture name.  ARCH is false if the user
10526    is asking for a core name.  */
10527
10528 static void
10529 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
10530 {
10531   auto_vec<const char *> candidates;
10532   const struct processor *entry = arch ? all_architectures : all_cores;
10533   for (; entry->name != NULL; entry++)
10534     candidates.safe_push (entry->name);
10535
10536 #ifdef HAVE_LOCAL_CPU_DETECT
10537   /* Add also "native" as possible value.  */
10538   if (arch)
10539     candidates.safe_push ("native");
10540 #endif
10541
10542   char *s;
10543   const char *hint = candidates_list_and_hint (str, s, candidates);
10544   if (hint)
10545     inform (input_location, "valid arguments are: %s;"
10546                              " did you mean %qs?", s, hint);
10547   else
10548     inform (input_location, "valid arguments are: %s", s);
10549
10550   XDELETEVEC (s);
10551 }
10552
10553 /* Print a hint with a suggestion for a core name that most closely resembles
10554    what the user passed in STR.  */
10555
10556 inline static void
10557 aarch64_print_hint_for_core (const char *str)
10558 {
10559   aarch64_print_hint_for_core_or_arch (str, false);
10560 }
10561
10562 /* Print a hint with a suggestion for an architecture name that most closely
10563    resembles what the user passed in STR.  */
10564
10565 inline static void
10566 aarch64_print_hint_for_arch (const char *str)
10567 {
10568   aarch64_print_hint_for_core_or_arch (str, true);
10569 }
10570
10571 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
10572    specified in STR and throw errors if appropriate.  Put the results if
10573    they are valid in RES and ISA_FLAGS.  Return whether the option is
10574    valid.  */
10575
10576 static bool
10577 aarch64_validate_mcpu (const char *str, const struct processor **res,
10578                        unsigned long *isa_flags)
10579 {
10580   enum aarch64_parse_opt_result parse_res
10581     = aarch64_parse_cpu (str, res, isa_flags);
10582
10583   if (parse_res == AARCH64_PARSE_OK)
10584     return true;
10585
10586   switch (parse_res)
10587     {
10588       case AARCH64_PARSE_MISSING_ARG:
10589         error ("missing cpu name in %<-mcpu=%s%>", str);
10590         break;
10591       case AARCH64_PARSE_INVALID_ARG:
10592         error ("unknown value %qs for -mcpu", str);
10593         aarch64_print_hint_for_core (str);
10594         break;
10595       case AARCH64_PARSE_INVALID_FEATURE:
10596         error ("invalid feature modifier in %<-mcpu=%s%>", str);
10597         break;
10598       default:
10599         gcc_unreachable ();
10600     }
10601
10602   return false;
10603 }
10604
10605 /* Validate a command-line -march option.  Parse the arch and extensions
10606    (if any) specified in STR and throw errors if appropriate.  Put the
10607    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
10608    option is valid.  */
10609
10610 static bool
10611 aarch64_validate_march (const char *str, const struct processor **res,
10612                          unsigned long *isa_flags)
10613 {
10614   enum aarch64_parse_opt_result parse_res
10615     = aarch64_parse_arch (str, res, isa_flags);
10616
10617   if (parse_res == AARCH64_PARSE_OK)
10618     return true;
10619
10620   switch (parse_res)
10621     {
10622       case AARCH64_PARSE_MISSING_ARG:
10623         error ("missing arch name in %<-march=%s%>", str);
10624         break;
10625       case AARCH64_PARSE_INVALID_ARG:
10626         error ("unknown value %qs for -march", str);
10627         aarch64_print_hint_for_arch (str);
10628         break;
10629       case AARCH64_PARSE_INVALID_FEATURE:
10630         error ("invalid feature modifier in %<-march=%s%>", str);
10631         break;
10632       default:
10633         gcc_unreachable ();
10634     }
10635
10636   return false;
10637 }
10638
10639 /* Validate a command-line -mtune option.  Parse the cpu
10640    specified in STR and throw errors if appropriate.  Put the
10641    result, if it is valid, in RES.  Return whether the option is
10642    valid.  */
10643
10644 static bool
10645 aarch64_validate_mtune (const char *str, const struct processor **res)
10646 {
10647   enum aarch64_parse_opt_result parse_res
10648     = aarch64_parse_tune (str, res);
10649
10650   if (parse_res == AARCH64_PARSE_OK)
10651     return true;
10652
10653   switch (parse_res)
10654     {
10655       case AARCH64_PARSE_MISSING_ARG:
10656         error ("missing cpu name in %<-mtune=%s%>", str);
10657         break;
10658       case AARCH64_PARSE_INVALID_ARG:
10659         error ("unknown value %qs for -mtune", str);
10660         aarch64_print_hint_for_core (str);
10661         break;
10662       default:
10663         gcc_unreachable ();
10664     }
10665   return false;
10666 }
10667
10668 /* Return the CPU corresponding to the enum CPU.
10669    If it doesn't specify a cpu, return the default.  */
10670
10671 static const struct processor *
10672 aarch64_get_tune_cpu (enum aarch64_processor cpu)
10673 {
10674   if (cpu != aarch64_none)
10675     return &all_cores[cpu];
10676
10677   /* The & 0x3f is to extract the bottom 6 bits that encode the
10678      default cpu as selected by the --with-cpu GCC configure option
10679      in config.gcc.
10680      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
10681      flags mechanism should be reworked to make it more sane.  */
10682   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10683 }
10684
10685 /* Return the architecture corresponding to the enum ARCH.
10686    If it doesn't specify a valid architecture, return the default.  */
10687
10688 static const struct processor *
10689 aarch64_get_arch (enum aarch64_arch arch)
10690 {
10691   if (arch != aarch64_no_arch)
10692     return &all_architectures[arch];
10693
10694   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10695
10696   return &all_architectures[cpu->arch];
10697 }
10698
10699 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
10700
10701 static poly_uint16
10702 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
10703 {
10704   /* For now generate vector-length agnostic code for -msve-vector-bits=128.
10705      This ensures we can clearly distinguish SVE and Advanced SIMD modes when
10706      deciding which .md file patterns to use and when deciding whether
10707      something is a legitimate address or constant.  */
10708   if (value == SVE_SCALABLE || value == SVE_128)
10709     return poly_uint16 (2, 2);
10710   else
10711     return (int) value / 64;
10712 }
10713
10714 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
10715    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
10716    tuning structs.  In particular it must set selected_tune and
10717    aarch64_isa_flags that define the available ISA features and tuning
10718    decisions.  It must also set selected_arch as this will be used to
10719    output the .arch asm tags for each function.  */
10720
10721 static void
10722 aarch64_override_options (void)
10723 {
10724   unsigned long cpu_isa = 0;
10725   unsigned long arch_isa = 0;
10726   aarch64_isa_flags = 0;
10727
10728   bool valid_cpu = true;
10729   bool valid_tune = true;
10730   bool valid_arch = true;
10731
10732   selected_cpu = NULL;
10733   selected_arch = NULL;
10734   selected_tune = NULL;
10735
10736   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
10737      If either of -march or -mtune is given, they override their
10738      respective component of -mcpu.  */
10739   if (aarch64_cpu_string)
10740     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
10741                                         &cpu_isa);
10742
10743   if (aarch64_arch_string)
10744     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
10745                                           &arch_isa);
10746
10747   if (aarch64_tune_string)
10748     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
10749
10750   /* If the user did not specify a processor, choose the default
10751      one for them.  This will be the CPU set during configuration using
10752      --with-cpu, otherwise it is "generic".  */
10753   if (!selected_cpu)
10754     {
10755       if (selected_arch)
10756         {
10757           selected_cpu = &all_cores[selected_arch->ident];
10758           aarch64_isa_flags = arch_isa;
10759           explicit_arch = selected_arch->arch;
10760         }
10761       else
10762         {
10763           /* Get default configure-time CPU.  */
10764           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
10765           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
10766         }
10767
10768       if (selected_tune)
10769         explicit_tune_core = selected_tune->ident;
10770     }
10771   /* If both -mcpu and -march are specified check that they are architecturally
10772      compatible, warn if they're not and prefer the -march ISA flags.  */
10773   else if (selected_arch)
10774     {
10775       if (selected_arch->arch != selected_cpu->arch)
10776         {
10777           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
10778                        all_architectures[selected_cpu->arch].name,
10779                        selected_arch->name);
10780         }
10781       aarch64_isa_flags = arch_isa;
10782       explicit_arch = selected_arch->arch;
10783       explicit_tune_core = selected_tune ? selected_tune->ident
10784                                           : selected_cpu->ident;
10785     }
10786   else
10787     {
10788       /* -mcpu but no -march.  */
10789       aarch64_isa_flags = cpu_isa;
10790       explicit_tune_core = selected_tune ? selected_tune->ident
10791                                           : selected_cpu->ident;
10792       gcc_assert (selected_cpu);
10793       selected_arch = &all_architectures[selected_cpu->arch];
10794       explicit_arch = selected_arch->arch;
10795     }
10796
10797   /* Set the arch as well as we will need it when outputing
10798      the .arch directive in assembly.  */
10799   if (!selected_arch)
10800     {
10801       gcc_assert (selected_cpu);
10802       selected_arch = &all_architectures[selected_cpu->arch];
10803     }
10804
10805   if (!selected_tune)
10806     selected_tune = selected_cpu;
10807
10808 #ifndef HAVE_AS_MABI_OPTION
10809   /* The compiler may have been configured with 2.23.* binutils, which does
10810      not have support for ILP32.  */
10811   if (TARGET_ILP32)
10812     error ("assembler does not support -mabi=ilp32");
10813 #endif
10814
10815   /* Convert -msve-vector-bits to a VG count.  */
10816   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
10817
10818   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
10819     sorry ("return address signing is only supported for -mabi=lp64");
10820
10821   /* Make sure we properly set up the explicit options.  */
10822   if ((aarch64_cpu_string && valid_cpu)
10823        || (aarch64_tune_string && valid_tune))
10824     gcc_assert (explicit_tune_core != aarch64_none);
10825
10826   if ((aarch64_cpu_string && valid_cpu)
10827        || (aarch64_arch_string && valid_arch))
10828     gcc_assert (explicit_arch != aarch64_no_arch);
10829
10830   aarch64_override_options_internal (&global_options);
10831
10832   /* Save these options as the default ones in case we push and pop them later
10833      while processing functions with potential target attributes.  */
10834   target_option_default_node = target_option_current_node
10835       = build_target_option_node (&global_options);
10836 }
10837
10838 /* Implement targetm.override_options_after_change.  */
10839
10840 static void
10841 aarch64_override_options_after_change (void)
10842 {
10843   aarch64_override_options_after_change_1 (&global_options);
10844 }
10845
10846 static struct machine_function *
10847 aarch64_init_machine_status (void)
10848 {
10849   struct machine_function *machine;
10850   machine = ggc_cleared_alloc<machine_function> ();
10851   return machine;
10852 }
10853
10854 void
10855 aarch64_init_expanders (void)
10856 {
10857   init_machine_status = aarch64_init_machine_status;
10858 }
10859
10860 /* A checking mechanism for the implementation of the various code models.  */
10861 static void
10862 initialize_aarch64_code_model (struct gcc_options *opts)
10863 {
10864    if (opts->x_flag_pic)
10865      {
10866        switch (opts->x_aarch64_cmodel_var)
10867          {
10868          case AARCH64_CMODEL_TINY:
10869            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
10870            break;
10871          case AARCH64_CMODEL_SMALL:
10872 #ifdef HAVE_AS_SMALL_PIC_RELOCS
10873            aarch64_cmodel = (flag_pic == 2
10874                              ? AARCH64_CMODEL_SMALL_PIC
10875                              : AARCH64_CMODEL_SMALL_SPIC);
10876 #else
10877            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
10878 #endif
10879            break;
10880          case AARCH64_CMODEL_LARGE:
10881            sorry ("code model %qs with -f%s", "large",
10882                   opts->x_flag_pic > 1 ? "PIC" : "pic");
10883            break;
10884          default:
10885            gcc_unreachable ();
10886          }
10887      }
10888    else
10889      aarch64_cmodel = opts->x_aarch64_cmodel_var;
10890 }
10891
10892 /* Implement TARGET_OPTION_SAVE.  */
10893
10894 static void
10895 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
10896 {
10897   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
10898 }
10899
10900 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
10901    using the information saved in PTR.  */
10902
10903 static void
10904 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
10905 {
10906   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
10907   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
10908   opts->x_explicit_arch = ptr->x_explicit_arch;
10909   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
10910   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
10911
10912   aarch64_override_options_internal (opts);
10913 }
10914
10915 /* Implement TARGET_OPTION_PRINT.  */
10916
10917 static void
10918 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
10919 {
10920   const struct processor *cpu
10921     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
10922   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
10923   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
10924   std::string extension
10925     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
10926
10927   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
10928   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
10929            arch->name, extension.c_str ());
10930 }
10931
10932 static GTY(()) tree aarch64_previous_fndecl;
10933
10934 void
10935 aarch64_reset_previous_fndecl (void)
10936 {
10937   aarch64_previous_fndecl = NULL;
10938 }
10939
10940 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
10941    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
10942    make sure optab availability predicates are recomputed when necessary.  */
10943
10944 void
10945 aarch64_save_restore_target_globals (tree new_tree)
10946 {
10947   if (TREE_TARGET_GLOBALS (new_tree))
10948     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
10949   else if (new_tree == target_option_default_node)
10950     restore_target_globals (&default_target_globals);
10951   else
10952     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
10953 }
10954
10955 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
10956    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
10957    of the function, if such exists.  This function may be called multiple
10958    times on a single function so use aarch64_previous_fndecl to avoid
10959    setting up identical state.  */
10960
10961 static void
10962 aarch64_set_current_function (tree fndecl)
10963 {
10964   if (!fndecl || fndecl == aarch64_previous_fndecl)
10965     return;
10966
10967   tree old_tree = (aarch64_previous_fndecl
10968                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
10969                    : NULL_TREE);
10970
10971   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
10972
10973   /* If current function has no attributes but the previous one did,
10974      use the default node.  */
10975   if (!new_tree && old_tree)
10976     new_tree = target_option_default_node;
10977
10978   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
10979      the default have been handled by aarch64_save_restore_target_globals from
10980      aarch64_pragma_target_parse.  */
10981   if (old_tree == new_tree)
10982     return;
10983
10984   aarch64_previous_fndecl = fndecl;
10985
10986   /* First set the target options.  */
10987   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
10988
10989   aarch64_save_restore_target_globals (new_tree);
10990 }
10991
10992 /* Enum describing the various ways we can handle attributes.
10993    In many cases we can reuse the generic option handling machinery.  */
10994
10995 enum aarch64_attr_opt_type
10996 {
10997   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
10998   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
10999   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
11000   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
11001 };
11002
11003 /* All the information needed to handle a target attribute.
11004    NAME is the name of the attribute.
11005    ATTR_TYPE specifies the type of behavior of the attribute as described
11006    in the definition of enum aarch64_attr_opt_type.
11007    ALLOW_NEG is true if the attribute supports a "no-" form.
11008    HANDLER is the function that takes the attribute string as an argument
11009    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
11010    OPT_NUM is the enum specifying the option that the attribute modifies.
11011    This is needed for attributes that mirror the behavior of a command-line
11012    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
11013    aarch64_attr_enum.  */
11014
11015 struct aarch64_attribute_info
11016 {
11017   const char *name;
11018   enum aarch64_attr_opt_type attr_type;
11019   bool allow_neg;
11020   bool (*handler) (const char *);
11021   enum opt_code opt_num;
11022 };
11023
11024 /* Handle the ARCH_STR argument to the arch= target attribute.  */
11025
11026 static bool
11027 aarch64_handle_attr_arch (const char *str)
11028 {
11029   const struct processor *tmp_arch = NULL;
11030   enum aarch64_parse_opt_result parse_res
11031     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
11032
11033   if (parse_res == AARCH64_PARSE_OK)
11034     {
11035       gcc_assert (tmp_arch);
11036       selected_arch = tmp_arch;
11037       explicit_arch = selected_arch->arch;
11038       return true;
11039     }
11040
11041   switch (parse_res)
11042     {
11043       case AARCH64_PARSE_MISSING_ARG:
11044         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
11045         break;
11046       case AARCH64_PARSE_INVALID_ARG:
11047         error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
11048         aarch64_print_hint_for_arch (str);
11049         break;
11050       case AARCH64_PARSE_INVALID_FEATURE:
11051         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11052         break;
11053       default:
11054         gcc_unreachable ();
11055     }
11056
11057   return false;
11058 }
11059
11060 /* Handle the argument CPU_STR to the cpu= target attribute.  */
11061
11062 static bool
11063 aarch64_handle_attr_cpu (const char *str)
11064 {
11065   const struct processor *tmp_cpu = NULL;
11066   enum aarch64_parse_opt_result parse_res
11067     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
11068
11069   if (parse_res == AARCH64_PARSE_OK)
11070     {
11071       gcc_assert (tmp_cpu);
11072       selected_tune = tmp_cpu;
11073       explicit_tune_core = selected_tune->ident;
11074
11075       selected_arch = &all_architectures[tmp_cpu->arch];
11076       explicit_arch = selected_arch->arch;
11077       return true;
11078     }
11079
11080   switch (parse_res)
11081     {
11082       case AARCH64_PARSE_MISSING_ARG:
11083         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
11084         break;
11085       case AARCH64_PARSE_INVALID_ARG:
11086         error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
11087         aarch64_print_hint_for_core (str);
11088         break;
11089       case AARCH64_PARSE_INVALID_FEATURE:
11090         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11091         break;
11092       default:
11093         gcc_unreachable ();
11094     }
11095
11096   return false;
11097 }
11098
11099 /* Handle the argument STR to the tune= target attribute.  */
11100
11101 static bool
11102 aarch64_handle_attr_tune (const char *str)
11103 {
11104   const struct processor *tmp_tune = NULL;
11105   enum aarch64_parse_opt_result parse_res
11106     = aarch64_parse_tune (str, &tmp_tune);
11107
11108   if (parse_res == AARCH64_PARSE_OK)
11109     {
11110       gcc_assert (tmp_tune);
11111       selected_tune = tmp_tune;
11112       explicit_tune_core = selected_tune->ident;
11113       return true;
11114     }
11115
11116   switch (parse_res)
11117     {
11118       case AARCH64_PARSE_INVALID_ARG:
11119         error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
11120         aarch64_print_hint_for_core (str);
11121         break;
11122       default:
11123         gcc_unreachable ();
11124     }
11125
11126   return false;
11127 }
11128
11129 /* Parse an architecture extensions target attribute string specified in STR.
11130    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
11131    if successful.  Update aarch64_isa_flags to reflect the ISA features
11132    modified.  */
11133
11134 static bool
11135 aarch64_handle_attr_isa_flags (char *str)
11136 {
11137   enum aarch64_parse_opt_result parse_res;
11138   unsigned long isa_flags = aarch64_isa_flags;
11139
11140   /* We allow "+nothing" in the beginning to clear out all architectural
11141      features if the user wants to handpick specific features.  */
11142   if (strncmp ("+nothing", str, 8) == 0)
11143     {
11144       isa_flags = 0;
11145       str += 8;
11146     }
11147
11148   parse_res = aarch64_parse_extension (str, &isa_flags);
11149
11150   if (parse_res == AARCH64_PARSE_OK)
11151     {
11152       aarch64_isa_flags = isa_flags;
11153       return true;
11154     }
11155
11156   switch (parse_res)
11157     {
11158       case AARCH64_PARSE_MISSING_ARG:
11159         error ("missing value in %<target()%> pragma or attribute");
11160         break;
11161
11162       case AARCH64_PARSE_INVALID_FEATURE:
11163         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11164         break;
11165
11166       default:
11167         gcc_unreachable ();
11168     }
11169
11170  return false;
11171 }
11172
11173 /* The target attributes that we support.  On top of these we also support just
11174    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
11175    handled explicitly in aarch64_process_one_target_attr.  */
11176
11177 static const struct aarch64_attribute_info aarch64_attributes[] =
11178 {
11179   { "general-regs-only", aarch64_attr_mask, false, NULL,
11180      OPT_mgeneral_regs_only },
11181   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
11182      OPT_mfix_cortex_a53_835769 },
11183   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
11184      OPT_mfix_cortex_a53_843419 },
11185   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
11186   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
11187   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
11188      OPT_momit_leaf_frame_pointer },
11189   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
11190   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
11191      OPT_march_ },
11192   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
11193   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
11194      OPT_mtune_ },
11195   { "sign-return-address", aarch64_attr_enum, false, NULL,
11196      OPT_msign_return_address_ },
11197   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
11198 };
11199
11200 /* Parse ARG_STR which contains the definition of one target attribute.
11201    Show appropriate errors if any or return true if the attribute is valid.  */
11202
11203 static bool
11204 aarch64_process_one_target_attr (char *arg_str)
11205 {
11206   bool invert = false;
11207
11208   size_t len = strlen (arg_str);
11209
11210   if (len == 0)
11211     {
11212       error ("malformed %<target()%> pragma or attribute");
11213       return false;
11214     }
11215
11216   char *str_to_check = (char *) alloca (len + 1);
11217   strcpy (str_to_check, arg_str);
11218
11219   /* Skip leading whitespace.  */
11220   while (*str_to_check == ' ' || *str_to_check == '\t')
11221     str_to_check++;
11222
11223   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
11224      It is easier to detect and handle it explicitly here rather than going
11225      through the machinery for the rest of the target attributes in this
11226      function.  */
11227   if (*str_to_check == '+')
11228     return aarch64_handle_attr_isa_flags (str_to_check);
11229
11230   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
11231     {
11232       invert = true;
11233       str_to_check += 3;
11234     }
11235   char *arg = strchr (str_to_check, '=');
11236
11237   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
11238      and point ARG to "foo".  */
11239   if (arg)
11240     {
11241       *arg = '\0';
11242       arg++;
11243     }
11244   const struct aarch64_attribute_info *p_attr;
11245   bool found = false;
11246   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
11247     {
11248       /* If the names don't match up, or the user has given an argument
11249          to an attribute that doesn't accept one, or didn't give an argument
11250          to an attribute that expects one, fail to match.  */
11251       if (strcmp (str_to_check, p_attr->name) != 0)
11252         continue;
11253
11254       found = true;
11255       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
11256                               || p_attr->attr_type == aarch64_attr_enum;
11257
11258       if (attr_need_arg_p ^ (arg != NULL))
11259         {
11260           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
11261           return false;
11262         }
11263
11264       /* If the name matches but the attribute does not allow "no-" versions
11265          then we can't match.  */
11266       if (invert && !p_attr->allow_neg)
11267         {
11268           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
11269           return false;
11270         }
11271
11272       switch (p_attr->attr_type)
11273         {
11274         /* Has a custom handler registered.
11275            For example, cpu=, arch=, tune=.  */
11276           case aarch64_attr_custom:
11277             gcc_assert (p_attr->handler);
11278             if (!p_attr->handler (arg))
11279               return false;
11280             break;
11281
11282           /* Either set or unset a boolean option.  */
11283           case aarch64_attr_bool:
11284             {
11285               struct cl_decoded_option decoded;
11286
11287               generate_option (p_attr->opt_num, NULL, !invert,
11288                                CL_TARGET, &decoded);
11289               aarch64_handle_option (&global_options, &global_options_set,
11290                                       &decoded, input_location);
11291               break;
11292             }
11293           /* Set or unset a bit in the target_flags.  aarch64_handle_option
11294              should know what mask to apply given the option number.  */
11295           case aarch64_attr_mask:
11296             {
11297               struct cl_decoded_option decoded;
11298               /* We only need to specify the option number.
11299                  aarch64_handle_option will know which mask to apply.  */
11300               decoded.opt_index = p_attr->opt_num;
11301               decoded.value = !invert;
11302               aarch64_handle_option (&global_options, &global_options_set,
11303                                       &decoded, input_location);
11304               break;
11305             }
11306           /* Use the option setting machinery to set an option to an enum.  */
11307           case aarch64_attr_enum:
11308             {
11309               gcc_assert (arg);
11310               bool valid;
11311               int value;
11312               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
11313                                               &value, CL_TARGET);
11314               if (valid)
11315                 {
11316                   set_option (&global_options, NULL, p_attr->opt_num, value,
11317                               NULL, DK_UNSPECIFIED, input_location,
11318                               global_dc);
11319                 }
11320               else
11321                 {
11322                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
11323                 }
11324               break;
11325             }
11326           default:
11327             gcc_unreachable ();
11328         }
11329     }
11330
11331   /* If we reached here we either have found an attribute and validated
11332      it or didn't match any.  If we matched an attribute but its arguments
11333      were malformed we will have returned false already.  */
11334   return found;
11335 }
11336
11337 /* Count how many times the character C appears in
11338    NULL-terminated string STR.  */
11339
11340 static unsigned int
11341 num_occurences_in_str (char c, char *str)
11342 {
11343   unsigned int res = 0;
11344   while (*str != '\0')
11345     {
11346       if (*str == c)
11347         res++;
11348
11349       str++;
11350     }
11351
11352   return res;
11353 }
11354
11355 /* Parse the tree in ARGS that contains the target attribute information
11356    and update the global target options space.  */
11357
11358 bool
11359 aarch64_process_target_attr (tree args)
11360 {
11361   if (TREE_CODE (args) == TREE_LIST)
11362     {
11363       do
11364         {
11365           tree head = TREE_VALUE (args);
11366           if (head)
11367             {
11368               if (!aarch64_process_target_attr (head))
11369                 return false;
11370             }
11371           args = TREE_CHAIN (args);
11372         } while (args);
11373
11374       return true;
11375     }
11376
11377   if (TREE_CODE (args) != STRING_CST)
11378     {
11379       error ("attribute %<target%> argument not a string");
11380       return false;
11381     }
11382
11383   size_t len = strlen (TREE_STRING_POINTER (args));
11384   char *str_to_check = (char *) alloca (len + 1);
11385   strcpy (str_to_check, TREE_STRING_POINTER (args));
11386
11387   if (len == 0)
11388     {
11389       error ("malformed %<target()%> pragma or attribute");
11390       return false;
11391     }
11392
11393   /* Used to catch empty spaces between commas i.e.
11394      attribute ((target ("attr1,,attr2"))).  */
11395   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
11396
11397   /* Handle multiple target attributes separated by ','.  */
11398   char *token = strtok (str_to_check, ",");
11399
11400   unsigned int num_attrs = 0;
11401   while (token)
11402     {
11403       num_attrs++;
11404       if (!aarch64_process_one_target_attr (token))
11405         {
11406           error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
11407           return false;
11408         }
11409
11410       token = strtok (NULL, ",");
11411     }
11412
11413   if (num_attrs != num_commas + 1)
11414     {
11415       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
11416       return false;
11417     }
11418
11419   return true;
11420 }
11421
11422 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
11423    process attribute ((target ("..."))).  */
11424
11425 static bool
11426 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
11427 {
11428   struct cl_target_option cur_target;
11429   bool ret;
11430   tree old_optimize;
11431   tree new_target, new_optimize;
11432   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11433
11434   /* If what we're processing is the current pragma string then the
11435      target option node is already stored in target_option_current_node
11436      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
11437      having to re-parse the string.  This is especially useful to keep
11438      arm_neon.h compile times down since that header contains a lot
11439      of intrinsics enclosed in pragmas.  */
11440   if (!existing_target && args == current_target_pragma)
11441     {
11442       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
11443       return true;
11444     }
11445   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11446
11447   old_optimize = build_optimization_node (&global_options);
11448   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11449
11450   /* If the function changed the optimization levels as well as setting
11451      target options, start with the optimizations specified.  */
11452   if (func_optimize && func_optimize != old_optimize)
11453     cl_optimization_restore (&global_options,
11454                              TREE_OPTIMIZATION (func_optimize));
11455
11456   /* Save the current target options to restore at the end.  */
11457   cl_target_option_save (&cur_target, &global_options);
11458
11459   /* If fndecl already has some target attributes applied to it, unpack
11460      them so that we add this attribute on top of them, rather than
11461      overwriting them.  */
11462   if (existing_target)
11463     {
11464       struct cl_target_option *existing_options
11465         = TREE_TARGET_OPTION (existing_target);
11466
11467       if (existing_options)
11468         cl_target_option_restore (&global_options, existing_options);
11469     }
11470   else
11471     cl_target_option_restore (&global_options,
11472                         TREE_TARGET_OPTION (target_option_current_node));
11473
11474   ret = aarch64_process_target_attr (args);
11475
11476   /* Set up any additional state.  */
11477   if (ret)
11478     {
11479       aarch64_override_options_internal (&global_options);
11480       /* Initialize SIMD builtins if we haven't already.
11481          Set current_target_pragma to NULL for the duration so that
11482          the builtin initialization code doesn't try to tag the functions
11483          being built with the attributes specified by any current pragma, thus
11484          going into an infinite recursion.  */
11485       if (TARGET_SIMD)
11486         {
11487           tree saved_current_target_pragma = current_target_pragma;
11488           current_target_pragma = NULL;
11489           aarch64_init_simd_builtins ();
11490           current_target_pragma = saved_current_target_pragma;
11491         }
11492       new_target = build_target_option_node (&global_options);
11493     }
11494   else
11495     new_target = NULL;
11496
11497   new_optimize = build_optimization_node (&global_options);
11498
11499   if (fndecl && ret)
11500     {
11501       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
11502
11503       if (old_optimize != new_optimize)
11504         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
11505     }
11506
11507   cl_target_option_restore (&global_options, &cur_target);
11508
11509   if (old_optimize != new_optimize)
11510     cl_optimization_restore (&global_options,
11511                              TREE_OPTIMIZATION (old_optimize));
11512   return ret;
11513 }
11514
11515 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
11516    tri-bool options (yes, no, don't care) and the default value is
11517    DEF, determine whether to reject inlining.  */
11518
11519 static bool
11520 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
11521                                      int dont_care, int def)
11522 {
11523   /* If the callee doesn't care, always allow inlining.  */
11524   if (callee == dont_care)
11525     return true;
11526
11527   /* If the caller doesn't care, always allow inlining.  */
11528   if (caller == dont_care)
11529     return true;
11530
11531   /* Otherwise, allow inlining if either the callee and caller values
11532      agree, or if the callee is using the default value.  */
11533   return (callee == caller || callee == def);
11534 }
11535
11536 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
11537    to inline CALLEE into CALLER based on target-specific info.
11538    Make sure that the caller and callee have compatible architectural
11539    features.  Then go through the other possible target attributes
11540    and see if they can block inlining.  Try not to reject always_inline
11541    callees unless they are incompatible architecturally.  */
11542
11543 static bool
11544 aarch64_can_inline_p (tree caller, tree callee)
11545 {
11546   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
11547   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
11548
11549   struct cl_target_option *caller_opts
11550         = TREE_TARGET_OPTION (caller_tree ? caller_tree
11551                                            : target_option_default_node);
11552
11553   struct cl_target_option *callee_opts
11554         = TREE_TARGET_OPTION (callee_tree ? callee_tree
11555                                            : target_option_default_node);
11556
11557   /* Callee's ISA flags should be a subset of the caller's.  */
11558   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
11559        != callee_opts->x_aarch64_isa_flags)
11560     return false;
11561
11562   /* Allow non-strict aligned functions inlining into strict
11563      aligned ones.  */
11564   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
11565        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
11566       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
11567            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
11568     return false;
11569
11570   bool always_inline = lookup_attribute ("always_inline",
11571                                           DECL_ATTRIBUTES (callee));
11572
11573   /* If the architectural features match up and the callee is always_inline
11574      then the other attributes don't matter.  */
11575   if (always_inline)
11576     return true;
11577
11578   if (caller_opts->x_aarch64_cmodel_var
11579       != callee_opts->x_aarch64_cmodel_var)
11580     return false;
11581
11582   if (caller_opts->x_aarch64_tls_dialect
11583       != callee_opts->x_aarch64_tls_dialect)
11584     return false;
11585
11586   /* Honour explicit requests to workaround errata.  */
11587   if (!aarch64_tribools_ok_for_inlining_p (
11588           caller_opts->x_aarch64_fix_a53_err835769,
11589           callee_opts->x_aarch64_fix_a53_err835769,
11590           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
11591     return false;
11592
11593   if (!aarch64_tribools_ok_for_inlining_p (
11594           caller_opts->x_aarch64_fix_a53_err843419,
11595           callee_opts->x_aarch64_fix_a53_err843419,
11596           2, TARGET_FIX_ERR_A53_843419))
11597     return false;
11598
11599   /* If the user explicitly specified -momit-leaf-frame-pointer for the
11600      caller and calle and they don't match up, reject inlining.  */
11601   if (!aarch64_tribools_ok_for_inlining_p (
11602           caller_opts->x_flag_omit_leaf_frame_pointer,
11603           callee_opts->x_flag_omit_leaf_frame_pointer,
11604           2, 1))
11605     return false;
11606
11607   /* If the callee has specific tuning overrides, respect them.  */
11608   if (callee_opts->x_aarch64_override_tune_string != NULL
11609       && caller_opts->x_aarch64_override_tune_string == NULL)
11610     return false;
11611
11612   /* If the user specified tuning override strings for the
11613      caller and callee and they don't match up, reject inlining.
11614      We just do a string compare here, we don't analyze the meaning
11615      of the string, as it would be too costly for little gain.  */
11616   if (callee_opts->x_aarch64_override_tune_string
11617       && caller_opts->x_aarch64_override_tune_string
11618       && (strcmp (callee_opts->x_aarch64_override_tune_string,
11619                   caller_opts->x_aarch64_override_tune_string) != 0))
11620     return false;
11621
11622   return true;
11623 }
11624
11625 /* Return true if SYMBOL_REF X binds locally.  */
11626
11627 static bool
11628 aarch64_symbol_binds_local_p (const_rtx x)
11629 {
11630   return (SYMBOL_REF_DECL (x)
11631           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
11632           : SYMBOL_REF_LOCAL_P (x));
11633 }
11634
11635 /* Return true if SYMBOL_REF X is thread local */
11636 static bool
11637 aarch64_tls_symbol_p (rtx x)
11638 {
11639   if (! TARGET_HAVE_TLS)
11640     return false;
11641
11642   if (GET_CODE (x) != SYMBOL_REF)
11643     return false;
11644
11645   return SYMBOL_REF_TLS_MODEL (x) != 0;
11646 }
11647
11648 /* Classify a TLS symbol into one of the TLS kinds.  */
11649 enum aarch64_symbol_type
11650 aarch64_classify_tls_symbol (rtx x)
11651 {
11652   enum tls_model tls_kind = tls_symbolic_operand_type (x);
11653
11654   switch (tls_kind)
11655     {
11656     case TLS_MODEL_GLOBAL_DYNAMIC:
11657     case TLS_MODEL_LOCAL_DYNAMIC:
11658       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
11659
11660     case TLS_MODEL_INITIAL_EXEC:
11661       switch (aarch64_cmodel)
11662         {
11663         case AARCH64_CMODEL_TINY:
11664         case AARCH64_CMODEL_TINY_PIC:
11665           return SYMBOL_TINY_TLSIE;
11666         default:
11667           return SYMBOL_SMALL_TLSIE;
11668         }
11669
11670     case TLS_MODEL_LOCAL_EXEC:
11671       if (aarch64_tls_size == 12)
11672         return SYMBOL_TLSLE12;
11673       else if (aarch64_tls_size == 24)
11674         return SYMBOL_TLSLE24;
11675       else if (aarch64_tls_size == 32)
11676         return SYMBOL_TLSLE32;
11677       else if (aarch64_tls_size == 48)
11678         return SYMBOL_TLSLE48;
11679       else
11680         gcc_unreachable ();
11681
11682     case TLS_MODEL_EMULATED:
11683     case TLS_MODEL_NONE:
11684       return SYMBOL_FORCE_TO_MEM;
11685
11686     default:
11687       gcc_unreachable ();
11688     }
11689 }
11690
11691 /* Return the correct method for accessing X + OFFSET, where X is either
11692    a SYMBOL_REF or LABEL_REF.  */
11693
11694 enum aarch64_symbol_type
11695 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
11696 {
11697   if (GET_CODE (x) == LABEL_REF)
11698     {
11699       switch (aarch64_cmodel)
11700         {
11701         case AARCH64_CMODEL_LARGE:
11702           return SYMBOL_FORCE_TO_MEM;
11703
11704         case AARCH64_CMODEL_TINY_PIC:
11705         case AARCH64_CMODEL_TINY:
11706           return SYMBOL_TINY_ABSOLUTE;
11707
11708         case AARCH64_CMODEL_SMALL_SPIC:
11709         case AARCH64_CMODEL_SMALL_PIC:
11710         case AARCH64_CMODEL_SMALL:
11711           return SYMBOL_SMALL_ABSOLUTE;
11712
11713         default:
11714           gcc_unreachable ();
11715         }
11716     }
11717
11718   if (GET_CODE (x) == SYMBOL_REF)
11719     {
11720       if (aarch64_tls_symbol_p (x))
11721         return aarch64_classify_tls_symbol (x);
11722
11723       switch (aarch64_cmodel)
11724         {
11725         case AARCH64_CMODEL_TINY:
11726           /* When we retrieve symbol + offset address, we have to make sure
11727              the offset does not cause overflow of the final address.  But
11728              we have no way of knowing the address of symbol at compile time
11729              so we can't accurately say if the distance between the PC and
11730              symbol + offset is outside the addressible range of +/-1M in the
11731              TINY code model.  So we rely on images not being greater than
11732              1M and cap the offset at 1M and anything beyond 1M will have to
11733              be loaded using an alternative mechanism.  Furthermore if the
11734              symbol is a weak reference to something that isn't known to
11735              resolve to a symbol in this module, then force to memory.  */
11736           if ((SYMBOL_REF_WEAK (x)
11737                && !aarch64_symbol_binds_local_p (x))
11738               || !IN_RANGE (offset, -1048575, 1048575))
11739             return SYMBOL_FORCE_TO_MEM;
11740           return SYMBOL_TINY_ABSOLUTE;
11741
11742         case AARCH64_CMODEL_SMALL:
11743           /* Same reasoning as the tiny code model, but the offset cap here is
11744              4G.  */
11745           if ((SYMBOL_REF_WEAK (x)
11746                && !aarch64_symbol_binds_local_p (x))
11747               || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
11748                             HOST_WIDE_INT_C (4294967264)))
11749             return SYMBOL_FORCE_TO_MEM;
11750           return SYMBOL_SMALL_ABSOLUTE;
11751
11752         case AARCH64_CMODEL_TINY_PIC:
11753           if (!aarch64_symbol_binds_local_p (x))
11754             return SYMBOL_TINY_GOT;
11755           return SYMBOL_TINY_ABSOLUTE;
11756
11757         case AARCH64_CMODEL_SMALL_SPIC:
11758         case AARCH64_CMODEL_SMALL_PIC:
11759           if (!aarch64_symbol_binds_local_p (x))
11760             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
11761                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
11762           return SYMBOL_SMALL_ABSOLUTE;
11763
11764         case AARCH64_CMODEL_LARGE:
11765           /* This is alright even in PIC code as the constant
11766              pool reference is always PC relative and within
11767              the same translation unit.  */
11768           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
11769             return SYMBOL_SMALL_ABSOLUTE;
11770           else
11771             return SYMBOL_FORCE_TO_MEM;
11772
11773         default:
11774           gcc_unreachable ();
11775         }
11776     }
11777
11778   /* By default push everything into the constant pool.  */
11779   return SYMBOL_FORCE_TO_MEM;
11780 }
11781
11782 bool
11783 aarch64_constant_address_p (rtx x)
11784 {
11785   return (CONSTANT_P (x) && memory_address_p (DImode, x));
11786 }
11787
11788 bool
11789 aarch64_legitimate_pic_operand_p (rtx x)
11790 {
11791   if (GET_CODE (x) == SYMBOL_REF
11792       || (GET_CODE (x) == CONST
11793           && GET_CODE (XEXP (x, 0)) == PLUS
11794           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
11795      return false;
11796
11797   return true;
11798 }
11799
11800 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
11801    that should be rematerialized rather than spilled.  */
11802
11803 static bool
11804 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
11805 {
11806   /* Support CSE and rematerialization of common constants.  */
11807   if (CONST_INT_P (x)
11808       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
11809       || GET_CODE (x) == CONST_VECTOR)
11810     return true;
11811
11812   /* Do not allow vector struct mode constants for Advanced SIMD.
11813      We could support 0 and -1 easily, but they need support in
11814      aarch64-simd.md.  */
11815   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11816   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
11817     return false;
11818
11819   /* Only accept variable-length vector constants if they can be
11820      handled directly.
11821
11822      ??? It would be possible to handle rematerialization of other
11823      constants via secondary reloads.  */
11824   if (vec_flags & VEC_ANY_SVE)
11825     return aarch64_simd_valid_immediate (x, NULL);
11826
11827   if (GET_CODE (x) == HIGH)
11828     x = XEXP (x, 0);
11829
11830   /* Accept polynomial constants that can be calculated by using the
11831      destination of a move as the sole temporary.  Constants that
11832      require a second temporary cannot be rematerialized (they can't be
11833      forced to memory and also aren't legitimate constants).  */
11834   poly_int64 offset;
11835   if (poly_int_rtx_p (x, &offset))
11836     return aarch64_offset_temporaries (false, offset) <= 1;
11837
11838   /* If an offset is being added to something else, we need to allow the
11839      base to be moved into the destination register, meaning that there
11840      are no free temporaries for the offset.  */
11841   x = strip_offset (x, &offset);
11842   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
11843     return false;
11844
11845   /* Do not allow const (plus (anchor_symbol, const_int)).  */
11846   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
11847     return false;
11848
11849   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
11850      so spilling them is better than rematerialization.  */
11851   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
11852     return true;
11853
11854   /* Label references are always constant.  */
11855   if (GET_CODE (x) == LABEL_REF)
11856     return true;
11857
11858   return false;
11859 }
11860
11861 rtx
11862 aarch64_load_tp (rtx target)
11863 {
11864   if (!target
11865       || GET_MODE (target) != Pmode
11866       || !register_operand (target, Pmode))
11867     target = gen_reg_rtx (Pmode);
11868
11869   /* Can return in any reg.  */
11870   emit_insn (gen_aarch64_load_tp_hard (target));
11871   return target;
11872 }
11873
11874 /* On AAPCS systems, this is the "struct __va_list".  */
11875 static GTY(()) tree va_list_type;
11876
11877 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
11878    Return the type to use as __builtin_va_list.
11879
11880    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
11881
11882    struct __va_list
11883    {
11884      void *__stack;
11885      void *__gr_top;
11886      void *__vr_top;
11887      int   __gr_offs;
11888      int   __vr_offs;
11889    };  */
11890
11891 static tree
11892 aarch64_build_builtin_va_list (void)
11893 {
11894   tree va_list_name;
11895   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
11896
11897   /* Create the type.  */
11898   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
11899   /* Give it the required name.  */
11900   va_list_name = build_decl (BUILTINS_LOCATION,
11901                              TYPE_DECL,
11902                              get_identifier ("__va_list"),
11903                              va_list_type);
11904   DECL_ARTIFICIAL (va_list_name) = 1;
11905   TYPE_NAME (va_list_type) = va_list_name;
11906   TYPE_STUB_DECL (va_list_type) = va_list_name;
11907
11908   /* Create the fields.  */
11909   f_stack = build_decl (BUILTINS_LOCATION,
11910                         FIELD_DECL, get_identifier ("__stack"),
11911                         ptr_type_node);
11912   f_grtop = build_decl (BUILTINS_LOCATION,
11913                         FIELD_DECL, get_identifier ("__gr_top"),
11914                         ptr_type_node);
11915   f_vrtop = build_decl (BUILTINS_LOCATION,
11916                         FIELD_DECL, get_identifier ("__vr_top"),
11917                         ptr_type_node);
11918   f_groff = build_decl (BUILTINS_LOCATION,
11919                         FIELD_DECL, get_identifier ("__gr_offs"),
11920                         integer_type_node);
11921   f_vroff = build_decl (BUILTINS_LOCATION,
11922                         FIELD_DECL, get_identifier ("__vr_offs"),
11923                         integer_type_node);
11924
11925   /* Tell tree-stdarg pass about our internal offset fields.
11926      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
11927      purpose to identify whether the code is updating va_list internal
11928      offset fields through irregular way.  */
11929   va_list_gpr_counter_field = f_groff;
11930   va_list_fpr_counter_field = f_vroff;
11931
11932   DECL_ARTIFICIAL (f_stack) = 1;
11933   DECL_ARTIFICIAL (f_grtop) = 1;
11934   DECL_ARTIFICIAL (f_vrtop) = 1;
11935   DECL_ARTIFICIAL (f_groff) = 1;
11936   DECL_ARTIFICIAL (f_vroff) = 1;
11937
11938   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
11939   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
11940   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
11941   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
11942   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
11943
11944   TYPE_FIELDS (va_list_type) = f_stack;
11945   DECL_CHAIN (f_stack) = f_grtop;
11946   DECL_CHAIN (f_grtop) = f_vrtop;
11947   DECL_CHAIN (f_vrtop) = f_groff;
11948   DECL_CHAIN (f_groff) = f_vroff;
11949
11950   /* Compute its layout.  */
11951   layout_type (va_list_type);
11952
11953   return va_list_type;
11954 }
11955
11956 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
11957 static void
11958 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
11959 {
11960   const CUMULATIVE_ARGS *cum;
11961   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
11962   tree stack, grtop, vrtop, groff, vroff;
11963   tree t;
11964   int gr_save_area_size = cfun->va_list_gpr_size;
11965   int vr_save_area_size = cfun->va_list_fpr_size;
11966   int vr_offset;
11967
11968   cum = &crtl->args.info;
11969   if (cfun->va_list_gpr_size)
11970     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
11971                              cfun->va_list_gpr_size);
11972   if (cfun->va_list_fpr_size)
11973     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
11974                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
11975
11976   if (!TARGET_FLOAT)
11977     {
11978       gcc_assert (cum->aapcs_nvrn == 0);
11979       vr_save_area_size = 0;
11980     }
11981
11982   f_stack = TYPE_FIELDS (va_list_type_node);
11983   f_grtop = DECL_CHAIN (f_stack);
11984   f_vrtop = DECL_CHAIN (f_grtop);
11985   f_groff = DECL_CHAIN (f_vrtop);
11986   f_vroff = DECL_CHAIN (f_groff);
11987
11988   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
11989                   NULL_TREE);
11990   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
11991                   NULL_TREE);
11992   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
11993                   NULL_TREE);
11994   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
11995                   NULL_TREE);
11996   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
11997                   NULL_TREE);
11998
11999   /* Emit code to initialize STACK, which points to the next varargs stack
12000      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
12001      by named arguments.  STACK is 8-byte aligned.  */
12002   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
12003   if (cum->aapcs_stack_size > 0)
12004     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
12005   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
12006   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12007
12008   /* Emit code to initialize GRTOP, the top of the GR save area.
12009      virtual_incoming_args_rtx should have been 16 byte aligned.  */
12010   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
12011   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
12012   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12013
12014   /* Emit code to initialize VRTOP, the top of the VR save area.
12015      This address is gr_save_area_bytes below GRTOP, rounded
12016      down to the next 16-byte boundary.  */
12017   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
12018   vr_offset = ROUND_UP (gr_save_area_size,
12019                         STACK_BOUNDARY / BITS_PER_UNIT);
12020
12021   if (vr_offset)
12022     t = fold_build_pointer_plus_hwi (t, -vr_offset);
12023   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
12024   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12025
12026   /* Emit code to initialize GROFF, the offset from GRTOP of the
12027      next GPR argument.  */
12028   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
12029               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
12030   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12031
12032   /* Likewise emit code to initialize VROFF, the offset from FTOP
12033      of the next VR argument.  */
12034   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
12035               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
12036   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12037 }
12038
12039 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
12040
12041 static tree
12042 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
12043                               gimple_seq *post_p ATTRIBUTE_UNUSED)
12044 {
12045   tree addr;
12046   bool indirect_p;
12047   bool is_ha;           /* is HFA or HVA.  */
12048   bool dw_align;        /* double-word align.  */
12049   machine_mode ag_mode = VOIDmode;
12050   int nregs;
12051   machine_mode mode;
12052
12053   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12054   tree stack, f_top, f_off, off, arg, roundup, on_stack;
12055   HOST_WIDE_INT size, rsize, adjust, align;
12056   tree t, u, cond1, cond2;
12057
12058   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
12059   if (indirect_p)
12060     type = build_pointer_type (type);
12061
12062   mode = TYPE_MODE (type);
12063
12064   f_stack = TYPE_FIELDS (va_list_type_node);
12065   f_grtop = DECL_CHAIN (f_stack);
12066   f_vrtop = DECL_CHAIN (f_grtop);
12067   f_groff = DECL_CHAIN (f_vrtop);
12068   f_vroff = DECL_CHAIN (f_groff);
12069
12070   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
12071                   f_stack, NULL_TREE);
12072   size = int_size_in_bytes (type);
12073   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
12074
12075   dw_align = false;
12076   adjust = 0;
12077   if (aarch64_vfp_is_call_or_return_candidate (mode,
12078                                                type,
12079                                                &ag_mode,
12080                                                &nregs,
12081                                                &is_ha))
12082     {
12083       /* No frontends can create types with variable-sized modes, so we
12084          shouldn't be asked to pass or return them.  */
12085       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
12086
12087       /* TYPE passed in fp/simd registers.  */
12088       if (!TARGET_FLOAT)
12089         aarch64_err_no_fpadvsimd (mode);
12090
12091       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
12092                       unshare_expr (valist), f_vrtop, NULL_TREE);
12093       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
12094                       unshare_expr (valist), f_vroff, NULL_TREE);
12095
12096       rsize = nregs * UNITS_PER_VREG;
12097
12098       if (is_ha)
12099         {
12100           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
12101             adjust = UNITS_PER_VREG - ag_size;
12102         }
12103       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12104                && size < UNITS_PER_VREG)
12105         {
12106           adjust = UNITS_PER_VREG - size;
12107         }
12108     }
12109   else
12110     {
12111       /* TYPE passed in general registers.  */
12112       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
12113                       unshare_expr (valist), f_grtop, NULL_TREE);
12114       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
12115                       unshare_expr (valist), f_groff, NULL_TREE);
12116       rsize = ROUND_UP (size, UNITS_PER_WORD);
12117       nregs = rsize / UNITS_PER_WORD;
12118
12119       if (align > 8)
12120         dw_align = true;
12121
12122       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12123           && size < UNITS_PER_WORD)
12124         {
12125           adjust = UNITS_PER_WORD  - size;
12126         }
12127     }
12128
12129   /* Get a local temporary for the field value.  */
12130   off = get_initialized_tmp_var (f_off, pre_p, NULL);
12131
12132   /* Emit code to branch if off >= 0.  */
12133   t = build2 (GE_EXPR, boolean_type_node, off,
12134               build_int_cst (TREE_TYPE (off), 0));
12135   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
12136
12137   if (dw_align)
12138     {
12139       /* Emit: offs = (offs + 15) & -16.  */
12140       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12141                   build_int_cst (TREE_TYPE (off), 15));
12142       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
12143                   build_int_cst (TREE_TYPE (off), -16));
12144       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
12145     }
12146   else
12147     roundup = NULL;
12148
12149   /* Update ap.__[g|v]r_offs  */
12150   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12151               build_int_cst (TREE_TYPE (off), rsize));
12152   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
12153
12154   /* String up.  */
12155   if (roundup)
12156     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12157
12158   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
12159   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
12160               build_int_cst (TREE_TYPE (f_off), 0));
12161   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
12162
12163   /* String up: make sure the assignment happens before the use.  */
12164   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
12165   COND_EXPR_ELSE (cond1) = t;
12166
12167   /* Prepare the trees handling the argument that is passed on the stack;
12168      the top level node will store in ON_STACK.  */
12169   arg = get_initialized_tmp_var (stack, pre_p, NULL);
12170   if (align > 8)
12171     {
12172       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
12173       t = fold_build_pointer_plus_hwi (arg, 15);
12174       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12175                   build_int_cst (TREE_TYPE (t), -16));
12176       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
12177     }
12178   else
12179     roundup = NULL;
12180   /* Advance ap.__stack  */
12181   t = fold_build_pointer_plus_hwi (arg, size + 7);
12182   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12183               build_int_cst (TREE_TYPE (t), -8));
12184   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
12185   /* String up roundup and advance.  */
12186   if (roundup)
12187     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12188   /* String up with arg */
12189   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
12190   /* Big-endianness related address adjustment.  */
12191   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12192       && size < UNITS_PER_WORD)
12193   {
12194     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
12195                 size_int (UNITS_PER_WORD - size));
12196     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
12197   }
12198
12199   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
12200   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
12201
12202   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
12203   t = off;
12204   if (adjust)
12205     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
12206                 build_int_cst (TREE_TYPE (off), adjust));
12207
12208   t = fold_convert (sizetype, t);
12209   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
12210
12211   if (is_ha)
12212     {
12213       /* type ha; // treat as "struct {ftype field[n];}"
12214          ... [computing offs]
12215          for (i = 0; i <nregs; ++i, offs += 16)
12216            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
12217          return ha;  */
12218       int i;
12219       tree tmp_ha, field_t, field_ptr_t;
12220
12221       /* Declare a local variable.  */
12222       tmp_ha = create_tmp_var_raw (type, "ha");
12223       gimple_add_tmp_var (tmp_ha);
12224
12225       /* Establish the base type.  */
12226       switch (ag_mode)
12227         {
12228         case E_SFmode:
12229           field_t = float_type_node;
12230           field_ptr_t = float_ptr_type_node;
12231           break;
12232         case E_DFmode:
12233           field_t = double_type_node;
12234           field_ptr_t = double_ptr_type_node;
12235           break;
12236         case E_TFmode:
12237           field_t = long_double_type_node;
12238           field_ptr_t = long_double_ptr_type_node;
12239           break;
12240         case E_HFmode:
12241           field_t = aarch64_fp16_type_node;
12242           field_ptr_t = aarch64_fp16_ptr_type_node;
12243           break;
12244         case E_V2SImode:
12245         case E_V4SImode:
12246             {
12247               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
12248               field_t = build_vector_type_for_mode (innertype, ag_mode);
12249               field_ptr_t = build_pointer_type (field_t);
12250             }
12251           break;
12252         default:
12253           gcc_assert (0);
12254         }
12255
12256       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
12257       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
12258       addr = t;
12259       t = fold_convert (field_ptr_t, addr);
12260       t = build2 (MODIFY_EXPR, field_t,
12261                   build1 (INDIRECT_REF, field_t, tmp_ha),
12262                   build1 (INDIRECT_REF, field_t, t));
12263
12264       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
12265       for (i = 1; i < nregs; ++i)
12266         {
12267           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
12268           u = fold_convert (field_ptr_t, addr);
12269           u = build2 (MODIFY_EXPR, field_t,
12270                       build2 (MEM_REF, field_t, tmp_ha,
12271                               build_int_cst (field_ptr_t,
12272                                              (i *
12273                                               int_size_in_bytes (field_t)))),
12274                       build1 (INDIRECT_REF, field_t, u));
12275           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
12276         }
12277
12278       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
12279       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
12280     }
12281
12282   COND_EXPR_ELSE (cond2) = t;
12283   addr = fold_convert (build_pointer_type (type), cond1);
12284   addr = build_va_arg_indirect_ref (addr);
12285
12286   if (indirect_p)
12287     addr = build_va_arg_indirect_ref (addr);
12288
12289   return addr;
12290 }
12291
12292 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
12293
12294 static void
12295 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
12296                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
12297                                 int no_rtl)
12298 {
12299   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
12300   CUMULATIVE_ARGS local_cum;
12301   int gr_saved = cfun->va_list_gpr_size;
12302   int vr_saved = cfun->va_list_fpr_size;
12303
12304   /* The caller has advanced CUM up to, but not beyond, the last named
12305      argument.  Advance a local copy of CUM past the last "real" named
12306      argument, to find out how many registers are left over.  */
12307   local_cum = *cum;
12308   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
12309
12310   /* Found out how many registers we need to save.
12311      Honor tree-stdvar analysis results.  */
12312   if (cfun->va_list_gpr_size)
12313     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
12314                     cfun->va_list_gpr_size / UNITS_PER_WORD);
12315   if (cfun->va_list_fpr_size)
12316     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
12317                     cfun->va_list_fpr_size / UNITS_PER_VREG);
12318
12319   if (!TARGET_FLOAT)
12320     {
12321       gcc_assert (local_cum.aapcs_nvrn == 0);
12322       vr_saved = 0;
12323     }
12324
12325   if (!no_rtl)
12326     {
12327       if (gr_saved > 0)
12328         {
12329           rtx ptr, mem;
12330
12331           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
12332           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
12333                                - gr_saved * UNITS_PER_WORD);
12334           mem = gen_frame_mem (BLKmode, ptr);
12335           set_mem_alias_set (mem, get_varargs_alias_set ());
12336
12337           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
12338                                mem, gr_saved);
12339         }
12340       if (vr_saved > 0)
12341         {
12342           /* We can't use move_block_from_reg, because it will use
12343              the wrong mode, storing D regs only.  */
12344           machine_mode mode = TImode;
12345           int off, i, vr_start;
12346
12347           /* Set OFF to the offset from virtual_incoming_args_rtx of
12348              the first vector register.  The VR save area lies below
12349              the GR one, and is aligned to 16 bytes.  */
12350           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
12351                            STACK_BOUNDARY / BITS_PER_UNIT);
12352           off -= vr_saved * UNITS_PER_VREG;
12353
12354           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
12355           for (i = 0; i < vr_saved; ++i)
12356             {
12357               rtx ptr, mem;
12358
12359               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
12360               mem = gen_frame_mem (mode, ptr);
12361               set_mem_alias_set (mem, get_varargs_alias_set ());
12362               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
12363               off += UNITS_PER_VREG;
12364             }
12365         }
12366     }
12367
12368   /* We don't save the size into *PRETEND_SIZE because we want to avoid
12369      any complication of having crtl->args.pretend_args_size changed.  */
12370   cfun->machine->frame.saved_varargs_size
12371     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
12372                  STACK_BOUNDARY / BITS_PER_UNIT)
12373        + vr_saved * UNITS_PER_VREG);
12374 }
12375
12376 static void
12377 aarch64_conditional_register_usage (void)
12378 {
12379   int i;
12380   if (!TARGET_FLOAT)
12381     {
12382       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
12383         {
12384           fixed_regs[i] = 1;
12385           call_used_regs[i] = 1;
12386         }
12387     }
12388   if (!TARGET_SVE)
12389     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
12390       {
12391         fixed_regs[i] = 1;
12392         call_used_regs[i] = 1;
12393       }
12394
12395   /* When tracking speculation, we need a couple of call-clobbered registers
12396      to track the speculation state.  It would be nice to just use
12397      IP0 and IP1, but currently there are numerous places that just
12398      assume these registers are free for other uses (eg pointer
12399      authentication).  */
12400   if (aarch64_track_speculation)
12401     {
12402       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
12403       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
12404       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
12405       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
12406     }
12407 }
12408
12409 /* Walk down the type tree of TYPE counting consecutive base elements.
12410    If *MODEP is VOIDmode, then set it to the first valid floating point
12411    type.  If a non-floating point type is found, or if a floating point
12412    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
12413    otherwise return the count in the sub-tree.  */
12414 static int
12415 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
12416 {
12417   machine_mode mode;
12418   HOST_WIDE_INT size;
12419
12420   switch (TREE_CODE (type))
12421     {
12422     case REAL_TYPE:
12423       mode = TYPE_MODE (type);
12424       if (mode != DFmode && mode != SFmode
12425           && mode != TFmode && mode != HFmode)
12426         return -1;
12427
12428       if (*modep == VOIDmode)
12429         *modep = mode;
12430
12431       if (*modep == mode)
12432         return 1;
12433
12434       break;
12435
12436     case COMPLEX_TYPE:
12437       mode = TYPE_MODE (TREE_TYPE (type));
12438       if (mode != DFmode && mode != SFmode
12439           && mode != TFmode && mode != HFmode)
12440         return -1;
12441
12442       if (*modep == VOIDmode)
12443         *modep = mode;
12444
12445       if (*modep == mode)
12446         return 2;
12447
12448       break;
12449
12450     case VECTOR_TYPE:
12451       /* Use V2SImode and V4SImode as representatives of all 64-bit
12452          and 128-bit vector types.  */
12453       size = int_size_in_bytes (type);
12454       switch (size)
12455         {
12456         case 8:
12457           mode = V2SImode;
12458           break;
12459         case 16:
12460           mode = V4SImode;
12461           break;
12462         default:
12463           return -1;
12464         }
12465
12466       if (*modep == VOIDmode)
12467         *modep = mode;
12468
12469       /* Vector modes are considered to be opaque: two vectors are
12470          equivalent for the purposes of being homogeneous aggregates
12471          if they are the same size.  */
12472       if (*modep == mode)
12473         return 1;
12474
12475       break;
12476
12477     case ARRAY_TYPE:
12478       {
12479         int count;
12480         tree index = TYPE_DOMAIN (type);
12481
12482         /* Can't handle incomplete types nor sizes that are not
12483            fixed.  */
12484         if (!COMPLETE_TYPE_P (type)
12485             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12486           return -1;
12487
12488         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
12489         if (count == -1
12490             || !index
12491             || !TYPE_MAX_VALUE (index)
12492             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
12493             || !TYPE_MIN_VALUE (index)
12494             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
12495             || count < 0)
12496           return -1;
12497
12498         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
12499                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
12500
12501         /* There must be no padding.  */
12502         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12503                       count * GET_MODE_BITSIZE (*modep)))
12504           return -1;
12505
12506         return count;
12507       }
12508
12509     case RECORD_TYPE:
12510       {
12511         int count = 0;
12512         int sub_count;
12513         tree field;
12514
12515         /* Can't handle incomplete types nor sizes that are not
12516            fixed.  */
12517         if (!COMPLETE_TYPE_P (type)
12518             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12519           return -1;
12520
12521         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12522           {
12523             if (TREE_CODE (field) != FIELD_DECL)
12524               continue;
12525
12526             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12527             if (sub_count < 0)
12528               return -1;
12529             count += sub_count;
12530           }
12531
12532         /* There must be no padding.  */
12533         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12534                       count * GET_MODE_BITSIZE (*modep)))
12535           return -1;
12536
12537         return count;
12538       }
12539
12540     case UNION_TYPE:
12541     case QUAL_UNION_TYPE:
12542       {
12543         /* These aren't very interesting except in a degenerate case.  */
12544         int count = 0;
12545         int sub_count;
12546         tree field;
12547
12548         /* Can't handle incomplete types nor sizes that are not
12549            fixed.  */
12550         if (!COMPLETE_TYPE_P (type)
12551             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12552           return -1;
12553
12554         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12555           {
12556             if (TREE_CODE (field) != FIELD_DECL)
12557               continue;
12558
12559             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12560             if (sub_count < 0)
12561               return -1;
12562             count = count > sub_count ? count : sub_count;
12563           }
12564
12565         /* There must be no padding.  */
12566         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12567                       count * GET_MODE_BITSIZE (*modep)))
12568           return -1;
12569
12570         return count;
12571       }
12572
12573     default:
12574       break;
12575     }
12576
12577   return -1;
12578 }
12579
12580 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
12581    type as described in AAPCS64 \S 4.1.2.
12582
12583    See the comment above aarch64_composite_type_p for the notes on MODE.  */
12584
12585 static bool
12586 aarch64_short_vector_p (const_tree type,
12587                         machine_mode mode)
12588 {
12589   poly_int64 size = -1;
12590
12591   if (type && TREE_CODE (type) == VECTOR_TYPE)
12592     size = int_size_in_bytes (type);
12593   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
12594             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
12595     size = GET_MODE_SIZE (mode);
12596
12597   return known_eq (size, 8) || known_eq (size, 16);
12598 }
12599
12600 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
12601    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
12602    array types.  The C99 floating-point complex types are also considered
12603    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
12604    types, which are GCC extensions and out of the scope of AAPCS64, are
12605    treated as composite types here as well.
12606
12607    Note that MODE itself is not sufficient in determining whether a type
12608    is such a composite type or not.  This is because
12609    stor-layout.c:compute_record_mode may have already changed the MODE
12610    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
12611    structure with only one field may have its MODE set to the mode of the
12612    field.  Also an integer mode whose size matches the size of the
12613    RECORD_TYPE type may be used to substitute the original mode
12614    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
12615    solely relied on.  */
12616
12617 static bool
12618 aarch64_composite_type_p (const_tree type,
12619                           machine_mode mode)
12620 {
12621   if (aarch64_short_vector_p (type, mode))
12622     return false;
12623
12624   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
12625     return true;
12626
12627   if (mode == BLKmode
12628       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
12629       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
12630     return true;
12631
12632   return false;
12633 }
12634
12635 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
12636    shall be passed or returned in simd/fp register(s) (providing these
12637    parameter passing registers are available).
12638
12639    Upon successful return, *COUNT returns the number of needed registers,
12640    *BASE_MODE returns the mode of the individual register and when IS_HAF
12641    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
12642    floating-point aggregate or a homogeneous short-vector aggregate.  */
12643
12644 static bool
12645 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
12646                                          const_tree type,
12647                                          machine_mode *base_mode,
12648                                          int *count,
12649                                          bool *is_ha)
12650 {
12651   machine_mode new_mode = VOIDmode;
12652   bool composite_p = aarch64_composite_type_p (type, mode);
12653
12654   if (is_ha != NULL) *is_ha = false;
12655
12656   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
12657       || aarch64_short_vector_p (type, mode))
12658     {
12659       *count = 1;
12660       new_mode = mode;
12661     }
12662   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
12663     {
12664       if (is_ha != NULL) *is_ha = true;
12665       *count = 2;
12666       new_mode = GET_MODE_INNER (mode);
12667     }
12668   else if (type && composite_p)
12669     {
12670       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
12671
12672       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
12673         {
12674           if (is_ha != NULL) *is_ha = true;
12675           *count = ag_count;
12676         }
12677       else
12678         return false;
12679     }
12680   else
12681     return false;
12682
12683   *base_mode = new_mode;
12684   return true;
12685 }
12686
12687 /* Implement TARGET_STRUCT_VALUE_RTX.  */
12688
12689 static rtx
12690 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
12691                           int incoming ATTRIBUTE_UNUSED)
12692 {
12693   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
12694 }
12695
12696 /* Implements target hook vector_mode_supported_p.  */
12697 static bool
12698 aarch64_vector_mode_supported_p (machine_mode mode)
12699 {
12700   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12701   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
12702 }
12703
12704 /* Return appropriate SIMD container
12705    for MODE within a vector of WIDTH bits.  */
12706 static machine_mode
12707 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
12708 {
12709   if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
12710     switch (mode)
12711       {
12712       case E_DFmode:
12713         return VNx2DFmode;
12714       case E_SFmode:
12715         return VNx4SFmode;
12716       case E_HFmode:
12717         return VNx8HFmode;
12718       case E_DImode:
12719         return VNx2DImode;
12720       case E_SImode:
12721         return VNx4SImode;
12722       case E_HImode:
12723         return VNx8HImode;
12724       case E_QImode:
12725         return VNx16QImode;
12726       default:
12727         return word_mode;
12728       }
12729
12730   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
12731   if (TARGET_SIMD)
12732     {
12733       if (known_eq (width, 128))
12734         switch (mode)
12735           {
12736           case E_DFmode:
12737             return V2DFmode;
12738           case E_SFmode:
12739             return V4SFmode;
12740           case E_HFmode:
12741             return V8HFmode;
12742           case E_SImode:
12743             return V4SImode;
12744           case E_HImode:
12745             return V8HImode;
12746           case E_QImode:
12747             return V16QImode;
12748           case E_DImode:
12749             return V2DImode;
12750           default:
12751             break;
12752           }
12753       else
12754         switch (mode)
12755           {
12756           case E_SFmode:
12757             return V2SFmode;
12758           case E_HFmode:
12759             return V4HFmode;
12760           case E_SImode:
12761             return V2SImode;
12762           case E_HImode:
12763             return V4HImode;
12764           case E_QImode:
12765             return V8QImode;
12766           default:
12767             break;
12768           }
12769     }
12770   return word_mode;
12771 }
12772
12773 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
12774 static machine_mode
12775 aarch64_preferred_simd_mode (scalar_mode mode)
12776 {
12777   poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
12778   return aarch64_simd_container_mode (mode, bits);
12779 }
12780
12781 /* Return a list of possible vector sizes for the vectorizer
12782    to iterate over.  */
12783 static void
12784 aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
12785 {
12786   if (TARGET_SVE)
12787     sizes->safe_push (BYTES_PER_SVE_VECTOR);
12788   sizes->safe_push (16);
12789   sizes->safe_push (8);
12790 }
12791
12792 /* Implement TARGET_MANGLE_TYPE.  */
12793
12794 static const char *
12795 aarch64_mangle_type (const_tree type)
12796 {
12797   /* The AArch64 ABI documents say that "__va_list" has to be
12798      managled as if it is in the "std" namespace.  */
12799   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
12800     return "St9__va_list";
12801
12802   /* Half-precision float.  */
12803   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
12804     return "Dh";
12805
12806   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
12807      builtin types.  */
12808   if (TYPE_NAME (type) != NULL)
12809     return aarch64_mangle_builtin_type (type);
12810
12811   /* Use the default mangling.  */
12812   return NULL;
12813 }
12814
12815 /* Find the first rtx_insn before insn that will generate an assembly
12816    instruction.  */
12817
12818 static rtx_insn *
12819 aarch64_prev_real_insn (rtx_insn *insn)
12820 {
12821   if (!insn)
12822     return NULL;
12823
12824   do
12825     {
12826       insn = prev_real_insn (insn);
12827     }
12828   while (insn && recog_memoized (insn) < 0);
12829
12830   return insn;
12831 }
12832
12833 static bool
12834 is_madd_op (enum attr_type t1)
12835 {
12836   unsigned int i;
12837   /* A number of these may be AArch32 only.  */
12838   enum attr_type mlatypes[] = {
12839     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
12840     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
12841     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
12842   };
12843
12844   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
12845     {
12846       if (t1 == mlatypes[i])
12847         return true;
12848     }
12849
12850   return false;
12851 }
12852
12853 /* Check if there is a register dependency between a load and the insn
12854    for which we hold recog_data.  */
12855
12856 static bool
12857 dep_between_memop_and_curr (rtx memop)
12858 {
12859   rtx load_reg;
12860   int opno;
12861
12862   gcc_assert (GET_CODE (memop) == SET);
12863
12864   if (!REG_P (SET_DEST (memop)))
12865     return false;
12866
12867   load_reg = SET_DEST (memop);
12868   for (opno = 1; opno < recog_data.n_operands; opno++)
12869     {
12870       rtx operand = recog_data.operand[opno];
12871       if (REG_P (operand)
12872           && reg_overlap_mentioned_p (load_reg, operand))
12873         return true;
12874
12875     }
12876   return false;
12877 }
12878
12879
12880 /* When working around the Cortex-A53 erratum 835769,
12881    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
12882    instruction and has a preceding memory instruction such that a NOP
12883    should be inserted between them.  */
12884
12885 bool
12886 aarch64_madd_needs_nop (rtx_insn* insn)
12887 {
12888   enum attr_type attr_type;
12889   rtx_insn *prev;
12890   rtx body;
12891
12892   if (!TARGET_FIX_ERR_A53_835769)
12893     return false;
12894
12895   if (!INSN_P (insn) || recog_memoized (insn) < 0)
12896     return false;
12897
12898   attr_type = get_attr_type (insn);
12899   if (!is_madd_op (attr_type))
12900     return false;
12901
12902   prev = aarch64_prev_real_insn (insn);
12903   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
12904      Restore recog state to INSN to avoid state corruption.  */
12905   extract_constrain_insn_cached (insn);
12906
12907   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
12908     return false;
12909
12910   body = single_set (prev);
12911
12912   /* If the previous insn is a memory op and there is no dependency between
12913      it and the DImode madd, emit a NOP between them.  If body is NULL then we
12914      have a complex memory operation, probably a load/store pair.
12915      Be conservative for now and emit a NOP.  */
12916   if (GET_MODE (recog_data.operand[0]) == DImode
12917       && (!body || !dep_between_memop_and_curr (body)))
12918     return true;
12919
12920   return false;
12921
12922 }
12923
12924
12925 /* Implement FINAL_PRESCAN_INSN.  */
12926
12927 void
12928 aarch64_final_prescan_insn (rtx_insn *insn)
12929 {
12930   if (aarch64_madd_needs_nop (insn))
12931     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
12932 }
12933
12934
12935 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
12936    instruction.  */
12937
12938 bool
12939 aarch64_sve_index_immediate_p (rtx base_or_step)
12940 {
12941   return (CONST_INT_P (base_or_step)
12942           && IN_RANGE (INTVAL (base_or_step), -16, 15));
12943 }
12944
12945 /* Return true if X is a valid immediate for the SVE ADD and SUB
12946    instructions.  Negate X first if NEGATE_P is true.  */
12947
12948 bool
12949 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
12950 {
12951   rtx elt;
12952
12953   if (!const_vec_duplicate_p (x, &elt)
12954       || !CONST_INT_P (elt))
12955     return false;
12956
12957   HOST_WIDE_INT val = INTVAL (elt);
12958   if (negate_p)
12959     val = -val;
12960   val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
12961
12962   if (val & 0xff)
12963     return IN_RANGE (val, 0, 0xff);
12964   return IN_RANGE (val, 0, 0xff00);
12965 }
12966
12967 /* Return true if X is a valid immediate operand for an SVE logical
12968    instruction such as AND.  */
12969
12970 bool
12971 aarch64_sve_bitmask_immediate_p (rtx x)
12972 {
12973   rtx elt;
12974
12975   return (const_vec_duplicate_p (x, &elt)
12976           && CONST_INT_P (elt)
12977           && aarch64_bitmask_imm (INTVAL (elt),
12978                                   GET_MODE_INNER (GET_MODE (x))));
12979 }
12980
12981 /* Return true if X is a valid immediate for the SVE DUP and CPY
12982    instructions.  */
12983
12984 bool
12985 aarch64_sve_dup_immediate_p (rtx x)
12986 {
12987   rtx elt;
12988
12989   if (!const_vec_duplicate_p (x, &elt)
12990       || !CONST_INT_P (elt))
12991     return false;
12992
12993   HOST_WIDE_INT val = INTVAL (elt);
12994   if (val & 0xff)
12995     return IN_RANGE (val, -0x80, 0x7f);
12996   return IN_RANGE (val, -0x8000, 0x7f00);
12997 }
12998
12999 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
13000    SIGNED_P says whether the operand is signed rather than unsigned.  */
13001
13002 bool
13003 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
13004 {
13005   rtx elt;
13006
13007   return (const_vec_duplicate_p (x, &elt)
13008           && CONST_INT_P (elt)
13009           && (signed_p
13010               ? IN_RANGE (INTVAL (elt), -16, 15)
13011               : IN_RANGE (INTVAL (elt), 0, 127)));
13012 }
13013
13014 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
13015    instruction.  Negate X first if NEGATE_P is true.  */
13016
13017 bool
13018 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
13019 {
13020   rtx elt;
13021   REAL_VALUE_TYPE r;
13022
13023   if (!const_vec_duplicate_p (x, &elt)
13024       || GET_CODE (elt) != CONST_DOUBLE)
13025     return false;
13026
13027   r = *CONST_DOUBLE_REAL_VALUE (elt);
13028
13029   if (negate_p)
13030     r = real_value_negate (&r);
13031
13032   if (real_equal (&r, &dconst1))
13033     return true;
13034   if (real_equal (&r, &dconsthalf))
13035     return true;
13036   return false;
13037 }
13038
13039 /* Return true if X is a valid immediate operand for an SVE FMUL
13040    instruction.  */
13041
13042 bool
13043 aarch64_sve_float_mul_immediate_p (rtx x)
13044 {
13045   rtx elt;
13046
13047   /* GCC will never generate a multiply with an immediate of 2, so there is no
13048      point testing for it (even though it is a valid constant).  */
13049   return (const_vec_duplicate_p (x, &elt)
13050           && GET_CODE (elt) == CONST_DOUBLE
13051           && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
13052 }
13053
13054 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
13055    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
13056    is nonnull, use it to describe valid immediates.  */
13057 static bool
13058 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
13059                                     simd_immediate_info *info,
13060                                     enum simd_immediate_check which,
13061                                     simd_immediate_info::insn_type insn)
13062 {
13063   /* Try a 4-byte immediate with LSL.  */
13064   for (unsigned int shift = 0; shift < 32; shift += 8)
13065     if ((val32 & (0xff << shift)) == val32)
13066       {
13067         if (info)
13068           *info = simd_immediate_info (SImode, val32 >> shift, insn,
13069                                        simd_immediate_info::LSL, shift);
13070         return true;
13071       }
13072
13073   /* Try a 2-byte immediate with LSL.  */
13074   unsigned int imm16 = val32 & 0xffff;
13075   if (imm16 == (val32 >> 16))
13076     for (unsigned int shift = 0; shift < 16; shift += 8)
13077       if ((imm16 & (0xff << shift)) == imm16)
13078         {
13079           if (info)
13080             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
13081                                          simd_immediate_info::LSL, shift);
13082           return true;
13083         }
13084
13085   /* Try a 4-byte immediate with MSL, except for cases that MVN
13086      can handle.  */
13087   if (which == AARCH64_CHECK_MOV)
13088     for (unsigned int shift = 8; shift < 24; shift += 8)
13089       {
13090         unsigned int low = (1 << shift) - 1;
13091         if (((val32 & (0xff << shift)) | low) == val32)
13092           {
13093             if (info)
13094               *info = simd_immediate_info (SImode, val32 >> shift, insn,
13095                                            simd_immediate_info::MSL, shift);
13096             return true;
13097           }
13098       }
13099
13100   return false;
13101 }
13102
13103 /* Return true if replicating VAL64 is a valid immediate for the
13104    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
13105    use it to describe valid immediates.  */
13106 static bool
13107 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
13108                                  simd_immediate_info *info,
13109                                  enum simd_immediate_check which)
13110 {
13111   unsigned int val32 = val64 & 0xffffffff;
13112   unsigned int val16 = val64 & 0xffff;
13113   unsigned int val8 = val64 & 0xff;
13114
13115   if (val32 == (val64 >> 32))
13116     {
13117       if ((which & AARCH64_CHECK_ORR) != 0
13118           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
13119                                                  simd_immediate_info::MOV))
13120         return true;
13121
13122       if ((which & AARCH64_CHECK_BIC) != 0
13123           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
13124                                                  simd_immediate_info::MVN))
13125         return true;
13126
13127       /* Try using a replicated byte.  */
13128       if (which == AARCH64_CHECK_MOV
13129           && val16 == (val32 >> 16)
13130           && val8 == (val16 >> 8))
13131         {
13132           if (info)
13133             *info = simd_immediate_info (QImode, val8);
13134           return true;
13135         }
13136     }
13137
13138   /* Try using a bit-to-bytemask.  */
13139   if (which == AARCH64_CHECK_MOV)
13140     {
13141       unsigned int i;
13142       for (i = 0; i < 64; i += 8)
13143         {
13144           unsigned char byte = (val64 >> i) & 0xff;
13145           if (byte != 0 && byte != 0xff)
13146             break;
13147         }
13148       if (i == 64)
13149         {
13150           if (info)
13151             *info = simd_immediate_info (DImode, val64);
13152           return true;
13153         }
13154     }
13155   return false;
13156 }
13157
13158 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
13159    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
13160
13161 static bool
13162 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
13163                              simd_immediate_info *info)
13164 {
13165   scalar_int_mode mode = DImode;
13166   unsigned int val32 = val64 & 0xffffffff;
13167   if (val32 == (val64 >> 32))
13168     {
13169       mode = SImode;
13170       unsigned int val16 = val32 & 0xffff;
13171       if (val16 == (val32 >> 16))
13172         {
13173           mode = HImode;
13174           unsigned int val8 = val16 & 0xff;
13175           if (val8 == (val16 >> 8))
13176             mode = QImode;
13177         }
13178     }
13179   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
13180   if (IN_RANGE (val, -0x80, 0x7f))
13181     {
13182       /* DUP with no shift.  */
13183       if (info)
13184         *info = simd_immediate_info (mode, val);
13185       return true;
13186     }
13187   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
13188     {
13189       /* DUP with LSL #8.  */
13190       if (info)
13191         *info = simd_immediate_info (mode, val);
13192       return true;
13193     }
13194   if (aarch64_bitmask_imm (val64, mode))
13195     {
13196       /* DUPM.  */
13197       if (info)
13198         *info = simd_immediate_info (mode, val);
13199       return true;
13200     }
13201   return false;
13202 }
13203
13204 /* Return true if OP is a valid SIMD immediate for the operation
13205    described by WHICH.  If INFO is nonnull, use it to describe valid
13206    immediates.  */
13207 bool
13208 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
13209                               enum simd_immediate_check which)
13210 {
13211   machine_mode mode = GET_MODE (op);
13212   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13213   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13214     return false;
13215
13216   scalar_mode elt_mode = GET_MODE_INNER (mode);
13217   rtx base, step;
13218   unsigned int n_elts;
13219   if (GET_CODE (op) == CONST_VECTOR
13220       && CONST_VECTOR_DUPLICATE_P (op))
13221     n_elts = CONST_VECTOR_NPATTERNS (op);
13222   else if ((vec_flags & VEC_SVE_DATA)
13223            && const_vec_series_p (op, &base, &step))
13224     {
13225       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
13226       if (!aarch64_sve_index_immediate_p (base)
13227           || !aarch64_sve_index_immediate_p (step))
13228         return false;
13229
13230       if (info)
13231         *info = simd_immediate_info (elt_mode, base, step);
13232       return true;
13233     }
13234   else if (GET_CODE (op) == CONST_VECTOR
13235            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
13236     /* N_ELTS set above.  */;
13237   else
13238     return false;
13239
13240   /* Handle PFALSE and PTRUE.  */
13241   if (vec_flags & VEC_SVE_PRED)
13242     return (op == CONST0_RTX (mode)
13243             || op == CONSTM1_RTX (mode));
13244
13245   scalar_float_mode elt_float_mode;
13246   if (n_elts == 1
13247       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
13248     {
13249       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
13250       if (aarch64_float_const_zero_rtx_p (elt)
13251           || aarch64_float_const_representable_p (elt))
13252         {
13253           if (info)
13254             *info = simd_immediate_info (elt_float_mode, elt);
13255           return true;
13256         }
13257     }
13258
13259   unsigned int elt_size = GET_MODE_SIZE (elt_mode);
13260   if (elt_size > 8)
13261     return false;
13262
13263   scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
13264
13265   /* Expand the vector constant out into a byte vector, with the least
13266      significant byte of the register first.  */
13267   auto_vec<unsigned char, 16> bytes;
13268   bytes.reserve (n_elts * elt_size);
13269   for (unsigned int i = 0; i < n_elts; i++)
13270     {
13271       /* The vector is provided in gcc endian-neutral fashion.
13272          For aarch64_be Advanced SIMD, it must be laid out in the vector
13273          register in reverse order.  */
13274       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
13275       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
13276
13277       if (elt_mode != elt_int_mode)
13278         elt = gen_lowpart (elt_int_mode, elt);
13279
13280       if (!CONST_INT_P (elt))
13281         return false;
13282
13283       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
13284       for (unsigned int byte = 0; byte < elt_size; byte++)
13285         {
13286           bytes.quick_push (elt_val & 0xff);
13287           elt_val >>= BITS_PER_UNIT;
13288         }
13289     }
13290
13291   /* The immediate must repeat every eight bytes.  */
13292   unsigned int nbytes = bytes.length ();
13293   for (unsigned i = 8; i < nbytes; ++i)
13294     if (bytes[i] != bytes[i - 8])
13295       return false;
13296
13297   /* Get the repeating 8-byte value as an integer.  No endian correction
13298      is needed here because bytes is already in lsb-first order.  */
13299   unsigned HOST_WIDE_INT val64 = 0;
13300   for (unsigned int i = 0; i < 8; i++)
13301     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
13302               << (i * BITS_PER_UNIT));
13303
13304   if (vec_flags & VEC_SVE_DATA)
13305     return aarch64_sve_valid_immediate (val64, info);
13306   else
13307     return aarch64_advsimd_valid_immediate (val64, info, which);
13308 }
13309
13310 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
13311    has a step in the range of INDEX.  Return the index expression if so,
13312    otherwise return null.  */
13313 rtx
13314 aarch64_check_zero_based_sve_index_immediate (rtx x)
13315 {
13316   rtx base, step;
13317   if (const_vec_series_p (x, &base, &step)
13318       && base == const0_rtx
13319       && aarch64_sve_index_immediate_p (step))
13320     return step;
13321   return NULL_RTX;
13322 }
13323
13324 /* Check of immediate shift constants are within range.  */
13325 bool
13326 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
13327 {
13328   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
13329   if (left)
13330     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
13331   else
13332     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
13333 }
13334
13335 /* Return the bitmask CONST_INT to select the bits required by a zero extract
13336    operation of width WIDTH at bit position POS.  */
13337
13338 rtx
13339 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
13340 {
13341   gcc_assert (CONST_INT_P (width));
13342   gcc_assert (CONST_INT_P (pos));
13343
13344   unsigned HOST_WIDE_INT mask
13345     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
13346   return GEN_INT (mask << UINTVAL (pos));
13347 }
13348
13349 bool
13350 aarch64_mov_operand_p (rtx x, machine_mode mode)
13351 {
13352   if (GET_CODE (x) == HIGH
13353       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
13354     return true;
13355
13356   if (CONST_INT_P (x))
13357     return true;
13358
13359   if (VECTOR_MODE_P (GET_MODE (x)))
13360     return aarch64_simd_valid_immediate (x, NULL);
13361
13362   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
13363     return true;
13364
13365   if (aarch64_sve_cnt_immediate_p (x))
13366     return true;
13367
13368   return aarch64_classify_symbolic_expression (x)
13369     == SYMBOL_TINY_ABSOLUTE;
13370 }
13371
13372 /* Return a const_int vector of VAL.  */
13373 rtx
13374 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
13375 {
13376   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
13377   return gen_const_vec_duplicate (mode, c);
13378 }
13379
13380 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
13381
13382 bool
13383 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
13384 {
13385   machine_mode vmode;
13386
13387   vmode = aarch64_simd_container_mode (mode, 64);
13388   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
13389   return aarch64_simd_valid_immediate (op_v, NULL);
13390 }
13391
13392 /* Construct and return a PARALLEL RTX vector with elements numbering the
13393    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
13394    the vector - from the perspective of the architecture.  This does not
13395    line up with GCC's perspective on lane numbers, so we end up with
13396    different masks depending on our target endian-ness.  The diagram
13397    below may help.  We must draw the distinction when building masks
13398    which select one half of the vector.  An instruction selecting
13399    architectural low-lanes for a big-endian target, must be described using
13400    a mask selecting GCC high-lanes.
13401
13402                  Big-Endian             Little-Endian
13403
13404 GCC             0   1   2   3           3   2   1   0
13405               | x | x | x | x |       | x | x | x | x |
13406 Architecture    3   2   1   0           3   2   1   0
13407
13408 Low Mask:         { 2, 3 }                { 0, 1 }
13409 High Mask:        { 0, 1 }                { 2, 3 }
13410
13411    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
13412
13413 rtx
13414 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
13415 {
13416   rtvec v = rtvec_alloc (nunits / 2);
13417   int high_base = nunits / 2;
13418   int low_base = 0;
13419   int base;
13420   rtx t1;
13421   int i;
13422
13423   if (BYTES_BIG_ENDIAN)
13424     base = high ? low_base : high_base;
13425   else
13426     base = high ? high_base : low_base;
13427
13428   for (i = 0; i < nunits / 2; i++)
13429     RTVEC_ELT (v, i) = GEN_INT (base + i);
13430
13431   t1 = gen_rtx_PARALLEL (mode, v);
13432   return t1;
13433 }
13434
13435 /* Check OP for validity as a PARALLEL RTX vector with elements
13436    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
13437    from the perspective of the architecture.  See the diagram above
13438    aarch64_simd_vect_par_cnst_half for more details.  */
13439
13440 bool
13441 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
13442                                        bool high)
13443 {
13444   int nelts;
13445   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
13446     return false;
13447
13448   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
13449   HOST_WIDE_INT count_op = XVECLEN (op, 0);
13450   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
13451   int i = 0;
13452
13453   if (count_op != count_ideal)
13454     return false;
13455
13456   for (i = 0; i < count_ideal; i++)
13457     {
13458       rtx elt_op = XVECEXP (op, 0, i);
13459       rtx elt_ideal = XVECEXP (ideal, 0, i);
13460
13461       if (!CONST_INT_P (elt_op)
13462           || INTVAL (elt_ideal) != INTVAL (elt_op))
13463         return false;
13464     }
13465   return true;
13466 }
13467
13468 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
13469    HIGH (exclusive).  */
13470 void
13471 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
13472                           const_tree exp)
13473 {
13474   HOST_WIDE_INT lane;
13475   gcc_assert (CONST_INT_P (operand));
13476   lane = INTVAL (operand);
13477
13478   if (lane < low || lane >= high)
13479   {
13480     if (exp)
13481       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
13482     else
13483       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
13484   }
13485 }
13486
13487 /* Peform endian correction on lane number N, which indexes a vector
13488    of mode MODE, and return the result as an SImode rtx.  */
13489
13490 rtx
13491 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
13492 {
13493   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
13494 }
13495
13496 /* Return TRUE if OP is a valid vector addressing mode.  */
13497
13498 bool
13499 aarch64_simd_mem_operand_p (rtx op)
13500 {
13501   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
13502                         || REG_P (XEXP (op, 0)));
13503 }
13504
13505 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
13506
13507 bool
13508 aarch64_sve_ld1r_operand_p (rtx op)
13509 {
13510   struct aarch64_address_info addr;
13511   scalar_mode mode;
13512
13513   return (MEM_P (op)
13514           && is_a <scalar_mode> (GET_MODE (op), &mode)
13515           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
13516           && addr.type == ADDRESS_REG_IMM
13517           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
13518 }
13519
13520 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
13521    The conditions for STR are the same.  */
13522 bool
13523 aarch64_sve_ldr_operand_p (rtx op)
13524 {
13525   struct aarch64_address_info addr;
13526
13527   return (MEM_P (op)
13528           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
13529                                        false, ADDR_QUERY_ANY)
13530           && addr.type == ADDRESS_REG_IMM);
13531 }
13532
13533 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
13534    We need to be able to access the individual pieces, so the range
13535    is different from LD[234] and ST[234].  */
13536 bool
13537 aarch64_sve_struct_memory_operand_p (rtx op)
13538 {
13539   if (!MEM_P (op))
13540     return false;
13541
13542   machine_mode mode = GET_MODE (op);
13543   struct aarch64_address_info addr;
13544   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
13545                                  ADDR_QUERY_ANY)
13546       || addr.type != ADDRESS_REG_IMM)
13547     return false;
13548
13549   poly_int64 first = addr.const_offset;
13550   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
13551   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
13552           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
13553 }
13554
13555 /* Emit a register copy from operand to operand, taking care not to
13556    early-clobber source registers in the process.
13557
13558    COUNT is the number of components into which the copy needs to be
13559    decomposed.  */
13560 void
13561 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
13562                                 unsigned int count)
13563 {
13564   unsigned int i;
13565   int rdest = REGNO (operands[0]);
13566   int rsrc = REGNO (operands[1]);
13567
13568   if (!reg_overlap_mentioned_p (operands[0], operands[1])
13569       || rdest < rsrc)
13570     for (i = 0; i < count; i++)
13571       emit_move_insn (gen_rtx_REG (mode, rdest + i),
13572                       gen_rtx_REG (mode, rsrc + i));
13573   else
13574     for (i = 0; i < count; i++)
13575       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
13576                       gen_rtx_REG (mode, rsrc + count - i - 1));
13577 }
13578
13579 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
13580    one of VSTRUCT modes: OI, CI, or XI.  */
13581 int
13582 aarch64_simd_attr_length_rglist (machine_mode mode)
13583 {
13584   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
13585   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
13586 }
13587
13588 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
13589    alignment of a vector to 128 bits.  SVE predicates have an alignment of
13590    16 bits.  */
13591 static HOST_WIDE_INT
13592 aarch64_simd_vector_alignment (const_tree type)
13593 {
13594   if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13595     /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
13596        be set for non-predicate vectors of booleans.  Modes are the most
13597        direct way we have of identifying real SVE predicate types.  */
13598     return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
13599   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
13600   return MIN (align, 128);
13601 }
13602
13603 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
13604 static HOST_WIDE_INT
13605 aarch64_vectorize_preferred_vector_alignment (const_tree type)
13606 {
13607   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
13608     {
13609       /* If the length of the vector is fixed, try to align to that length,
13610          otherwise don't try to align at all.  */
13611       HOST_WIDE_INT result;
13612       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
13613         result = TYPE_ALIGN (TREE_TYPE (type));
13614       return result;
13615     }
13616   return TYPE_ALIGN (type);
13617 }
13618
13619 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
13620 static bool
13621 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
13622 {
13623   if (is_packed)
13624     return false;
13625
13626   /* For fixed-length vectors, check that the vectorizer will aim for
13627      full-vector alignment.  This isn't true for generic GCC vectors
13628      that are wider than the ABI maximum of 128 bits.  */
13629   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
13630       && (wi::to_widest (TYPE_SIZE (type))
13631           != aarch64_vectorize_preferred_vector_alignment (type)))
13632     return false;
13633
13634   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
13635   return true;
13636 }
13637
13638 /* Return true if the vector misalignment factor is supported by the
13639    target.  */
13640 static bool
13641 aarch64_builtin_support_vector_misalignment (machine_mode mode,
13642                                              const_tree type, int misalignment,
13643                                              bool is_packed)
13644 {
13645   if (TARGET_SIMD && STRICT_ALIGNMENT)
13646     {
13647       /* Return if movmisalign pattern is not supported for this mode.  */
13648       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
13649         return false;
13650
13651       /* Misalignment factor is unknown at compile time.  */
13652       if (misalignment == -1)
13653         return false;
13654     }
13655   return default_builtin_support_vector_misalignment (mode, type, misalignment,
13656                                                       is_packed);
13657 }
13658
13659 /* If VALS is a vector constant that can be loaded into a register
13660    using DUP, generate instructions to do so and return an RTX to
13661    assign to the register.  Otherwise return NULL_RTX.  */
13662 static rtx
13663 aarch64_simd_dup_constant (rtx vals)
13664 {
13665   machine_mode mode = GET_MODE (vals);
13666   machine_mode inner_mode = GET_MODE_INNER (mode);
13667   rtx x;
13668
13669   if (!const_vec_duplicate_p (vals, &x))
13670     return NULL_RTX;
13671
13672   /* We can load this constant by using DUP and a constant in a
13673      single ARM register.  This will be cheaper than a vector
13674      load.  */
13675   x = copy_to_mode_reg (inner_mode, x);
13676   return gen_vec_duplicate (mode, x);
13677 }
13678
13679
13680 /* Generate code to load VALS, which is a PARALLEL containing only
13681    constants (for vec_init) or CONST_VECTOR, efficiently into a
13682    register.  Returns an RTX to copy into the register, or NULL_RTX
13683    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
13684 static rtx
13685 aarch64_simd_make_constant (rtx vals)
13686 {
13687   machine_mode mode = GET_MODE (vals);
13688   rtx const_dup;
13689   rtx const_vec = NULL_RTX;
13690   int n_const = 0;
13691   int i;
13692
13693   if (GET_CODE (vals) == CONST_VECTOR)
13694     const_vec = vals;
13695   else if (GET_CODE (vals) == PARALLEL)
13696     {
13697       /* A CONST_VECTOR must contain only CONST_INTs and
13698          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
13699          Only store valid constants in a CONST_VECTOR.  */
13700       int n_elts = XVECLEN (vals, 0);
13701       for (i = 0; i < n_elts; ++i)
13702         {
13703           rtx x = XVECEXP (vals, 0, i);
13704           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13705             n_const++;
13706         }
13707       if (n_const == n_elts)
13708         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
13709     }
13710   else
13711     gcc_unreachable ();
13712
13713   if (const_vec != NULL_RTX
13714       && aarch64_simd_valid_immediate (const_vec, NULL))
13715     /* Load using MOVI/MVNI.  */
13716     return const_vec;
13717   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
13718     /* Loaded using DUP.  */
13719     return const_dup;
13720   else if (const_vec != NULL_RTX)
13721     /* Load from constant pool. We can not take advantage of single-cycle
13722        LD1 because we need a PC-relative addressing mode.  */
13723     return const_vec;
13724   else
13725     /* A PARALLEL containing something not valid inside CONST_VECTOR.
13726        We can not construct an initializer.  */
13727     return NULL_RTX;
13728 }
13729
13730 /* Expand a vector initialisation sequence, such that TARGET is
13731    initialised to contain VALS.  */
13732
13733 void
13734 aarch64_expand_vector_init (rtx target, rtx vals)
13735 {
13736   machine_mode mode = GET_MODE (target);
13737   scalar_mode inner_mode = GET_MODE_INNER (mode);
13738   /* The number of vector elements.  */
13739   int n_elts = XVECLEN (vals, 0);
13740   /* The number of vector elements which are not constant.  */
13741   int n_var = 0;
13742   rtx any_const = NULL_RTX;
13743   /* The first element of vals.  */
13744   rtx v0 = XVECEXP (vals, 0, 0);
13745   bool all_same = true;
13746
13747   /* Count the number of variable elements to initialise.  */
13748   for (int i = 0; i < n_elts; ++i)
13749     {
13750       rtx x = XVECEXP (vals, 0, i);
13751       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
13752         ++n_var;
13753       else
13754         any_const = x;
13755
13756       all_same &= rtx_equal_p (x, v0);
13757     }
13758
13759   /* No variable elements, hand off to aarch64_simd_make_constant which knows
13760      how best to handle this.  */
13761   if (n_var == 0)
13762     {
13763       rtx constant = aarch64_simd_make_constant (vals);
13764       if (constant != NULL_RTX)
13765         {
13766           emit_move_insn (target, constant);
13767           return;
13768         }
13769     }
13770
13771   /* Splat a single non-constant element if we can.  */
13772   if (all_same)
13773     {
13774       rtx x = copy_to_mode_reg (inner_mode, v0);
13775       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13776       return;
13777     }
13778
13779   enum insn_code icode = optab_handler (vec_set_optab, mode);
13780   gcc_assert (icode != CODE_FOR_nothing);
13781
13782   /* If there are only variable elements, try to optimize
13783      the insertion using dup for the most common element
13784      followed by insertions.  */
13785
13786   /* The algorithm will fill matches[*][0] with the earliest matching element,
13787      and matches[X][1] with the count of duplicate elements (if X is the
13788      earliest element which has duplicates).  */
13789
13790   if (n_var == n_elts && n_elts <= 16)
13791     {
13792       int matches[16][2] = {0};
13793       for (int i = 0; i < n_elts; i++)
13794         {
13795           for (int j = 0; j <= i; j++)
13796             {
13797               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
13798                 {
13799                   matches[i][0] = j;
13800                   matches[j][1]++;
13801                   break;
13802                 }
13803             }
13804         }
13805       int maxelement = 0;
13806       int maxv = 0;
13807       for (int i = 0; i < n_elts; i++)
13808         if (matches[i][1] > maxv)
13809           {
13810             maxelement = i;
13811             maxv = matches[i][1];
13812           }
13813
13814       /* Create a duplicate of the most common element, unless all elements
13815          are equally useless to us, in which case just immediately set the
13816          vector register using the first element.  */
13817
13818       if (maxv == 1)
13819         {
13820           /* For vectors of two 64-bit elements, we can do even better.  */
13821           if (n_elts == 2
13822               && (inner_mode == E_DImode
13823                   || inner_mode == E_DFmode))
13824
13825             {
13826               rtx x0 = XVECEXP (vals, 0, 0);
13827               rtx x1 = XVECEXP (vals, 0, 1);
13828               /* Combine can pick up this case, but handling it directly
13829                  here leaves clearer RTL.
13830
13831                  This is load_pair_lanes<mode>, and also gives us a clean-up
13832                  for store_pair_lanes<mode>.  */
13833               if (memory_operand (x0, inner_mode)
13834                   && memory_operand (x1, inner_mode)
13835                   && !STRICT_ALIGNMENT
13836                   && rtx_equal_p (XEXP (x1, 0),
13837                                   plus_constant (Pmode,
13838                                                  XEXP (x0, 0),
13839                                                  GET_MODE_SIZE (inner_mode))))
13840                 {
13841                   rtx t;
13842                   if (inner_mode == DFmode)
13843                     t = gen_load_pair_lanesdf (target, x0, x1);
13844                   else
13845                     t = gen_load_pair_lanesdi (target, x0, x1);
13846                   emit_insn (t);
13847                   return;
13848                 }
13849             }
13850           /* The subreg-move sequence below will move into lane zero of the
13851              vector register.  For big-endian we want that position to hold
13852              the last element of VALS.  */
13853           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
13854           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
13855           aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
13856         }
13857       else
13858         {
13859           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
13860           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13861         }
13862
13863       /* Insert the rest.  */
13864       for (int i = 0; i < n_elts; i++)
13865         {
13866           rtx x = XVECEXP (vals, 0, i);
13867           if (matches[i][0] == maxelement)
13868             continue;
13869           x = copy_to_mode_reg (inner_mode, x);
13870           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
13871         }
13872       return;
13873     }
13874
13875   /* Initialise a vector which is part-variable.  We want to first try
13876      to build those lanes which are constant in the most efficient way we
13877      can.  */
13878   if (n_var != n_elts)
13879     {
13880       rtx copy = copy_rtx (vals);
13881
13882       /* Load constant part of vector.  We really don't care what goes into the
13883          parts we will overwrite, but we're more likely to be able to load the
13884          constant efficiently if it has fewer, larger, repeating parts
13885          (see aarch64_simd_valid_immediate).  */
13886       for (int i = 0; i < n_elts; i++)
13887         {
13888           rtx x = XVECEXP (vals, 0, i);
13889           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13890             continue;
13891           rtx subst = any_const;
13892           for (int bit = n_elts / 2; bit > 0; bit /= 2)
13893             {
13894               /* Look in the copied vector, as more elements are const.  */
13895               rtx test = XVECEXP (copy, 0, i ^ bit);
13896               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
13897                 {
13898                   subst = test;
13899                   break;
13900                 }
13901             }
13902           XVECEXP (copy, 0, i) = subst;
13903         }
13904       aarch64_expand_vector_init (target, copy);
13905     }
13906
13907   /* Insert the variable lanes directly.  */
13908   for (int i = 0; i < n_elts; i++)
13909     {
13910       rtx x = XVECEXP (vals, 0, i);
13911       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13912         continue;
13913       x = copy_to_mode_reg (inner_mode, x);
13914       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
13915     }
13916 }
13917
13918 static unsigned HOST_WIDE_INT
13919 aarch64_shift_truncation_mask (machine_mode mode)
13920 {
13921   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
13922     return 0;
13923   return GET_MODE_UNIT_BITSIZE (mode) - 1;
13924 }
13925
13926 /* Select a format to encode pointers in exception handling data.  */
13927 int
13928 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
13929 {
13930    int type;
13931    switch (aarch64_cmodel)
13932      {
13933      case AARCH64_CMODEL_TINY:
13934      case AARCH64_CMODEL_TINY_PIC:
13935      case AARCH64_CMODEL_SMALL:
13936      case AARCH64_CMODEL_SMALL_PIC:
13937      case AARCH64_CMODEL_SMALL_SPIC:
13938        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
13939           for everything.  */
13940        type = DW_EH_PE_sdata4;
13941        break;
13942      default:
13943        /* No assumptions here.  8-byte relocs required.  */
13944        type = DW_EH_PE_sdata8;
13945        break;
13946      }
13947    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
13948 }
13949
13950 /* The last .arch and .tune assembly strings that we printed.  */
13951 static std::string aarch64_last_printed_arch_string;
13952 static std::string aarch64_last_printed_tune_string;
13953
13954 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
13955    by the function fndecl.  */
13956
13957 void
13958 aarch64_declare_function_name (FILE *stream, const char* name,
13959                                 tree fndecl)
13960 {
13961   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13962
13963   struct cl_target_option *targ_options;
13964   if (target_parts)
13965     targ_options = TREE_TARGET_OPTION (target_parts);
13966   else
13967     targ_options = TREE_TARGET_OPTION (target_option_current_node);
13968   gcc_assert (targ_options);
13969
13970   const struct processor *this_arch
13971     = aarch64_get_arch (targ_options->x_explicit_arch);
13972
13973   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
13974   std::string extension
13975     = aarch64_get_extension_string_for_isa_flags (isa_flags,
13976                                                   this_arch->flags);
13977   /* Only update the assembler .arch string if it is distinct from the last
13978      such string we printed.  */
13979   std::string to_print = this_arch->name + extension;
13980   if (to_print != aarch64_last_printed_arch_string)
13981     {
13982       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
13983       aarch64_last_printed_arch_string = to_print;
13984     }
13985
13986   /* Print the cpu name we're tuning for in the comments, might be
13987      useful to readers of the generated asm.  Do it only when it changes
13988      from function to function and verbose assembly is requested.  */
13989   const struct processor *this_tune
13990     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
13991
13992   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
13993     {
13994       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
13995                    this_tune->name);
13996       aarch64_last_printed_tune_string = this_tune->name;
13997     }
13998
13999   /* Don't forget the type directive for ELF.  */
14000   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
14001   ASM_OUTPUT_LABEL (stream, name);
14002 }
14003
14004 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
14005
14006 static void
14007 aarch64_start_file (void)
14008 {
14009   struct cl_target_option *default_options
14010     = TREE_TARGET_OPTION (target_option_default_node);
14011
14012   const struct processor *default_arch
14013     = aarch64_get_arch (default_options->x_explicit_arch);
14014   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
14015   std::string extension
14016     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
14017                                                   default_arch->flags);
14018
14019    aarch64_last_printed_arch_string = default_arch->name + extension;
14020    aarch64_last_printed_tune_string = "";
14021    asm_fprintf (asm_out_file, "\t.arch %s\n",
14022                 aarch64_last_printed_arch_string.c_str ());
14023
14024    default_file_start ();
14025 }
14026
14027 /* Emit load exclusive.  */
14028
14029 static void
14030 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
14031                              rtx mem, rtx model_rtx)
14032 {
14033   emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
14034 }
14035
14036 /* Emit store exclusive.  */
14037
14038 static void
14039 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
14040                               rtx rval, rtx mem, rtx model_rtx)
14041 {
14042   emit_insn (gen_aarch64_store_exclusive (mode, bval, rval, mem, model_rtx));
14043 }
14044
14045 /* Mark the previous jump instruction as unlikely.  */
14046
14047 static void
14048 aarch64_emit_unlikely_jump (rtx insn)
14049 {
14050   rtx_insn *jump = emit_jump_insn (insn);
14051   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
14052 }
14053
14054 /* Expand a compare and swap pattern.  */
14055
14056 void
14057 aarch64_expand_compare_and_swap (rtx operands[])
14058 {
14059   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
14060   machine_mode mode, cmp_mode;
14061
14062   bval = operands[0];
14063   rval = operands[1];
14064   mem = operands[2];
14065   oldval = operands[3];
14066   newval = operands[4];
14067   is_weak = operands[5];
14068   mod_s = operands[6];
14069   mod_f = operands[7];
14070   mode = GET_MODE (mem);
14071   cmp_mode = mode;
14072
14073   /* Normally the succ memory model must be stronger than fail, but in the
14074      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
14075      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
14076
14077   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
14078       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
14079     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
14080
14081   switch (mode)
14082     {
14083     case E_QImode:
14084     case E_HImode:
14085       /* For short modes, we're going to perform the comparison in SImode,
14086          so do the zero-extension now.  */
14087       cmp_mode = SImode;
14088       rval = gen_reg_rtx (SImode);
14089       oldval = convert_modes (SImode, mode, oldval, true);
14090       /* Fall through.  */
14091
14092     case E_SImode:
14093     case E_DImode:
14094       /* Force the value into a register if needed.  */
14095       if (!aarch64_plus_operand (oldval, mode))
14096         oldval = force_reg (cmp_mode, oldval);
14097       break;
14098
14099     default:
14100       gcc_unreachable ();
14101     }
14102
14103   if (TARGET_LSE)
14104     emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem, oldval,
14105                                                  newval, is_weak, mod_s,
14106                                                  mod_f));
14107   else
14108     emit_insn (gen_aarch64_compare_and_swap (mode, rval, mem, oldval, newval,
14109                                              is_weak, mod_s, mod_f));
14110
14111
14112   if (mode == QImode || mode == HImode)
14113     emit_move_insn (operands[1], gen_lowpart (mode, rval));
14114
14115   x = gen_rtx_REG (CCmode, CC_REGNUM);
14116   x = gen_rtx_EQ (SImode, x, const0_rtx);
14117   emit_insn (gen_rtx_SET (bval, x));
14118 }
14119
14120 /* Test whether the target supports using a atomic load-operate instruction.
14121    CODE is the operation and AFTER is TRUE if the data in memory after the
14122    operation should be returned and FALSE if the data before the operation
14123    should be returned.  Returns FALSE if the operation isn't supported by the
14124    architecture.  */
14125
14126 bool
14127 aarch64_atomic_ldop_supported_p (enum rtx_code code)
14128 {
14129   if (!TARGET_LSE)
14130     return false;
14131
14132   switch (code)
14133     {
14134     case SET:
14135     case AND:
14136     case IOR:
14137     case XOR:
14138     case MINUS:
14139     case PLUS:
14140       return true;
14141     default:
14142       return false;
14143     }
14144 }
14145
14146 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
14147    sequence implementing an atomic operation.  */
14148
14149 static void
14150 aarch64_emit_post_barrier (enum memmodel model)
14151 {
14152   const enum memmodel base_model = memmodel_base (model);
14153
14154   if (is_mm_sync (model)
14155       && (base_model == MEMMODEL_ACQUIRE
14156           || base_model == MEMMODEL_ACQ_REL
14157           || base_model == MEMMODEL_SEQ_CST))
14158     {
14159       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
14160     }
14161 }
14162
14163 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
14164    for the data in memory.  EXPECTED is the value expected to be in memory.
14165    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
14166    is the memory ordering to use.  */
14167
14168 void
14169 aarch64_gen_atomic_cas (rtx rval, rtx mem,
14170                         rtx expected, rtx desired,
14171                         rtx model)
14172 {
14173   machine_mode mode;
14174
14175   mode = GET_MODE (mem);
14176
14177   /* Move the expected value into the CAS destination register.  */
14178   emit_insn (gen_rtx_SET (rval, expected));
14179
14180   /* Emit the CAS.  */
14181   emit_insn (gen_aarch64_atomic_cas (mode, rval, mem, desired, model));
14182
14183   /* Compare the expected value with the value loaded by the CAS, to establish
14184      whether the swap was made.  */
14185   aarch64_gen_compare_reg (EQ, rval, expected);
14186 }
14187
14188 /* Split a compare and swap pattern.  */
14189
14190 void
14191 aarch64_split_compare_and_swap (rtx operands[])
14192 {
14193   rtx rval, mem, oldval, newval, scratch;
14194   machine_mode mode;
14195   bool is_weak;
14196   rtx_code_label *label1, *label2;
14197   rtx x, cond;
14198   enum memmodel model;
14199   rtx model_rtx;
14200
14201   rval = operands[0];
14202   mem = operands[1];
14203   oldval = operands[2];
14204   newval = operands[3];
14205   is_weak = (operands[4] != const0_rtx);
14206   model_rtx = operands[5];
14207   scratch = operands[7];
14208   mode = GET_MODE (mem);
14209   model = memmodel_from_int (INTVAL (model_rtx));
14210
14211   /* When OLDVAL is zero and we want the strong version we can emit a tighter
14212     loop:
14213     .label1:
14214         LD[A]XR rval, [mem]
14215         CBNZ    rval, .label2
14216         ST[L]XR scratch, newval, [mem]
14217         CBNZ    scratch, .label1
14218     .label2:
14219         CMP     rval, 0.  */
14220   bool strong_zero_p = !is_weak && oldval == const0_rtx;
14221
14222   label1 = NULL;
14223   if (!is_weak)
14224     {
14225       label1 = gen_label_rtx ();
14226       emit_label (label1);
14227     }
14228   label2 = gen_label_rtx ();
14229
14230   /* The initial load can be relaxed for a __sync operation since a final
14231      barrier will be emitted to stop code hoisting.  */
14232   if (is_mm_sync (model))
14233     aarch64_emit_load_exclusive (mode, rval, mem,
14234                                  GEN_INT (MEMMODEL_RELAXED));
14235   else
14236     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
14237
14238   if (strong_zero_p)
14239     {
14240       if (aarch64_track_speculation)
14241         {
14242           /* Emit an explicit compare instruction, so that we can correctly
14243              track the condition codes.  */
14244           rtx cc_reg = aarch64_gen_compare_reg (NE, rval, const0_rtx);
14245           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
14246         }
14247       else
14248         x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
14249
14250       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14251                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14252       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14253     }
14254   else
14255     {
14256       cond = aarch64_gen_compare_reg (NE, rval, oldval);
14257       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14258       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14259                                  gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14260       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14261     }
14262
14263   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
14264
14265   if (!is_weak)
14266     {
14267       if (aarch64_track_speculation)
14268         {
14269           /* Emit an explicit compare instruction, so that we can correctly
14270              track the condition codes.  */
14271           rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
14272           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
14273         }
14274       else
14275         x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
14276
14277       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14278                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
14279       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14280     }
14281   else
14282     {
14283       cond = gen_rtx_REG (CCmode, CC_REGNUM);
14284       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
14285       emit_insn (gen_rtx_SET (cond, x));
14286     }
14287
14288   emit_label (label2);
14289   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
14290      to set the condition flags.  If this is not used it will be removed by
14291      later passes.  */
14292   if (strong_zero_p)
14293     {
14294       cond = gen_rtx_REG (CCmode, CC_REGNUM);
14295       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
14296       emit_insn (gen_rtx_SET (cond, x));
14297     }
14298   /* Emit any final barrier needed for a __sync operation.  */
14299   if (is_mm_sync (model))
14300     aarch64_emit_post_barrier (model);
14301 }
14302
14303 /* Emit a BIC instruction.  */
14304
14305 static void
14306 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
14307 {
14308   rtx shift_rtx = GEN_INT (shift);
14309   rtx (*gen) (rtx, rtx, rtx, rtx);
14310
14311   switch (mode)
14312     {
14313     case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
14314     case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
14315     default:
14316       gcc_unreachable ();
14317     }
14318
14319   emit_insn (gen (dst, s2, shift_rtx, s1));
14320 }
14321
14322 /* Emit an atomic swap.  */
14323
14324 static void
14325 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
14326                           rtx mem, rtx model)
14327 {
14328   emit_insn (gen_aarch64_atomic_swp (mode, dst, mem, value, model));
14329 }
14330
14331 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
14332    location to store the data read from memory.  OUT_RESULT is the location to
14333    store the result of the operation.  MEM is the memory location to read and
14334    modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
14335    operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
14336    be NULL.  */
14337
14338 void
14339 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
14340                          rtx mem, rtx value, rtx model_rtx)
14341 {
14342   machine_mode mode = GET_MODE (mem);
14343   machine_mode wmode = (mode == DImode ? DImode : SImode);
14344   const bool short_mode = (mode < SImode);
14345   int ldop_code;
14346   rtx src;
14347   rtx x;
14348
14349   if (out_data)
14350     out_data = gen_lowpart (mode, out_data);
14351
14352   if (out_result)
14353     out_result = gen_lowpart (mode, out_result);
14354
14355   /* Make sure the value is in a register, putting it into a destination
14356      register if it needs to be manipulated.  */
14357   if (!register_operand (value, mode)
14358       || code == AND || code == MINUS)
14359     {
14360       src = out_result ? out_result : out_data;
14361       emit_move_insn (src, gen_lowpart (mode, value));
14362     }
14363   else
14364     src = value;
14365   gcc_assert (register_operand (src, mode));
14366
14367   /* Preprocess the data for the operation as necessary.  If the operation is
14368      a SET then emit a swap instruction and finish.  */
14369   switch (code)
14370     {
14371     case SET:
14372       aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
14373       return;
14374
14375     case MINUS:
14376       /* Negate the value and treat it as a PLUS.  */
14377       {
14378         rtx neg_src;
14379
14380         /* Resize the value if necessary.  */
14381         if (short_mode)
14382           src = gen_lowpart (wmode, src);
14383
14384         neg_src = gen_rtx_NEG (wmode, src);
14385         emit_insn (gen_rtx_SET (src, neg_src));
14386
14387         if (short_mode)
14388           src = gen_lowpart (mode, src);
14389       }
14390       /* Fall-through.  */
14391     case PLUS:
14392       ldop_code = UNSPECV_ATOMIC_LDOP_PLUS;
14393       break;
14394
14395     case IOR:
14396       ldop_code = UNSPECV_ATOMIC_LDOP_OR;
14397       break;
14398
14399     case XOR:
14400       ldop_code = UNSPECV_ATOMIC_LDOP_XOR;
14401       break;
14402
14403     case AND:
14404       {
14405         rtx not_src;
14406
14407         /* Resize the value if necessary.  */
14408         if (short_mode)
14409           src = gen_lowpart (wmode, src);
14410
14411         not_src = gen_rtx_NOT (wmode, src);
14412         emit_insn (gen_rtx_SET (src, not_src));
14413
14414         if (short_mode)
14415           src = gen_lowpart (mode, src);
14416       }
14417       ldop_code = UNSPECV_ATOMIC_LDOP_BIC;
14418       break;
14419
14420     default:
14421       /* The operation can't be done with atomic instructions.  */
14422       gcc_unreachable ();
14423     }
14424
14425   emit_insn (gen_aarch64_atomic_load (ldop_code, mode,
14426                                       out_data, mem, src, model_rtx));
14427
14428   /* If necessary, calculate the data in memory after the update by redoing the
14429      operation from values in registers.  */
14430   if (!out_result)
14431     return;
14432
14433   if (short_mode)
14434     {
14435       src = gen_lowpart (wmode, src);
14436       out_data = gen_lowpart (wmode, out_data);
14437       out_result = gen_lowpart (wmode, out_result);
14438     }
14439
14440   x = NULL_RTX;
14441
14442   switch (code)
14443     {
14444     case MINUS:
14445     case PLUS:
14446       x = gen_rtx_PLUS (wmode, out_data, src);
14447       break;
14448     case IOR:
14449       x = gen_rtx_IOR (wmode, out_data, src);
14450       break;
14451     case XOR:
14452       x = gen_rtx_XOR (wmode, out_data, src);
14453       break;
14454     case AND:
14455       aarch64_emit_bic (wmode, out_result, out_data, src, 0);
14456       return;
14457     default:
14458       gcc_unreachable ();
14459     }
14460
14461   emit_set_insn (out_result, x);
14462
14463   return;
14464 }
14465
14466 /* Split an atomic operation.  */
14467
14468 void
14469 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
14470                          rtx value, rtx model_rtx, rtx cond)
14471 {
14472   machine_mode mode = GET_MODE (mem);
14473   machine_mode wmode = (mode == DImode ? DImode : SImode);
14474   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
14475   const bool is_sync = is_mm_sync (model);
14476   rtx_code_label *label;
14477   rtx x;
14478
14479   /* Split the atomic operation into a sequence.  */
14480   label = gen_label_rtx ();
14481   emit_label (label);
14482
14483   if (new_out)
14484     new_out = gen_lowpart (wmode, new_out);
14485   if (old_out)
14486     old_out = gen_lowpart (wmode, old_out);
14487   else
14488     old_out = new_out;
14489   value = simplify_gen_subreg (wmode, value, mode, 0);
14490
14491   /* The initial load can be relaxed for a __sync operation since a final
14492      barrier will be emitted to stop code hoisting.  */
14493  if (is_sync)
14494     aarch64_emit_load_exclusive (mode, old_out, mem,
14495                                  GEN_INT (MEMMODEL_RELAXED));
14496   else
14497     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
14498
14499   switch (code)
14500     {
14501     case SET:
14502       new_out = value;
14503       break;
14504
14505     case NOT:
14506       x = gen_rtx_AND (wmode, old_out, value);
14507       emit_insn (gen_rtx_SET (new_out, x));
14508       x = gen_rtx_NOT (wmode, new_out);
14509       emit_insn (gen_rtx_SET (new_out, x));
14510       break;
14511
14512     case MINUS:
14513       if (CONST_INT_P (value))
14514         {
14515           value = GEN_INT (-INTVAL (value));
14516           code = PLUS;
14517         }
14518       /* Fall through.  */
14519
14520     default:
14521       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
14522       emit_insn (gen_rtx_SET (new_out, x));
14523       break;
14524     }
14525
14526   aarch64_emit_store_exclusive (mode, cond, mem,
14527                                 gen_lowpart (mode, new_out), model_rtx);
14528
14529   if (aarch64_track_speculation)
14530     {
14531       /* Emit an explicit compare instruction, so that we can correctly
14532          track the condition codes.  */
14533       rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
14534       x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
14535     }
14536   else
14537     x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14538
14539   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14540                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
14541   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14542
14543   /* Emit any final barrier needed for a __sync operation.  */
14544   if (is_sync)
14545     aarch64_emit_post_barrier (model);
14546 }
14547
14548 static void
14549 aarch64_init_libfuncs (void)
14550 {
14551    /* Half-precision float operations.  The compiler handles all operations
14552      with NULL libfuncs by converting to SFmode.  */
14553
14554   /* Conversions.  */
14555   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
14556   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
14557
14558   /* Arithmetic.  */
14559   set_optab_libfunc (add_optab, HFmode, NULL);
14560   set_optab_libfunc (sdiv_optab, HFmode, NULL);
14561   set_optab_libfunc (smul_optab, HFmode, NULL);
14562   set_optab_libfunc (neg_optab, HFmode, NULL);
14563   set_optab_libfunc (sub_optab, HFmode, NULL);
14564
14565   /* Comparisons.  */
14566   set_optab_libfunc (eq_optab, HFmode, NULL);
14567   set_optab_libfunc (ne_optab, HFmode, NULL);
14568   set_optab_libfunc (lt_optab, HFmode, NULL);
14569   set_optab_libfunc (le_optab, HFmode, NULL);
14570   set_optab_libfunc (ge_optab, HFmode, NULL);
14571   set_optab_libfunc (gt_optab, HFmode, NULL);
14572   set_optab_libfunc (unord_optab, HFmode, NULL);
14573 }
14574
14575 /* Target hook for c_mode_for_suffix.  */
14576 static machine_mode
14577 aarch64_c_mode_for_suffix (char suffix)
14578 {
14579   if (suffix == 'q')
14580     return TFmode;
14581
14582   return VOIDmode;
14583 }
14584
14585 /* We can only represent floating point constants which will fit in
14586    "quarter-precision" values.  These values are characterised by
14587    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
14588    by:
14589
14590    (-1)^s * (n/16) * 2^r
14591
14592    Where:
14593      's' is the sign bit.
14594      'n' is an integer in the range 16 <= n <= 31.
14595      'r' is an integer in the range -3 <= r <= 4.  */
14596
14597 /* Return true iff X can be represented by a quarter-precision
14598    floating point immediate operand X.  Note, we cannot represent 0.0.  */
14599 bool
14600 aarch64_float_const_representable_p (rtx x)
14601 {
14602   /* This represents our current view of how many bits
14603      make up the mantissa.  */
14604   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
14605   int exponent;
14606   unsigned HOST_WIDE_INT mantissa, mask;
14607   REAL_VALUE_TYPE r, m;
14608   bool fail;
14609
14610   if (!CONST_DOUBLE_P (x))
14611     return false;
14612
14613   if (GET_MODE (x) == VOIDmode
14614       || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
14615     return false;
14616
14617   r = *CONST_DOUBLE_REAL_VALUE (x);
14618
14619   /* We cannot represent infinities, NaNs or +/-zero.  We won't
14620      know if we have +zero until we analyse the mantissa, but we
14621      can reject the other invalid values.  */
14622   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
14623       || REAL_VALUE_MINUS_ZERO (r))
14624     return false;
14625
14626   /* Extract exponent.  */
14627   r = real_value_abs (&r);
14628   exponent = REAL_EXP (&r);
14629
14630   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
14631      highest (sign) bit, with a fixed binary point at bit point_pos.
14632      m1 holds the low part of the mantissa, m2 the high part.
14633      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
14634      bits for the mantissa, this can fail (low bits will be lost).  */
14635   real_ldexp (&m, &r, point_pos - exponent);
14636   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
14637
14638   /* If the low part of the mantissa has bits set we cannot represent
14639      the value.  */
14640   if (w.ulow () != 0)
14641     return false;
14642   /* We have rejected the lower HOST_WIDE_INT, so update our
14643      understanding of how many bits lie in the mantissa and
14644      look only at the high HOST_WIDE_INT.  */
14645   mantissa = w.elt (1);
14646   point_pos -= HOST_BITS_PER_WIDE_INT;
14647
14648   /* We can only represent values with a mantissa of the form 1.xxxx.  */
14649   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
14650   if ((mantissa & mask) != 0)
14651     return false;
14652
14653   /* Having filtered unrepresentable values, we may now remove all
14654      but the highest 5 bits.  */
14655   mantissa >>= point_pos - 5;
14656
14657   /* We cannot represent the value 0.0, so reject it.  This is handled
14658      elsewhere.  */
14659   if (mantissa == 0)
14660     return false;
14661
14662   /* Then, as bit 4 is always set, we can mask it off, leaving
14663      the mantissa in the range [0, 15].  */
14664   mantissa &= ~(1 << 4);
14665   gcc_assert (mantissa <= 15);
14666
14667   /* GCC internally does not use IEEE754-like encoding (where normalized
14668      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
14669      Our mantissa values are shifted 4 places to the left relative to
14670      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
14671      by 5 places to correct for GCC's representation.  */
14672   exponent = 5 - exponent;
14673
14674   return (exponent >= 0 && exponent <= 7);
14675 }
14676
14677 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
14678    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
14679    output MOVI/MVNI, ORR or BIC immediate.  */
14680 char*
14681 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
14682                                    enum simd_immediate_check which)
14683 {
14684   bool is_valid;
14685   static char templ[40];
14686   const char *mnemonic;
14687   const char *shift_op;
14688   unsigned int lane_count = 0;
14689   char element_char;
14690
14691   struct simd_immediate_info info;
14692
14693   /* This will return true to show const_vector is legal for use as either
14694      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
14695      It will also update INFO to show how the immediate should be generated.
14696      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
14697   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
14698   gcc_assert (is_valid);
14699
14700   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
14701   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
14702
14703   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
14704     {
14705       gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
14706       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
14707          move immediate path.  */
14708       if (aarch64_float_const_zero_rtx_p (info.value))
14709         info.value = GEN_INT (0);
14710       else
14711         {
14712           const unsigned int buf_size = 20;
14713           char float_buf[buf_size] = {'\0'};
14714           real_to_decimal_for_mode (float_buf,
14715                                     CONST_DOUBLE_REAL_VALUE (info.value),
14716                                     buf_size, buf_size, 1, info.elt_mode);
14717
14718           if (lane_count == 1)
14719             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
14720           else
14721             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
14722                       lane_count, element_char, float_buf);
14723           return templ;
14724         }
14725     }
14726
14727   gcc_assert (CONST_INT_P (info.value));
14728
14729   if (which == AARCH64_CHECK_MOV)
14730     {
14731       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
14732       shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
14733       if (lane_count == 1)
14734         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
14735                   mnemonic, UINTVAL (info.value));
14736       else if (info.shift)
14737         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
14738                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
14739                   element_char, UINTVAL (info.value), shift_op, info.shift);
14740       else
14741         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
14742                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
14743                   element_char, UINTVAL (info.value));
14744     }
14745   else
14746     {
14747       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
14748       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
14749       if (info.shift)
14750         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
14751                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
14752                   element_char, UINTVAL (info.value), "lsl", info.shift);
14753       else
14754         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
14755                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
14756                   element_char, UINTVAL (info.value));
14757     }
14758   return templ;
14759 }
14760
14761 char*
14762 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
14763 {
14764
14765   /* If a floating point number was passed and we desire to use it in an
14766      integer mode do the conversion to integer.  */
14767   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
14768     {
14769       unsigned HOST_WIDE_INT ival;
14770       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
14771           gcc_unreachable ();
14772       immediate = gen_int_mode (ival, mode);
14773     }
14774
14775   machine_mode vmode;
14776   /* use a 64 bit mode for everything except for DI/DF mode, where we use
14777      a 128 bit vector mode.  */
14778   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
14779
14780   vmode = aarch64_simd_container_mode (mode, width);
14781   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
14782   return aarch64_output_simd_mov_immediate (v_op, width);
14783 }
14784
14785 /* Return the output string to use for moving immediate CONST_VECTOR
14786    into an SVE register.  */
14787
14788 char *
14789 aarch64_output_sve_mov_immediate (rtx const_vector)
14790 {
14791   static char templ[40];
14792   struct simd_immediate_info info;
14793   char element_char;
14794
14795   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
14796   gcc_assert (is_valid);
14797
14798   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
14799
14800   if (info.step)
14801     {
14802       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
14803                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
14804                 element_char, INTVAL (info.value), INTVAL (info.step));
14805       return templ;
14806     }
14807
14808   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
14809     {
14810       if (aarch64_float_const_zero_rtx_p (info.value))
14811         info.value = GEN_INT (0);
14812       else
14813         {
14814           const int buf_size = 20;
14815           char float_buf[buf_size] = {};
14816           real_to_decimal_for_mode (float_buf,
14817                                     CONST_DOUBLE_REAL_VALUE (info.value),
14818                                     buf_size, buf_size, 1, info.elt_mode);
14819
14820           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
14821                     element_char, float_buf);
14822           return templ;
14823         }
14824     }
14825
14826   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
14827             element_char, INTVAL (info.value));
14828   return templ;
14829 }
14830
14831 /* Return the asm format for a PTRUE instruction whose destination has
14832    mode MODE.  SUFFIX is the element size suffix.  */
14833
14834 char *
14835 aarch64_output_ptrue (machine_mode mode, char suffix)
14836 {
14837   unsigned int nunits;
14838   static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
14839   if (GET_MODE_NUNITS (mode).is_constant (&nunits))
14840     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
14841   else
14842     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
14843   return buf;
14844 }
14845
14846 /* Split operands into moves from op[1] + op[2] into op[0].  */
14847
14848 void
14849 aarch64_split_combinev16qi (rtx operands[3])
14850 {
14851   unsigned int dest = REGNO (operands[0]);
14852   unsigned int src1 = REGNO (operands[1]);
14853   unsigned int src2 = REGNO (operands[2]);
14854   machine_mode halfmode = GET_MODE (operands[1]);
14855   unsigned int halfregs = REG_NREGS (operands[1]);
14856   rtx destlo, desthi;
14857
14858   gcc_assert (halfmode == V16QImode);
14859
14860   if (src1 == dest && src2 == dest + halfregs)
14861     {
14862       /* No-op move.  Can't split to nothing; emit something.  */
14863       emit_note (NOTE_INSN_DELETED);
14864       return;
14865     }
14866
14867   /* Preserve register attributes for variable tracking.  */
14868   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
14869   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
14870                                GET_MODE_SIZE (halfmode));
14871
14872   /* Special case of reversed high/low parts.  */
14873   if (reg_overlap_mentioned_p (operands[2], destlo)
14874       && reg_overlap_mentioned_p (operands[1], desthi))
14875     {
14876       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
14877       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
14878       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
14879     }
14880   else if (!reg_overlap_mentioned_p (operands[2], destlo))
14881     {
14882       /* Try to avoid unnecessary moves if part of the result
14883          is in the right place already.  */
14884       if (src1 != dest)
14885         emit_move_insn (destlo, operands[1]);
14886       if (src2 != dest + halfregs)
14887         emit_move_insn (desthi, operands[2]);
14888     }
14889   else
14890     {
14891       if (src2 != dest + halfregs)
14892         emit_move_insn (desthi, operands[2]);
14893       if (src1 != dest)
14894         emit_move_insn (destlo, operands[1]);
14895     }
14896 }
14897
14898 /* vec_perm support.  */
14899
14900 struct expand_vec_perm_d
14901 {
14902   rtx target, op0, op1;
14903   vec_perm_indices perm;
14904   machine_mode vmode;
14905   unsigned int vec_flags;
14906   bool one_vector_p;
14907   bool testing_p;
14908 };
14909
14910 /* Generate a variable permutation.  */
14911
14912 static void
14913 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
14914 {
14915   machine_mode vmode = GET_MODE (target);
14916   bool one_vector_p = rtx_equal_p (op0, op1);
14917
14918   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
14919   gcc_checking_assert (GET_MODE (op0) == vmode);
14920   gcc_checking_assert (GET_MODE (op1) == vmode);
14921   gcc_checking_assert (GET_MODE (sel) == vmode);
14922   gcc_checking_assert (TARGET_SIMD);
14923
14924   if (one_vector_p)
14925     {
14926       if (vmode == V8QImode)
14927         {
14928           /* Expand the argument to a V16QI mode by duplicating it.  */
14929           rtx pair = gen_reg_rtx (V16QImode);
14930           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
14931           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
14932         }
14933       else
14934         {
14935           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
14936         }
14937     }
14938   else
14939     {
14940       rtx pair;
14941
14942       if (vmode == V8QImode)
14943         {
14944           pair = gen_reg_rtx (V16QImode);
14945           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
14946           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
14947         }
14948       else
14949         {
14950           pair = gen_reg_rtx (OImode);
14951           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
14952           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
14953         }
14954     }
14955 }
14956
14957 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
14958    NELT is the number of elements in the vector.  */
14959
14960 void
14961 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
14962                          unsigned int nelt)
14963 {
14964   machine_mode vmode = GET_MODE (target);
14965   bool one_vector_p = rtx_equal_p (op0, op1);
14966   rtx mask;
14967
14968   /* The TBL instruction does not use a modulo index, so we must take care
14969      of that ourselves.  */
14970   mask = aarch64_simd_gen_const_vector_dup (vmode,
14971       one_vector_p ? nelt - 1 : 2 * nelt - 1);
14972   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
14973
14974   /* For big-endian, we also need to reverse the index within the vector
14975      (but not which vector).  */
14976   if (BYTES_BIG_ENDIAN)
14977     {
14978       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
14979       if (!one_vector_p)
14980         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
14981       sel = expand_simple_binop (vmode, XOR, sel, mask,
14982                                  NULL, 0, OPTAB_LIB_WIDEN);
14983     }
14984   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
14985 }
14986
14987 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
14988
14989 static void
14990 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
14991 {
14992   emit_insn (gen_rtx_SET (target,
14993                           gen_rtx_UNSPEC (GET_MODE (target),
14994                                           gen_rtvec (2, op0, op1), code)));
14995 }
14996
14997 /* Expand an SVE vec_perm with the given operands.  */
14998
14999 void
15000 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
15001 {
15002   machine_mode data_mode = GET_MODE (target);
15003   machine_mode sel_mode = GET_MODE (sel);
15004   /* Enforced by the pattern condition.  */
15005   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
15006
15007   /* Note: vec_perm indices are supposed to wrap when they go beyond the
15008      size of the two value vectors, i.e. the upper bits of the indices
15009      are effectively ignored.  SVE TBL instead produces 0 for any
15010      out-of-range indices, so we need to modulo all the vec_perm indices
15011      to ensure they are all in range.  */
15012   rtx sel_reg = force_reg (sel_mode, sel);
15013
15014   /* Check if the sel only references the first values vector.  */
15015   if (GET_CODE (sel) == CONST_VECTOR
15016       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
15017     {
15018       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
15019       return;
15020     }
15021
15022   /* Check if the two values vectors are the same.  */
15023   if (rtx_equal_p (op0, op1))
15024     {
15025       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
15026       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15027                                          NULL, 0, OPTAB_DIRECT);
15028       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
15029       return;
15030     }
15031
15032   /* Run TBL on for each value vector and combine the results.  */
15033
15034   rtx res0 = gen_reg_rtx (data_mode);
15035   rtx res1 = gen_reg_rtx (data_mode);
15036   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
15037   if (GET_CODE (sel) != CONST_VECTOR
15038       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
15039     {
15040       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
15041                                                        2 * nunits - 1);
15042       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15043                                      NULL, 0, OPTAB_DIRECT);
15044     }
15045   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
15046   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
15047                                      NULL, 0, OPTAB_DIRECT);
15048   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
15049   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
15050     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
15051   else
15052     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
15053 }
15054
15055 /* Recognize patterns suitable for the TRN instructions.  */
15056 static bool
15057 aarch64_evpc_trn (struct expand_vec_perm_d *d)
15058 {
15059   HOST_WIDE_INT odd;
15060   poly_uint64 nelt = d->perm.length ();
15061   rtx out, in0, in1, x;
15062   machine_mode vmode = d->vmode;
15063
15064   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15065     return false;
15066
15067   /* Note that these are little-endian tests.
15068      We correct for big-endian later.  */
15069   if (!d->perm[0].is_constant (&odd)
15070       || (odd != 0 && odd != 1)
15071       || !d->perm.series_p (0, 2, odd, 2)
15072       || !d->perm.series_p (1, 2, nelt + odd, 2))
15073     return false;
15074
15075   /* Success!  */
15076   if (d->testing_p)
15077     return true;
15078
15079   in0 = d->op0;
15080   in1 = d->op1;
15081   /* We don't need a big-endian lane correction for SVE; see the comment
15082      at the head of aarch64-sve.md for details.  */
15083   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15084     {
15085       x = in0, in0 = in1, in1 = x;
15086       odd = !odd;
15087     }
15088   out = d->target;
15089
15090   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15091                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
15092   return true;
15093 }
15094
15095 /* Recognize patterns suitable for the UZP instructions.  */
15096 static bool
15097 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
15098 {
15099   HOST_WIDE_INT odd;
15100   rtx out, in0, in1, x;
15101   machine_mode vmode = d->vmode;
15102
15103   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15104     return false;
15105
15106   /* Note that these are little-endian tests.
15107      We correct for big-endian later.  */
15108   if (!d->perm[0].is_constant (&odd)
15109       || (odd != 0 && odd != 1)
15110       || !d->perm.series_p (0, 1, odd, 2))
15111     return false;
15112
15113   /* Success!  */
15114   if (d->testing_p)
15115     return true;
15116
15117   in0 = d->op0;
15118   in1 = d->op1;
15119   /* We don't need a big-endian lane correction for SVE; see the comment
15120      at the head of aarch64-sve.md for details.  */
15121   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15122     {
15123       x = in0, in0 = in1, in1 = x;
15124       odd = !odd;
15125     }
15126   out = d->target;
15127
15128   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15129                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
15130   return true;
15131 }
15132
15133 /* Recognize patterns suitable for the ZIP instructions.  */
15134 static bool
15135 aarch64_evpc_zip (struct expand_vec_perm_d *d)
15136 {
15137   unsigned int high;
15138   poly_uint64 nelt = d->perm.length ();
15139   rtx out, in0, in1, x;
15140   machine_mode vmode = d->vmode;
15141
15142   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15143     return false;
15144
15145   /* Note that these are little-endian tests.
15146      We correct for big-endian later.  */
15147   poly_uint64 first = d->perm[0];
15148   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
15149       || !d->perm.series_p (0, 2, first, 1)
15150       || !d->perm.series_p (1, 2, first + nelt, 1))
15151     return false;
15152   high = maybe_ne (first, 0U);
15153
15154   /* Success!  */
15155   if (d->testing_p)
15156     return true;
15157
15158   in0 = d->op0;
15159   in1 = d->op1;
15160   /* We don't need a big-endian lane correction for SVE; see the comment
15161      at the head of aarch64-sve.md for details.  */
15162   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15163     {
15164       x = in0, in0 = in1, in1 = x;
15165       high = !high;
15166     }
15167   out = d->target;
15168
15169   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15170                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
15171   return true;
15172 }
15173
15174 /* Recognize patterns for the EXT insn.  */
15175
15176 static bool
15177 aarch64_evpc_ext (struct expand_vec_perm_d *d)
15178 {
15179   HOST_WIDE_INT location;
15180   rtx offset;
15181
15182   /* The first element always refers to the first vector.
15183      Check if the extracted indices are increasing by one.  */
15184   if (d->vec_flags == VEC_SVE_PRED
15185       || !d->perm[0].is_constant (&location)
15186       || !d->perm.series_p (0, 1, location, 1))
15187     return false;
15188
15189   /* Success! */
15190   if (d->testing_p)
15191     return true;
15192
15193   /* The case where (location == 0) is a no-op for both big- and little-endian,
15194      and is removed by the mid-end at optimization levels -O1 and higher.
15195
15196      We don't need a big-endian lane correction for SVE; see the comment
15197      at the head of aarch64-sve.md for details.  */
15198   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
15199     {
15200       /* After setup, we want the high elements of the first vector (stored
15201          at the LSB end of the register), and the low elements of the second
15202          vector (stored at the MSB end of the register). So swap.  */
15203       std::swap (d->op0, d->op1);
15204       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
15205          to_constant () is safe since this is restricted to Advanced SIMD
15206          vectors.  */
15207       location = d->perm.length ().to_constant () - location;
15208     }
15209
15210   offset = GEN_INT (location);
15211   emit_set_insn (d->target,
15212                  gen_rtx_UNSPEC (d->vmode,
15213                                  gen_rtvec (3, d->op0, d->op1, offset),
15214                                  UNSPEC_EXT));
15215   return true;
15216 }
15217
15218 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
15219    within each 64-bit, 32-bit or 16-bit granule.  */
15220
15221 static bool
15222 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
15223 {
15224   HOST_WIDE_INT diff;
15225   unsigned int i, size, unspec;
15226   machine_mode pred_mode;
15227
15228   if (d->vec_flags == VEC_SVE_PRED
15229       || !d->one_vector_p
15230       || !d->perm[0].is_constant (&diff))
15231     return false;
15232
15233   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
15234   if (size == 8)
15235     {
15236       unspec = UNSPEC_REV64;
15237       pred_mode = VNx2BImode;
15238     }
15239   else if (size == 4)
15240     {
15241       unspec = UNSPEC_REV32;
15242       pred_mode = VNx4BImode;
15243     }
15244   else if (size == 2)
15245     {
15246       unspec = UNSPEC_REV16;
15247       pred_mode = VNx8BImode;
15248     }
15249   else
15250     return false;
15251
15252   unsigned int step = diff + 1;
15253   for (i = 0; i < step; ++i)
15254     if (!d->perm.series_p (i, step, diff - i, step))
15255       return false;
15256
15257   /* Success! */
15258   if (d->testing_p)
15259     return true;
15260
15261   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
15262   if (d->vec_flags == VEC_SVE_DATA)
15263     {
15264       rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15265       src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
15266                             UNSPEC_MERGE_PTRUE);
15267     }
15268   emit_set_insn (d->target, src);
15269   return true;
15270 }
15271
15272 /* Recognize patterns for the REV insn, which reverses elements within
15273    a full vector.  */
15274
15275 static bool
15276 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
15277 {
15278   poly_uint64 nelt = d->perm.length ();
15279
15280   if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
15281     return false;
15282
15283   if (!d->perm.series_p (0, 1, nelt - 1, -1))
15284     return false;
15285
15286   /* Success! */
15287   if (d->testing_p)
15288     return true;
15289
15290   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
15291   emit_set_insn (d->target, src);
15292   return true;
15293 }
15294
15295 static bool
15296 aarch64_evpc_dup (struct expand_vec_perm_d *d)
15297 {
15298   rtx out = d->target;
15299   rtx in0;
15300   HOST_WIDE_INT elt;
15301   machine_mode vmode = d->vmode;
15302   rtx lane;
15303
15304   if (d->vec_flags == VEC_SVE_PRED
15305       || d->perm.encoding ().encoded_nelts () != 1
15306       || !d->perm[0].is_constant (&elt))
15307     return false;
15308
15309   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
15310     return false;
15311
15312   /* Success! */
15313   if (d->testing_p)
15314     return true;
15315
15316   /* The generic preparation in aarch64_expand_vec_perm_const_1
15317      swaps the operand order and the permute indices if it finds
15318      d->perm[0] to be in the second operand.  Thus, we can always
15319      use d->op0 and need not do any extra arithmetic to get the
15320      correct lane number.  */
15321   in0 = d->op0;
15322   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
15323
15324   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
15325   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
15326   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
15327   return true;
15328 }
15329
15330 static bool
15331 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
15332 {
15333   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
15334   machine_mode vmode = d->vmode;
15335
15336   /* Make sure that the indices are constant.  */
15337   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
15338   for (unsigned int i = 0; i < encoded_nelts; ++i)
15339     if (!d->perm[i].is_constant ())
15340       return false;
15341
15342   if (d->testing_p)
15343     return true;
15344
15345   /* Generic code will try constant permutation twice.  Once with the
15346      original mode and again with the elements lowered to QImode.
15347      So wait and don't do the selector expansion ourselves.  */
15348   if (vmode != V8QImode && vmode != V16QImode)
15349     return false;
15350
15351   /* to_constant is safe since this routine is specific to Advanced SIMD
15352      vectors.  */
15353   unsigned int nelt = d->perm.length ().to_constant ();
15354   for (unsigned int i = 0; i < nelt; ++i)
15355     /* If big-endian and two vectors we end up with a weird mixed-endian
15356        mode on NEON.  Reverse the index within each word but not the word
15357        itself.  to_constant is safe because we checked is_constant above.  */
15358     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
15359                         ? d->perm[i].to_constant () ^ (nelt - 1)
15360                         : d->perm[i].to_constant ());
15361
15362   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
15363   sel = force_reg (vmode, sel);
15364
15365   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
15366   return true;
15367 }
15368
15369 /* Try to implement D using an SVE TBL instruction.  */
15370
15371 static bool
15372 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
15373 {
15374   unsigned HOST_WIDE_INT nelt;
15375
15376   /* Permuting two variable-length vectors could overflow the
15377      index range.  */
15378   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
15379     return false;
15380
15381   if (d->testing_p)
15382     return true;
15383
15384   machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
15385   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
15386   if (d->one_vector_p)
15387     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
15388   else
15389     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
15390   return true;
15391 }
15392
15393 static bool
15394 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
15395 {
15396   /* The pattern matching functions above are written to look for a small
15397      number to begin the sequence (0, 1, N/2).  If we begin with an index
15398      from the second operand, we can swap the operands.  */
15399   poly_int64 nelt = d->perm.length ();
15400   if (known_ge (d->perm[0], nelt))
15401     {
15402       d->perm.rotate_inputs (1);
15403       std::swap (d->op0, d->op1);
15404     }
15405
15406   if ((d->vec_flags == VEC_ADVSIMD
15407        || d->vec_flags == VEC_SVE_DATA
15408        || d->vec_flags == VEC_SVE_PRED)
15409       && known_gt (nelt, 1))
15410     {
15411       if (aarch64_evpc_rev_local (d))
15412         return true;
15413       else if (aarch64_evpc_rev_global (d))
15414         return true;
15415       else if (aarch64_evpc_ext (d))
15416         return true;
15417       else if (aarch64_evpc_dup (d))
15418         return true;
15419       else if (aarch64_evpc_zip (d))
15420         return true;
15421       else if (aarch64_evpc_uzp (d))
15422         return true;
15423       else if (aarch64_evpc_trn (d))
15424         return true;
15425       if (d->vec_flags == VEC_SVE_DATA)
15426         return aarch64_evpc_sve_tbl (d);
15427       else if (d->vec_flags == VEC_ADVSIMD)
15428         return aarch64_evpc_tbl (d);
15429     }
15430   return false;
15431 }
15432
15433 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
15434
15435 static bool
15436 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
15437                                   rtx op1, const vec_perm_indices &sel)
15438 {
15439   struct expand_vec_perm_d d;
15440
15441   /* Check whether the mask can be applied to a single vector.  */
15442   if (sel.ninputs () == 1
15443       || (op0 && rtx_equal_p (op0, op1)))
15444     d.one_vector_p = true;
15445   else if (sel.all_from_input_p (0))
15446     {
15447       d.one_vector_p = true;
15448       op1 = op0;
15449     }
15450   else if (sel.all_from_input_p (1))
15451     {
15452       d.one_vector_p = true;
15453       op0 = op1;
15454     }
15455   else
15456     d.one_vector_p = false;
15457
15458   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
15459                      sel.nelts_per_input ());
15460   d.vmode = vmode;
15461   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
15462   d.target = target;
15463   d.op0 = op0;
15464   d.op1 = op1;
15465   d.testing_p = !target;
15466
15467   if (!d.testing_p)
15468     return aarch64_expand_vec_perm_const_1 (&d);
15469
15470   rtx_insn *last = get_last_insn ();
15471   bool ret = aarch64_expand_vec_perm_const_1 (&d);
15472   gcc_assert (last == get_last_insn ());
15473
15474   return ret;
15475 }
15476
15477 /* Generate a byte permute mask for a register of mode MODE,
15478    which has NUNITS units.  */
15479
15480 rtx
15481 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
15482 {
15483   /* We have to reverse each vector because we dont have
15484      a permuted load that can reverse-load according to ABI rules.  */
15485   rtx mask;
15486   rtvec v = rtvec_alloc (16);
15487   unsigned int i, j;
15488   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
15489
15490   gcc_assert (BYTES_BIG_ENDIAN);
15491   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
15492
15493   for (i = 0; i < nunits; i++)
15494     for (j = 0; j < usize; j++)
15495       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
15496   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
15497   return force_reg (V16QImode, mask);
15498 }
15499
15500 /* Return true if X is a valid second operand for the SVE instruction
15501    that implements integer comparison OP_CODE.  */
15502
15503 static bool
15504 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
15505 {
15506   if (register_operand (x, VOIDmode))
15507     return true;
15508
15509   switch (op_code)
15510     {
15511     case LTU:
15512     case LEU:
15513     case GEU:
15514     case GTU:
15515       return aarch64_sve_cmp_immediate_p (x, false);
15516     case LT:
15517     case LE:
15518     case GE:
15519     case GT:
15520     case NE:
15521     case EQ:
15522       return aarch64_sve_cmp_immediate_p (x, true);
15523     default:
15524       gcc_unreachable ();
15525     }
15526 }
15527
15528 /* Use predicated SVE instructions to implement the equivalent of:
15529
15530      (set TARGET OP)
15531
15532    given that PTRUE is an all-true predicate of the appropriate mode.  */
15533
15534 static void
15535 aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
15536 {
15537   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
15538                                gen_rtvec (2, ptrue, op),
15539                                UNSPEC_MERGE_PTRUE);
15540   rtx_insn *insn = emit_set_insn (target, unspec);
15541   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
15542 }
15543
15544 /* Likewise, but also clobber the condition codes.  */
15545
15546 static void
15547 aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
15548 {
15549   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
15550                                gen_rtvec (2, ptrue, op),
15551                                UNSPEC_MERGE_PTRUE);
15552   rtx_insn *insn = emit_insn (gen_set_clobber_cc (target, unspec));
15553   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
15554 }
15555
15556 /* Return the UNSPEC_COND_* code for comparison CODE.  */
15557
15558 static unsigned int
15559 aarch64_unspec_cond_code (rtx_code code)
15560 {
15561   switch (code)
15562     {
15563     case NE:
15564       return UNSPEC_COND_NE;
15565     case EQ:
15566       return UNSPEC_COND_EQ;
15567     case LT:
15568       return UNSPEC_COND_LT;
15569     case GT:
15570       return UNSPEC_COND_GT;
15571     case LE:
15572       return UNSPEC_COND_LE;
15573     case GE:
15574       return UNSPEC_COND_GE;
15575     default:
15576       gcc_unreachable ();
15577     }
15578 }
15579
15580 /* Emit:
15581
15582       (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
15583
15584    where <X> is the operation associated with comparison CODE.  This form
15585    of instruction is used when (and (CODE OP0 OP1) PRED) would have different
15586    semantics, such as when PRED might not be all-true and when comparing
15587    inactive lanes could have side effects.  */
15588
15589 static void
15590 aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
15591                                   rtx pred, rtx op0, rtx op1)
15592 {
15593   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
15594                                gen_rtvec (3, pred, op0, op1),
15595                                aarch64_unspec_cond_code (code));
15596   emit_set_insn (target, unspec);
15597 }
15598
15599 /* Expand an SVE integer comparison using the SVE equivalent of:
15600
15601      (set TARGET (CODE OP0 OP1)).  */
15602
15603 void
15604 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
15605 {
15606   machine_mode pred_mode = GET_MODE (target);
15607   machine_mode data_mode = GET_MODE (op0);
15608
15609   if (!aarch64_sve_cmp_operand_p (code, op1))
15610     op1 = force_reg (data_mode, op1);
15611
15612   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15613   rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15614   aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
15615 }
15616
15617 /* Emit the SVE equivalent of:
15618
15619       (set TMP1 (CODE1 OP0 OP1))
15620       (set TMP2 (CODE2 OP0 OP1))
15621       (set TARGET (ior:PRED_MODE TMP1 TMP2))
15622
15623    PTRUE is an all-true predicate with the same mode as TARGET.  */
15624
15625 static void
15626 aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
15627                            rtx ptrue, rtx op0, rtx op1)
15628 {
15629   machine_mode pred_mode = GET_MODE (ptrue);
15630   rtx tmp1 = gen_reg_rtx (pred_mode);
15631   aarch64_emit_sve_ptrue_op (tmp1, ptrue,
15632                              gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
15633   rtx tmp2 = gen_reg_rtx (pred_mode);
15634   aarch64_emit_sve_ptrue_op (tmp2, ptrue,
15635                              gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
15636   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
15637 }
15638
15639 /* Emit the SVE equivalent of:
15640
15641       (set TMP (CODE OP0 OP1))
15642       (set TARGET (not TMP))
15643
15644    PTRUE is an all-true predicate with the same mode as TARGET.  */
15645
15646 static void
15647 aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
15648                                 rtx op0, rtx op1)
15649 {
15650   machine_mode pred_mode = GET_MODE (ptrue);
15651   rtx tmp = gen_reg_rtx (pred_mode);
15652   aarch64_emit_sve_ptrue_op (tmp, ptrue,
15653                              gen_rtx_fmt_ee (code, pred_mode, op0, op1));
15654   aarch64_emit_unop (target, one_cmpl_optab, tmp);
15655 }
15656
15657 /* Expand an SVE floating-point comparison using the SVE equivalent of:
15658
15659      (set TARGET (CODE OP0 OP1))
15660
15661    If CAN_INVERT_P is true, the caller can also handle inverted results;
15662    return true if the result is in fact inverted.  */
15663
15664 bool
15665 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
15666                                   rtx op0, rtx op1, bool can_invert_p)
15667 {
15668   machine_mode pred_mode = GET_MODE (target);
15669   machine_mode data_mode = GET_MODE (op0);
15670
15671   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15672   switch (code)
15673     {
15674     case UNORDERED:
15675       /* UNORDERED has no immediate form.  */
15676       op1 = force_reg (data_mode, op1);
15677       /* fall through */
15678     case LT:
15679     case LE:
15680     case GT:
15681     case GE:
15682     case EQ:
15683     case NE:
15684       {
15685         /* There is native support for the comparison.  */
15686         rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15687         aarch64_emit_sve_ptrue_op (target, ptrue, cond);
15688         return false;
15689       }
15690
15691     case LTGT:
15692       /* This is a trapping operation (LT or GT).  */
15693       aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
15694       return false;
15695
15696     case UNEQ:
15697       if (!flag_trapping_math)
15698         {
15699           /* This would trap for signaling NaNs.  */
15700           op1 = force_reg (data_mode, op1);
15701           aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
15702           return false;
15703         }
15704       /* fall through */
15705     case UNLT:
15706     case UNLE:
15707     case UNGT:
15708     case UNGE:
15709       if (flag_trapping_math)
15710         {
15711           /* Work out which elements are ordered.  */
15712           rtx ordered = gen_reg_rtx (pred_mode);
15713           op1 = force_reg (data_mode, op1);
15714           aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
15715
15716           /* Test the opposite condition for the ordered elements,
15717              then invert the result.  */
15718           if (code == UNEQ)
15719             code = NE;
15720           else
15721             code = reverse_condition_maybe_unordered (code);
15722           if (can_invert_p)
15723             {
15724               aarch64_emit_sve_predicated_cond (target, code,
15725                                                 ordered, op0, op1);
15726               return true;
15727             }
15728           rtx tmp = gen_reg_rtx (pred_mode);
15729           aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
15730           aarch64_emit_unop (target, one_cmpl_optab, tmp);
15731           return false;
15732         }
15733       break;
15734
15735     case ORDERED:
15736       /* ORDERED has no immediate form.  */
15737       op1 = force_reg (data_mode, op1);
15738       break;
15739
15740     default:
15741       gcc_unreachable ();
15742     }
15743
15744   /* There is native support for the inverse comparison.  */
15745   code = reverse_condition_maybe_unordered (code);
15746   if (can_invert_p)
15747     {
15748       rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15749       aarch64_emit_sve_ptrue_op (target, ptrue, cond);
15750       return true;
15751     }
15752   aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
15753   return false;
15754 }
15755
15756 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
15757    of the data being selected and CMP_MODE is the mode of the values being
15758    compared.  */
15759
15760 void
15761 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
15762                           rtx *ops)
15763 {
15764   machine_mode pred_mode
15765     = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
15766                              GET_MODE_SIZE (cmp_mode)).require ();
15767   rtx pred = gen_reg_rtx (pred_mode);
15768   if (FLOAT_MODE_P (cmp_mode))
15769     {
15770       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
15771                                             ops[4], ops[5], true))
15772         std::swap (ops[1], ops[2]);
15773     }
15774   else
15775     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
15776
15777   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
15778   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
15779 }
15780
15781 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
15782    true.  However due to issues with register allocation it is preferable
15783    to avoid tieing integer scalar and FP scalar modes.  Executing integer
15784    operations in general registers is better than treating them as scalar
15785    vector operations.  This reduces latency and avoids redundant int<->FP
15786    moves.  So tie modes if they are either the same class, or vector modes
15787    with other vector modes, vector structs or any scalar mode.  */
15788
15789 static bool
15790 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
15791 {
15792   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
15793     return true;
15794
15795   /* We specifically want to allow elements of "structure" modes to
15796      be tieable to the structure.  This more general condition allows
15797      other rarer situations too.  The reason we don't extend this to
15798      predicate modes is that there are no predicate structure modes
15799      nor any specific instructions for extracting part of a predicate
15800      register.  */
15801   if (aarch64_vector_data_mode_p (mode1)
15802       && aarch64_vector_data_mode_p (mode2))
15803     return true;
15804
15805   /* Also allow any scalar modes with vectors.  */
15806   if (aarch64_vector_mode_supported_p (mode1)
15807       || aarch64_vector_mode_supported_p (mode2))
15808     return true;
15809
15810   return false;
15811 }
15812
15813 /* Return a new RTX holding the result of moving POINTER forward by
15814    AMOUNT bytes.  */
15815
15816 static rtx
15817 aarch64_move_pointer (rtx pointer, poly_int64 amount)
15818 {
15819   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
15820
15821   return adjust_automodify_address (pointer, GET_MODE (pointer),
15822                                     next, amount);
15823 }
15824
15825 /* Return a new RTX holding the result of moving POINTER forward by the
15826    size of the mode it points to.  */
15827
15828 static rtx
15829 aarch64_progress_pointer (rtx pointer)
15830 {
15831   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
15832 }
15833
15834 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
15835    MODE bytes.  */
15836
15837 static void
15838 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
15839                                               machine_mode mode)
15840 {
15841   rtx reg = gen_reg_rtx (mode);
15842
15843   /* "Cast" the pointers to the correct mode.  */
15844   *src = adjust_address (*src, mode, 0);
15845   *dst = adjust_address (*dst, mode, 0);
15846   /* Emit the memcpy.  */
15847   emit_move_insn (reg, *src);
15848   emit_move_insn (*dst, reg);
15849   /* Move the pointers forward.  */
15850   *src = aarch64_progress_pointer (*src);
15851   *dst = aarch64_progress_pointer (*dst);
15852 }
15853
15854 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
15855    we succeed, otherwise return false.  */
15856
15857 bool
15858 aarch64_expand_movmem (rtx *operands)
15859 {
15860   int n, mode_bits;
15861   rtx dst = operands[0];
15862   rtx src = operands[1];
15863   rtx base;
15864   machine_mode cur_mode = BLKmode, next_mode;
15865   bool speed_p = !optimize_function_for_size_p (cfun);
15866
15867   /* When optimizing for size, give a better estimate of the length of a
15868      memcpy call, but use the default otherwise.  Moves larger than 8 bytes
15869      will always require an even number of instructions to do now.  And each
15870      operation requires both a load+store, so devide the max number by 2.  */
15871   int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
15872
15873   /* We can't do anything smart if the amount to copy is not constant.  */
15874   if (!CONST_INT_P (operands[2]))
15875     return false;
15876
15877   n = INTVAL (operands[2]);
15878
15879   /* Try to keep the number of instructions low.  For all cases we will do at
15880      most two moves for the residual amount, since we'll always overlap the
15881      remainder.  */
15882   if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
15883     return false;
15884
15885   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
15886   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
15887
15888   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
15889   src = adjust_automodify_address (src, VOIDmode, base, 0);
15890
15891   /* Convert n to bits to make the rest of the code simpler.  */
15892   n = n * BITS_PER_UNIT;
15893
15894   /* Maximum amount to copy in one go.  The AArch64 back-end has integer modes
15895      larger than TImode, but we should not use them for loads/stores here.  */
15896   const int copy_limit = GET_MODE_BITSIZE (TImode);
15897
15898   while (n > 0)
15899     {
15900       /* Find the largest mode in which to do the copy in without over reading
15901          or writing.  */
15902       opt_scalar_int_mode mode_iter;
15903       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
15904         if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
15905           cur_mode = mode_iter.require ();
15906
15907       gcc_assert (cur_mode != BLKmode);
15908
15909       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
15910       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
15911
15912       n -= mode_bits;
15913
15914       /* Do certain trailing copies as overlapping if it's going to be
15915          cheaper.  i.e. less instructions to do so.  For instance doing a 15
15916          byte copy it's more efficient to do two overlapping 8 byte copies than
15917          8 + 6 + 1.  */
15918       if (n > 0 && n <= 8 * BITS_PER_UNIT)
15919         {
15920           next_mode = smallest_mode_for_size (n, MODE_INT);
15921           int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
15922           src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
15923           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
15924           n = n_bits;
15925         }
15926     }
15927
15928   return true;
15929 }
15930
15931 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
15932    SImode stores.  Handle the case when the constant has identical
15933    bottom and top halves.  This is beneficial when the two stores can be
15934    merged into an STP and we avoid synthesising potentially expensive
15935    immediates twice.  Return true if such a split is possible.  */
15936
15937 bool
15938 aarch64_split_dimode_const_store (rtx dst, rtx src)
15939 {
15940   rtx lo = gen_lowpart (SImode, src);
15941   rtx hi = gen_highpart_mode (SImode, DImode, src);
15942
15943   bool size_p = optimize_function_for_size_p (cfun);
15944
15945   if (!rtx_equal_p (lo, hi))
15946     return false;
15947
15948   unsigned int orig_cost
15949     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
15950   unsigned int lo_cost
15951     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
15952
15953   /* We want to transform:
15954      MOV        x1, 49370
15955      MOVK       x1, 0x140, lsl 16
15956      MOVK       x1, 0xc0da, lsl 32
15957      MOVK       x1, 0x140, lsl 48
15958      STR        x1, [x0]
15959    into:
15960      MOV        w1, 49370
15961      MOVK       w1, 0x140, lsl 16
15962      STP        w1, w1, [x0]
15963    So we want to perform this only when we save two instructions
15964    or more.  When optimizing for size, however, accept any code size
15965    savings we can.  */
15966   if (size_p && orig_cost <= lo_cost)
15967     return false;
15968
15969   if (!size_p
15970       && (orig_cost <= lo_cost + 1))
15971     return false;
15972
15973   rtx mem_lo = adjust_address (dst, SImode, 0);
15974   if (!aarch64_mem_pair_operand (mem_lo, SImode))
15975     return false;
15976
15977   rtx tmp_reg = gen_reg_rtx (SImode);
15978   aarch64_expand_mov_immediate (tmp_reg, lo);
15979   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
15980   /* Don't emit an explicit store pair as this may not be always profitable.
15981      Let the sched-fusion logic decide whether to merge them.  */
15982   emit_move_insn (mem_lo, tmp_reg);
15983   emit_move_insn (mem_hi, tmp_reg);
15984
15985   return true;
15986 }
15987
15988 /* Generate RTL for a conditional branch with rtx comparison CODE in
15989    mode CC_MODE.  The destination of the unlikely conditional branch
15990    is LABEL_REF.  */
15991
15992 void
15993 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
15994                               rtx label_ref)
15995 {
15996   rtx x;
15997   x = gen_rtx_fmt_ee (code, VOIDmode,
15998                       gen_rtx_REG (cc_mode, CC_REGNUM),
15999                       const0_rtx);
16000
16001   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16002                             gen_rtx_LABEL_REF (VOIDmode, label_ref),
16003                             pc_rtx);
16004   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16005 }
16006
16007 /* Generate DImode scratch registers for 128-bit (TImode) addition.
16008
16009    OP1 represents the TImode destination operand 1
16010    OP2 represents the TImode destination operand 2
16011    LOW_DEST represents the low half (DImode) of TImode operand 0
16012    LOW_IN1 represents the low half (DImode) of TImode operand 1
16013    LOW_IN2 represents the low half (DImode) of TImode operand 2
16014    HIGH_DEST represents the high half (DImode) of TImode operand 0
16015    HIGH_IN1 represents the high half (DImode) of TImode operand 1
16016    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
16017
16018 void
16019 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
16020                             rtx *low_in1, rtx *low_in2,
16021                             rtx *high_dest, rtx *high_in1,
16022                             rtx *high_in2)
16023 {
16024   *low_dest = gen_reg_rtx (DImode);
16025   *low_in1 = gen_lowpart (DImode, op1);
16026   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
16027                                   subreg_lowpart_offset (DImode, TImode));
16028   *high_dest = gen_reg_rtx (DImode);
16029   *high_in1 = gen_highpart (DImode, op1);
16030   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
16031                                    subreg_highpart_offset (DImode, TImode));
16032 }
16033
16034 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
16035
16036    This function differs from 'arch64_addti_scratch_regs' in that
16037    OP1 can be an immediate constant (zero). We must call
16038    subreg_highpart_offset with DImode and TImode arguments, otherwise
16039    VOIDmode will be used for the const_int which generates an internal
16040    error from subreg_size_highpart_offset which does not expect a size of zero.
16041
16042    OP1 represents the TImode destination operand 1
16043    OP2 represents the TImode destination operand 2
16044    LOW_DEST represents the low half (DImode) of TImode operand 0
16045    LOW_IN1 represents the low half (DImode) of TImode operand 1
16046    LOW_IN2 represents the low half (DImode) of TImode operand 2
16047    HIGH_DEST represents the high half (DImode) of TImode operand 0
16048    HIGH_IN1 represents the high half (DImode) of TImode operand 1
16049    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
16050
16051
16052 void
16053 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
16054                              rtx *low_in1, rtx *low_in2,
16055                              rtx *high_dest, rtx *high_in1,
16056                              rtx *high_in2)
16057 {
16058   *low_dest = gen_reg_rtx (DImode);
16059   *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
16060                                   subreg_lowpart_offset (DImode, TImode));
16061
16062   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
16063                                   subreg_lowpart_offset (DImode, TImode));
16064   *high_dest = gen_reg_rtx (DImode);
16065
16066   *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
16067                                    subreg_highpart_offset (DImode, TImode));
16068   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
16069                                    subreg_highpart_offset (DImode, TImode));
16070 }
16071
16072 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
16073
16074    OP0 represents the TImode destination operand 0
16075    LOW_DEST represents the low half (DImode) of TImode operand 0
16076    LOW_IN1 represents the low half (DImode) of TImode operand 1
16077    LOW_IN2 represents the low half (DImode) of TImode operand 2
16078    HIGH_DEST represents the high half (DImode) of TImode operand 0
16079    HIGH_IN1 represents the high half (DImode) of TImode operand 1
16080    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
16081
16082 void
16083 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
16084                        rtx low_in2, rtx high_dest, rtx high_in1,
16085                        rtx high_in2)
16086 {
16087   if (low_in2 == const0_rtx)
16088     {
16089       low_dest = low_in1;
16090       emit_insn (gen_subdi3_compare1 (high_dest, high_in1,
16091                                       force_reg (DImode, high_in2)));
16092     }
16093   else
16094     {
16095       if (CONST_INT_P (low_in2))
16096         {
16097           low_in2 = force_reg (DImode, GEN_INT (-UINTVAL (low_in2)));
16098           high_in2 = force_reg (DImode, high_in2);
16099           emit_insn (gen_adddi3_compareC (low_dest, low_in1, low_in2));
16100         }
16101       else
16102         emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
16103       emit_insn (gen_subdi3_carryinCV (high_dest,
16104                                        force_reg (DImode, high_in1),
16105                                        high_in2));
16106     }
16107
16108   emit_move_insn (gen_lowpart (DImode, op0), low_dest);
16109   emit_move_insn (gen_highpart (DImode, op0), high_dest);
16110
16111 }
16112
16113 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
16114
16115 static unsigned HOST_WIDE_INT
16116 aarch64_asan_shadow_offset (void)
16117 {
16118   return (HOST_WIDE_INT_1 << 36);
16119 }
16120
16121 static rtx
16122 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
16123                         int code, tree treeop0, tree treeop1)
16124 {
16125   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16126   rtx op0, op1;
16127   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16128   insn_code icode;
16129   struct expand_operand ops[4];
16130
16131   start_sequence ();
16132   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16133
16134   op_mode = GET_MODE (op0);
16135   if (op_mode == VOIDmode)
16136     op_mode = GET_MODE (op1);
16137
16138   switch (op_mode)
16139     {
16140     case E_QImode:
16141     case E_HImode:
16142     case E_SImode:
16143       cmp_mode = SImode;
16144       icode = CODE_FOR_cmpsi;
16145       break;
16146
16147     case E_DImode:
16148       cmp_mode = DImode;
16149       icode = CODE_FOR_cmpdi;
16150       break;
16151
16152     case E_SFmode:
16153       cmp_mode = SFmode;
16154       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16155       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
16156       break;
16157
16158     case E_DFmode:
16159       cmp_mode = DFmode;
16160       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16161       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
16162       break;
16163
16164     default:
16165       end_sequence ();
16166       return NULL_RTX;
16167     }
16168
16169   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
16170   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
16171   if (!op0 || !op1)
16172     {
16173       end_sequence ();
16174       return NULL_RTX;
16175     }
16176   *prep_seq = get_insns ();
16177   end_sequence ();
16178
16179   create_fixed_operand (&ops[0], op0);
16180   create_fixed_operand (&ops[1], op1);
16181
16182   start_sequence ();
16183   if (!maybe_expand_insn (icode, 2, ops))
16184     {
16185       end_sequence ();
16186       return NULL_RTX;
16187     }
16188   *gen_seq = get_insns ();
16189   end_sequence ();
16190
16191   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
16192                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
16193 }
16194
16195 static rtx
16196 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
16197                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
16198 {
16199   rtx op0, op1, target;
16200   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16201   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16202   insn_code icode;
16203   struct expand_operand ops[6];
16204   int aarch64_cond;
16205
16206   push_to_sequence (*prep_seq);
16207   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16208
16209   op_mode = GET_MODE (op0);
16210   if (op_mode == VOIDmode)
16211     op_mode = GET_MODE (op1);
16212
16213   switch (op_mode)
16214     {
16215     case E_QImode:
16216     case E_HImode:
16217     case E_SImode:
16218       cmp_mode = SImode;
16219       icode = CODE_FOR_ccmpsi;
16220       break;
16221
16222     case E_DImode:
16223       cmp_mode = DImode;
16224       icode = CODE_FOR_ccmpdi;
16225       break;
16226
16227     case E_SFmode:
16228       cmp_mode = SFmode;
16229       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16230       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
16231       break;
16232
16233     case E_DFmode:
16234       cmp_mode = DFmode;
16235       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16236       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
16237       break;
16238
16239     default:
16240       end_sequence ();
16241       return NULL_RTX;
16242     }
16243
16244   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
16245   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
16246   if (!op0 || !op1)
16247     {
16248       end_sequence ();
16249       return NULL_RTX;
16250     }
16251   *prep_seq = get_insns ();
16252   end_sequence ();
16253
16254   target = gen_rtx_REG (cc_mode, CC_REGNUM);
16255   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
16256
16257   if (bit_code != AND)
16258     {
16259       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
16260                                                 GET_MODE (XEXP (prev, 0))),
16261                              VOIDmode, XEXP (prev, 0), const0_rtx);
16262       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
16263     }
16264
16265   create_fixed_operand (&ops[0], XEXP (prev, 0));
16266   create_fixed_operand (&ops[1], target);
16267   create_fixed_operand (&ops[2], op0);
16268   create_fixed_operand (&ops[3], op1);
16269   create_fixed_operand (&ops[4], prev);
16270   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
16271
16272   push_to_sequence (*gen_seq);
16273   if (!maybe_expand_insn (icode, 6, ops))
16274     {
16275       end_sequence ();
16276       return NULL_RTX;
16277     }
16278
16279   *gen_seq = get_insns ();
16280   end_sequence ();
16281
16282   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
16283 }
16284
16285 #undef TARGET_GEN_CCMP_FIRST
16286 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
16287
16288 #undef TARGET_GEN_CCMP_NEXT
16289 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
16290
16291 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
16292    instruction fusion of some sort.  */
16293
16294 static bool
16295 aarch64_macro_fusion_p (void)
16296 {
16297   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
16298 }
16299
16300
16301 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
16302    should be kept together during scheduling.  */
16303
16304 static bool
16305 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
16306 {
16307   rtx set_dest;
16308   rtx prev_set = single_set (prev);
16309   rtx curr_set = single_set (curr);
16310   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
16311   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
16312
16313   if (!aarch64_macro_fusion_p ())
16314     return false;
16315
16316   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
16317     {
16318       /* We are trying to match:
16319          prev (mov)  == (set (reg r0) (const_int imm16))
16320          curr (movk) == (set (zero_extract (reg r0)
16321                                            (const_int 16)
16322                                            (const_int 16))
16323                              (const_int imm16_1))  */
16324
16325       set_dest = SET_DEST (curr_set);
16326
16327       if (GET_CODE (set_dest) == ZERO_EXTRACT
16328           && CONST_INT_P (SET_SRC (curr_set))
16329           && CONST_INT_P (SET_SRC (prev_set))
16330           && CONST_INT_P (XEXP (set_dest, 2))
16331           && INTVAL (XEXP (set_dest, 2)) == 16
16332           && REG_P (XEXP (set_dest, 0))
16333           && REG_P (SET_DEST (prev_set))
16334           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
16335         {
16336           return true;
16337         }
16338     }
16339
16340   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
16341     {
16342
16343       /*  We're trying to match:
16344           prev (adrp) == (set (reg r1)
16345                               (high (symbol_ref ("SYM"))))
16346           curr (add) == (set (reg r0)
16347                              (lo_sum (reg r1)
16348                                      (symbol_ref ("SYM"))))
16349           Note that r0 need not necessarily be the same as r1, especially
16350           during pre-regalloc scheduling.  */
16351
16352       if (satisfies_constraint_Ush (SET_SRC (prev_set))
16353           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16354         {
16355           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
16356               && REG_P (XEXP (SET_SRC (curr_set), 0))
16357               && REGNO (XEXP (SET_SRC (curr_set), 0))
16358                  == REGNO (SET_DEST (prev_set))
16359               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
16360                               XEXP (SET_SRC (curr_set), 1)))
16361             return true;
16362         }
16363     }
16364
16365   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
16366     {
16367
16368       /* We're trying to match:
16369          prev (movk) == (set (zero_extract (reg r0)
16370                                            (const_int 16)
16371                                            (const_int 32))
16372                              (const_int imm16_1))
16373          curr (movk) == (set (zero_extract (reg r0)
16374                                            (const_int 16)
16375                                            (const_int 48))
16376                              (const_int imm16_2))  */
16377
16378       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
16379           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
16380           && REG_P (XEXP (SET_DEST (prev_set), 0))
16381           && REG_P (XEXP (SET_DEST (curr_set), 0))
16382           && REGNO (XEXP (SET_DEST (prev_set), 0))
16383              == REGNO (XEXP (SET_DEST (curr_set), 0))
16384           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
16385           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
16386           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
16387           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
16388           && CONST_INT_P (SET_SRC (prev_set))
16389           && CONST_INT_P (SET_SRC (curr_set)))
16390         return true;
16391
16392     }
16393   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
16394     {
16395       /* We're trying to match:
16396           prev (adrp) == (set (reg r0)
16397                               (high (symbol_ref ("SYM"))))
16398           curr (ldr) == (set (reg r1)
16399                              (mem (lo_sum (reg r0)
16400                                              (symbol_ref ("SYM")))))
16401                  or
16402           curr (ldr) == (set (reg r1)
16403                              (zero_extend (mem
16404                                            (lo_sum (reg r0)
16405                                                    (symbol_ref ("SYM"))))))  */
16406       if (satisfies_constraint_Ush (SET_SRC (prev_set))
16407           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16408         {
16409           rtx curr_src = SET_SRC (curr_set);
16410
16411           if (GET_CODE (curr_src) == ZERO_EXTEND)
16412             curr_src = XEXP (curr_src, 0);
16413
16414           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
16415               && REG_P (XEXP (XEXP (curr_src, 0), 0))
16416               && REGNO (XEXP (XEXP (curr_src, 0), 0))
16417                  == REGNO (SET_DEST (prev_set))
16418               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
16419                               XEXP (SET_SRC (prev_set), 0)))
16420               return true;
16421         }
16422     }
16423
16424   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
16425        && aarch_crypto_can_dual_issue (prev, curr))
16426     return true;
16427
16428   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
16429       && any_condjump_p (curr))
16430     {
16431       enum attr_type prev_type = get_attr_type (prev);
16432
16433       unsigned int condreg1, condreg2;
16434       rtx cc_reg_1;
16435       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
16436       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
16437
16438       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
16439           && prev
16440           && modified_in_p (cc_reg_1, prev))
16441         {
16442           /* FIXME: this misses some which is considered simple arthematic
16443              instructions for ThunderX.  Simple shifts are missed here.  */
16444           if (prev_type == TYPE_ALUS_SREG
16445               || prev_type == TYPE_ALUS_IMM
16446               || prev_type == TYPE_LOGICS_REG
16447               || prev_type == TYPE_LOGICS_IMM)
16448             return true;
16449         }
16450     }
16451
16452   if (prev_set
16453       && curr_set
16454       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
16455       && any_condjump_p (curr))
16456     {
16457       /* We're trying to match:
16458           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
16459           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
16460                                                          (const_int 0))
16461                                                  (label_ref ("SYM"))
16462                                                  (pc))  */
16463       if (SET_DEST (curr_set) == (pc_rtx)
16464           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
16465           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
16466           && REG_P (SET_DEST (prev_set))
16467           && REGNO (SET_DEST (prev_set))
16468              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
16469         {
16470           /* Fuse ALU operations followed by conditional branch instruction.  */
16471           switch (get_attr_type (prev))
16472             {
16473             case TYPE_ALU_IMM:
16474             case TYPE_ALU_SREG:
16475             case TYPE_ADC_REG:
16476             case TYPE_ADC_IMM:
16477             case TYPE_ADCS_REG:
16478             case TYPE_ADCS_IMM:
16479             case TYPE_LOGIC_REG:
16480             case TYPE_LOGIC_IMM:
16481             case TYPE_CSEL:
16482             case TYPE_ADR:
16483             case TYPE_MOV_IMM:
16484             case TYPE_SHIFT_REG:
16485             case TYPE_SHIFT_IMM:
16486             case TYPE_BFM:
16487             case TYPE_RBIT:
16488             case TYPE_REV:
16489             case TYPE_EXTEND:
16490               return true;
16491
16492             default:;
16493             }
16494         }
16495     }
16496
16497   return false;
16498 }
16499
16500 /* Return true iff the instruction fusion described by OP is enabled.  */
16501
16502 bool
16503 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
16504 {
16505   return (aarch64_tune_params.fusible_ops & op) != 0;
16506 }
16507
16508 /* If MEM is in the form of [base+offset], extract the two parts
16509    of address and set to BASE and OFFSET, otherwise return false
16510    after clearing BASE and OFFSET.  */
16511
16512 bool
16513 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
16514 {
16515   rtx addr;
16516
16517   gcc_assert (MEM_P (mem));
16518
16519   addr = XEXP (mem, 0);
16520
16521   if (REG_P (addr))
16522     {
16523       *base = addr;
16524       *offset = const0_rtx;
16525       return true;
16526     }
16527
16528   if (GET_CODE (addr) == PLUS
16529       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
16530     {
16531       *base = XEXP (addr, 0);
16532       *offset = XEXP (addr, 1);
16533       return true;
16534     }
16535
16536   *base = NULL_RTX;
16537   *offset = NULL_RTX;
16538
16539   return false;
16540 }
16541
16542 /* Types for scheduling fusion.  */
16543 enum sched_fusion_type
16544 {
16545   SCHED_FUSION_NONE = 0,
16546   SCHED_FUSION_LD_SIGN_EXTEND,
16547   SCHED_FUSION_LD_ZERO_EXTEND,
16548   SCHED_FUSION_LD,
16549   SCHED_FUSION_ST,
16550   SCHED_FUSION_NUM
16551 };
16552
16553 /* If INSN is a load or store of address in the form of [base+offset],
16554    extract the two parts and set to BASE and OFFSET.  Return scheduling
16555    fusion type this INSN is.  */
16556
16557 static enum sched_fusion_type
16558 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
16559 {
16560   rtx x, dest, src;
16561   enum sched_fusion_type fusion = SCHED_FUSION_LD;
16562
16563   gcc_assert (INSN_P (insn));
16564   x = PATTERN (insn);
16565   if (GET_CODE (x) != SET)
16566     return SCHED_FUSION_NONE;
16567
16568   src = SET_SRC (x);
16569   dest = SET_DEST (x);
16570
16571   machine_mode dest_mode = GET_MODE (dest);
16572
16573   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
16574     return SCHED_FUSION_NONE;
16575
16576   if (GET_CODE (src) == SIGN_EXTEND)
16577     {
16578       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
16579       src = XEXP (src, 0);
16580       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16581         return SCHED_FUSION_NONE;
16582     }
16583   else if (GET_CODE (src) == ZERO_EXTEND)
16584     {
16585       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
16586       src = XEXP (src, 0);
16587       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16588         return SCHED_FUSION_NONE;
16589     }
16590
16591   if (GET_CODE (src) == MEM && REG_P (dest))
16592     extract_base_offset_in_addr (src, base, offset);
16593   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
16594     {
16595       fusion = SCHED_FUSION_ST;
16596       extract_base_offset_in_addr (dest, base, offset);
16597     }
16598   else
16599     return SCHED_FUSION_NONE;
16600
16601   if (*base == NULL_RTX || *offset == NULL_RTX)
16602     fusion = SCHED_FUSION_NONE;
16603
16604   return fusion;
16605 }
16606
16607 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
16608
16609    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
16610    and PRI are only calculated for these instructions.  For other instruction,
16611    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
16612    type instruction fusion can be added by returning different priorities.
16613
16614    It's important that irrelevant instructions get the largest FUSION_PRI.  */
16615
16616 static void
16617 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
16618                                int *fusion_pri, int *pri)
16619 {
16620   int tmp, off_val;
16621   rtx base, offset;
16622   enum sched_fusion_type fusion;
16623
16624   gcc_assert (INSN_P (insn));
16625
16626   tmp = max_pri - 1;
16627   fusion = fusion_load_store (insn, &base, &offset);
16628   if (fusion == SCHED_FUSION_NONE)
16629     {
16630       *pri = tmp;
16631       *fusion_pri = tmp;
16632       return;
16633     }
16634
16635   /* Set FUSION_PRI according to fusion type and base register.  */
16636   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
16637
16638   /* Calculate PRI.  */
16639   tmp /= 2;
16640
16641   /* INSN with smaller offset goes first.  */
16642   off_val = (int)(INTVAL (offset));
16643   if (off_val >= 0)
16644     tmp -= (off_val & 0xfffff);
16645   else
16646     tmp += ((- off_val) & 0xfffff);
16647
16648   *pri = tmp;
16649   return;
16650 }
16651
16652 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
16653    Adjust priority of sha1h instructions so they are scheduled before
16654    other SHA1 instructions.  */
16655
16656 static int
16657 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
16658 {
16659   rtx x = PATTERN (insn);
16660
16661   if (GET_CODE (x) == SET)
16662     {
16663       x = SET_SRC (x);
16664
16665       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
16666         return priority + 10;
16667     }
16668
16669   return priority;
16670 }
16671
16672 /* Given OPERANDS of consecutive load/store, check if we can merge
16673    them into ldp/stp.  LOAD is true if they are load instructions.
16674    MODE is the mode of memory operands.  */
16675
16676 bool
16677 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
16678                                 machine_mode mode)
16679 {
16680   HOST_WIDE_INT offval_1, offval_2, msize;
16681   enum reg_class rclass_1, rclass_2;
16682   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
16683
16684   if (load)
16685     {
16686       mem_1 = operands[1];
16687       mem_2 = operands[3];
16688       reg_1 = operands[0];
16689       reg_2 = operands[2];
16690       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
16691       if (REGNO (reg_1) == REGNO (reg_2))
16692         return false;
16693     }
16694   else
16695     {
16696       mem_1 = operands[0];
16697       mem_2 = operands[2];
16698       reg_1 = operands[1];
16699       reg_2 = operands[3];
16700     }
16701
16702   /* The mems cannot be volatile.  */
16703   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
16704     return false;
16705
16706   /* If we have SImode and slow unaligned ldp,
16707      check the alignment to be at least 8 byte. */
16708   if (mode == SImode
16709       && (aarch64_tune_params.extra_tuning_flags
16710           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16711       && !optimize_size
16712       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
16713     return false;
16714
16715   /* Check if the addresses are in the form of [base+offset].  */
16716   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16717   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
16718     return false;
16719   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16720   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
16721     return false;
16722
16723   /* Check if the bases are same.  */
16724   if (!rtx_equal_p (base_1, base_2))
16725     return false;
16726
16727   /* The operands must be of the same size.  */
16728   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
16729                          GET_MODE_SIZE (GET_MODE (mem_2))));
16730
16731   offval_1 = INTVAL (offset_1);
16732   offval_2 = INTVAL (offset_2);
16733   /* We should only be trying this for fixed-sized modes.  There is no
16734      SVE LDP/STP instruction.  */
16735   msize = GET_MODE_SIZE (mode).to_constant ();
16736   /* Check if the offsets are consecutive.  */
16737   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
16738     return false;
16739
16740   /* Check if the addresses are clobbered by load.  */
16741   if (load)
16742     {
16743       if (reg_mentioned_p (reg_1, mem_1))
16744         return false;
16745
16746       /* In increasing order, the last load can clobber the address.  */
16747       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
16748         return false;
16749     }
16750
16751   /* One of the memory accesses must be a mempair operand.
16752      If it is not the first one, they need to be swapped by the
16753      peephole.  */
16754   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
16755        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
16756     return false;
16757
16758   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
16759     rclass_1 = FP_REGS;
16760   else
16761     rclass_1 = GENERAL_REGS;
16762
16763   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
16764     rclass_2 = FP_REGS;
16765   else
16766     rclass_2 = GENERAL_REGS;
16767
16768   /* Check if the registers are of same class.  */
16769   if (rclass_1 != rclass_2)
16770     return false;
16771
16772   return true;
16773 }
16774
16775 /* Given OPERANDS of consecutive load/store that can be merged,
16776    swap them if they are not in ascending order.  */
16777 void
16778 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
16779 {
16780   rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
16781   HOST_WIDE_INT offval_1, offval_2;
16782
16783   if (load)
16784     {
16785       mem_1 = operands[1];
16786       mem_2 = operands[3];
16787     }
16788   else
16789     {
16790       mem_1 = operands[0];
16791       mem_2 = operands[2];
16792     }
16793
16794   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16795   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16796
16797   offval_1 = INTVAL (offset_1);
16798   offval_2 = INTVAL (offset_2);
16799
16800   if (offval_1 > offval_2)
16801     {
16802       /* Irrespective of whether this is a load or a store,
16803          we do the same swap.  */
16804       std::swap (operands[0], operands[2]);
16805       std::swap (operands[1], operands[3]);
16806     }
16807 }
16808
16809 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
16810    comparison between the two.  */
16811 int
16812 aarch64_host_wide_int_compare (const void *x, const void *y)
16813 {
16814   return wi::cmps (* ((const HOST_WIDE_INT *) x),
16815                    * ((const HOST_WIDE_INT *) y));
16816 }
16817
16818 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
16819    other pointing to a REG rtx containing an offset, compare the offsets
16820    of the two pairs.
16821
16822    Return:
16823
16824         1 iff offset (X) > offset (Y)
16825         0 iff offset (X) == offset (Y)
16826         -1 iff offset (X) < offset (Y)  */
16827 int
16828 aarch64_ldrstr_offset_compare (const void *x, const void *y)
16829 {
16830   const rtx * operands_1 = (const rtx *) x;
16831   const rtx * operands_2 = (const rtx *) y;
16832   rtx mem_1, mem_2, base, offset_1, offset_2;
16833
16834   if (MEM_P (operands_1[0]))
16835     mem_1 = operands_1[0];
16836   else
16837     mem_1 = operands_1[1];
16838
16839   if (MEM_P (operands_2[0]))
16840     mem_2 = operands_2[0];
16841   else
16842     mem_2 = operands_2[1];
16843
16844   /* Extract the offsets.  */
16845   extract_base_offset_in_addr (mem_1, &base, &offset_1);
16846   extract_base_offset_in_addr (mem_2, &base, &offset_2);
16847
16848   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
16849
16850   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
16851 }
16852
16853 /* Given OPERANDS of consecutive load/store, check if we can merge
16854    them into ldp/stp by adjusting the offset.  LOAD is true if they
16855    are load instructions.  MODE is the mode of memory operands.
16856
16857    Given below consecutive stores:
16858
16859      str  w1, [xb, 0x100]
16860      str  w1, [xb, 0x104]
16861      str  w1, [xb, 0x108]
16862      str  w1, [xb, 0x10c]
16863
16864    Though the offsets are out of the range supported by stp, we can
16865    still pair them after adjusting the offset, like:
16866
16867      add  scratch, xb, 0x100
16868      stp  w1, w1, [scratch]
16869      stp  w1, w1, [scratch, 0x8]
16870
16871    The peephole patterns detecting this opportunity should guarantee
16872    the scratch register is avaliable.  */
16873
16874 bool
16875 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
16876                                        scalar_mode mode)
16877 {
16878   const int num_insns = 4;
16879   enum reg_class rclass;
16880   HOST_WIDE_INT offvals[num_insns], msize;
16881   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
16882
16883   if (load)
16884     {
16885       for (int i = 0; i < num_insns; i++)
16886         {
16887           reg[i] = operands[2 * i];
16888           mem[i] = operands[2 * i + 1];
16889
16890           gcc_assert (REG_P (reg[i]));
16891         }
16892
16893       /* Do not attempt to merge the loads if the loads clobber each other.  */
16894       for (int i = 0; i < 8; i += 2)
16895         for (int j = i + 2; j < 8; j += 2)
16896           if (reg_overlap_mentioned_p (operands[i], operands[j]))
16897             return false;
16898     }
16899   else
16900     for (int i = 0; i < num_insns; i++)
16901       {
16902         mem[i] = operands[2 * i];
16903         reg[i] = operands[2 * i + 1];
16904       }
16905
16906   /* Skip if memory operand is by itself valid for ldp/stp.  */
16907   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
16908     return false;
16909
16910   for (int i = 0; i < num_insns; i++)
16911     {
16912       /* The mems cannot be volatile.  */
16913       if (MEM_VOLATILE_P (mem[i]))
16914         return false;
16915
16916       /* Check if the addresses are in the form of [base+offset].  */
16917       extract_base_offset_in_addr (mem[i], base + i, offset + i);
16918       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
16919         return false;
16920     }
16921
16922   /* Check if the registers are of same class.  */
16923   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
16924     ? FP_REGS : GENERAL_REGS;
16925
16926   for (int i = 1; i < num_insns; i++)
16927     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
16928       {
16929         if (rclass != FP_REGS)
16930           return false;
16931       }
16932     else
16933       {
16934         if (rclass != GENERAL_REGS)
16935           return false;
16936       }
16937
16938   /* Only the last register in the order in which they occur
16939      may be clobbered by the load.  */
16940   if (rclass == GENERAL_REGS && load)
16941     for (int i = 0; i < num_insns - 1; i++)
16942       if (reg_mentioned_p (reg[i], mem[i]))
16943         return false;
16944
16945   /* Check if the bases are same.  */
16946   for (int i = 0; i < num_insns - 1; i++)
16947     if (!rtx_equal_p (base[i], base[i + 1]))
16948       return false;
16949
16950   for (int i = 0; i < num_insns; i++)
16951     offvals[i] = INTVAL (offset[i]);
16952
16953   msize = GET_MODE_SIZE (mode);
16954
16955   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
16956   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
16957          aarch64_host_wide_int_compare);
16958
16959   if (!(offvals[1] == offvals[0] + msize
16960         && offvals[3] == offvals[2] + msize))
16961     return false;
16962
16963   /* Check that offsets are within range of each other.  The ldp/stp
16964      instructions have 7 bit immediate offsets, so use 0x80.  */
16965   if (offvals[2] - offvals[0] >= msize * 0x80)
16966     return false;
16967
16968   /* The offsets must be aligned with respect to each other.  */
16969   if (offvals[0] % msize != offvals[2] % msize)
16970     return false;
16971
16972   /* If we have SImode and slow unaligned ldp,
16973      check the alignment to be at least 8 byte. */
16974   if (mode == SImode
16975       && (aarch64_tune_params.extra_tuning_flags
16976           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16977       && !optimize_size
16978       && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
16979     return false;
16980
16981   return true;
16982 }
16983
16984 /* Given OPERANDS of consecutive load/store, this function pairs them
16985    into LDP/STP after adjusting the offset.  It depends on the fact
16986    that the operands can be sorted so the offsets are correct for STP.
16987    MODE is the mode of memory operands.  CODE is the rtl operator
16988    which should be applied to all memory operands, it's SIGN_EXTEND,
16989    ZERO_EXTEND or UNKNOWN.  */
16990
16991 bool
16992 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
16993                              scalar_mode mode, RTX_CODE code)
16994 {
16995   rtx base, offset_1, offset_3, t1, t2;
16996   rtx mem_1, mem_2, mem_3, mem_4;
16997   rtx temp_operands[8];
16998   HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
16999                 stp_off_upper_limit, stp_off_lower_limit, msize;
17000
17001   /* We make changes on a copy as we may still bail out.  */
17002   for (int i = 0; i < 8; i ++)
17003     temp_operands[i] = operands[i];
17004
17005   /* Sort the operands.  */
17006   qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
17007
17008   if (load)
17009     {
17010       mem_1 = temp_operands[1];
17011       mem_2 = temp_operands[3];
17012       mem_3 = temp_operands[5];
17013       mem_4 = temp_operands[7];
17014     }
17015   else
17016     {
17017       mem_1 = temp_operands[0];
17018       mem_2 = temp_operands[2];
17019       mem_3 = temp_operands[4];
17020       mem_4 = temp_operands[6];
17021       gcc_assert (code == UNKNOWN);
17022     }
17023
17024   extract_base_offset_in_addr (mem_1, &base, &offset_1);
17025   extract_base_offset_in_addr (mem_3, &base, &offset_3);
17026   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
17027               && offset_3 != NULL_RTX);
17028
17029   /* Adjust offset so it can fit in LDP/STP instruction.  */
17030   msize = GET_MODE_SIZE (mode);
17031   stp_off_upper_limit = msize * (0x40 - 1);
17032   stp_off_lower_limit = - msize * 0x40;
17033
17034   off_val_1 = INTVAL (offset_1);
17035   off_val_3 = INTVAL (offset_3);
17036
17037   /* The base offset is optimally half way between the two STP/LDP offsets.  */
17038   if (msize <= 4)
17039     base_off = (off_val_1 + off_val_3) / 2;
17040   else
17041     /* However, due to issues with negative LDP/STP offset generation for
17042        larger modes, for DF, DI and vector modes. we must not use negative
17043        addresses smaller than 9 signed unadjusted bits can store.  This
17044        provides the most range in this case.  */
17045     base_off = off_val_1;
17046
17047   /* Adjust the base so that it is aligned with the addresses but still
17048      optimal.  */
17049   if (base_off % msize != off_val_1 % msize)
17050     /* Fix the offset, bearing in mind we want to make it bigger not
17051        smaller.  */
17052     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17053   else if (msize <= 4)
17054     /* The negative range of LDP/STP is one larger than the positive range.  */
17055     base_off += msize;
17056
17057   /* Check if base offset is too big or too small.  We can attempt to resolve
17058      this issue by setting it to the maximum value and seeing if the offsets
17059      still fit.  */
17060   if (base_off >= 0x1000)
17061     {
17062       base_off = 0x1000 - 1;
17063       /* We must still make sure that the base offset is aligned with respect
17064          to the address.  But it may may not be made any bigger.  */
17065       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17066     }
17067
17068   /* Likewise for the case where the base is too small.  */
17069   if (base_off <= -0x1000)
17070     {
17071       base_off = -0x1000 + 1;
17072       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17073     }
17074
17075   /* Offset of the first STP/LDP.  */
17076   new_off_1 = off_val_1 - base_off;
17077
17078   /* Offset of the second STP/LDP.  */
17079   new_off_3 = off_val_3 - base_off;
17080
17081   /* The offsets must be within the range of the LDP/STP instructions.  */
17082   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
17083       || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
17084     return false;
17085
17086   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
17087                                                   new_off_1), true);
17088   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
17089                                                   new_off_1 + msize), true);
17090   replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
17091                                                   new_off_3), true);
17092   replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
17093                                                   new_off_3 + msize), true);
17094
17095   if (!aarch64_mem_pair_operand (mem_1, mode)
17096       || !aarch64_mem_pair_operand (mem_3, mode))
17097     return false;
17098
17099   if (code == ZERO_EXTEND)
17100     {
17101       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
17102       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
17103       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
17104       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
17105     }
17106   else if (code == SIGN_EXTEND)
17107     {
17108       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
17109       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
17110       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
17111       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
17112     }
17113
17114   if (load)
17115     {
17116       operands[0] = temp_operands[0];
17117       operands[1] = mem_1;
17118       operands[2] = temp_operands[2];
17119       operands[3] = mem_2;
17120       operands[4] = temp_operands[4];
17121       operands[5] = mem_3;
17122       operands[6] = temp_operands[6];
17123       operands[7] = mem_4;
17124     }
17125   else
17126     {
17127       operands[0] = mem_1;
17128       operands[1] = temp_operands[1];
17129       operands[2] = mem_2;
17130       operands[3] = temp_operands[3];
17131       operands[4] = mem_3;
17132       operands[5] = temp_operands[5];
17133       operands[6] = mem_4;
17134       operands[7] = temp_operands[7];
17135     }
17136
17137   /* Emit adjusting instruction.  */
17138   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
17139   /* Emit ldp/stp instructions.  */
17140   t1 = gen_rtx_SET (operands[0], operands[1]);
17141   t2 = gen_rtx_SET (operands[2], operands[3]);
17142   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17143   t1 = gen_rtx_SET (operands[4], operands[5]);
17144   t2 = gen_rtx_SET (operands[6], operands[7]);
17145   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17146   return true;
17147 }
17148
17149 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
17150    it isn't worth branching around empty masked ops (including masked
17151    stores).  */
17152
17153 static bool
17154 aarch64_empty_mask_is_expensive (unsigned)
17155 {
17156   return false;
17157 }
17158
17159 /* Return 1 if pseudo register should be created and used to hold
17160    GOT address for PIC code.  */
17161
17162 bool
17163 aarch64_use_pseudo_pic_reg (void)
17164 {
17165   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
17166 }
17167
17168 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
17169
17170 static int
17171 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
17172 {
17173   switch (XINT (x, 1))
17174     {
17175     case UNSPEC_GOTSMALLPIC:
17176     case UNSPEC_GOTSMALLPIC28K:
17177     case UNSPEC_GOTTINYPIC:
17178       return 0;
17179     default:
17180       break;
17181     }
17182
17183   return default_unspec_may_trap_p (x, flags);
17184 }
17185
17186
17187 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
17188    return the log2 of that value.  Otherwise return -1.  */
17189
17190 int
17191 aarch64_fpconst_pow_of_2 (rtx x)
17192 {
17193   const REAL_VALUE_TYPE *r;
17194
17195   if (!CONST_DOUBLE_P (x))
17196     return -1;
17197
17198   r = CONST_DOUBLE_REAL_VALUE (x);
17199
17200   if (REAL_VALUE_NEGATIVE (*r)
17201       || REAL_VALUE_ISNAN (*r)
17202       || REAL_VALUE_ISINF (*r)
17203       || !real_isinteger (r, DFmode))
17204     return -1;
17205
17206   return exact_log2 (real_to_integer (r));
17207 }
17208
17209 /* If X is a vector of equal CONST_DOUBLE values and that value is
17210    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
17211
17212 int
17213 aarch64_vec_fpconst_pow_of_2 (rtx x)
17214 {
17215   int nelts;
17216   if (GET_CODE (x) != CONST_VECTOR
17217       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
17218     return -1;
17219
17220   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
17221     return -1;
17222
17223   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
17224   if (firstval <= 0)
17225     return -1;
17226
17227   for (int i = 1; i < nelts; i++)
17228     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
17229       return -1;
17230
17231   return firstval;
17232 }
17233
17234 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
17235    to float.
17236
17237    __fp16 always promotes through this hook.
17238    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
17239    through the generic excess precision logic rather than here.  */
17240
17241 static tree
17242 aarch64_promoted_type (const_tree t)
17243 {
17244   if (SCALAR_FLOAT_TYPE_P (t)
17245       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
17246     return float_type_node;
17247
17248   return NULL_TREE;
17249 }
17250
17251 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
17252
17253 static bool
17254 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
17255                            optimization_type opt_type)
17256 {
17257   switch (op)
17258     {
17259     case rsqrt_optab:
17260       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
17261
17262     default:
17263       return true;
17264     }
17265 }
17266
17267 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
17268
17269 static unsigned int
17270 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
17271                                         int *offset)
17272 {
17273   /* Polynomial invariant 1 == (VG / 2) - 1.  */
17274   gcc_assert (i == 1);
17275   *factor = 2;
17276   *offset = 1;
17277   return AARCH64_DWARF_VG;
17278 }
17279
17280 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
17281    if MODE is HFmode, and punt to the generic implementation otherwise.  */
17282
17283 static bool
17284 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
17285 {
17286   return (mode == HFmode
17287           ? true
17288           : default_libgcc_floating_mode_supported_p (mode));
17289 }
17290
17291 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
17292    if MODE is HFmode, and punt to the generic implementation otherwise.  */
17293
17294 static bool
17295 aarch64_scalar_mode_supported_p (scalar_mode mode)
17296 {
17297   return (mode == HFmode
17298           ? true
17299           : default_scalar_mode_supported_p (mode));
17300 }
17301
17302 /* Set the value of FLT_EVAL_METHOD.
17303    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
17304
17305     0: evaluate all operations and constants, whose semantic type has at
17306        most the range and precision of type float, to the range and
17307        precision of float; evaluate all other operations and constants to
17308        the range and precision of the semantic type;
17309
17310     N, where _FloatN is a supported interchange floating type
17311        evaluate all operations and constants, whose semantic type has at
17312        most the range and precision of _FloatN type, to the range and
17313        precision of the _FloatN type; evaluate all other operations and
17314        constants to the range and precision of the semantic type;
17315
17316    If we have the ARMv8.2-A extensions then we support _Float16 in native
17317    precision, so we should set this to 16.  Otherwise, we support the type,
17318    but want to evaluate expressions in float precision, so set this to
17319    0.  */
17320
17321 static enum flt_eval_method
17322 aarch64_excess_precision (enum excess_precision_type type)
17323 {
17324   switch (type)
17325     {
17326       case EXCESS_PRECISION_TYPE_FAST:
17327       case EXCESS_PRECISION_TYPE_STANDARD:
17328         /* We can calculate either in 16-bit range and precision or
17329            32-bit range and precision.  Make that decision based on whether
17330            we have native support for the ARMv8.2-A 16-bit floating-point
17331            instructions or not.  */
17332         return (TARGET_FP_F16INST
17333                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
17334                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
17335       case EXCESS_PRECISION_TYPE_IMPLICIT:
17336         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
17337       default:
17338         gcc_unreachable ();
17339     }
17340   return FLT_EVAL_METHOD_UNPREDICTABLE;
17341 }
17342
17343 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
17344    scheduled for speculative execution.  Reject the long-running division
17345    and square-root instructions.  */
17346
17347 static bool
17348 aarch64_sched_can_speculate_insn (rtx_insn *insn)
17349 {
17350   switch (get_attr_type (insn))
17351     {
17352       case TYPE_SDIV:
17353       case TYPE_UDIV:
17354       case TYPE_FDIVS:
17355       case TYPE_FDIVD:
17356       case TYPE_FSQRTS:
17357       case TYPE_FSQRTD:
17358       case TYPE_NEON_FP_SQRT_S:
17359       case TYPE_NEON_FP_SQRT_D:
17360       case TYPE_NEON_FP_SQRT_S_Q:
17361       case TYPE_NEON_FP_SQRT_D_Q:
17362       case TYPE_NEON_FP_DIV_S:
17363       case TYPE_NEON_FP_DIV_D:
17364       case TYPE_NEON_FP_DIV_S_Q:
17365       case TYPE_NEON_FP_DIV_D_Q:
17366         return false;
17367       default:
17368         return true;
17369     }
17370 }
17371
17372 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
17373
17374 static int
17375 aarch64_compute_pressure_classes (reg_class *classes)
17376 {
17377   int i = 0;
17378   classes[i++] = GENERAL_REGS;
17379   classes[i++] = FP_REGS;
17380   /* PR_REGS isn't a useful pressure class because many predicate pseudo
17381      registers need to go in PR_LO_REGS at some point during their
17382      lifetime.  Splitting it into two halves has the effect of making
17383      all predicates count against PR_LO_REGS, so that we try whenever
17384      possible to restrict the number of live predicates to 8.  This
17385      greatly reduces the amount of spilling in certain loops.  */
17386   classes[i++] = PR_LO_REGS;
17387   classes[i++] = PR_HI_REGS;
17388   return i;
17389 }
17390
17391 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
17392
17393 static bool
17394 aarch64_can_change_mode_class (machine_mode from,
17395                                machine_mode to, reg_class_t)
17396 {
17397   if (BYTES_BIG_ENDIAN)
17398     {
17399       bool from_sve_p = aarch64_sve_data_mode_p (from);
17400       bool to_sve_p = aarch64_sve_data_mode_p (to);
17401
17402       /* Don't allow changes between SVE data modes and non-SVE modes.
17403          See the comment at the head of aarch64-sve.md for details.  */
17404       if (from_sve_p != to_sve_p)
17405         return false;
17406
17407       /* Don't allow changes in element size: lane 0 of the new vector
17408          would not then be lane 0 of the old vector.  See the comment
17409          above aarch64_maybe_expand_sve_subreg_move for a more detailed
17410          description.
17411
17412          In the worst case, this forces a register to be spilled in
17413          one mode and reloaded in the other, which handles the
17414          endianness correctly.  */
17415       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
17416         return false;
17417     }
17418   return true;
17419 }
17420
17421 /* Implement TARGET_EARLY_REMAT_MODES.  */
17422
17423 static void
17424 aarch64_select_early_remat_modes (sbitmap modes)
17425 {
17426   /* SVE values are not normally live across a call, so it should be
17427      worth doing early rematerialization even in VL-specific mode.  */
17428   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
17429     {
17430       machine_mode mode = (machine_mode) i;
17431       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
17432       if (vec_flags & VEC_ANY_SVE)
17433         bitmap_set_bit (modes, i);
17434     }
17435 }
17436
17437 /* Override the default target speculation_safe_value.  */
17438 static rtx
17439 aarch64_speculation_safe_value (machine_mode mode,
17440                                 rtx result, rtx val, rtx failval)
17441 {
17442   /* Maybe we should warn if falling back to hard barriers.  They are
17443      likely to be noticably more expensive than the alternative below.  */
17444   if (!aarch64_track_speculation)
17445     return default_speculation_safe_value (mode, result, val, failval);
17446
17447   if (!REG_P (val))
17448     val = copy_to_mode_reg (mode, val);
17449
17450   if (!aarch64_reg_or_zero (failval, mode))
17451     failval = copy_to_mode_reg (mode, failval);
17452
17453   switch (mode)
17454     {
17455     case E_QImode:
17456       emit_insn (gen_despeculate_copyqi (result, val, failval));
17457       break;
17458     case E_HImode:
17459       emit_insn (gen_despeculate_copyhi (result, val, failval));
17460       break;
17461     case E_SImode:
17462       emit_insn (gen_despeculate_copysi (result, val, failval));
17463       break;
17464     case E_DImode:
17465       emit_insn (gen_despeculate_copydi (result, val, failval));
17466       break;
17467     case E_TImode:
17468       emit_insn (gen_despeculate_copyti (result, val, failval));
17469       break;
17470     default:
17471       gcc_unreachable ();
17472     }
17473   return result;
17474 }
17475
17476 /* Target-specific selftests.  */
17477
17478 #if CHECKING_P
17479
17480 namespace selftest {
17481
17482 /* Selftest for the RTL loader.
17483    Verify that the RTL loader copes with a dump from
17484    print_rtx_function.  This is essentially just a test that class
17485    function_reader can handle a real dump, but it also verifies
17486    that lookup_reg_by_dump_name correctly handles hard regs.
17487    The presence of hard reg names in the dump means that the test is
17488    target-specific, hence it is in this file.  */
17489
17490 static void
17491 aarch64_test_loading_full_dump ()
17492 {
17493   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
17494
17495   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
17496
17497   rtx_insn *insn_1 = get_insn_by_uid (1);
17498   ASSERT_EQ (NOTE, GET_CODE (insn_1));
17499
17500   rtx_insn *insn_15 = get_insn_by_uid (15);
17501   ASSERT_EQ (INSN, GET_CODE (insn_15));
17502   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
17503
17504   /* Verify crtl->return_rtx.  */
17505   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
17506   ASSERT_EQ (0, REGNO (crtl->return_rtx));
17507   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
17508 }
17509
17510 /* Run all target-specific selftests.  */
17511
17512 static void
17513 aarch64_run_selftests (void)
17514 {
17515   aarch64_test_loading_full_dump ();
17516 }
17517
17518 } // namespace selftest
17519
17520 #endif /* #if CHECKING_P */
17521
17522 #undef TARGET_ADDRESS_COST
17523 #define TARGET_ADDRESS_COST aarch64_address_cost
17524
17525 /* This hook will determines whether unnamed bitfields affect the alignment
17526    of the containing structure.  The hook returns true if the structure
17527    should inherit the alignment requirements of an unnamed bitfield's
17528    type.  */
17529 #undef TARGET_ALIGN_ANON_BITFIELD
17530 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
17531
17532 #undef TARGET_ASM_ALIGNED_DI_OP
17533 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
17534
17535 #undef TARGET_ASM_ALIGNED_HI_OP
17536 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
17537
17538 #undef TARGET_ASM_ALIGNED_SI_OP
17539 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
17540
17541 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
17542 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
17543   hook_bool_const_tree_hwi_hwi_const_tree_true
17544
17545 #undef TARGET_ASM_FILE_START
17546 #define TARGET_ASM_FILE_START aarch64_start_file
17547
17548 #undef TARGET_ASM_OUTPUT_MI_THUNK
17549 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
17550
17551 #undef TARGET_ASM_SELECT_RTX_SECTION
17552 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
17553
17554 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
17555 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
17556
17557 #undef TARGET_BUILD_BUILTIN_VA_LIST
17558 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
17559
17560 #undef TARGET_CALLEE_COPIES
17561 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
17562
17563 #undef TARGET_CAN_ELIMINATE
17564 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
17565
17566 #undef TARGET_CAN_INLINE_P
17567 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
17568
17569 #undef TARGET_CANNOT_FORCE_CONST_MEM
17570 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
17571
17572 #undef TARGET_CASE_VALUES_THRESHOLD
17573 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
17574
17575 #undef TARGET_CONDITIONAL_REGISTER_USAGE
17576 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
17577
17578 /* Only the least significant bit is used for initialization guard
17579    variables.  */
17580 #undef TARGET_CXX_GUARD_MASK_BIT
17581 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
17582
17583 #undef TARGET_C_MODE_FOR_SUFFIX
17584 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
17585
17586 #ifdef TARGET_BIG_ENDIAN_DEFAULT
17587 #undef  TARGET_DEFAULT_TARGET_FLAGS
17588 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
17589 #endif
17590
17591 #undef TARGET_CLASS_MAX_NREGS
17592 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
17593
17594 #undef TARGET_BUILTIN_DECL
17595 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
17596
17597 #undef TARGET_BUILTIN_RECIPROCAL
17598 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
17599
17600 #undef TARGET_C_EXCESS_PRECISION
17601 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
17602
17603 #undef  TARGET_EXPAND_BUILTIN
17604 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
17605
17606 #undef TARGET_EXPAND_BUILTIN_VA_START
17607 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
17608
17609 #undef TARGET_FOLD_BUILTIN
17610 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
17611
17612 #undef TARGET_FUNCTION_ARG
17613 #define TARGET_FUNCTION_ARG aarch64_function_arg
17614
17615 #undef TARGET_FUNCTION_ARG_ADVANCE
17616 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
17617
17618 #undef TARGET_FUNCTION_ARG_BOUNDARY
17619 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
17620
17621 #undef TARGET_FUNCTION_ARG_PADDING
17622 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
17623
17624 #undef TARGET_GET_RAW_RESULT_MODE
17625 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
17626 #undef TARGET_GET_RAW_ARG_MODE
17627 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
17628
17629 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
17630 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
17631
17632 #undef TARGET_FUNCTION_VALUE
17633 #define TARGET_FUNCTION_VALUE aarch64_function_value
17634
17635 #undef TARGET_FUNCTION_VALUE_REGNO_P
17636 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
17637
17638 #undef TARGET_GIMPLE_FOLD_BUILTIN
17639 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
17640
17641 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
17642 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
17643
17644 #undef  TARGET_INIT_BUILTINS
17645 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
17646
17647 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
17648 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
17649   aarch64_ira_change_pseudo_allocno_class
17650
17651 #undef TARGET_LEGITIMATE_ADDRESS_P
17652 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
17653
17654 #undef TARGET_LEGITIMATE_CONSTANT_P
17655 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
17656
17657 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
17658 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
17659   aarch64_legitimize_address_displacement
17660
17661 #undef TARGET_LIBGCC_CMP_RETURN_MODE
17662 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
17663
17664 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
17665 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
17666 aarch64_libgcc_floating_mode_supported_p
17667
17668 #undef TARGET_MANGLE_TYPE
17669 #define TARGET_MANGLE_TYPE aarch64_mangle_type
17670
17671 #undef TARGET_MEMORY_MOVE_COST
17672 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
17673
17674 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
17675 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
17676
17677 #undef TARGET_MUST_PASS_IN_STACK
17678 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
17679
17680 /* This target hook should return true if accesses to volatile bitfields
17681    should use the narrowest mode possible.  It should return false if these
17682    accesses should use the bitfield container type.  */
17683 #undef TARGET_NARROW_VOLATILE_BITFIELD
17684 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
17685
17686 #undef  TARGET_OPTION_OVERRIDE
17687 #define TARGET_OPTION_OVERRIDE aarch64_override_options
17688
17689 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
17690 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
17691   aarch64_override_options_after_change
17692
17693 #undef TARGET_OPTION_SAVE
17694 #define TARGET_OPTION_SAVE aarch64_option_save
17695
17696 #undef TARGET_OPTION_RESTORE
17697 #define TARGET_OPTION_RESTORE aarch64_option_restore
17698
17699 #undef TARGET_OPTION_PRINT
17700 #define TARGET_OPTION_PRINT aarch64_option_print
17701
17702 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
17703 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
17704
17705 #undef TARGET_SET_CURRENT_FUNCTION
17706 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
17707
17708 #undef TARGET_PASS_BY_REFERENCE
17709 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
17710
17711 #undef TARGET_PREFERRED_RELOAD_CLASS
17712 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
17713
17714 #undef TARGET_SCHED_REASSOCIATION_WIDTH
17715 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
17716
17717 #undef TARGET_PROMOTED_TYPE
17718 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
17719
17720 #undef TARGET_SECONDARY_RELOAD
17721 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
17722
17723 #undef TARGET_SHIFT_TRUNCATION_MASK
17724 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
17725
17726 #undef TARGET_SETUP_INCOMING_VARARGS
17727 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
17728
17729 #undef TARGET_STRUCT_VALUE_RTX
17730 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
17731
17732 #undef TARGET_REGISTER_MOVE_COST
17733 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
17734
17735 #undef TARGET_RETURN_IN_MEMORY
17736 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
17737
17738 #undef TARGET_RETURN_IN_MSB
17739 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
17740
17741 #undef TARGET_RTX_COSTS
17742 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
17743
17744 #undef TARGET_SCALAR_MODE_SUPPORTED_P
17745 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
17746
17747 #undef TARGET_SCHED_ISSUE_RATE
17748 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
17749
17750 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
17751 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
17752   aarch64_sched_first_cycle_multipass_dfa_lookahead
17753
17754 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
17755 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
17756   aarch64_first_cycle_multipass_dfa_lookahead_guard
17757
17758 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
17759 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
17760   aarch64_get_separate_components
17761
17762 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
17763 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
17764   aarch64_components_for_bb
17765
17766 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
17767 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
17768   aarch64_disqualify_components
17769
17770 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
17771 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
17772   aarch64_emit_prologue_components
17773
17774 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
17775 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
17776   aarch64_emit_epilogue_components
17777
17778 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
17779 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
17780   aarch64_set_handled_components
17781
17782 #undef TARGET_TRAMPOLINE_INIT
17783 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
17784
17785 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
17786 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
17787
17788 #undef TARGET_VECTOR_MODE_SUPPORTED_P
17789 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
17790
17791 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
17792 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
17793   aarch64_builtin_support_vector_misalignment
17794
17795 #undef TARGET_ARRAY_MODE
17796 #define TARGET_ARRAY_MODE aarch64_array_mode
17797
17798 #undef TARGET_ARRAY_MODE_SUPPORTED_P
17799 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
17800
17801 #undef TARGET_VECTORIZE_ADD_STMT_COST
17802 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
17803
17804 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
17805 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
17806   aarch64_builtin_vectorization_cost
17807
17808 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
17809 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
17810
17811 #undef TARGET_VECTORIZE_BUILTINS
17812 #define TARGET_VECTORIZE_BUILTINS
17813
17814 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
17815 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
17816   aarch64_builtin_vectorized_function
17817
17818 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
17819 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
17820   aarch64_autovectorize_vector_sizes
17821
17822 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
17823 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
17824   aarch64_atomic_assign_expand_fenv
17825
17826 /* Section anchor support.  */
17827
17828 #undef TARGET_MIN_ANCHOR_OFFSET
17829 #define TARGET_MIN_ANCHOR_OFFSET -256
17830
17831 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
17832    byte offset; we can do much more for larger data types, but have no way
17833    to determine the size of the access.  We assume accesses are aligned.  */
17834 #undef TARGET_MAX_ANCHOR_OFFSET
17835 #define TARGET_MAX_ANCHOR_OFFSET 4095
17836
17837 #undef TARGET_VECTOR_ALIGNMENT
17838 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
17839
17840 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
17841 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
17842   aarch64_vectorize_preferred_vector_alignment
17843 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
17844 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
17845   aarch64_simd_vector_alignment_reachable
17846
17847 /* vec_perm support.  */
17848
17849 #undef TARGET_VECTORIZE_VEC_PERM_CONST
17850 #define TARGET_VECTORIZE_VEC_PERM_CONST \
17851   aarch64_vectorize_vec_perm_const
17852
17853 #undef TARGET_VECTORIZE_GET_MASK_MODE
17854 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
17855 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
17856 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
17857   aarch64_empty_mask_is_expensive
17858 #undef TARGET_PREFERRED_ELSE_VALUE
17859 #define TARGET_PREFERRED_ELSE_VALUE \
17860   aarch64_preferred_else_value
17861
17862 #undef TARGET_INIT_LIBFUNCS
17863 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
17864
17865 #undef TARGET_FIXED_CONDITION_CODE_REGS
17866 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
17867
17868 #undef TARGET_FLAGS_REGNUM
17869 #define TARGET_FLAGS_REGNUM CC_REGNUM
17870
17871 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
17872 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
17873
17874 #undef TARGET_ASAN_SHADOW_OFFSET
17875 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
17876
17877 #undef TARGET_LEGITIMIZE_ADDRESS
17878 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
17879
17880 #undef TARGET_SCHED_CAN_SPECULATE_INSN
17881 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
17882
17883 #undef TARGET_CAN_USE_DOLOOP_P
17884 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
17885
17886 #undef TARGET_SCHED_ADJUST_PRIORITY
17887 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
17888
17889 #undef TARGET_SCHED_MACRO_FUSION_P
17890 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
17891
17892 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
17893 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
17894
17895 #undef TARGET_SCHED_FUSION_PRIORITY
17896 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
17897
17898 #undef TARGET_UNSPEC_MAY_TRAP_P
17899 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
17900
17901 #undef TARGET_USE_PSEUDO_PIC_REG
17902 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
17903
17904 #undef TARGET_PRINT_OPERAND
17905 #define TARGET_PRINT_OPERAND aarch64_print_operand
17906
17907 #undef TARGET_PRINT_OPERAND_ADDRESS
17908 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
17909
17910 #undef TARGET_OPTAB_SUPPORTED_P
17911 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
17912
17913 #undef TARGET_OMIT_STRUCT_RETURN_REG
17914 #define TARGET_OMIT_STRUCT_RETURN_REG true
17915
17916 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
17917 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
17918   aarch64_dwarf_poly_indeterminate_value
17919
17920 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
17921 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
17922 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
17923
17924 #undef TARGET_HARD_REGNO_NREGS
17925 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
17926 #undef TARGET_HARD_REGNO_MODE_OK
17927 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
17928
17929 #undef TARGET_MODES_TIEABLE_P
17930 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
17931
17932 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
17933 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
17934   aarch64_hard_regno_call_part_clobbered
17935
17936 #undef TARGET_CONSTANT_ALIGNMENT
17937 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
17938
17939 #undef TARGET_COMPUTE_PRESSURE_CLASSES
17940 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
17941
17942 #undef TARGET_CAN_CHANGE_MODE_CLASS
17943 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
17944
17945 #undef TARGET_SELECT_EARLY_REMAT_MODES
17946 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
17947
17948 #undef TARGET_SPECULATION_SAFE_VALUE
17949 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
17950
17951 #if CHECKING_P
17952 #undef TARGET_RUN_TARGET_SELFTESTS
17953 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
17954 #endif /* #if CHECKING_P */
17955
17956 struct gcc_target targetm = TARGET_INITIALIZER;
17957
17958 #include "gt-aarch64.h"