gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2017 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #define INCLUDE_STRING
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "memmodel.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "cfgloop.h"
  33 #include "df.h"
  34 #include "tm_p.h"
  35 #include "stringpool.h"
  36 #include "optabs.h"
  37 #include "regs.h"
  38 #include "emit-rtl.h"
  39 #include "recog.h"
  40 #include "diagnostic.h"
  41 #include "insn-attr.h"
  42 #include "alias.h"
  43 #include "fold-const.h"
  44 #include "stor-layout.h"
  45 #include "calls.h"
  46 #include "varasm.h"
  47 #include "output.h"
  48 #include "flags.h"
  49 #include "explow.h"
  50 #include "expr.h"
  51 #include "reload.h"
  52 #include "langhooks.h"
  53 #include "opts.h"
  54 #include "params.h"
  55 #include "gimplify.h"
  56 #include "dwarf2.h"
  57 #include "gimple-iterator.h"
  58 #include "tree-vectorizer.h"
  59 #include "aarch64-cost-tables.h"
  60 #include "dumpfile.h"
  61 #include "builtins.h"
  62 #include "rtl-iter.h"
  63 #include "tm-constrs.h"
  64 #include "sched-int.h"
  65 #include "target-globals.h"
  66 #include "common/common-target.h"
  67 #include "selftest.h"
  68 #include "selftest-rtl.h"
  69
  70 /* This file should be included last.  */
  71 #include "target-def.h"
  72
  73 /* Defined for convenience.  */
  74 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  75
  76 /* Classifies an address.
  77
  78    ADDRESS_REG_IMM
  79        A simple base register plus immediate offset.
  80
  81    ADDRESS_REG_WB
  82        A base register indexed by immediate offset with writeback.
  83
  84    ADDRESS_REG_REG
  85        A base register indexed by (optionally scaled) register.
  86
  87    ADDRESS_REG_UXTW
  88        A base register indexed by (optionally scaled) zero-extended register.
  89
  90    ADDRESS_REG_SXTW
  91        A base register indexed by (optionally scaled) sign-extended register.
  92
  93    ADDRESS_LO_SUM
  94        A LO_SUM rtx with a base register and "LO12" symbol relocation.
  95
  96    ADDRESS_SYMBOLIC:
  97        A constant symbolic address, in pc-relative literal pool.  */
  98
  99 enum aarch64_address_type {
 100   ADDRESS_REG_IMM,
 101   ADDRESS_REG_WB,
 102   ADDRESS_REG_REG,
 103   ADDRESS_REG_UXTW,
 104   ADDRESS_REG_SXTW,
 105   ADDRESS_LO_SUM,
 106   ADDRESS_SYMBOLIC
 107 };
 108
 109 struct aarch64_address_info {
 110   enum aarch64_address_type type;
 111   rtx base;
 112   rtx offset;
 113   int shift;
 114   enum aarch64_symbol_type symbol_type;
 115 };
 116
 117 struct simd_immediate_info
 118 {
 119   rtx value;
 120   int shift;
 121   int element_width;
 122   bool mvn;
 123   bool msl;
 124 };
 125
 126 /* The current code model.  */
 127 enum aarch64_code_model aarch64_cmodel;
 128
 129 #ifdef HAVE_AS_TLS
 130 #undef TARGET_HAVE_TLS
 131 #define TARGET_HAVE_TLS 1
 132 #endif
 133
 134 static bool aarch64_composite_type_p (const_tree, machine_mode);
 135 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 136                                                      const_tree,
 137                                                      machine_mode *, int *,
 138                                                      bool *);
 139 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 140 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 141 static void aarch64_override_options_after_change (void);
 142 static bool aarch64_vector_mode_supported_p (machine_mode);
 143 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
 144                                                  const unsigned char *sel);
 145 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 146 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 147                                                          const_tree type,
 148                                                          int misalignment,
 149                                                          bool is_packed);
 150
 151 /* Major revision number of the ARM Architecture implemented by the target.  */
 152 unsigned aarch64_architecture_version;
 153
 154 /* The processor for which instructions should be scheduled.  */
 155 enum aarch64_processor aarch64_tune = cortexa53;
 156
 157 /* Mask to specify which instruction scheduling options should be used.  */
 158 unsigned long aarch64_tune_flags = 0;
 159
 160 /* Global flag for PC relative loads.  */
 161 bool aarch64_pcrelative_literal_loads;
 162
 163 /* Support for command line parsing of boolean flags in the tuning
 164    structures.  */
 165 struct aarch64_flag_desc
 166 {
 167   const char* name;
 168   unsigned int flag;
 169 };
 170
 171 #define AARCH64_FUSION_PAIR(name, internal_name) \
 172   { name, AARCH64_FUSE_##internal_name },
 173 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 174 {
 175   { "none", AARCH64_FUSE_NOTHING },
 176 #include "aarch64-fusion-pairs.def"
 177   { "all", AARCH64_FUSE_ALL },
 178   { NULL, AARCH64_FUSE_NOTHING }
 179 };
 180
 181 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 182   { name, AARCH64_EXTRA_TUNE_##internal_name },
 183 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 184 {
 185   { "none", AARCH64_EXTRA_TUNE_NONE },
 186 #include "aarch64-tuning-flags.def"
 187   { "all", AARCH64_EXTRA_TUNE_ALL },
 188   { NULL, AARCH64_EXTRA_TUNE_NONE }
 189 };
 190
 191 /* Tuning parameters.  */
 192
 193 static const struct cpu_addrcost_table generic_addrcost_table =
 194 {
 195     {
 196       1, /* hi  */
 197       0, /* si  */
 198       0, /* di  */
 199       1, /* ti  */
 200     },
 201   0, /* pre_modify  */
 202   0, /* post_modify  */
 203   0, /* register_offset  */
 204   0, /* register_sextend  */
 205   0, /* register_zextend  */
 206   0 /* imm_offset  */
 207 };
 208
 209 static const struct cpu_addrcost_table cortexa57_addrcost_table =
 210 {
 211     {
 212       1, /* hi  */
 213       0, /* si  */
 214       0, /* di  */
 215       1, /* ti  */
 216     },
 217   0, /* pre_modify  */
 218   0, /* post_modify  */
 219   0, /* register_offset  */
 220   0, /* register_sextend  */
 221   0, /* register_zextend  */
 222   0, /* imm_offset  */
 223 };
 224
 225 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 226 {
 227     {
 228       0, /* hi  */
 229       0, /* si  */
 230       0, /* di  */
 231       2, /* ti  */
 232     },
 233   0, /* pre_modify  */
 234   0, /* post_modify  */
 235   1, /* register_offset  */
 236   1, /* register_sextend  */
 237   2, /* register_zextend  */
 238   0, /* imm_offset  */
 239 };
 240
 241 static const struct cpu_addrcost_table xgene1_addrcost_table =
 242 {
 243     {
 244       1, /* hi  */
 245       0, /* si  */
 246       0, /* di  */
 247       1, /* ti  */
 248     },
 249   1, /* pre_modify  */
 250   0, /* post_modify  */
 251   0, /* register_offset  */
 252   1, /* register_sextend  */
 253   1, /* register_zextend  */
 254   0, /* imm_offset  */
 255 };
 256
 257 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 258 {
 259     {
 260       1, /* hi  */
 261       0, /* si  */
 262       0, /* di  */
 263       1, /* ti  */
 264     },
 265   0, /* pre_modify  */
 266   0, /* post_modify  */
 267   0, /* register_offset  */
 268   0, /* register_sextend  */
 269   0, /* register_zextend  */
 270   0 /* imm_offset  */
 271 };
 272
 273 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 274 {
 275     {
 276       1, /* hi  */
 277       1, /* si  */
 278       1, /* di  */
 279       2, /* ti  */
 280     },
 281   0, /* pre_modify  */
 282   0, /* post_modify  */
 283   2, /* register_offset  */
 284   3, /* register_sextend  */
 285   3, /* register_zextend  */
 286   0, /* imm_offset  */
 287 };
 288
 289 static const struct cpu_regmove_cost generic_regmove_cost =
 290 {
 291   1, /* GP2GP  */
 292   /* Avoid the use of slow int<->fp moves for spilling by setting
 293      their cost higher than memmov_cost.  */
 294   5, /* GP2FP  */
 295   5, /* FP2GP  */
 296   2 /* FP2FP  */
 297 };
 298
 299 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 300 {
 301   1, /* GP2GP  */
 302   /* Avoid the use of slow int<->fp moves for spilling by setting
 303      their cost higher than memmov_cost.  */
 304   5, /* GP2FP  */
 305   5, /* FP2GP  */
 306   2 /* FP2FP  */
 307 };
 308
 309 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 310 {
 311   1, /* GP2GP  */
 312   /* Avoid the use of slow int<->fp moves for spilling by setting
 313      their cost higher than memmov_cost.  */
 314   5, /* GP2FP  */
 315   5, /* FP2GP  */
 316   2 /* FP2FP  */
 317 };
 318
 319 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 320 {
 321   1, /* GP2GP  */
 322   /* Avoid the use of slow int<->fp moves for spilling by setting
 323      their cost higher than memmov_cost (actual, 4 and 9).  */
 324   9, /* GP2FP  */
 325   9, /* FP2GP  */
 326   1 /* FP2FP  */
 327 };
 328
 329 static const struct cpu_regmove_cost thunderx_regmove_cost =
 330 {
 331   2, /* GP2GP  */
 332   2, /* GP2FP  */
 333   6, /* FP2GP  */
 334   4 /* FP2FP  */
 335 };
 336
 337 static const struct cpu_regmove_cost xgene1_regmove_cost =
 338 {
 339   1, /* GP2GP  */
 340   /* Avoid the use of slow int<->fp moves for spilling by setting
 341      their cost higher than memmov_cost.  */
 342   8, /* GP2FP  */
 343   8, /* FP2GP  */
 344   2 /* FP2FP  */
 345 };
 346
 347 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 348 {
 349   2, /* GP2GP  */
 350   /* Avoid the use of int<->fp moves for spilling.  */
 351   6, /* GP2FP  */
 352   6, /* FP2GP  */
 353   4 /* FP2FP  */
 354 };
 355
 356 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 357 {
 358   1, /* GP2GP  */
 359   /* Avoid the use of int<->fp moves for spilling.  */
 360   8, /* GP2FP  */
 361   8, /* FP2GP  */
 362   4  /* FP2FP  */
 363 };
 364
 365 /* Generic costs for vector insn classes.  */
 366 static const struct cpu_vector_cost generic_vector_cost =
 367 {
 368   1, /* scalar_int_stmt_cost  */
 369   1, /* scalar_fp_stmt_cost  */
 370   1, /* scalar_load_cost  */
 371   1, /* scalar_store_cost  */
 372   1, /* vec_int_stmt_cost  */
 373   1, /* vec_fp_stmt_cost  */
 374   2, /* vec_permute_cost  */
 375   1, /* vec_to_scalar_cost  */
 376   1, /* scalar_to_vec_cost  */
 377   1, /* vec_align_load_cost  */
 378   1, /* vec_unalign_load_cost  */
 379   1, /* vec_unalign_store_cost  */
 380   1, /* vec_store_cost  */
 381   3, /* cond_taken_branch_cost  */
 382   1 /* cond_not_taken_branch_cost  */
 383 };
 384
 385 /* ThunderX costs for vector insn classes.  */
 386 static const struct cpu_vector_cost thunderx_vector_cost =
 387 {
 388   1, /* scalar_int_stmt_cost  */
 389   1, /* scalar_fp_stmt_cost  */
 390   3, /* scalar_load_cost  */
 391   1, /* scalar_store_cost  */
 392   4, /* vec_int_stmt_cost  */
 393   4, /* vec_fp_stmt_cost  */
 394   4, /* vec_permute_cost  */
 395   2, /* vec_to_scalar_cost  */
 396   2, /* scalar_to_vec_cost  */
 397   3, /* vec_align_load_cost  */
 398   10, /* vec_unalign_load_cost  */
 399   10, /* vec_unalign_store_cost  */
 400   1, /* vec_store_cost  */
 401   3, /* cond_taken_branch_cost  */
 402   3 /* cond_not_taken_branch_cost  */
 403 };
 404
 405 /* Generic costs for vector insn classes.  */
 406 static const struct cpu_vector_cost cortexa57_vector_cost =
 407 {
 408   1, /* scalar_int_stmt_cost  */
 409   1, /* scalar_fp_stmt_cost  */
 410   4, /* scalar_load_cost  */
 411   1, /* scalar_store_cost  */
 412   2, /* vec_int_stmt_cost  */
 413   2, /* vec_fp_stmt_cost  */
 414   3, /* vec_permute_cost  */
 415   8, /* vec_to_scalar_cost  */
 416   8, /* scalar_to_vec_cost  */
 417   4, /* vec_align_load_cost  */
 418   4, /* vec_unalign_load_cost  */
 419   1, /* vec_unalign_store_cost  */
 420   1, /* vec_store_cost  */
 421   1, /* cond_taken_branch_cost  */
 422   1 /* cond_not_taken_branch_cost  */
 423 };
 424
 425 static const struct cpu_vector_cost exynosm1_vector_cost =
 426 {
 427   1, /* scalar_int_stmt_cost  */
 428   1, /* scalar_fp_stmt_cost  */
 429   5, /* scalar_load_cost  */
 430   1, /* scalar_store_cost  */
 431   3, /* vec_int_stmt_cost  */
 432   3, /* vec_fp_stmt_cost  */
 433   3, /* vec_permute_cost  */
 434   3, /* vec_to_scalar_cost  */
 435   3, /* scalar_to_vec_cost  */
 436   5, /* vec_align_load_cost  */
 437   5, /* vec_unalign_load_cost  */
 438   1, /* vec_unalign_store_cost  */
 439   1, /* vec_store_cost  */
 440   1, /* cond_taken_branch_cost  */
 441   1 /* cond_not_taken_branch_cost  */
 442 };
 443
 444 /* Generic costs for vector insn classes.  */
 445 static const struct cpu_vector_cost xgene1_vector_cost =
 446 {
 447   1, /* scalar_int_stmt_cost  */
 448   1, /* scalar_fp_stmt_cost  */
 449   5, /* scalar_load_cost  */
 450   1, /* scalar_store_cost  */
 451   2, /* vec_int_stmt_cost  */
 452   2, /* vec_fp_stmt_cost  */
 453   2, /* vec_permute_cost  */
 454   4, /* vec_to_scalar_cost  */
 455   4, /* scalar_to_vec_cost  */
 456   10, /* vec_align_load_cost  */
 457   10, /* vec_unalign_load_cost  */
 458   2, /* vec_unalign_store_cost  */
 459   2, /* vec_store_cost  */
 460   2, /* cond_taken_branch_cost  */
 461   1 /* cond_not_taken_branch_cost  */
 462 };
 463
 464 /* Costs for vector insn classes for Vulcan.  */
 465 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 466 {
 467   1, /* scalar_int_stmt_cost  */
 468   6, /* scalar_fp_stmt_cost  */
 469   4, /* scalar_load_cost  */
 470   1, /* scalar_store_cost  */
 471   5, /* vec_int_stmt_cost  */
 472   6, /* vec_fp_stmt_cost  */
 473   3, /* vec_permute_cost  */
 474   6, /* vec_to_scalar_cost  */
 475   5, /* scalar_to_vec_cost  */
 476   8, /* vec_align_load_cost  */
 477   8, /* vec_unalign_load_cost  */
 478   4, /* vec_unalign_store_cost  */
 479   4, /* vec_store_cost  */
 480   2, /* cond_taken_branch_cost  */
 481   1  /* cond_not_taken_branch_cost  */
 482 };
 483
 484 /* Generic costs for branch instructions.  */
 485 static const struct cpu_branch_cost generic_branch_cost =
 486 {
 487   1,  /* Predictable.  */
 488   3   /* Unpredictable.  */
 489 };
 490
 491 /* Branch costs for Cortex-A57.  */
 492 static const struct cpu_branch_cost cortexa57_branch_cost =
 493 {
 494   1,  /* Predictable.  */
 495   3   /* Unpredictable.  */
 496 };
 497
 498 /* Branch costs for Vulcan.  */
 499 static const struct cpu_branch_cost thunderx2t99_branch_cost =
 500 {
 501   1,  /* Predictable.  */
 502   3   /* Unpredictable.  */
 503 };
 504
 505 /* Generic approximation modes.  */
 506 static const cpu_approx_modes generic_approx_modes =
 507 {
 508   AARCH64_APPROX_NONE,  /* division  */
 509   AARCH64_APPROX_NONE,  /* sqrt  */
 510   AARCH64_APPROX_NONE   /* recip_sqrt  */
 511 };
 512
 513 /* Approximation modes for Exynos M1.  */
 514 static const cpu_approx_modes exynosm1_approx_modes =
 515 {
 516   AARCH64_APPROX_NONE,  /* division  */
 517   AARCH64_APPROX_ALL,   /* sqrt  */
 518   AARCH64_APPROX_ALL    /* recip_sqrt  */
 519 };
 520
 521 /* Approximation modes for X-Gene 1.  */
 522 static const cpu_approx_modes xgene1_approx_modes =
 523 {
 524   AARCH64_APPROX_NONE,  /* division  */
 525   AARCH64_APPROX_NONE,  /* sqrt  */
 526   AARCH64_APPROX_ALL    /* recip_sqrt  */
 527 };
 528
 529 /* Generic prefetch settings (which disable prefetch).  */
 530 static const cpu_prefetch_tune generic_prefetch_tune =
 531 {
 532   0,                    /* num_slots  */
 533   -1,                   /* l1_cache_size  */
 534   -1,                   /* l1_cache_line_size  */
 535   -1,                   /* l2_cache_size  */
 536   -1                    /* default_opt_level  */
 537 };
 538
 539 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 540 {
 541   0,                    /* num_slots  */
 542   -1,                   /* l1_cache_size  */
 543   64,                   /* l1_cache_line_size  */
 544   -1,                   /* l2_cache_size  */
 545   -1                    /* default_opt_level  */
 546 };
 547
 548 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 549 {
 550   4,                    /* num_slots  */
 551   32,                   /* l1_cache_size  */
 552   64,                   /* l1_cache_line_size  */
 553   1024,                 /* l2_cache_size  */
 554   3                     /* default_opt_level  */
 555 };
 556
 557 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 558 {
 559   8,                    /* num_slots  */
 560   32,                   /* l1_cache_size  */
 561   128,                  /* l1_cache_line_size  */
 562   16*1024,              /* l2_cache_size  */
 563   3                     /* default_opt_level  */
 564 };
 565
 566 static const cpu_prefetch_tune thunderx_prefetch_tune =
 567 {
 568   8,                    /* num_slots  */
 569   32,                   /* l1_cache_size  */
 570   128,                  /* l1_cache_line_size  */
 571   -1,                   /* l2_cache_size  */
 572   -1                    /* default_opt_level  */
 573 };
 574
 575 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 576 {
 577   8,                    /* num_slots  */
 578   32,                   /* l1_cache_size  */
 579   64,                   /* l1_cache_line_size  */
 580   256,                  /* l2_cache_size  */
 581   -1                    /* default_opt_level  */
 582 };
 583
 584 static const struct tune_params generic_tunings =
 585 {
 586   &cortexa57_extra_costs,
 587   &generic_addrcost_table,
 588   &generic_regmove_cost,
 589   &generic_vector_cost,
 590   &generic_branch_cost,
 591   &generic_approx_modes,
 592   4, /* memmov_cost  */
 593   2, /* issue_rate  */
 594   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 595   8,    /* function_align.  */
 596   4,    /* jump_align.  */
 597   8,    /* loop_align.  */
 598   2,    /* int_reassoc_width.  */
 599   4,    /* fp_reassoc_width.  */
 600   1,    /* vec_reassoc_width.  */
 601   2,    /* min_div_recip_mul_sf.  */
 602   2,    /* min_div_recip_mul_df.  */
 603   0,    /* max_case_values.  */
 604   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 605   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 606   &generic_prefetch_tune
 607 };
 608
 609 static const struct tune_params cortexa35_tunings =
 610 {
 611   &cortexa53_extra_costs,
 612   &generic_addrcost_table,
 613   &cortexa53_regmove_cost,
 614   &generic_vector_cost,
 615   &cortexa57_branch_cost,
 616   &generic_approx_modes,
 617   4, /* memmov_cost  */
 618   1, /* issue_rate  */
 619   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 620    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 621   16,   /* function_align.  */
 622   4,    /* jump_align.  */
 623   8,    /* loop_align.  */
 624   2,    /* int_reassoc_width.  */
 625   4,    /* fp_reassoc_width.  */
 626   1,    /* vec_reassoc_width.  */
 627   2,    /* min_div_recip_mul_sf.  */
 628   2,    /* min_div_recip_mul_df.  */
 629   0,    /* max_case_values.  */
 630   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 631   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 632   &generic_prefetch_tune
 633 };
 634
 635 static const struct tune_params cortexa53_tunings =
 636 {
 637   &cortexa53_extra_costs,
 638   &generic_addrcost_table,
 639   &cortexa53_regmove_cost,
 640   &generic_vector_cost,
 641   &cortexa57_branch_cost,
 642   &generic_approx_modes,
 643   4, /* memmov_cost  */
 644   2, /* issue_rate  */
 645   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 646    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 647   16,   /* function_align.  */
 648   4,    /* jump_align.  */
 649   8,    /* loop_align.  */
 650   2,    /* int_reassoc_width.  */
 651   4,    /* fp_reassoc_width.  */
 652   1,    /* vec_reassoc_width.  */
 653   2,    /* min_div_recip_mul_sf.  */
 654   2,    /* min_div_recip_mul_df.  */
 655   0,    /* max_case_values.  */
 656   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 657   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 658   &generic_prefetch_tune
 659 };
 660
 661 static const struct tune_params cortexa57_tunings =
 662 {
 663   &cortexa57_extra_costs,
 664   &cortexa57_addrcost_table,
 665   &cortexa57_regmove_cost,
 666   &cortexa57_vector_cost,
 667   &cortexa57_branch_cost,
 668   &generic_approx_modes,
 669   4, /* memmov_cost  */
 670   3, /* issue_rate  */
 671   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 672    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 673   16,   /* function_align.  */
 674   4,    /* jump_align.  */
 675   8,    /* loop_align.  */
 676   2,    /* int_reassoc_width.  */
 677   4,    /* fp_reassoc_width.  */
 678   1,    /* vec_reassoc_width.  */
 679   2,    /* min_div_recip_mul_sf.  */
 680   2,    /* min_div_recip_mul_df.  */
 681   0,    /* max_case_values.  */
 682   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 683   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 684   &generic_prefetch_tune
 685 };
 686
 687 static const struct tune_params cortexa72_tunings =
 688 {
 689   &cortexa57_extra_costs,
 690   &cortexa57_addrcost_table,
 691   &cortexa57_regmove_cost,
 692   &cortexa57_vector_cost,
 693   &cortexa57_branch_cost,
 694   &generic_approx_modes,
 695   4, /* memmov_cost  */
 696   3, /* issue_rate  */
 697   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 698    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 699   16,   /* function_align.  */
 700   4,    /* jump_align.  */
 701   8,    /* loop_align.  */
 702   2,    /* int_reassoc_width.  */
 703   4,    /* fp_reassoc_width.  */
 704   1,    /* vec_reassoc_width.  */
 705   2,    /* min_div_recip_mul_sf.  */
 706   2,    /* min_div_recip_mul_df.  */
 707   0,    /* max_case_values.  */
 708   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 709   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 710   &generic_prefetch_tune
 711 };
 712
 713 static const struct tune_params cortexa73_tunings =
 714 {
 715   &cortexa57_extra_costs,
 716   &cortexa57_addrcost_table,
 717   &cortexa57_regmove_cost,
 718   &cortexa57_vector_cost,
 719   &cortexa57_branch_cost,
 720   &generic_approx_modes,
 721   4, /* memmov_cost.  */
 722   2, /* issue_rate.  */
 723   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 724    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 725   16,   /* function_align.  */
 726   4,    /* jump_align.  */
 727   8,    /* loop_align.  */
 728   2,    /* int_reassoc_width.  */
 729   4,    /* fp_reassoc_width.  */
 730   1,    /* vec_reassoc_width.  */
 731   2,    /* min_div_recip_mul_sf.  */
 732   2,    /* min_div_recip_mul_df.  */
 733   0,    /* max_case_values.  */
 734   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 735   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 736   &generic_prefetch_tune
 737 };
 738
 739
 740
 741 static const struct tune_params exynosm1_tunings =
 742 {
 743   &exynosm1_extra_costs,
 744   &exynosm1_addrcost_table,
 745   &exynosm1_regmove_cost,
 746   &exynosm1_vector_cost,
 747   &generic_branch_cost,
 748   &exynosm1_approx_modes,
 749   4,    /* memmov_cost  */
 750   3,    /* issue_rate  */
 751   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 752   4,    /* function_align.  */
 753   4,    /* jump_align.  */
 754   4,    /* loop_align.  */
 755   2,    /* int_reassoc_width.  */
 756   4,    /* fp_reassoc_width.  */
 757   1,    /* vec_reassoc_width.  */
 758   2,    /* min_div_recip_mul_sf.  */
 759   2,    /* min_div_recip_mul_df.  */
 760   48,   /* max_case_values.  */
 761   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 762   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 763   &exynosm1_prefetch_tune
 764 };
 765
 766 static const struct tune_params thunderxt88_tunings =
 767 {
 768   &thunderx_extra_costs,
 769   &generic_addrcost_table,
 770   &thunderx_regmove_cost,
 771   &thunderx_vector_cost,
 772   &generic_branch_cost,
 773   &generic_approx_modes,
 774   6, /* memmov_cost  */
 775   2, /* issue_rate  */
 776   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 777   8,    /* function_align.  */
 778   8,    /* jump_align.  */
 779   8,    /* loop_align.  */
 780   2,    /* int_reassoc_width.  */
 781   4,    /* fp_reassoc_width.  */
 782   1,    /* vec_reassoc_width.  */
 783   2,    /* min_div_recip_mul_sf.  */
 784   2,    /* min_div_recip_mul_df.  */
 785   0,    /* max_case_values.  */
 786   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 787   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 788   &thunderxt88_prefetch_tune
 789 };
 790
 791 static const struct tune_params thunderx_tunings =
 792 {
 793   &thunderx_extra_costs,
 794   &generic_addrcost_table,
 795   &thunderx_regmove_cost,
 796   &thunderx_vector_cost,
 797   &generic_branch_cost,
 798   &generic_approx_modes,
 799   6, /* memmov_cost  */
 800   2, /* issue_rate  */
 801   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 802   8,    /* function_align.  */
 803   8,    /* jump_align.  */
 804   8,    /* loop_align.  */
 805   2,    /* int_reassoc_width.  */
 806   4,    /* fp_reassoc_width.  */
 807   1,    /* vec_reassoc_width.  */
 808   2,    /* min_div_recip_mul_sf.  */
 809   2,    /* min_div_recip_mul_df.  */
 810   0,    /* max_case_values.  */
 811   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 812   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 813    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 814   &thunderx_prefetch_tune
 815 };
 816
 817 static const struct tune_params xgene1_tunings =
 818 {
 819   &xgene1_extra_costs,
 820   &xgene1_addrcost_table,
 821   &xgene1_regmove_cost,
 822   &xgene1_vector_cost,
 823   &generic_branch_cost,
 824   &xgene1_approx_modes,
 825   6, /* memmov_cost  */
 826   4, /* issue_rate  */
 827   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 828   16,   /* function_align.  */
 829   8,    /* jump_align.  */
 830   16,   /* loop_align.  */
 831   2,    /* int_reassoc_width.  */
 832   4,    /* fp_reassoc_width.  */
 833   1,    /* vec_reassoc_width.  */
 834   2,    /* min_div_recip_mul_sf.  */
 835   2,    /* min_div_recip_mul_df.  */
 836   0,    /* max_case_values.  */
 837   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 838   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 839   &generic_prefetch_tune
 840 };
 841
 842 static const struct tune_params qdf24xx_tunings =
 843 {
 844   &qdf24xx_extra_costs,
 845   &qdf24xx_addrcost_table,
 846   &qdf24xx_regmove_cost,
 847   &generic_vector_cost,
 848   &generic_branch_cost,
 849   &generic_approx_modes,
 850   4, /* memmov_cost  */
 851   4, /* issue_rate  */
 852   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 853    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 854   16,   /* function_align.  */
 855   8,    /* jump_align.  */
 856   16,   /* loop_align.  */
 857   2,    /* int_reassoc_width.  */
 858   4,    /* fp_reassoc_width.  */
 859   1,    /* vec_reassoc_width.  */
 860   2,    /* min_div_recip_mul_sf.  */
 861   2,    /* min_div_recip_mul_df.  */
 862   0,    /* max_case_values.  */
 863   tune_params::AUTOPREFETCHER_STRONG,   /* autoprefetcher_model.  */
 864   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
 865   &qdf24xx_prefetch_tune
 866 };
 867
 868 static const struct tune_params thunderx2t99_tunings =
 869 {
 870   &thunderx2t99_extra_costs,
 871   &thunderx2t99_addrcost_table,
 872   &thunderx2t99_regmove_cost,
 873   &thunderx2t99_vector_cost,
 874   &thunderx2t99_branch_cost,
 875   &generic_approx_modes,
 876   4, /* memmov_cost.  */
 877   4, /* issue_rate.  */
 878   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
 879    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 880   16,   /* function_align.  */
 881   8,    /* jump_align.  */
 882   16,   /* loop_align.  */
 883   3,    /* int_reassoc_width.  */
 884   2,    /* fp_reassoc_width.  */
 885   2,    /* vec_reassoc_width.  */
 886   2,    /* min_div_recip_mul_sf.  */
 887   2,    /* min_div_recip_mul_df.  */
 888   0,    /* max_case_values.  */
 889   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 890   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 891   &thunderx2t99_prefetch_tune
 892 };
 893
 894 /* Support for fine-grained override of the tuning structures.  */
 895 struct aarch64_tuning_override_function
 896 {
 897   const char* name;
 898   void (*parse_override)(const char*, struct tune_params*);
 899 };
 900
 901 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 902 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 903
 904 static const struct aarch64_tuning_override_function
 905 aarch64_tuning_override_functions[] =
 906 {
 907   { "fuse", aarch64_parse_fuse_string },
 908   { "tune", aarch64_parse_tune_string },
 909   { NULL, NULL }
 910 };
 911
 912 /* A processor implementing AArch64.  */
 913 struct processor
 914 {
 915   const char *const name;
 916   enum aarch64_processor ident;
 917   enum aarch64_processor sched_core;
 918   enum aarch64_arch arch;
 919   unsigned architecture_version;
 920   const unsigned long flags;
 921   const struct tune_params *const tune;
 922 };
 923
 924 /* Architectures implementing AArch64.  */
 925 static const struct processor all_architectures[] =
 926 {
 927 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
 928   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
 929 #include "aarch64-arches.def"
 930   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 931 };
 932
 933 /* Processor cores implementing AArch64.  */
 934 static const struct processor all_cores[] =
 935 {
 936 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
 937   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
 938   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
 939   FLAGS, &COSTS##_tunings},
 940 #include "aarch64-cores.def"
 941   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
 942     AARCH64_FL_FOR_ARCH8, &generic_tunings},
 943   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 944 };
 945
 946
 947 /* Target specification.  These are populated by the -march, -mtune, -mcpu
 948    handling code or by target attributes.  */
 949 static const struct processor *selected_arch;
 950 static const struct processor *selected_cpu;
 951 static const struct processor *selected_tune;
 952
 953 /* The current tuning set.  */
 954 struct tune_params aarch64_tune_params = generic_tunings;
 955
 956 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 957
 958 /* An ISA extension in the co-processor and main instruction set space.  */
 959 struct aarch64_option_extension
 960 {
 961   const char *const name;
 962   const unsigned long flags_on;
 963   const unsigned long flags_off;
 964 };
 965
 966 typedef enum aarch64_cond_code
 967 {
 968   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 969   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 970   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 971 }
 972 aarch64_cc;
 973
 974 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 975
 976 /* The condition codes of the processor, and the inverse function.  */
 977 static const char * const aarch64_condition_codes[] =
 978 {
 979   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 980   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 981 };
 982
 983 /* Generate code to enable conditional branches in functions over 1 MiB.  */
 984 const char *
 985 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
 986                         const char * branch_format)
 987 {
 988     rtx_code_label * tmp_label = gen_label_rtx ();
 989     char label_buf[256];
 990     char buffer[128];
 991     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
 992                                  CODE_LABEL_NUMBER (tmp_label));
 993     const char *label_ptr = targetm.strip_name_encoding (label_buf);
 994     rtx dest_label = operands[pos_label];
 995     operands[pos_label] = tmp_label;
 996
 997     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
 998     output_asm_insn (buffer, operands);
 999
1000     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1001     operands[pos_label] = dest_label;
1002     output_asm_insn (buffer, operands);
1003     return "";
1004 }
1005
1006 void
1007 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
1008 {
1009   const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
1010   if (TARGET_GENERAL_REGS_ONLY)
1011     error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
1012   else
1013     error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
1014 }
1015
1016 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1017    The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
1018    the same cost even if ALL_REGS has a much larger cost.  ALL_REGS is also
1019    used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
1020    cost (in this case the best class is the lowest cost one).  Using ALL_REGS
1021    irrespectively of its cost results in bad allocations with many redundant
1022    int<->FP moves which are expensive on various cores.
1023    To avoid this we don't allow ALL_REGS as the allocno class, but force a
1024    decision between FP_REGS and GENERAL_REGS.  We use the allocno class if it
1025    isn't ALL_REGS.  Similarly, use the best class if it isn't ALL_REGS.
1026    Otherwise set the allocno class depending on the mode.
1027    The result of this is that it is no longer inefficient to have a higher
1028    memory move cost than the register move cost.
1029 */
1030
1031 static reg_class_t
1032 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1033                                          reg_class_t best_class)
1034 {
1035   machine_mode mode;
1036
1037   if (allocno_class != ALL_REGS)
1038     return allocno_class;
1039
1040   if (best_class != ALL_REGS)
1041     return best_class;
1042
1043   mode = PSEUDO_REGNO_MODE (regno);
1044   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1045 }
1046
1047 static unsigned int
1048 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1049 {
1050   if (GET_MODE_UNIT_SIZE (mode) == 4)
1051     return aarch64_tune_params.min_div_recip_mul_sf;
1052   return aarch64_tune_params.min_div_recip_mul_df;
1053 }
1054
1055 static int
1056 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
1057                              machine_mode mode)
1058 {
1059   if (VECTOR_MODE_P (mode))
1060     return aarch64_tune_params.vec_reassoc_width;
1061   if (INTEGRAL_MODE_P (mode))
1062     return aarch64_tune_params.int_reassoc_width;
1063   if (FLOAT_MODE_P (mode))
1064     return aarch64_tune_params.fp_reassoc_width;
1065   return 1;
1066 }
1067
1068 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1069 unsigned
1070 aarch64_dbx_register_number (unsigned regno)
1071 {
1072    if (GP_REGNUM_P (regno))
1073      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1074    else if (regno == SP_REGNUM)
1075      return AARCH64_DWARF_SP;
1076    else if (FP_REGNUM_P (regno))
1077      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1078
1079    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1080       equivalent DWARF register.  */
1081    return DWARF_FRAME_REGISTERS;
1082 }
1083
1084 /* Return TRUE if MODE is any of the large INT modes.  */
1085 static bool
1086 aarch64_vect_struct_mode_p (machine_mode mode)
1087 {
1088   return mode == OImode || mode == CImode || mode == XImode;
1089 }
1090
1091 /* Return TRUE if MODE is any of the vector modes.  */
1092 static bool
1093 aarch64_vector_mode_p (machine_mode mode)
1094 {
1095   return aarch64_vector_mode_supported_p (mode)
1096          || aarch64_vect_struct_mode_p (mode);
1097 }
1098
1099 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1100 static bool
1101 aarch64_array_mode_supported_p (machine_mode mode,
1102                                 unsigned HOST_WIDE_INT nelems)
1103 {
1104   if (TARGET_SIMD
1105       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1106           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1107       && (nelems >= 2 && nelems <= 4))
1108     return true;
1109
1110   return false;
1111 }
1112
1113 /* Implement HARD_REGNO_NREGS.  */
1114
1115 int
1116 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1117 {
1118   switch (aarch64_regno_regclass (regno))
1119     {
1120     case FP_REGS:
1121     case FP_LO_REGS:
1122       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
1123     default:
1124       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
1125     }
1126   gcc_unreachable ();
1127 }
1128
1129 /* Implement HARD_REGNO_MODE_OK.  */
1130
1131 int
1132 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1133 {
1134   if (GET_MODE_CLASS (mode) == MODE_CC)
1135     return regno == CC_REGNUM;
1136
1137   if (regno == SP_REGNUM)
1138     /* The purpose of comparing with ptr_mode is to support the
1139        global register variable associated with the stack pointer
1140        register via the syntax of asm ("wsp") in ILP32.  */
1141     return mode == Pmode || mode == ptr_mode;
1142
1143   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1144     return mode == Pmode;
1145
1146   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
1147     return 1;
1148
1149   if (FP_REGNUM_P (regno))
1150     {
1151       if (aarch64_vect_struct_mode_p (mode))
1152         return
1153           (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
1154       else
1155         return 1;
1156     }
1157
1158   return 0;
1159 }
1160
1161 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1162 machine_mode
1163 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
1164                                      machine_mode mode)
1165 {
1166   /* Handle modes that fit within single registers.  */
1167   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
1168     {
1169       if (GET_MODE_SIZE (mode) >= 4)
1170         return mode;
1171       else
1172         return SImode;
1173     }
1174   /* Fall back to generic for multi-reg and very large modes.  */
1175   else
1176     return choose_hard_reg_mode (regno, nregs, false);
1177 }
1178
1179 /* Return true if calls to DECL should be treated as
1180    long-calls (ie called via a register).  */
1181 static bool
1182 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1183 {
1184   return false;
1185 }
1186
1187 /* Return true if calls to symbol-ref SYM should be treated as
1188    long-calls (ie called via a register).  */
1189 bool
1190 aarch64_is_long_call_p (rtx sym)
1191 {
1192   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1193 }
1194
1195 /* Return true if calls to symbol-ref SYM should not go through
1196    plt stubs.  */
1197
1198 bool
1199 aarch64_is_noplt_call_p (rtx sym)
1200 {
1201   const_tree decl = SYMBOL_REF_DECL (sym);
1202
1203   if (flag_pic
1204       && decl
1205       && (!flag_plt
1206           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1207       && !targetm.binds_local_p (decl))
1208     return true;
1209
1210   return false;
1211 }
1212
1213 /* Return true if the offsets to a zero/sign-extract operation
1214    represent an expression that matches an extend operation.  The
1215    operands represent the paramters from
1216
1217    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1218 bool
1219 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
1220                                 rtx extract_imm)
1221 {
1222   HOST_WIDE_INT mult_val, extract_val;
1223
1224   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1225     return false;
1226
1227   mult_val = INTVAL (mult_imm);
1228   extract_val = INTVAL (extract_imm);
1229
1230   if (extract_val > 8
1231       && extract_val < GET_MODE_BITSIZE (mode)
1232       && exact_log2 (extract_val & ~7) > 0
1233       && (extract_val & 7) <= 4
1234       && mult_val == (1 << (extract_val & 7)))
1235     return true;
1236
1237   return false;
1238 }
1239
1240 /* Emit an insn that's a simple single-set.  Both the operands must be
1241    known to be valid.  */
1242 inline static rtx_insn *
1243 emit_set_insn (rtx x, rtx y)
1244 {
1245   return emit_insn (gen_rtx_SET (x, y));
1246 }
1247
1248 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1249    return the rtx for register 0 in the proper mode.  */
1250 rtx
1251 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1252 {
1253   machine_mode mode = SELECT_CC_MODE (code, x, y);
1254   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1255
1256   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1257   return cc_reg;
1258 }
1259
1260 /* Build the SYMBOL_REF for __tls_get_addr.  */
1261
1262 static GTY(()) rtx tls_get_addr_libfunc;
1263
1264 rtx
1265 aarch64_tls_get_addr (void)
1266 {
1267   if (!tls_get_addr_libfunc)
1268     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1269   return tls_get_addr_libfunc;
1270 }
1271
1272 /* Return the TLS model to use for ADDR.  */
1273
1274 static enum tls_model
1275 tls_symbolic_operand_type (rtx addr)
1276 {
1277   enum tls_model tls_kind = TLS_MODEL_NONE;
1278   rtx sym, addend;
1279
1280   if (GET_CODE (addr) == CONST)
1281     {
1282       split_const (addr, &sym, &addend);
1283       if (GET_CODE (sym) == SYMBOL_REF)
1284         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1285     }
1286   else if (GET_CODE (addr) == SYMBOL_REF)
1287     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1288
1289   return tls_kind;
1290 }
1291
1292 /* We'll allow lo_sum's in addresses in our legitimate addresses
1293    so that combine would take care of combining addresses where
1294    necessary, but for generation purposes, we'll generate the address
1295    as :
1296    RTL                               Absolute
1297    tmp = hi (symbol_ref);            adrp  x1, foo
1298    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1299                                      nop
1300
1301    PIC                               TLS
1302    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1303    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1304                                      bl   __tls_get_addr
1305                                      nop
1306
1307    Load TLS symbol, depending on TLS mechanism and TLS access model.
1308
1309    Global Dynamic - Traditional TLS:
1310    adrp tmp, :tlsgd:imm
1311    add  dest, tmp, #:tlsgd_lo12:imm
1312    bl   __tls_get_addr
1313
1314    Global Dynamic - TLS Descriptors:
1315    adrp dest, :tlsdesc:imm
1316    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1317    add  dest, dest, #:tlsdesc_lo12:imm
1318    blr  tmp
1319    mrs  tp, tpidr_el0
1320    add  dest, dest, tp
1321
1322    Initial Exec:
1323    mrs  tp, tpidr_el0
1324    adrp tmp, :gottprel:imm
1325    ldr  dest, [tmp, #:gottprel_lo12:imm]
1326    add  dest, dest, tp
1327
1328    Local Exec:
1329    mrs  tp, tpidr_el0
1330    add  t0, tp, #:tprel_hi12:imm, lsl #12
1331    add  t0, t0, #:tprel_lo12_nc:imm
1332 */
1333
1334 static void
1335 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1336                                    enum aarch64_symbol_type type)
1337 {
1338   switch (type)
1339     {
1340     case SYMBOL_SMALL_ABSOLUTE:
1341       {
1342         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1343         rtx tmp_reg = dest;
1344         machine_mode mode = GET_MODE (dest);
1345
1346         gcc_assert (mode == Pmode || mode == ptr_mode);
1347
1348         if (can_create_pseudo_p ())
1349           tmp_reg = gen_reg_rtx (mode);
1350
1351         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1352         emit_insn (gen_add_losym (dest, tmp_reg, imm));
1353         return;
1354       }
1355
1356     case SYMBOL_TINY_ABSOLUTE:
1357       emit_insn (gen_rtx_SET (dest, imm));
1358       return;
1359
1360     case SYMBOL_SMALL_GOT_28K:
1361       {
1362         machine_mode mode = GET_MODE (dest);
1363         rtx gp_rtx = pic_offset_table_rtx;
1364         rtx insn;
1365         rtx mem;
1366
1367         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1368            here before rtl expand.  Tree IVOPT will generate rtl pattern to
1369            decide rtx costs, in which case pic_offset_table_rtx is not
1370            initialized.  For that case no need to generate the first adrp
1371            instruction as the final cost for global variable access is
1372            one instruction.  */
1373         if (gp_rtx != NULL)
1374           {
1375             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1376                using the page base as GOT base, the first page may be wasted,
1377                in the worst scenario, there is only 28K space for GOT).
1378
1379                The generate instruction sequence for accessing global variable
1380                is:
1381
1382                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1383
1384                Only one instruction needed. But we must initialize
1385                pic_offset_table_rtx properly.  We generate initialize insn for
1386                every global access, and allow CSE to remove all redundant.
1387
1388                The final instruction sequences will look like the following
1389                for multiply global variables access.
1390
1391                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1392
1393                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1394                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1395                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1396                  ...  */
1397
1398             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1399             crtl->uses_pic_offset_table = 1;
1400             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1401
1402             if (mode != GET_MODE (gp_rtx))
1403              gp_rtx = gen_lowpart (mode, gp_rtx);
1404
1405           }
1406
1407         if (mode == ptr_mode)
1408           {
1409             if (mode == DImode)
1410               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1411             else
1412               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1413
1414             mem = XVECEXP (SET_SRC (insn), 0, 0);
1415           }
1416         else
1417           {
1418             gcc_assert (mode == Pmode);
1419
1420             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1421             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1422           }
1423
1424         /* The operand is expected to be MEM.  Whenever the related insn
1425            pattern changed, above code which calculate mem should be
1426            updated.  */
1427         gcc_assert (GET_CODE (mem) == MEM);
1428         MEM_READONLY_P (mem) = 1;
1429         MEM_NOTRAP_P (mem) = 1;
1430         emit_insn (insn);
1431         return;
1432       }
1433
1434     case SYMBOL_SMALL_GOT_4G:
1435       {
1436         /* In ILP32, the mode of dest can be either SImode or DImode,
1437            while the got entry is always of SImode size.  The mode of
1438            dest depends on how dest is used: if dest is assigned to a
1439            pointer (e.g. in the memory), it has SImode; it may have
1440            DImode if dest is dereferenced to access the memeory.
1441            This is why we have to handle three different ldr_got_small
1442            patterns here (two patterns for ILP32).  */
1443
1444         rtx insn;
1445         rtx mem;
1446         rtx tmp_reg = dest;
1447         machine_mode mode = GET_MODE (dest);
1448
1449         if (can_create_pseudo_p ())
1450           tmp_reg = gen_reg_rtx (mode);
1451
1452         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1453         if (mode == ptr_mode)
1454           {
1455             if (mode == DImode)
1456               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1457             else
1458               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1459
1460             mem = XVECEXP (SET_SRC (insn), 0, 0);
1461           }
1462         else
1463           {
1464             gcc_assert (mode == Pmode);
1465
1466             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1467             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1468           }
1469
1470         gcc_assert (GET_CODE (mem) == MEM);
1471         MEM_READONLY_P (mem) = 1;
1472         MEM_NOTRAP_P (mem) = 1;
1473         emit_insn (insn);
1474         return;
1475       }
1476
1477     case SYMBOL_SMALL_TLSGD:
1478       {
1479         rtx_insn *insns;
1480         machine_mode mode = GET_MODE (dest);
1481         rtx result = gen_rtx_REG (mode, R0_REGNUM);
1482
1483         start_sequence ();
1484         if (TARGET_ILP32)
1485           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1486         else
1487           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1488         insns = get_insns ();
1489         end_sequence ();
1490
1491         RTL_CONST_CALL_P (insns) = 1;
1492         emit_libcall_block (insns, dest, result, imm);
1493         return;
1494       }
1495
1496     case SYMBOL_SMALL_TLSDESC:
1497       {
1498         machine_mode mode = GET_MODE (dest);
1499         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1500         rtx tp;
1501
1502         gcc_assert (mode == Pmode || mode == ptr_mode);
1503
1504         /* In ILP32, the got entry is always of SImode size.  Unlike
1505            small GOT, the dest is fixed at reg 0.  */
1506         if (TARGET_ILP32)
1507           emit_insn (gen_tlsdesc_small_si (imm));
1508         else
1509           emit_insn (gen_tlsdesc_small_di (imm));
1510         tp = aarch64_load_tp (NULL);
1511
1512         if (mode != Pmode)
1513           tp = gen_lowpart (mode, tp);
1514
1515         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1516         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1517         return;
1518       }
1519
1520     case SYMBOL_SMALL_TLSIE:
1521       {
1522         /* In ILP32, the mode of dest can be either SImode or DImode,
1523            while the got entry is always of SImode size.  The mode of
1524            dest depends on how dest is used: if dest is assigned to a
1525            pointer (e.g. in the memory), it has SImode; it may have
1526            DImode if dest is dereferenced to access the memeory.
1527            This is why we have to handle three different tlsie_small
1528            patterns here (two patterns for ILP32).  */
1529         machine_mode mode = GET_MODE (dest);
1530         rtx tmp_reg = gen_reg_rtx (mode);
1531         rtx tp = aarch64_load_tp (NULL);
1532
1533         if (mode == ptr_mode)
1534           {
1535             if (mode == DImode)
1536               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1537             else
1538               {
1539                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1540                 tp = gen_lowpart (mode, tp);
1541               }
1542           }
1543         else
1544           {
1545             gcc_assert (mode == Pmode);
1546             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1547           }
1548
1549         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1550         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1551         return;
1552       }
1553
1554     case SYMBOL_TLSLE12:
1555     case SYMBOL_TLSLE24:
1556     case SYMBOL_TLSLE32:
1557     case SYMBOL_TLSLE48:
1558       {
1559         machine_mode mode = GET_MODE (dest);
1560         rtx tp = aarch64_load_tp (NULL);
1561
1562         if (mode != Pmode)
1563           tp = gen_lowpart (mode, tp);
1564
1565         switch (type)
1566           {
1567           case SYMBOL_TLSLE12:
1568             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1569                         (dest, tp, imm));
1570             break;
1571           case SYMBOL_TLSLE24:
1572             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1573                         (dest, tp, imm));
1574           break;
1575           case SYMBOL_TLSLE32:
1576             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1577                         (dest, imm));
1578             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1579                         (dest, dest, tp));
1580           break;
1581           case SYMBOL_TLSLE48:
1582             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1583                         (dest, imm));
1584             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1585                         (dest, dest, tp));
1586             break;
1587           default:
1588             gcc_unreachable ();
1589           }
1590
1591         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1592         return;
1593       }
1594
1595     case SYMBOL_TINY_GOT:
1596       emit_insn (gen_ldr_got_tiny (dest, imm));
1597       return;
1598
1599     case SYMBOL_TINY_TLSIE:
1600       {
1601         machine_mode mode = GET_MODE (dest);
1602         rtx tp = aarch64_load_tp (NULL);
1603
1604         if (mode == ptr_mode)
1605           {
1606             if (mode == DImode)
1607               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1608             else
1609               {
1610                 tp = gen_lowpart (mode, tp);
1611                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1612               }
1613           }
1614         else
1615           {
1616             gcc_assert (mode == Pmode);
1617             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1618           }
1619
1620         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1621         return;
1622       }
1623
1624     default:
1625       gcc_unreachable ();
1626     }
1627 }
1628
1629 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1630    handle all moves if !can_create_pseudo_p ().  The distinction is
1631    important because, unlike emit_move_insn, the move expanders know
1632    how to force Pmode objects into the constant pool even when the
1633    constant pool address is not itself legitimate.  */
1634 static rtx
1635 aarch64_emit_move (rtx dest, rtx src)
1636 {
1637   return (can_create_pseudo_p ()
1638           ? emit_move_insn (dest, src)
1639           : emit_move_insn_1 (dest, src));
1640 }
1641
1642 /* Split a 128-bit move operation into two 64-bit move operations,
1643    taking care to handle partial overlap of register to register
1644    copies.  Special cases are needed when moving between GP regs and
1645    FP regs.  SRC can be a register, constant or memory; DST a register
1646    or memory.  If either operand is memory it must not have any side
1647    effects.  */
1648 void
1649 aarch64_split_128bit_move (rtx dst, rtx src)
1650 {
1651   rtx dst_lo, dst_hi;
1652   rtx src_lo, src_hi;
1653
1654   machine_mode mode = GET_MODE (dst);
1655
1656   gcc_assert (mode == TImode || mode == TFmode);
1657   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1658   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1659
1660   if (REG_P (dst) && REG_P (src))
1661     {
1662       int src_regno = REGNO (src);
1663       int dst_regno = REGNO (dst);
1664
1665       /* Handle FP <-> GP regs.  */
1666       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1667         {
1668           src_lo = gen_lowpart (word_mode, src);
1669           src_hi = gen_highpart (word_mode, src);
1670
1671           if (mode == TImode)
1672             {
1673               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1674               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1675             }
1676           else
1677             {
1678               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1679               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1680             }
1681           return;
1682         }
1683       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1684         {
1685           dst_lo = gen_lowpart (word_mode, dst);
1686           dst_hi = gen_highpart (word_mode, dst);
1687
1688           if (mode == TImode)
1689             {
1690               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1691               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1692             }
1693           else
1694             {
1695               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1696               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1697             }
1698           return;
1699         }
1700     }
1701
1702   dst_lo = gen_lowpart (word_mode, dst);
1703   dst_hi = gen_highpart (word_mode, dst);
1704   src_lo = gen_lowpart (word_mode, src);
1705   src_hi = gen_highpart_mode (word_mode, mode, src);
1706
1707   /* At most one pairing may overlap.  */
1708   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1709     {
1710       aarch64_emit_move (dst_hi, src_hi);
1711       aarch64_emit_move (dst_lo, src_lo);
1712     }
1713   else
1714     {
1715       aarch64_emit_move (dst_lo, src_lo);
1716       aarch64_emit_move (dst_hi, src_hi);
1717     }
1718 }
1719
1720 bool
1721 aarch64_split_128bit_move_p (rtx dst, rtx src)
1722 {
1723   return (! REG_P (src)
1724           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1725 }
1726
1727 /* Split a complex SIMD combine.  */
1728
1729 void
1730 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1731 {
1732   machine_mode src_mode = GET_MODE (src1);
1733   machine_mode dst_mode = GET_MODE (dst);
1734
1735   gcc_assert (VECTOR_MODE_P (dst_mode));
1736   gcc_assert (register_operand (dst, dst_mode)
1737               && register_operand (src1, src_mode)
1738               && register_operand (src2, src_mode));
1739
1740   rtx (*gen) (rtx, rtx, rtx);
1741
1742   switch (src_mode)
1743     {
1744     case V8QImode:
1745       gen = gen_aarch64_simd_combinev8qi;
1746       break;
1747     case V4HImode:
1748       gen = gen_aarch64_simd_combinev4hi;
1749       break;
1750     case V2SImode:
1751       gen = gen_aarch64_simd_combinev2si;
1752       break;
1753     case V4HFmode:
1754       gen = gen_aarch64_simd_combinev4hf;
1755       break;
1756     case V2SFmode:
1757       gen = gen_aarch64_simd_combinev2sf;
1758       break;
1759     case DImode:
1760       gen = gen_aarch64_simd_combinedi;
1761       break;
1762     case DFmode:
1763       gen = gen_aarch64_simd_combinedf;
1764       break;
1765     default:
1766       gcc_unreachable ();
1767     }
1768
1769   emit_insn (gen (dst, src1, src2));
1770   return;
1771 }
1772
1773 /* Split a complex SIMD move.  */
1774
1775 void
1776 aarch64_split_simd_move (rtx dst, rtx src)
1777 {
1778   machine_mode src_mode = GET_MODE (src);
1779   machine_mode dst_mode = GET_MODE (dst);
1780
1781   gcc_assert (VECTOR_MODE_P (dst_mode));
1782
1783   if (REG_P (dst) && REG_P (src))
1784     {
1785       rtx (*gen) (rtx, rtx);
1786
1787       gcc_assert (VECTOR_MODE_P (src_mode));
1788
1789       switch (src_mode)
1790         {
1791         case V16QImode:
1792           gen = gen_aarch64_split_simd_movv16qi;
1793           break;
1794         case V8HImode:
1795           gen = gen_aarch64_split_simd_movv8hi;
1796           break;
1797         case V4SImode:
1798           gen = gen_aarch64_split_simd_movv4si;
1799           break;
1800         case V2DImode:
1801           gen = gen_aarch64_split_simd_movv2di;
1802           break;
1803         case V8HFmode:
1804           gen = gen_aarch64_split_simd_movv8hf;
1805           break;
1806         case V4SFmode:
1807           gen = gen_aarch64_split_simd_movv4sf;
1808           break;
1809         case V2DFmode:
1810           gen = gen_aarch64_split_simd_movv2df;
1811           break;
1812         default:
1813           gcc_unreachable ();
1814         }
1815
1816       emit_insn (gen (dst, src));
1817       return;
1818     }
1819 }
1820
1821 bool
1822 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1823                               machine_mode ymode, rtx y)
1824 {
1825   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1826   gcc_assert (r != NULL);
1827   return rtx_equal_p (x, r);
1828 }
1829
1830
1831 static rtx
1832 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1833 {
1834   if (can_create_pseudo_p ())
1835     return force_reg (mode, value);
1836   else
1837     {
1838       x = aarch64_emit_move (x, value);
1839       return x;
1840     }
1841 }
1842
1843
1844 static rtx
1845 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1846 {
1847   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1848     {
1849       rtx high;
1850       /* Load the full offset into a register.  This
1851          might be improvable in the future.  */
1852       high = GEN_INT (offset);
1853       offset = 0;
1854       high = aarch64_force_temporary (mode, temp, high);
1855       reg = aarch64_force_temporary (mode, temp,
1856                                      gen_rtx_PLUS (mode, high, reg));
1857     }
1858   return plus_constant (mode, reg, offset);
1859 }
1860
1861 static int
1862 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1863                                 machine_mode mode)
1864 {
1865   int i;
1866   unsigned HOST_WIDE_INT val, val2, mask;
1867   int one_match, zero_match;
1868   int num_insns;
1869
1870   val = INTVAL (imm);
1871
1872   if (aarch64_move_imm (val, mode))
1873     {
1874       if (generate)
1875         emit_insn (gen_rtx_SET (dest, imm));
1876       return 1;
1877     }
1878
1879   if ((val >> 32) == 0 || mode == SImode)
1880     {
1881       if (generate)
1882         {
1883           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1884           if (mode == SImode)
1885             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1886                                        GEN_INT ((val >> 16) & 0xffff)));
1887           else
1888             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1889                                        GEN_INT ((val >> 16) & 0xffff)));
1890         }
1891       return 2;
1892     }
1893
1894   /* Remaining cases are all for DImode.  */
1895
1896   mask = 0xffff;
1897   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1898     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1899   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1900     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1901
1902   if (zero_match != 2 && one_match != 2)
1903     {
1904       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1905          For a 64-bit bitmask try whether changing 16 bits to all ones or
1906          zeroes creates a valid bitmask.  To check any repeated bitmask,
1907          try using 16 bits from the other 32-bit half of val.  */
1908
1909       for (i = 0; i < 64; i += 16, mask <<= 16)
1910         {
1911           val2 = val & ~mask;
1912           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1913             break;
1914           val2 = val | mask;
1915           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1916             break;
1917           val2 = val2 & ~mask;
1918           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1919           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1920             break;
1921         }
1922       if (i != 64)
1923         {
1924           if (generate)
1925             {
1926               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1927               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1928                                          GEN_INT ((val >> i) & 0xffff)));
1929             }
1930           return 2;
1931         }
1932     }
1933
1934   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1935      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
1936      otherwise skip zero bits.  */
1937
1938   num_insns = 1;
1939   mask = 0xffff;
1940   val2 = one_match > zero_match ? ~val : val;
1941   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1942
1943   if (generate)
1944     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1945                                            ? (val | ~(mask << i))
1946                                            : (val & (mask << i)))));
1947   for (i += 16; i < 64; i += 16)
1948     {
1949       if ((val2 & (mask << i)) == 0)
1950         continue;
1951       if (generate)
1952         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1953                                    GEN_INT ((val >> i) & 0xffff)));
1954       num_insns ++;
1955     }
1956
1957   return num_insns;
1958 }
1959
1960
1961 void
1962 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1963 {
1964   machine_mode mode = GET_MODE (dest);
1965
1966   gcc_assert (mode == SImode || mode == DImode);
1967
1968   /* Check on what type of symbol it is.  */
1969   if (GET_CODE (imm) == SYMBOL_REF
1970       || GET_CODE (imm) == LABEL_REF
1971       || GET_CODE (imm) == CONST)
1972     {
1973       rtx mem, base, offset;
1974       enum aarch64_symbol_type sty;
1975
1976       /* If we have (const (plus symbol offset)), separate out the offset
1977          before we start classifying the symbol.  */
1978       split_const (imm, &base, &offset);
1979
1980       sty = aarch64_classify_symbol (base, offset);
1981       switch (sty)
1982         {
1983         case SYMBOL_FORCE_TO_MEM:
1984           if (offset != const0_rtx
1985               && targetm.cannot_force_const_mem (mode, imm))
1986             {
1987               gcc_assert (can_create_pseudo_p ());
1988               base = aarch64_force_temporary (mode, dest, base);
1989               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1990               aarch64_emit_move (dest, base);
1991               return;
1992             }
1993
1994           mem = force_const_mem (ptr_mode, imm);
1995           gcc_assert (mem);
1996
1997           /* If we aren't generating PC relative literals, then
1998              we need to expand the literal pool access carefully.
1999              This is something that needs to be done in a number
2000              of places, so could well live as a separate function.  */
2001           if (!aarch64_pcrelative_literal_loads)
2002             {
2003               gcc_assert (can_create_pseudo_p ());
2004               base = gen_reg_rtx (ptr_mode);
2005               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
2006               if (ptr_mode != Pmode)
2007                 base = convert_memory_address (Pmode, base);
2008               mem = gen_rtx_MEM (ptr_mode, base);
2009             }
2010
2011           if (mode != ptr_mode)
2012             mem = gen_rtx_ZERO_EXTEND (mode, mem);
2013
2014           emit_insn (gen_rtx_SET (dest, mem));
2015
2016           return;
2017
2018         case SYMBOL_SMALL_TLSGD:
2019         case SYMBOL_SMALL_TLSDESC:
2020         case SYMBOL_SMALL_TLSIE:
2021         case SYMBOL_SMALL_GOT_28K:
2022         case SYMBOL_SMALL_GOT_4G:
2023         case SYMBOL_TINY_GOT:
2024         case SYMBOL_TINY_TLSIE:
2025           if (offset != const0_rtx)
2026             {
2027               gcc_assert(can_create_pseudo_p ());
2028               base = aarch64_force_temporary (mode, dest, base);
2029               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
2030               aarch64_emit_move (dest, base);
2031               return;
2032             }
2033           /* FALLTHRU */
2034
2035         case SYMBOL_SMALL_ABSOLUTE:
2036         case SYMBOL_TINY_ABSOLUTE:
2037         case SYMBOL_TLSLE12:
2038         case SYMBOL_TLSLE24:
2039         case SYMBOL_TLSLE32:
2040         case SYMBOL_TLSLE48:
2041           aarch64_load_symref_appropriately (dest, imm, sty);
2042           return;
2043
2044         default:
2045           gcc_unreachable ();
2046         }
2047     }
2048
2049   if (!CONST_INT_P (imm))
2050     {
2051       if (GET_CODE (imm) == HIGH)
2052         emit_insn (gen_rtx_SET (dest, imm));
2053       else
2054         {
2055           rtx mem = force_const_mem (mode, imm);
2056           gcc_assert (mem);
2057           emit_insn (gen_rtx_SET (dest, mem));
2058         }
2059
2060       return;
2061     }
2062
2063   aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
2064 }
2065
2066 /* Add DELTA to REGNUM in mode MODE.  SCRATCHREG can be used to hold a
2067    temporary value if necessary.  FRAME_RELATED_P should be true if
2068    the RTX_FRAME_RELATED flag should be set and CFA adjustments added
2069    to the generated instructions.  If SCRATCHREG is known to hold
2070    abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
2071    immediate again.
2072
2073    Since this function may be used to adjust the stack pointer, we must
2074    ensure that it cannot cause transient stack deallocation (for example
2075    by first incrementing SP and then decrementing when adjusting by a
2076    large immediate).  */
2077
2078 static void
2079 aarch64_add_constant_internal (machine_mode mode, int regnum, int scratchreg,
2080                                HOST_WIDE_INT delta, bool frame_related_p,
2081                                bool emit_move_imm)
2082 {
2083   HOST_WIDE_INT mdelta = abs_hwi (delta);
2084   rtx this_rtx = gen_rtx_REG (mode, regnum);
2085   rtx_insn *insn;
2086
2087   if (!mdelta)
2088     return;
2089
2090   /* Single instruction adjustment.  */
2091   if (aarch64_uimm12_shift (mdelta))
2092     {
2093       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
2094       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2095       return;
2096     }
2097
2098   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
2099      Only do this if mdelta is not a 16-bit move as adjusting using a move
2100      is better.  */
2101   if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode))
2102     {
2103       HOST_WIDE_INT low_off = mdelta & 0xfff;
2104
2105       low_off = delta < 0 ? -low_off : low_off;
2106       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
2107       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2108       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
2109       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2110       return;
2111     }
2112
2113   /* Emit a move immediate if required and an addition/subtraction.  */
2114   rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
2115   if (emit_move_imm)
2116     aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode);
2117   insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx)
2118                               : gen_add2_insn (this_rtx, scratch_rtx));
2119   if (frame_related_p)
2120     {
2121       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2122       rtx adj = plus_constant (mode, this_rtx, delta);
2123       add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
2124     }
2125 }
2126
2127 static inline void
2128 aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
2129                       HOST_WIDE_INT delta)
2130 {
2131   aarch64_add_constant_internal (mode, regnum, scratchreg, delta, false, true);
2132 }
2133
2134 static inline void
2135 aarch64_add_sp (int scratchreg, HOST_WIDE_INT delta, bool emit_move_imm)
2136 {
2137   aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, delta,
2138                                  true, emit_move_imm);
2139 }
2140
2141 static inline void
2142 aarch64_sub_sp (int scratchreg, HOST_WIDE_INT delta, bool frame_related_p)
2143 {
2144   aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, -delta,
2145                                  frame_related_p, true);
2146 }
2147
2148 static bool
2149 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
2150                                  tree exp ATTRIBUTE_UNUSED)
2151 {
2152   /* Currently, always true.  */
2153   return true;
2154 }
2155
2156 /* Implement TARGET_PASS_BY_REFERENCE.  */
2157
2158 static bool
2159 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
2160                            machine_mode mode,
2161                            const_tree type,
2162                            bool named ATTRIBUTE_UNUSED)
2163 {
2164   HOST_WIDE_INT size;
2165   machine_mode dummymode;
2166   int nregs;
2167
2168   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
2169   size = (mode == BLKmode && type)
2170     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
2171
2172   /* Aggregates are passed by reference based on their size.  */
2173   if (type && AGGREGATE_TYPE_P (type))
2174     {
2175       size = int_size_in_bytes (type);
2176     }
2177
2178   /* Variable sized arguments are always returned by reference.  */
2179   if (size < 0)
2180     return true;
2181
2182   /* Can this be a candidate to be passed in fp/simd register(s)?  */
2183   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2184                                                &dummymode, &nregs,
2185                                                NULL))
2186     return false;
2187
2188   /* Arguments which are variable sized or larger than 2 registers are
2189      passed by reference unless they are a homogenous floating point
2190      aggregate.  */
2191   return size > 2 * UNITS_PER_WORD;
2192 }
2193
2194 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
2195 static bool
2196 aarch64_return_in_msb (const_tree valtype)
2197 {
2198   machine_mode dummy_mode;
2199   int dummy_int;
2200
2201   /* Never happens in little-endian mode.  */
2202   if (!BYTES_BIG_ENDIAN)
2203     return false;
2204
2205   /* Only composite types smaller than or equal to 16 bytes can
2206      be potentially returned in registers.  */
2207   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
2208       || int_size_in_bytes (valtype) <= 0
2209       || int_size_in_bytes (valtype) > 16)
2210     return false;
2211
2212   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2213      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2214      is always passed/returned in the least significant bits of fp/simd
2215      register(s).  */
2216   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
2217                                                &dummy_mode, &dummy_int, NULL))
2218     return false;
2219
2220   return true;
2221 }
2222
2223 /* Implement TARGET_FUNCTION_VALUE.
2224    Define how to find the value returned by a function.  */
2225
2226 static rtx
2227 aarch64_function_value (const_tree type, const_tree func,
2228                         bool outgoing ATTRIBUTE_UNUSED)
2229 {
2230   machine_mode mode;
2231   int unsignedp;
2232   int count;
2233   machine_mode ag_mode;
2234
2235   mode = TYPE_MODE (type);
2236   if (INTEGRAL_TYPE_P (type))
2237     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
2238
2239   if (aarch64_return_in_msb (type))
2240     {
2241       HOST_WIDE_INT size = int_size_in_bytes (type);
2242
2243       if (size % UNITS_PER_WORD != 0)
2244         {
2245           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
2246           mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
2247         }
2248     }
2249
2250   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2251                                                &ag_mode, &count, NULL))
2252     {
2253       if (!aarch64_composite_type_p (type, mode))
2254         {
2255           gcc_assert (count == 1 && mode == ag_mode);
2256           return gen_rtx_REG (mode, V0_REGNUM);
2257         }
2258       else
2259         {
2260           int i;
2261           rtx par;
2262
2263           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
2264           for (i = 0; i < count; i++)
2265             {
2266               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
2267               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2268                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
2269               XVECEXP (par, 0, i) = tmp;
2270             }
2271           return par;
2272         }
2273     }
2274   else
2275     return gen_rtx_REG (mode, R0_REGNUM);
2276 }
2277
2278 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2279    Return true if REGNO is the number of a hard register in which the values
2280    of called function may come back.  */
2281
2282 static bool
2283 aarch64_function_value_regno_p (const unsigned int regno)
2284 {
2285   /* Maximum of 16 bytes can be returned in the general registers.  Examples
2286      of 16-byte return values are: 128-bit integers and 16-byte small
2287      structures (excluding homogeneous floating-point aggregates).  */
2288   if (regno == R0_REGNUM || regno == R1_REGNUM)
2289     return true;
2290
2291   /* Up to four fp/simd registers can return a function value, e.g. a
2292      homogeneous floating-point aggregate having four members.  */
2293   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
2294     return TARGET_FLOAT;
2295
2296   return false;
2297 }
2298
2299 /* Implement TARGET_RETURN_IN_MEMORY.
2300
2301    If the type T of the result of a function is such that
2302      void func (T arg)
2303    would require that arg be passed as a value in a register (or set of
2304    registers) according to the parameter passing rules, then the result
2305    is returned in the same registers as would be used for such an
2306    argument.  */
2307
2308 static bool
2309 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
2310 {
2311   HOST_WIDE_INT size;
2312   machine_mode ag_mode;
2313   int count;
2314
2315   if (!AGGREGATE_TYPE_P (type)
2316       && TREE_CODE (type) != COMPLEX_TYPE
2317       && TREE_CODE (type) != VECTOR_TYPE)
2318     /* Simple scalar types always returned in registers.  */
2319     return false;
2320
2321   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
2322                                                type,
2323                                                &ag_mode,
2324                                                &count,
2325                                                NULL))
2326     return false;
2327
2328   /* Types larger than 2 registers returned in memory.  */
2329   size = int_size_in_bytes (type);
2330   return (size < 0 || size > 2 * UNITS_PER_WORD);
2331 }
2332
2333 static bool
2334 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
2335                                const_tree type, int *nregs)
2336 {
2337   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2338   return aarch64_vfp_is_call_or_return_candidate (mode,
2339                                                   type,
2340                                                   &pcum->aapcs_vfp_rmode,
2341                                                   nregs,
2342                                                   NULL);
2343 }
2344
2345 /* Given MODE and TYPE of a function argument, return the alignment in
2346    bits.  The idea is to suppress any stronger alignment requested by
2347    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2348    This is a helper function for local use only.  */
2349
2350 static unsigned int
2351 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
2352 {
2353   if (!type)
2354     return GET_MODE_ALIGNMENT (mode);
2355
2356   if (integer_zerop (TYPE_SIZE (type)))
2357     return 0;
2358
2359   gcc_assert (TYPE_MODE (type) == mode);
2360
2361   if (!AGGREGATE_TYPE_P (type))
2362     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
2363
2364   if (TREE_CODE (type) == ARRAY_TYPE)
2365     return TYPE_ALIGN (TREE_TYPE (type));
2366
2367   unsigned int alignment = 0;
2368   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
2369     if (TREE_CODE (field) == FIELD_DECL)
2370       alignment = std::max (alignment, DECL_ALIGN (field));
2371
2372   return alignment;
2373 }
2374
2375 /* Layout a function argument according to the AAPCS64 rules.  The rule
2376    numbers refer to the rule numbers in the AAPCS64.  */
2377
2378 static void
2379 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
2380                     const_tree type,
2381                     bool named ATTRIBUTE_UNUSED)
2382 {
2383   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2384   int ncrn, nvrn, nregs;
2385   bool allocate_ncrn, allocate_nvrn;
2386   HOST_WIDE_INT size;
2387
2388   /* We need to do this once per argument.  */
2389   if (pcum->aapcs_arg_processed)
2390     return;
2391
2392   pcum->aapcs_arg_processed = true;
2393
2394   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
2395   size
2396     = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2397                 UNITS_PER_WORD);
2398
2399   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2400   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2401                                                  mode,
2402                                                  type,
2403                                                  &nregs);
2404
2405   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2406      The following code thus handles passing by SIMD/FP registers first.  */
2407
2408   nvrn = pcum->aapcs_nvrn;
2409
2410   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2411      and homogenous short-vector aggregates (HVA).  */
2412   if (allocate_nvrn)
2413     {
2414       if (!TARGET_FLOAT)
2415         aarch64_err_no_fpadvsimd (mode, "argument");
2416
2417       if (nvrn + nregs <= NUM_FP_ARG_REGS)
2418         {
2419           pcum->aapcs_nextnvrn = nvrn + nregs;
2420           if (!aarch64_composite_type_p (type, mode))
2421             {
2422               gcc_assert (nregs == 1);
2423               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2424             }
2425           else
2426             {
2427               rtx par;
2428               int i;
2429               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2430               for (i = 0; i < nregs; i++)
2431                 {
2432                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2433                                          V0_REGNUM + nvrn + i);
2434                   tmp = gen_rtx_EXPR_LIST
2435                     (VOIDmode, tmp,
2436                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2437                   XVECEXP (par, 0, i) = tmp;
2438                 }
2439               pcum->aapcs_reg = par;
2440             }
2441           return;
2442         }
2443       else
2444         {
2445           /* C.3 NSRN is set to 8.  */
2446           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2447           goto on_stack;
2448         }
2449     }
2450
2451   ncrn = pcum->aapcs_ncrn;
2452   nregs = size / UNITS_PER_WORD;
2453
2454   /* C6 - C9.  though the sign and zero extension semantics are
2455      handled elsewhere.  This is the case where the argument fits
2456      entirely general registers.  */
2457   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2458     {
2459
2460       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2461
2462       /* C.8 if the argument has an alignment of 16 then the NGRN is
2463          rounded up to the next even number.  */
2464       if (nregs == 2
2465           && ncrn % 2
2466           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
2467              comparison is there because for > 16 * BITS_PER_UNIT
2468              alignment nregs should be > 2 and therefore it should be
2469              passed by reference rather than value.  */
2470           && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2471         {
2472           ++ncrn;
2473           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2474         }
2475
2476       /* NREGS can be 0 when e.g. an empty structure is to be passed.
2477          A reg is still generated for it, but the caller should be smart
2478          enough not to use it.  */
2479       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2480         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2481       else
2482         {
2483           rtx par;
2484           int i;
2485
2486           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2487           for (i = 0; i < nregs; i++)
2488             {
2489               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2490               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2491                                        GEN_INT (i * UNITS_PER_WORD));
2492               XVECEXP (par, 0, i) = tmp;
2493             }
2494           pcum->aapcs_reg = par;
2495         }
2496
2497       pcum->aapcs_nextncrn = ncrn + nregs;
2498       return;
2499     }
2500
2501   /* C.11  */
2502   pcum->aapcs_nextncrn = NUM_ARG_REGS;
2503
2504   /* The argument is passed on stack; record the needed number of words for
2505      this argument and align the total size if necessary.  */
2506 on_stack:
2507   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2508
2509   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2510     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2511                                        16 / UNITS_PER_WORD);
2512   return;
2513 }
2514
2515 /* Implement TARGET_FUNCTION_ARG.  */
2516
2517 static rtx
2518 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2519                       const_tree type, bool named)
2520 {
2521   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2522   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2523
2524   if (mode == VOIDmode)
2525     return NULL_RTX;
2526
2527   aarch64_layout_arg (pcum_v, mode, type, named);
2528   return pcum->aapcs_reg;
2529 }
2530
2531 void
2532 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2533                            const_tree fntype ATTRIBUTE_UNUSED,
2534                            rtx libname ATTRIBUTE_UNUSED,
2535                            const_tree fndecl ATTRIBUTE_UNUSED,
2536                            unsigned n_named ATTRIBUTE_UNUSED)
2537 {
2538   pcum->aapcs_ncrn = 0;
2539   pcum->aapcs_nvrn = 0;
2540   pcum->aapcs_nextncrn = 0;
2541   pcum->aapcs_nextnvrn = 0;
2542   pcum->pcs_variant = ARM_PCS_AAPCS64;
2543   pcum->aapcs_reg = NULL_RTX;
2544   pcum->aapcs_arg_processed = false;
2545   pcum->aapcs_stack_words = 0;
2546   pcum->aapcs_stack_size = 0;
2547
2548   if (!TARGET_FLOAT
2549       && fndecl && TREE_PUBLIC (fndecl)
2550       && fntype && fntype != error_mark_node)
2551     {
2552       const_tree type = TREE_TYPE (fntype);
2553       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
2554       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
2555       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2556                                                    &mode, &nregs, NULL))
2557         aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2558     }
2559   return;
2560 }
2561
2562 static void
2563 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2564                               machine_mode mode,
2565                               const_tree type,
2566                               bool named)
2567 {
2568   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2569   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2570     {
2571       aarch64_layout_arg (pcum_v, mode, type, named);
2572       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2573                   != (pcum->aapcs_stack_words != 0));
2574       pcum->aapcs_arg_processed = false;
2575       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2576       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2577       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2578       pcum->aapcs_stack_words = 0;
2579       pcum->aapcs_reg = NULL_RTX;
2580     }
2581 }
2582
2583 bool
2584 aarch64_function_arg_regno_p (unsigned regno)
2585 {
2586   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2587           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2588 }
2589
2590 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
2591    PARM_BOUNDARY bits of alignment, but will be given anything up
2592    to STACK_BOUNDARY bits if the type requires it.  This makes sure
2593    that both before and after the layout of each argument, the Next
2594    Stacked Argument Address (NSAA) will have a minimum alignment of
2595    8 bytes.  */
2596
2597 static unsigned int
2598 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2599 {
2600   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2601   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
2602 }
2603
2604 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2605
2606    Return true if an argument passed on the stack should be padded upwards,
2607    i.e. if the least-significant byte of the stack slot has useful data.
2608
2609    Small aggregate types are placed in the lowest memory address.
2610
2611    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
2612
2613 bool
2614 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
2615 {
2616   /* On little-endian targets, the least significant byte of every stack
2617      argument is passed at the lowest byte address of the stack slot.  */
2618   if (!BYTES_BIG_ENDIAN)
2619     return true;
2620
2621   /* Otherwise, integral, floating-point and pointer types are padded downward:
2622      the least significant byte of a stack argument is passed at the highest
2623      byte address of the stack slot.  */
2624   if (type
2625       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2626          || POINTER_TYPE_P (type))
2627       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2628     return false;
2629
2630   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
2631   return true;
2632 }
2633
2634 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2635
2636    It specifies padding for the last (may also be the only)
2637    element of a block move between registers and memory.  If
2638    assuming the block is in the memory, padding upward means that
2639    the last element is padded after its highest significant byte,
2640    while in downward padding, the last element is padded at the
2641    its least significant byte side.
2642
2643    Small aggregates and small complex types are always padded
2644    upwards.
2645
2646    We don't need to worry about homogeneous floating-point or
2647    short-vector aggregates; their move is not affected by the
2648    padding direction determined here.  Regardless of endianness,
2649    each element of such an aggregate is put in the least
2650    significant bits of a fp/simd register.
2651
2652    Return !BYTES_BIG_ENDIAN if the least significant byte of the
2653    register has useful data, and return the opposite if the most
2654    significant byte does.  */
2655
2656 bool
2657 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2658                      bool first ATTRIBUTE_UNUSED)
2659 {
2660
2661   /* Small composite types are always padded upward.  */
2662   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2663     {
2664       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2665                             : GET_MODE_SIZE (mode));
2666       if (size < 2 * UNITS_PER_WORD)
2667         return true;
2668     }
2669
2670   /* Otherwise, use the default padding.  */
2671   return !BYTES_BIG_ENDIAN;
2672 }
2673
2674 static machine_mode
2675 aarch64_libgcc_cmp_return_mode (void)
2676 {
2677   return SImode;
2678 }
2679
2680 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2681
2682 /* We use the 12-bit shifted immediate arithmetic instructions so values
2683    must be multiple of (1 << 12), i.e. 4096.  */
2684 #define ARITH_FACTOR 4096
2685
2686 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2687 #error Cannot use simple address calculation for stack probing
2688 #endif
2689
2690 /* The pair of scratch registers used for stack probing.  */
2691 #define PROBE_STACK_FIRST_REG  9
2692 #define PROBE_STACK_SECOND_REG 10
2693
2694 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2695    inclusive.  These are offsets from the current stack pointer.  */
2696
2697 static void
2698 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2699 {
2700   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
2701
2702   /* See the same assertion on PROBE_INTERVAL above.  */
2703   gcc_assert ((first % ARITH_FACTOR) == 0);
2704
2705   /* See if we have a constant small number of probes to generate.  If so,
2706      that's the easy case.  */
2707   if (size <= PROBE_INTERVAL)
2708     {
2709       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2710
2711       emit_set_insn (reg1,
2712                      plus_constant (Pmode,
2713                                     stack_pointer_rtx, -(first + base)));
2714       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
2715     }
2716
2717   /* The run-time loop is made up of 8 insns in the generic case while the
2718      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
2719   else if (size <= 4 * PROBE_INTERVAL)
2720     {
2721       HOST_WIDE_INT i, rem;
2722
2723       emit_set_insn (reg1,
2724                      plus_constant (Pmode,
2725                                     stack_pointer_rtx,
2726                                     -(first + PROBE_INTERVAL)));
2727       emit_stack_probe (reg1);
2728
2729       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2730          it exceeds SIZE.  If only two probes are needed, this will not
2731          generate any code.  Then probe at FIRST + SIZE.  */
2732       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2733         {
2734           emit_set_insn (reg1,
2735                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
2736           emit_stack_probe (reg1);
2737         }
2738
2739       rem = size - (i - PROBE_INTERVAL);
2740       if (rem > 256)
2741         {
2742           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2743
2744           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
2745           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
2746         }
2747       else
2748         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
2749     }
2750
2751   /* Otherwise, do the same as above, but in a loop.  Note that we must be
2752      extra careful with variables wrapping around because we might be at
2753      the very top (or the very bottom) of the address space and we have
2754      to be able to handle this case properly; in particular, we use an
2755      equality test for the loop condition.  */
2756   else
2757     {
2758       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
2759
2760       /* Step 1: round SIZE to the previous multiple of the interval.  */
2761
2762       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2763
2764
2765       /* Step 2: compute initial and final value of the loop counter.  */
2766
2767       /* TEST_ADDR = SP + FIRST.  */
2768       emit_set_insn (reg1,
2769                      plus_constant (Pmode, stack_pointer_rtx, -first));
2770
2771       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
2772       HOST_WIDE_INT adjustment = - (first + rounded_size);
2773       if (! aarch64_uimm12_shift (adjustment))
2774         {
2775           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
2776                                           true, Pmode);
2777           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
2778         }
2779       else
2780         {
2781           emit_set_insn (reg2,
2782                          plus_constant (Pmode, stack_pointer_rtx, adjustment));
2783         }
2784
2785       /* Step 3: the loop
2786
2787          do
2788            {
2789              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2790              probe at TEST_ADDR
2791            }
2792          while (TEST_ADDR != LAST_ADDR)
2793
2794          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2795          until it is equal to ROUNDED_SIZE.  */
2796
2797       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
2798
2799
2800       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2801          that SIZE is equal to ROUNDED_SIZE.  */
2802
2803       if (size != rounded_size)
2804         {
2805           HOST_WIDE_INT rem = size - rounded_size;
2806
2807           if (rem > 256)
2808             {
2809               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2810
2811               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
2812               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
2813             }
2814           else
2815             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
2816         }
2817     }
2818
2819   /* Make sure nothing is scheduled before we are done.  */
2820   emit_insn (gen_blockage ());
2821 }
2822
2823 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
2824    absolute addresses.  */
2825
2826 const char *
2827 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2828 {
2829   static int labelno = 0;
2830   char loop_lab[32];
2831   rtx xops[2];
2832
2833   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2834
2835   /* Loop.  */
2836   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2837
2838   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
2839   xops[0] = reg1;
2840   xops[1] = GEN_INT (PROBE_INTERVAL);
2841   output_asm_insn ("sub\t%0, %0, %1", xops);
2842
2843   /* Probe at TEST_ADDR.  */
2844   output_asm_insn ("str\txzr, [%0]", xops);
2845
2846   /* Test if TEST_ADDR == LAST_ADDR.  */
2847   xops[1] = reg2;
2848   output_asm_insn ("cmp\t%0, %1", xops);
2849
2850   /* Branch.  */
2851   fputs ("\tb.ne\t", asm_out_file);
2852   assemble_name_raw (asm_out_file, loop_lab);
2853   fputc ('\n', asm_out_file);
2854
2855   return "";
2856 }
2857
2858 static bool
2859 aarch64_frame_pointer_required (void)
2860 {
2861   /* In aarch64_override_options_after_change
2862      flag_omit_leaf_frame_pointer turns off the frame pointer by
2863      default.  Turn it back on now if we've not got a leaf
2864      function.  */
2865   if (flag_omit_leaf_frame_pointer
2866       && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2867     return true;
2868
2869   /* Force a frame pointer for EH returns so the return address is at FP+8.  */
2870   if (crtl->calls_eh_return)
2871     return true;
2872
2873   return false;
2874 }
2875
2876 /* Mark the registers that need to be saved by the callee and calculate
2877    the size of the callee-saved registers area and frame record (both FP
2878    and LR may be omitted).  */
2879 static void
2880 aarch64_layout_frame (void)
2881 {
2882   HOST_WIDE_INT offset = 0;
2883   int regno, last_fp_reg = INVALID_REGNUM;
2884
2885   if (reload_completed && cfun->machine->frame.laid_out)
2886     return;
2887
2888 #define SLOT_NOT_REQUIRED (-2)
2889 #define SLOT_REQUIRED     (-1)
2890
2891   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
2892   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
2893
2894   /* First mark all the registers that really need to be saved...  */
2895   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2896     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2897
2898   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2899     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2900
2901   /* ... that includes the eh data registers (if needed)...  */
2902   if (crtl->calls_eh_return)
2903     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2904       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2905         = SLOT_REQUIRED;
2906
2907   /* ... and any callee saved register that dataflow says is live.  */
2908   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2909     if (df_regs_ever_live_p (regno)
2910         && (regno == R30_REGNUM
2911             || !call_used_regs[regno]))
2912       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2913
2914   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2915     if (df_regs_ever_live_p (regno)
2916         && !call_used_regs[regno])
2917       {
2918         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2919         last_fp_reg = regno;
2920       }
2921
2922   if (frame_pointer_needed)
2923     {
2924       /* FP and LR are placed in the linkage record.  */
2925       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2926       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2927       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2928       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2929       offset += 2 * UNITS_PER_WORD;
2930     }
2931
2932   /* Now assign stack slots for them.  */
2933   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2934     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2935       {
2936         cfun->machine->frame.reg_offset[regno] = offset;
2937         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2938           cfun->machine->frame.wb_candidate1 = regno;
2939         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
2940           cfun->machine->frame.wb_candidate2 = regno;
2941         offset += UNITS_PER_WORD;
2942       }
2943
2944   HOST_WIDE_INT max_int_offset = offset;
2945   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2946   bool has_align_gap = offset != max_int_offset;
2947
2948   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2949     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2950       {
2951         /* If there is an alignment gap between integer and fp callee-saves,
2952            allocate the last fp register to it if possible.  */
2953         if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
2954           {
2955             cfun->machine->frame.reg_offset[regno] = max_int_offset;
2956             break;
2957           }
2958
2959         cfun->machine->frame.reg_offset[regno] = offset;
2960         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2961           cfun->machine->frame.wb_candidate1 = regno;
2962         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
2963                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2964           cfun->machine->frame.wb_candidate2 = regno;
2965         offset += UNITS_PER_WORD;
2966       }
2967
2968   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2969
2970   cfun->machine->frame.saved_regs_size = offset;
2971
2972   HOST_WIDE_INT varargs_and_saved_regs_size
2973     = offset + cfun->machine->frame.saved_varargs_size;
2974
2975   cfun->machine->frame.hard_fp_offset
2976     = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
2977                 STACK_BOUNDARY / BITS_PER_UNIT);
2978
2979   cfun->machine->frame.frame_size
2980     = ROUND_UP (cfun->machine->frame.hard_fp_offset
2981                 + crtl->outgoing_args_size,
2982                 STACK_BOUNDARY / BITS_PER_UNIT);
2983
2984   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
2985
2986   cfun->machine->frame.initial_adjust = 0;
2987   cfun->machine->frame.final_adjust = 0;
2988   cfun->machine->frame.callee_adjust = 0;
2989   cfun->machine->frame.callee_offset = 0;
2990
2991   HOST_WIDE_INT max_push_offset = 0;
2992   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
2993     max_push_offset = 512;
2994   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
2995     max_push_offset = 256;
2996
2997   if (cfun->machine->frame.frame_size < max_push_offset
2998       && crtl->outgoing_args_size == 0)
2999     {
3000       /* Simple, small frame with no outgoing arguments:
3001          stp reg1, reg2, [sp, -frame_size]!
3002          stp reg3, reg4, [sp, 16]  */
3003       cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
3004     }
3005   else if ((crtl->outgoing_args_size
3006             + cfun->machine->frame.saved_regs_size < 512)
3007            && !(cfun->calls_alloca
3008                 && cfun->machine->frame.hard_fp_offset < max_push_offset))
3009     {
3010       /* Frame with small outgoing arguments:
3011          sub sp, sp, frame_size
3012          stp reg1, reg2, [sp, outgoing_args_size]
3013          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
3014       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
3015       cfun->machine->frame.callee_offset
3016         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
3017     }
3018   else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
3019     {
3020       /* Frame with large outgoing arguments but a small local area:
3021          stp reg1, reg2, [sp, -hard_fp_offset]!
3022          stp reg3, reg4, [sp, 16]
3023          sub sp, sp, outgoing_args_size  */
3024       cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
3025       cfun->machine->frame.final_adjust
3026         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3027     }
3028   else if (!frame_pointer_needed
3029            && varargs_and_saved_regs_size < max_push_offset)
3030     {
3031       /* Frame with large local area and outgoing arguments (this pushes the
3032          callee-saves first, followed by the locals and outgoing area):
3033          stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
3034          stp reg3, reg4, [sp, 16]
3035          sub sp, sp, frame_size - varargs_and_saved_regs_size  */
3036       cfun->machine->frame.callee_adjust = varargs_and_saved_regs_size;
3037       cfun->machine->frame.final_adjust
3038         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3039       cfun->machine->frame.hard_fp_offset = cfun->machine->frame.callee_adjust;
3040       cfun->machine->frame.locals_offset = cfun->machine->frame.hard_fp_offset;
3041     }
3042   else
3043     {
3044       /* Frame with large local area and outgoing arguments using frame pointer:
3045          sub sp, sp, hard_fp_offset
3046          stp x29, x30, [sp, 0]
3047          add x29, sp, 0
3048          stp reg3, reg4, [sp, 16]
3049          sub sp, sp, outgoing_args_size  */
3050       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
3051       cfun->machine->frame.final_adjust
3052         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
3053     }
3054
3055   cfun->machine->frame.laid_out = true;
3056 }
3057
3058 /* Return true if the register REGNO is saved on entry to
3059    the current function.  */
3060
3061 static bool
3062 aarch64_register_saved_on_entry (int regno)
3063 {
3064   return cfun->machine->frame.reg_offset[regno] >= 0;
3065 }
3066
3067 /* Return the next register up from REGNO up to LIMIT for the callee
3068    to save.  */
3069
3070 static unsigned
3071 aarch64_next_callee_save (unsigned regno, unsigned limit)
3072 {
3073   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
3074     regno ++;
3075   return regno;
3076 }
3077
3078 /* Push the register number REGNO of mode MODE to the stack with write-back
3079    adjusting the stack by ADJUSTMENT.  */
3080
3081 static void
3082 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
3083                            HOST_WIDE_INT adjustment)
3084  {
3085   rtx base_rtx = stack_pointer_rtx;
3086   rtx insn, reg, mem;
3087
3088   reg = gen_rtx_REG (mode, regno);
3089   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
3090                             plus_constant (Pmode, base_rtx, -adjustment));
3091   mem = gen_rtx_MEM (mode, mem);
3092
3093   insn = emit_move_insn (mem, reg);
3094   RTX_FRAME_RELATED_P (insn) = 1;
3095 }
3096
3097 /* Generate and return an instruction to store the pair of registers
3098    REG and REG2 of mode MODE to location BASE with write-back adjusting
3099    the stack location BASE by ADJUSTMENT.  */
3100
3101 static rtx
3102 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3103                           HOST_WIDE_INT adjustment)
3104 {
3105   switch (mode)
3106     {
3107     case DImode:
3108       return gen_storewb_pairdi_di (base, base, reg, reg2,
3109                                     GEN_INT (-adjustment),
3110                                     GEN_INT (UNITS_PER_WORD - adjustment));
3111     case DFmode:
3112       return gen_storewb_pairdf_di (base, base, reg, reg2,
3113                                     GEN_INT (-adjustment),
3114                                     GEN_INT (UNITS_PER_WORD - adjustment));
3115     default:
3116       gcc_unreachable ();
3117     }
3118 }
3119
3120 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
3121    stack pointer by ADJUSTMENT.  */
3122
3123 static void
3124 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
3125 {
3126   rtx_insn *insn;
3127   machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
3128
3129   if (regno2 == INVALID_REGNUM)
3130     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
3131
3132   rtx reg1 = gen_rtx_REG (mode, regno1);
3133   rtx reg2 = gen_rtx_REG (mode, regno2);
3134
3135   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
3136                                               reg2, adjustment));
3137   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
3138   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3139   RTX_FRAME_RELATED_P (insn) = 1;
3140 }
3141
3142 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
3143    adjusting it by ADJUSTMENT afterwards.  */
3144
3145 static rtx
3146 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3147                          HOST_WIDE_INT adjustment)
3148 {
3149   switch (mode)
3150     {
3151     case DImode:
3152       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
3153                                    GEN_INT (UNITS_PER_WORD));
3154     case DFmode:
3155       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
3156                                    GEN_INT (UNITS_PER_WORD));
3157     default:
3158       gcc_unreachable ();
3159     }
3160 }
3161
3162 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
3163    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
3164    into CFI_OPS.  */
3165
3166 static void
3167 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
3168                   rtx *cfi_ops)
3169 {
3170   machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
3171   rtx reg1 = gen_rtx_REG (mode, regno1);
3172
3173   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
3174
3175   if (regno2 == INVALID_REGNUM)
3176     {
3177       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
3178       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3179       emit_move_insn (reg1, gen_rtx_MEM (mode, mem));
3180     }
3181   else
3182     {
3183       rtx reg2 = gen_rtx_REG (mode, regno2);
3184       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3185       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
3186                                           reg2, adjustment));
3187     }
3188 }
3189
3190 /* Generate and return a store pair instruction of mode MODE to store
3191    register REG1 to MEM1 and register REG2 to MEM2.  */
3192
3193 static rtx
3194 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
3195                         rtx reg2)
3196 {
3197   switch (mode)
3198     {
3199     case DImode:
3200       return gen_store_pairdi (mem1, reg1, mem2, reg2);
3201
3202     case DFmode:
3203       return gen_store_pairdf (mem1, reg1, mem2, reg2);
3204
3205     default:
3206       gcc_unreachable ();
3207     }
3208 }
3209
3210 /* Generate and regurn a load pair isntruction of mode MODE to load register
3211    REG1 from MEM1 and register REG2 from MEM2.  */
3212
3213 static rtx
3214 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
3215                        rtx mem2)
3216 {
3217   switch (mode)
3218     {
3219     case DImode:
3220       return gen_load_pairdi (reg1, mem1, reg2, mem2);
3221
3222     case DFmode:
3223       return gen_load_pairdf (reg1, mem1, reg2, mem2);
3224
3225     default:
3226       gcc_unreachable ();
3227     }
3228 }
3229
3230 /* Return TRUE if return address signing should be enabled for the current
3231    function, otherwise return FALSE.  */
3232
3233 bool
3234 aarch64_return_address_signing_enabled (void)
3235 {
3236   /* This function should only be called after frame laid out.   */
3237   gcc_assert (cfun->machine->frame.laid_out);
3238
3239   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
3240      if it's LR is pushed onto stack.  */
3241   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
3242           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
3243               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
3244 }
3245
3246 /* Emit code to save the callee-saved registers from register number START
3247    to LIMIT to the stack at the location starting at offset START_OFFSET,
3248    skipping any write-back candidates if SKIP_WB is true.  */
3249
3250 static void
3251 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
3252                            unsigned start, unsigned limit, bool skip_wb)
3253 {
3254   rtx_insn *insn;
3255   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3256                                                  ? gen_frame_mem : gen_rtx_MEM);
3257   unsigned regno;
3258   unsigned regno2;
3259
3260   for (regno = aarch64_next_callee_save (start, limit);
3261        regno <= limit;
3262        regno = aarch64_next_callee_save (regno + 1, limit))
3263     {
3264       rtx reg, mem;
3265       HOST_WIDE_INT offset;
3266
3267       if (skip_wb
3268           && (regno == cfun->machine->frame.wb_candidate1
3269               || regno == cfun->machine->frame.wb_candidate2))
3270         continue;
3271
3272       if (cfun->machine->reg_is_wrapped_separately[regno])
3273        continue;
3274
3275       reg = gen_rtx_REG (mode, regno);
3276       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3277       mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3278                                               offset));
3279
3280       regno2 = aarch64_next_callee_save (regno + 1, limit);
3281
3282       if (regno2 <= limit
3283           && !cfun->machine->reg_is_wrapped_separately[regno2]
3284           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3285               == cfun->machine->frame.reg_offset[regno2]))
3286
3287         {
3288           rtx reg2 = gen_rtx_REG (mode, regno2);
3289           rtx mem2;
3290
3291           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3292           mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3293                                                    offset));
3294           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
3295                                                     reg2));
3296
3297           /* The first part of a frame-related parallel insn is
3298              always assumed to be relevant to the frame
3299              calculations; subsequent parts, are only
3300              frame-related if explicitly marked.  */
3301           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3302           regno = regno2;
3303         }
3304       else
3305         insn = emit_move_insn (mem, reg);
3306
3307       RTX_FRAME_RELATED_P (insn) = 1;
3308     }
3309 }
3310
3311 /* Emit code to restore the callee registers of mode MODE from register
3312    number START up to and including LIMIT.  Restore from the stack offset
3313    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
3314    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
3315
3316 static void
3317 aarch64_restore_callee_saves (machine_mode mode,
3318                               HOST_WIDE_INT start_offset, unsigned start,
3319                               unsigned limit, bool skip_wb, rtx *cfi_ops)
3320 {
3321   rtx base_rtx = stack_pointer_rtx;
3322   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3323                                                  ? gen_frame_mem : gen_rtx_MEM);
3324   unsigned regno;
3325   unsigned regno2;
3326   HOST_WIDE_INT offset;
3327
3328   for (regno = aarch64_next_callee_save (start, limit);
3329        regno <= limit;
3330        regno = aarch64_next_callee_save (regno + 1, limit))
3331     {
3332       if (cfun->machine->reg_is_wrapped_separately[regno])
3333        continue;
3334
3335       rtx reg, mem;
3336
3337       if (skip_wb
3338           && (regno == cfun->machine->frame.wb_candidate1
3339               || regno == cfun->machine->frame.wb_candidate2))
3340         continue;
3341
3342       reg = gen_rtx_REG (mode, regno);
3343       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3344       mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3345
3346       regno2 = aarch64_next_callee_save (regno + 1, limit);
3347
3348       if (regno2 <= limit
3349           && !cfun->machine->reg_is_wrapped_separately[regno2]
3350           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3351               == cfun->machine->frame.reg_offset[regno2]))
3352         {
3353           rtx reg2 = gen_rtx_REG (mode, regno2);
3354           rtx mem2;
3355
3356           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3357           mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3358           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3359
3360           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3361           regno = regno2;
3362         }
3363       else
3364         emit_move_insn (reg, mem);
3365       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
3366     }
3367 }
3368
3369 static inline bool
3370 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3371                                HOST_WIDE_INT offset)
3372 {
3373   return offset >= -256 && offset < 256;
3374 }
3375
3376 static inline bool
3377 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3378 {
3379   return (offset >= 0
3380           && offset < 4096 * GET_MODE_SIZE (mode)
3381           && offset % GET_MODE_SIZE (mode) == 0);
3382 }
3383
3384 bool
3385 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3386 {
3387   return (offset >= -64 * GET_MODE_SIZE (mode)
3388           && offset < 64 * GET_MODE_SIZE (mode)
3389           && offset % GET_MODE_SIZE (mode) == 0);
3390 }
3391
3392 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
3393
3394 static sbitmap
3395 aarch64_get_separate_components (void)
3396 {
3397   aarch64_layout_frame ();
3398
3399   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3400   bitmap_clear (components);
3401
3402   /* The registers we need saved to the frame.  */
3403   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3404     if (aarch64_register_saved_on_entry (regno))
3405       {
3406         HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3407         if (!frame_pointer_needed)
3408           offset += cfun->machine->frame.frame_size
3409                     - cfun->machine->frame.hard_fp_offset;
3410         /* Check that we can access the stack slot of the register with one
3411            direct load with no adjustments needed.  */
3412         if (offset_12bit_unsigned_scaled_p (DImode, offset))
3413           bitmap_set_bit (components, regno);
3414       }
3415
3416   /* Don't mess with the hard frame pointer.  */
3417   if (frame_pointer_needed)
3418     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
3419
3420   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3421   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3422   /* If aarch64_layout_frame has chosen registers to store/restore with
3423      writeback don't interfere with them to avoid having to output explicit
3424      stack adjustment instructions.  */
3425   if (reg2 != INVALID_REGNUM)
3426     bitmap_clear_bit (components, reg2);
3427   if (reg1 != INVALID_REGNUM)
3428     bitmap_clear_bit (components, reg1);
3429
3430   bitmap_clear_bit (components, LR_REGNUM);
3431   bitmap_clear_bit (components, SP_REGNUM);
3432
3433   return components;
3434 }
3435
3436 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
3437
3438 static sbitmap
3439 aarch64_components_for_bb (basic_block bb)
3440 {
3441   bitmap in = DF_LIVE_IN (bb);
3442   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
3443   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
3444
3445   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3446   bitmap_clear (components);
3447
3448   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
3449   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3450     if ((!call_used_regs[regno])
3451        && (bitmap_bit_p (in, regno)
3452            || bitmap_bit_p (gen, regno)
3453            || bitmap_bit_p (kill, regno)))
3454           bitmap_set_bit (components, regno);
3455
3456   return components;
3457 }
3458
3459 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
3460    Nothing to do for aarch64.  */
3461
3462 static void
3463 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
3464 {
3465 }
3466
3467 /* Return the next set bit in BMP from START onwards.  Return the total number
3468    of bits in BMP if no set bit is found at or after START.  */
3469
3470 static unsigned int
3471 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
3472 {
3473   unsigned int nbits = SBITMAP_SIZE (bmp);
3474   if (start == nbits)
3475     return start;
3476
3477   gcc_assert (start < nbits);
3478   for (unsigned int i = start; i < nbits; i++)
3479     if (bitmap_bit_p (bmp, i))
3480       return i;
3481
3482   return nbits;
3483 }
3484
3485 /* Do the work for aarch64_emit_prologue_components and
3486    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
3487    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
3488    for these components or the epilogue sequence.  That is, it determines
3489    whether we should emit stores or loads and what kind of CFA notes to attach
3490    to the insns.  Otherwise the logic for the two sequences is very
3491    similar.  */
3492
3493 static void
3494 aarch64_process_components (sbitmap components, bool prologue_p)
3495 {
3496   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
3497                              ? HARD_FRAME_POINTER_REGNUM
3498                              : STACK_POINTER_REGNUM);
3499
3500   unsigned last_regno = SBITMAP_SIZE (components);
3501   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
3502   rtx_insn *insn = NULL;
3503
3504   while (regno != last_regno)
3505     {
3506       /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
3507          so DFmode for the vector registers is enough.  */
3508       machine_mode mode = GP_REGNUM_P (regno) ? DImode : DFmode;
3509       rtx reg = gen_rtx_REG (mode, regno);
3510       HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3511       if (!frame_pointer_needed)
3512         offset += cfun->machine->frame.frame_size
3513                   - cfun->machine->frame.hard_fp_offset;
3514       rtx addr = plus_constant (Pmode, ptr_reg, offset);
3515       rtx mem = gen_frame_mem (mode, addr);
3516
3517       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
3518       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
3519       /* No more registers to handle after REGNO.
3520          Emit a single save/restore and exit.  */
3521       if (regno2 == last_regno)
3522         {
3523           insn = emit_insn (set);
3524           RTX_FRAME_RELATED_P (insn) = 1;
3525           if (prologue_p)
3526             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3527           else
3528             add_reg_note (insn, REG_CFA_RESTORE, reg);
3529           break;
3530         }
3531
3532       HOST_WIDE_INT offset2 = cfun->machine->frame.reg_offset[regno2];
3533       /* The next register is not of the same class or its offset is not
3534          mergeable with the current one into a pair.  */
3535       if (!satisfies_constraint_Ump (mem)
3536           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
3537           || (offset2 - cfun->machine->frame.reg_offset[regno])
3538                 != GET_MODE_SIZE (mode))
3539         {
3540           insn = emit_insn (set);
3541           RTX_FRAME_RELATED_P (insn) = 1;
3542           if (prologue_p)
3543             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3544           else
3545             add_reg_note (insn, REG_CFA_RESTORE, reg);
3546
3547           regno = regno2;
3548           continue;
3549         }
3550
3551       /* REGNO2 can be saved/restored in a pair with REGNO.  */
3552       rtx reg2 = gen_rtx_REG (mode, regno2);
3553       if (!frame_pointer_needed)
3554         offset2 += cfun->machine->frame.frame_size
3555                   - cfun->machine->frame.hard_fp_offset;
3556       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
3557       rtx mem2 = gen_frame_mem (mode, addr2);
3558       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
3559                              : gen_rtx_SET (reg2, mem2);
3560
3561       if (prologue_p)
3562         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
3563       else
3564         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3565
3566       RTX_FRAME_RELATED_P (insn) = 1;
3567       if (prologue_p)
3568         {
3569           add_reg_note (insn, REG_CFA_OFFSET, set);
3570           add_reg_note (insn, REG_CFA_OFFSET, set2);
3571         }
3572       else
3573         {
3574           add_reg_note (insn, REG_CFA_RESTORE, reg);
3575           add_reg_note (insn, REG_CFA_RESTORE, reg2);
3576         }
3577
3578       regno = aarch64_get_next_set_bit (components, regno2 + 1);
3579     }
3580 }
3581
3582 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
3583
3584 static void
3585 aarch64_emit_prologue_components (sbitmap components)
3586 {
3587   aarch64_process_components (components, true);
3588 }
3589
3590 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
3591
3592 static void
3593 aarch64_emit_epilogue_components (sbitmap components)
3594 {
3595   aarch64_process_components (components, false);
3596 }
3597
3598 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
3599
3600 static void
3601 aarch64_set_handled_components (sbitmap components)
3602 {
3603   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3604     if (bitmap_bit_p (components, regno))
3605       cfun->machine->reg_is_wrapped_separately[regno] = true;
3606 }
3607
3608 /* AArch64 stack frames generated by this compiler look like:
3609
3610         +-------------------------------+
3611         |                               |
3612         |  incoming stack arguments     |
3613         |                               |
3614         +-------------------------------+
3615         |                               | <-- incoming stack pointer (aligned)
3616         |  callee-allocated save area   |
3617         |  for register varargs         |
3618         |                               |
3619         +-------------------------------+
3620         |  local variables              | <-- frame_pointer_rtx
3621         |                               |
3622         +-------------------------------+
3623         |  padding0                     | \
3624         +-------------------------------+  |
3625         |  callee-saved registers       |  | frame.saved_regs_size
3626         +-------------------------------+  |
3627         |  LR'                          |  |
3628         +-------------------------------+  |
3629         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
3630         +-------------------------------+
3631         |  dynamic allocation           |
3632         +-------------------------------+
3633         |  padding                      |
3634         +-------------------------------+
3635         |  outgoing stack arguments     | <-- arg_pointer
3636         |                               |
3637         +-------------------------------+
3638         |                               | <-- stack_pointer_rtx (aligned)
3639
3640    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3641    but leave frame_pointer_rtx and hard_frame_pointer_rtx
3642    unchanged.  */
3643
3644 /* Generate the prologue instructions for entry into a function.
3645    Establish the stack frame by decreasing the stack pointer with a
3646    properly calculated size and, if necessary, create a frame record
3647    filled with the values of LR and previous frame pointer.  The
3648    current FP is also set up if it is in use.  */
3649
3650 void
3651 aarch64_expand_prologue (void)
3652 {
3653   aarch64_layout_frame ();
3654
3655   HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
3656   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3657   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3658   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3659   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3660   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3661   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3662   rtx_insn *insn;
3663
3664   /* Sign return address for functions.  */
3665   if (aarch64_return_address_signing_enabled ())
3666     {
3667       insn = emit_insn (gen_pacisp ());
3668       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3669       RTX_FRAME_RELATED_P (insn) = 1;
3670     }
3671
3672   if (flag_stack_usage_info)
3673     current_function_static_stack_size = frame_size;
3674
3675   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
3676     {
3677       if (crtl->is_leaf && !cfun->calls_alloca)
3678         {
3679           if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
3680             aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
3681                                             frame_size - STACK_CHECK_PROTECT);
3682         }
3683       else if (frame_size > 0)
3684         aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
3685     }
3686
3687   aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
3688
3689   if (callee_adjust != 0)
3690     aarch64_push_regs (reg1, reg2, callee_adjust);
3691
3692   if (frame_pointer_needed)
3693     {
3694       if (callee_adjust == 0)
3695         aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
3696                                    R30_REGNUM, false);
3697       insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
3698                                        stack_pointer_rtx,
3699                                        GEN_INT (callee_offset)));
3700       RTX_FRAME_RELATED_P (insn) = 1;
3701       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
3702     }
3703
3704   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3705                              callee_adjust != 0 || frame_pointer_needed);
3706   aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3707                              callee_adjust != 0 || frame_pointer_needed);
3708   aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
3709 }
3710
3711 /* Return TRUE if we can use a simple_return insn.
3712
3713    This function checks whether the callee saved stack is empty, which
3714    means no restore actions are need. The pro_and_epilogue will use
3715    this to check whether shrink-wrapping opt is feasible.  */
3716
3717 bool
3718 aarch64_use_return_insn_p (void)
3719 {
3720   if (!reload_completed)
3721     return false;
3722
3723   if (crtl->profile)
3724     return false;
3725
3726   aarch64_layout_frame ();
3727
3728   return cfun->machine->frame.frame_size == 0;
3729 }
3730
3731 /* Generate the epilogue instructions for returning from a function.
3732    This is almost exactly the reverse of the prolog sequence, except
3733    that we need to insert barriers to avoid scheduling loads that read
3734    from a deallocated stack, and we optimize the unwind records by
3735    emitting them all together if possible.  */
3736 void
3737 aarch64_expand_epilogue (bool for_sibcall)
3738 {
3739   aarch64_layout_frame ();
3740
3741   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3742   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3743   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3744   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3745   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3746   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3747   rtx cfi_ops = NULL;
3748   rtx_insn *insn;
3749
3750   /* We need to add memory barrier to prevent read from deallocated stack.  */
3751   bool need_barrier_p = (get_frame_size ()
3752                          + cfun->machine->frame.saved_varargs_size) != 0;
3753
3754   /* Emit a barrier to prevent loads from a deallocated stack.  */
3755   if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca
3756       || crtl->calls_eh_return)
3757     {
3758       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3759       need_barrier_p = false;
3760     }
3761
3762   /* Restore the stack pointer from the frame pointer if it may not
3763      be the same as the stack pointer.  */
3764   if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
3765     {
3766       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3767                                        hard_frame_pointer_rtx,
3768                                        GEN_INT (-callee_offset)));
3769       /* If writeback is used when restoring callee-saves, the CFA
3770          is restored on the instruction doing the writeback.  */
3771       RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
3772     }
3773   else
3774     aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM));
3775
3776   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3777                                 callee_adjust != 0, &cfi_ops);
3778   aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3779                                 callee_adjust != 0, &cfi_ops);
3780
3781   if (need_barrier_p)
3782     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3783
3784   if (callee_adjust != 0)
3785     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
3786
3787   if (callee_adjust != 0 || initial_adjust > 65536)
3788     {
3789       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
3790       insn = get_last_insn ();
3791       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
3792       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3793       RTX_FRAME_RELATED_P (insn) = 1;
3794       cfi_ops = NULL;
3795     }
3796
3797   aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM));
3798
3799   if (cfi_ops)
3800     {
3801       /* Emit delayed restores and reset the CFA to be SP.  */
3802       insn = get_last_insn ();
3803       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
3804       REG_NOTES (insn) = cfi_ops;
3805       RTX_FRAME_RELATED_P (insn) = 1;
3806     }
3807
3808   /* We prefer to emit the combined return/authenticate instruction RETAA,
3809      however there are three cases in which we must instead emit an explicit
3810      authentication instruction.
3811
3812         1) Sibcalls don't return in a normal way, so if we're about to call one
3813            we must authenticate.
3814
3815         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
3816            generating code for !TARGET_ARMV8_3 we can't use it and must
3817            explicitly authenticate.
3818
3819         3) On an eh_return path we make extra stack adjustments to update the
3820            canonical frame address to be the exception handler's CFA.  We want
3821            to authenticate using the CFA of the function which calls eh_return.
3822     */
3823   if (aarch64_return_address_signing_enabled ()
3824       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
3825     {
3826       insn = emit_insn (gen_autisp ());
3827       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3828       RTX_FRAME_RELATED_P (insn) = 1;
3829     }
3830
3831   /* Stack adjustment for exception handler.  */
3832   if (crtl->calls_eh_return)
3833     {
3834       /* We need to unwind the stack by the offset computed by
3835          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
3836          to be SP; letting the CFA move during this adjustment
3837          is just as correct as retaining the CFA from the body
3838          of the function.  Therefore, do nothing special.  */
3839       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3840     }
3841
3842   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3843   if (!for_sibcall)
3844     emit_jump_insn (ret_rtx);
3845 }
3846
3847 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
3848    normally or return to a previous frame after unwinding.
3849
3850    An EH return uses a single shared return sequence.  The epilogue is
3851    exactly like a normal epilogue except that it has an extra input
3852    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
3853    that must be applied after the frame has been destroyed.  An extra label
3854    is inserted before the epilogue which initializes this register to zero,
3855    and this is the entry point for a normal return.
3856
3857    An actual EH return updates the return address, initializes the stack
3858    adjustment and jumps directly into the epilogue (bypassing the zeroing
3859    of the adjustment).  Since the return address is typically saved on the
3860    stack when a function makes a call, the saved LR must be updated outside
3861    the epilogue.
3862
3863    This poses problems as the store is generated well before the epilogue,
3864    so the offset of LR is not known yet.  Also optimizations will remove the
3865    store as it appears dead, even after the epilogue is generated (as the
3866    base or offset for loading LR is different in many cases).
3867
3868    To avoid these problems this implementation forces the frame pointer
3869    in eh_return functions so that the location of LR is fixed and known early.
3870    It also marks the store volatile, so no optimization is permitted to
3871    remove the store.  */
3872 rtx
3873 aarch64_eh_return_handler_rtx (void)
3874 {
3875   rtx tmp = gen_frame_mem (Pmode,
3876     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3877
3878   /* Mark the store volatile, so no optimization is permitted to remove it.  */
3879   MEM_VOLATILE_P (tmp) = true;
3880   return tmp;
3881 }
3882
3883 /* Output code to add DELTA to the first argument, and then jump
3884    to FUNCTION.  Used for C++ multiple inheritance.  */
3885 static void
3886 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3887                          HOST_WIDE_INT delta,
3888                          HOST_WIDE_INT vcall_offset,
3889                          tree function)
3890 {
3891   /* The this pointer is always in x0.  Note that this differs from
3892      Arm where the this pointer maybe bumped to r1 if r0 is required
3893      to return a pointer to an aggregate.  On AArch64 a result value
3894      pointer will be in x8.  */
3895   int this_regno = R0_REGNUM;
3896   rtx this_rtx, temp0, temp1, addr, funexp;
3897   rtx_insn *insn;
3898
3899   reload_completed = 1;
3900   emit_note (NOTE_INSN_PROLOGUE_END);
3901
3902   if (vcall_offset == 0)
3903     aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3904   else
3905     {
3906       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3907
3908       this_rtx = gen_rtx_REG (Pmode, this_regno);
3909       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3910       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3911
3912       addr = this_rtx;
3913       if (delta != 0)
3914         {
3915           if (delta >= -256 && delta < 256)
3916             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3917                                        plus_constant (Pmode, this_rtx, delta));
3918           else
3919             aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3920         }
3921
3922       if (Pmode == ptr_mode)
3923         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3924       else
3925         aarch64_emit_move (temp0,
3926                            gen_rtx_ZERO_EXTEND (Pmode,
3927                                                 gen_rtx_MEM (ptr_mode, addr)));
3928
3929       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3930           addr = plus_constant (Pmode, temp0, vcall_offset);
3931       else
3932         {
3933           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
3934                                           Pmode);
3935           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3936         }
3937
3938       if (Pmode == ptr_mode)
3939         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3940       else
3941         aarch64_emit_move (temp1,
3942                            gen_rtx_SIGN_EXTEND (Pmode,
3943                                                 gen_rtx_MEM (ptr_mode, addr)));
3944
3945       emit_insn (gen_add2_insn (this_rtx, temp1));
3946     }
3947
3948   /* Generate a tail call to the target function.  */
3949   if (!TREE_USED (function))
3950     {
3951       assemble_external (function);
3952       TREE_USED (function) = 1;
3953     }
3954   funexp = XEXP (DECL_RTL (function), 0);
3955   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3956   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3957   SIBLING_CALL_P (insn) = 1;
3958
3959   insn = get_insns ();
3960   shorten_branches (insn);
3961   final_start_function (insn, file, 1);
3962   final (insn, file, 1);
3963   final_end_function ();
3964
3965   /* Stop pretending to be a post-reload pass.  */
3966   reload_completed = 0;
3967 }
3968
3969 static bool
3970 aarch64_tls_referenced_p (rtx x)
3971 {
3972   if (!TARGET_HAVE_TLS)
3973     return false;
3974   subrtx_iterator::array_type array;
3975   FOR_EACH_SUBRTX (iter, array, x, ALL)
3976     {
3977       const_rtx x = *iter;
3978       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3979         return true;
3980       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3981          TLS offsets, not real symbol references.  */
3982       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3983         iter.skip_subrtxes ();
3984     }
3985   return false;
3986 }
3987
3988
3989 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3990    a left shift of 0 or 12 bits.  */
3991 bool
3992 aarch64_uimm12_shift (HOST_WIDE_INT val)
3993 {
3994   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3995           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3996           );
3997 }
3998
3999
4000 /* Return true if val is an immediate that can be loaded into a
4001    register by a MOVZ instruction.  */
4002 static bool
4003 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
4004 {
4005   if (GET_MODE_SIZE (mode) > 4)
4006     {
4007       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
4008           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
4009         return 1;
4010     }
4011   else
4012     {
4013       /* Ignore sign extension.  */
4014       val &= (HOST_WIDE_INT) 0xffffffff;
4015     }
4016   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
4017           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
4018 }
4019
4020 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
4021
4022 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
4023   {
4024     0x0000000100000001ull,
4025     0x0001000100010001ull,
4026     0x0101010101010101ull,
4027     0x1111111111111111ull,
4028     0x5555555555555555ull,
4029   };
4030
4031
4032 /* Return true if val is a valid bitmask immediate.  */
4033
4034 bool
4035 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
4036 {
4037   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
4038   int bits;
4039
4040   /* Check for a single sequence of one bits and return quickly if so.
4041      The special cases of all ones and all zeroes returns false.  */
4042   val = (unsigned HOST_WIDE_INT) val_in;
4043   tmp = val + (val & -val);
4044
4045   if (tmp == (tmp & -tmp))
4046     return (val + 1) > 1;
4047
4048   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
4049   if (mode == SImode)
4050     val = (val << 32) | (val & 0xffffffff);
4051
4052   /* Invert if the immediate doesn't start with a zero bit - this means we
4053      only need to search for sequences of one bits.  */
4054   if (val & 1)
4055     val = ~val;
4056
4057   /* Find the first set bit and set tmp to val with the first sequence of one
4058      bits removed.  Return success if there is a single sequence of ones.  */
4059   first_one = val & -val;
4060   tmp = val & (val + first_one);
4061
4062   if (tmp == 0)
4063     return true;
4064
4065   /* Find the next set bit and compute the difference in bit position.  */
4066   next_one = tmp & -tmp;
4067   bits = clz_hwi (first_one) - clz_hwi (next_one);
4068   mask = val ^ tmp;
4069
4070   /* Check the bit position difference is a power of 2, and that the first
4071      sequence of one bits fits within 'bits' bits.  */
4072   if ((mask >> bits) != 0 || bits != (bits & -bits))
4073     return false;
4074
4075   /* Check the sequence of one bits is repeated 64/bits times.  */
4076   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
4077 }
4078
4079 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
4080    Assumed precondition: VAL_IN Is not zero.  */
4081
4082 unsigned HOST_WIDE_INT
4083 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
4084 {
4085   int lowest_bit_set = ctz_hwi (val_in);
4086   int highest_bit_set = floor_log2 (val_in);
4087   gcc_assert (val_in != 0);
4088
4089   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
4090           (HOST_WIDE_INT_1U << lowest_bit_set));
4091 }
4092
4093 /* Create constant where bits outside of lowest bit set to highest bit set
4094    are set to 1.  */
4095
4096 unsigned HOST_WIDE_INT
4097 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
4098 {
4099   return val_in | ~aarch64_and_split_imm1 (val_in);
4100 }
4101
4102 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
4103
4104 bool
4105 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
4106 {
4107   if (aarch64_bitmask_imm (val_in, mode))
4108     return false;
4109
4110   if (aarch64_move_imm (val_in, mode))
4111     return false;
4112
4113   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
4114
4115   return aarch64_bitmask_imm (imm2, mode);
4116 }
4117
4118 /* Return true if val is an immediate that can be loaded into a
4119    register in a single instruction.  */
4120 bool
4121 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
4122 {
4123   if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
4124     return 1;
4125   return aarch64_bitmask_imm (val, mode);
4126 }
4127
4128 static bool
4129 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
4130 {
4131   rtx base, offset;
4132
4133   if (GET_CODE (x) == HIGH)
4134     return true;
4135
4136   split_const (x, &base, &offset);
4137   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
4138     {
4139       if (aarch64_classify_symbol (base, offset)
4140           != SYMBOL_FORCE_TO_MEM)
4141         return true;
4142       else
4143         /* Avoid generating a 64-bit relocation in ILP32; leave
4144            to aarch64_expand_mov_immediate to handle it properly.  */
4145         return mode != ptr_mode;
4146     }
4147
4148   return aarch64_tls_referenced_p (x);
4149 }
4150
4151 /* Implement TARGET_CASE_VALUES_THRESHOLD.
4152    The expansion for a table switch is quite expensive due to the number
4153    of instructions, the table lookup and hard to predict indirect jump.
4154    When optimizing for speed, and -O3 enabled, use the per-core tuning if
4155    set, otherwise use tables for > 16 cases as a tradeoff between size and
4156    performance.  When optimizing for size, use the default setting.  */
4157
4158 static unsigned int
4159 aarch64_case_values_threshold (void)
4160 {
4161   /* Use the specified limit for the number of cases before using jump
4162      tables at higher optimization levels.  */
4163   if (optimize > 2
4164       && selected_cpu->tune->max_case_values != 0)
4165     return selected_cpu->tune->max_case_values;
4166   else
4167     return optimize_size ? default_case_values_threshold () : 17;
4168 }
4169
4170 /* Return true if register REGNO is a valid index register.
4171    STRICT_P is true if REG_OK_STRICT is in effect.  */
4172
4173 bool
4174 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
4175 {
4176   if (!HARD_REGISTER_NUM_P (regno))
4177     {
4178       if (!strict_p)
4179         return true;
4180
4181       if (!reg_renumber)
4182         return false;
4183
4184       regno = reg_renumber[regno];
4185     }
4186   return GP_REGNUM_P (regno);
4187 }
4188
4189 /* Return true if register REGNO is a valid base register for mode MODE.
4190    STRICT_P is true if REG_OK_STRICT is in effect.  */
4191
4192 bool
4193 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
4194 {
4195   if (!HARD_REGISTER_NUM_P (regno))
4196     {
4197       if (!strict_p)
4198         return true;
4199
4200       if (!reg_renumber)
4201         return false;
4202
4203       regno = reg_renumber[regno];
4204     }
4205
4206   /* The fake registers will be eliminated to either the stack or
4207      hard frame pointer, both of which are usually valid base registers.
4208      Reload deals with the cases where the eliminated form isn't valid.  */
4209   return (GP_REGNUM_P (regno)
4210           || regno == SP_REGNUM
4211           || regno == FRAME_POINTER_REGNUM
4212           || regno == ARG_POINTER_REGNUM);
4213 }
4214
4215 /* Return true if X is a valid base register for mode MODE.
4216    STRICT_P is true if REG_OK_STRICT is in effect.  */
4217
4218 static bool
4219 aarch64_base_register_rtx_p (rtx x, bool strict_p)
4220 {
4221   if (!strict_p && GET_CODE (x) == SUBREG)
4222     x = SUBREG_REG (x);
4223
4224   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
4225 }
4226
4227 /* Return true if address offset is a valid index.  If it is, fill in INFO
4228    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
4229
4230 static bool
4231 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
4232                         machine_mode mode, bool strict_p)
4233 {
4234   enum aarch64_address_type type;
4235   rtx index;
4236   int shift;
4237
4238   /* (reg:P) */
4239   if ((REG_P (x) || GET_CODE (x) == SUBREG)
4240       && GET_MODE (x) == Pmode)
4241     {
4242       type = ADDRESS_REG_REG;
4243       index = x;
4244       shift = 0;
4245     }
4246   /* (sign_extend:DI (reg:SI)) */
4247   else if ((GET_CODE (x) == SIGN_EXTEND
4248             || GET_CODE (x) == ZERO_EXTEND)
4249            && GET_MODE (x) == DImode
4250            && GET_MODE (XEXP (x, 0)) == SImode)
4251     {
4252       type = (GET_CODE (x) == SIGN_EXTEND)
4253         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4254       index = XEXP (x, 0);
4255       shift = 0;
4256     }
4257   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
4258   else if (GET_CODE (x) == MULT
4259            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4260                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4261            && GET_MODE (XEXP (x, 0)) == DImode
4262            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4263            && CONST_INT_P (XEXP (x, 1)))
4264     {
4265       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4266         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4267       index = XEXP (XEXP (x, 0), 0);
4268       shift = exact_log2 (INTVAL (XEXP (x, 1)));
4269     }
4270   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
4271   else if (GET_CODE (x) == ASHIFT
4272            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4273                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4274            && GET_MODE (XEXP (x, 0)) == DImode
4275            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4276            && CONST_INT_P (XEXP (x, 1)))
4277     {
4278       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4279         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4280       index = XEXP (XEXP (x, 0), 0);
4281       shift = INTVAL (XEXP (x, 1));
4282     }
4283   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
4284   else if ((GET_CODE (x) == SIGN_EXTRACT
4285             || GET_CODE (x) == ZERO_EXTRACT)
4286            && GET_MODE (x) == DImode
4287            && GET_CODE (XEXP (x, 0)) == MULT
4288            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4289            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4290     {
4291       type = (GET_CODE (x) == SIGN_EXTRACT)
4292         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4293       index = XEXP (XEXP (x, 0), 0);
4294       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4295       if (INTVAL (XEXP (x, 1)) != 32 + shift
4296           || INTVAL (XEXP (x, 2)) != 0)
4297         shift = -1;
4298     }
4299   /* (and:DI (mult:DI (reg:DI) (const_int scale))
4300      (const_int 0xffffffff<<shift)) */
4301   else if (GET_CODE (x) == AND
4302            && GET_MODE (x) == DImode
4303            && GET_CODE (XEXP (x, 0)) == MULT
4304            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4305            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4306            && CONST_INT_P (XEXP (x, 1)))
4307     {
4308       type = ADDRESS_REG_UXTW;
4309       index = XEXP (XEXP (x, 0), 0);
4310       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4311       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4312         shift = -1;
4313     }
4314   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
4315   else if ((GET_CODE (x) == SIGN_EXTRACT
4316             || GET_CODE (x) == ZERO_EXTRACT)
4317            && GET_MODE (x) == DImode
4318            && GET_CODE (XEXP (x, 0)) == ASHIFT
4319            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4320            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4321     {
4322       type = (GET_CODE (x) == SIGN_EXTRACT)
4323         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4324       index = XEXP (XEXP (x, 0), 0);
4325       shift = INTVAL (XEXP (XEXP (x, 0), 1));
4326       if (INTVAL (XEXP (x, 1)) != 32 + shift
4327           || INTVAL (XEXP (x, 2)) != 0)
4328         shift = -1;
4329     }
4330   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
4331      (const_int 0xffffffff<<shift)) */
4332   else if (GET_CODE (x) == AND
4333            && GET_MODE (x) == DImode
4334            && GET_CODE (XEXP (x, 0)) == ASHIFT
4335            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4336            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4337            && CONST_INT_P (XEXP (x, 1)))
4338     {
4339       type = ADDRESS_REG_UXTW;
4340       index = XEXP (XEXP (x, 0), 0);
4341       shift = INTVAL (XEXP (XEXP (x, 0), 1));
4342       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4343         shift = -1;
4344     }
4345   /* (mult:P (reg:P) (const_int scale)) */
4346   else if (GET_CODE (x) == MULT
4347            && GET_MODE (x) == Pmode
4348            && GET_MODE (XEXP (x, 0)) == Pmode
4349            && CONST_INT_P (XEXP (x, 1)))
4350     {
4351       type = ADDRESS_REG_REG;
4352       index = XEXP (x, 0);
4353       shift = exact_log2 (INTVAL (XEXP (x, 1)));
4354     }
4355   /* (ashift:P (reg:P) (const_int shift)) */
4356   else if (GET_CODE (x) == ASHIFT
4357            && GET_MODE (x) == Pmode
4358            && GET_MODE (XEXP (x, 0)) == Pmode
4359            && CONST_INT_P (XEXP (x, 1)))
4360     {
4361       type = ADDRESS_REG_REG;
4362       index = XEXP (x, 0);
4363       shift = INTVAL (XEXP (x, 1));
4364     }
4365   else
4366     return false;
4367
4368   if (GET_CODE (index) == SUBREG)
4369     index = SUBREG_REG (index);
4370
4371   if ((shift == 0 ||
4372        (shift > 0 && shift <= 3
4373         && (1 << shift) == GET_MODE_SIZE (mode)))
4374       && REG_P (index)
4375       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
4376     {
4377       info->type = type;
4378       info->offset = index;
4379       info->shift = shift;
4380       return true;
4381     }
4382
4383   return false;
4384 }
4385
4386 /* Return true if MODE is one of the modes for which we
4387    support LDP/STP operations.  */
4388
4389 static bool
4390 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
4391 {
4392   return mode == SImode || mode == DImode
4393          || mode == SFmode || mode == DFmode
4394          || (aarch64_vector_mode_supported_p (mode)
4395              && GET_MODE_SIZE (mode) == 8);
4396 }
4397
4398 /* Return true if REGNO is a virtual pointer register, or an eliminable
4399    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
4400    include stack_pointer or hard_frame_pointer.  */
4401 static bool
4402 virt_or_elim_regno_p (unsigned regno)
4403 {
4404   return ((regno >= FIRST_VIRTUAL_REGISTER
4405            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
4406           || regno == FRAME_POINTER_REGNUM
4407           || regno == ARG_POINTER_REGNUM);
4408 }
4409
4410 /* Return true if X is a valid address for machine mode MODE.  If it is,
4411    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
4412    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
4413
4414 static bool
4415 aarch64_classify_address (struct aarch64_address_info *info,
4416                           rtx x, machine_mode mode,
4417                           RTX_CODE outer_code, bool strict_p)
4418 {
4419   enum rtx_code code = GET_CODE (x);
4420   rtx op0, op1;
4421
4422   /* On BE, we use load/store pair for all large int mode load/stores.
4423      TI/TFmode may also use a load/store pair.  */
4424   bool load_store_pair_p = (outer_code == PARALLEL
4425                             || mode == TImode
4426                             || mode == TFmode
4427                             || (BYTES_BIG_ENDIAN
4428                                 && aarch64_vect_struct_mode_p (mode)));
4429
4430   bool allow_reg_index_p =
4431     !load_store_pair_p
4432     && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
4433     && !aarch64_vect_struct_mode_p (mode);
4434
4435   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
4436      REG addressing.  */
4437   if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
4438       && (code != POST_INC && code != REG))
4439     return false;
4440
4441   switch (code)
4442     {
4443     case REG:
4444     case SUBREG:
4445       info->type = ADDRESS_REG_IMM;
4446       info->base = x;
4447       info->offset = const0_rtx;
4448       return aarch64_base_register_rtx_p (x, strict_p);
4449
4450     case PLUS:
4451       op0 = XEXP (x, 0);
4452       op1 = XEXP (x, 1);
4453
4454       if (! strict_p
4455           && REG_P (op0)
4456           && virt_or_elim_regno_p (REGNO (op0))
4457           && CONST_INT_P (op1))
4458         {
4459           info->type = ADDRESS_REG_IMM;
4460           info->base = op0;
4461           info->offset = op1;
4462
4463           return true;
4464         }
4465
4466       if (GET_MODE_SIZE (mode) != 0
4467           && CONST_INT_P (op1)
4468           && aarch64_base_register_rtx_p (op0, strict_p))
4469         {
4470           HOST_WIDE_INT offset = INTVAL (op1);
4471
4472           info->type = ADDRESS_REG_IMM;
4473           info->base = op0;
4474           info->offset = op1;
4475
4476           /* TImode and TFmode values are allowed in both pairs of X
4477              registers and individual Q registers.  The available
4478              address modes are:
4479              X,X: 7-bit signed scaled offset
4480              Q:   9-bit signed offset
4481              We conservatively require an offset representable in either mode.
4482              When performing the check for pairs of X registers i.e.  LDP/STP
4483              pass down DImode since that is the natural size of the LDP/STP
4484              instruction memory accesses.  */
4485           if (mode == TImode || mode == TFmode)
4486             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
4487                     && (offset_9bit_signed_unscaled_p (mode, offset)
4488                         || offset_12bit_unsigned_scaled_p (mode, offset)));
4489
4490           /* A 7bit offset check because OImode will emit a ldp/stp
4491              instruction (only big endian will get here).
4492              For ldp/stp instructions, the offset is scaled for the size of a
4493              single element of the pair.  */
4494           if (mode == OImode)
4495             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
4496
4497           /* Three 9/12 bit offsets checks because CImode will emit three
4498              ldr/str instructions (only big endian will get here).  */
4499           if (mode == CImode)
4500             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4501                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
4502                         || offset_12bit_unsigned_scaled_p (V16QImode,
4503                                                            offset + 32)));
4504
4505           /* Two 7bit offsets checks because XImode will emit two ldp/stp
4506              instructions (only big endian will get here).  */
4507           if (mode == XImode)
4508             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4509                     && aarch64_offset_7bit_signed_scaled_p (TImode,
4510                                                             offset + 32));
4511
4512           if (load_store_pair_p)
4513             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4514                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4515           else
4516             return (offset_9bit_signed_unscaled_p (mode, offset)
4517                     || offset_12bit_unsigned_scaled_p (mode, offset));
4518         }
4519
4520       if (allow_reg_index_p)
4521         {
4522           /* Look for base + (scaled/extended) index register.  */
4523           if (aarch64_base_register_rtx_p (op0, strict_p)
4524               && aarch64_classify_index (info, op1, mode, strict_p))
4525             {
4526               info->base = op0;
4527               return true;
4528             }
4529           if (aarch64_base_register_rtx_p (op1, strict_p)
4530               && aarch64_classify_index (info, op0, mode, strict_p))
4531             {
4532               info->base = op1;
4533               return true;
4534             }
4535         }
4536
4537       return false;
4538
4539     case POST_INC:
4540     case POST_DEC:
4541     case PRE_INC:
4542     case PRE_DEC:
4543       info->type = ADDRESS_REG_WB;
4544       info->base = XEXP (x, 0);
4545       info->offset = NULL_RTX;
4546       return aarch64_base_register_rtx_p (info->base, strict_p);
4547
4548     case POST_MODIFY:
4549     case PRE_MODIFY:
4550       info->type = ADDRESS_REG_WB;
4551       info->base = XEXP (x, 0);
4552       if (GET_CODE (XEXP (x, 1)) == PLUS
4553           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
4554           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
4555           && aarch64_base_register_rtx_p (info->base, strict_p))
4556         {
4557           HOST_WIDE_INT offset;
4558           info->offset = XEXP (XEXP (x, 1), 1);
4559           offset = INTVAL (info->offset);
4560
4561           /* TImode and TFmode values are allowed in both pairs of X
4562              registers and individual Q registers.  The available
4563              address modes are:
4564              X,X: 7-bit signed scaled offset
4565              Q:   9-bit signed offset
4566              We conservatively require an offset representable in either mode.
4567            */
4568           if (mode == TImode || mode == TFmode)
4569             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4570                     && offset_9bit_signed_unscaled_p (mode, offset));
4571
4572           if (load_store_pair_p)
4573             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4574                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4575           else
4576             return offset_9bit_signed_unscaled_p (mode, offset);
4577         }
4578       return false;
4579
4580     case CONST:
4581     case SYMBOL_REF:
4582     case LABEL_REF:
4583       /* load literal: pc-relative constant pool entry.  Only supported
4584          for SI mode or larger.  */
4585       info->type = ADDRESS_SYMBOLIC;
4586
4587       if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4588         {
4589           rtx sym, addend;
4590
4591           split_const (x, &sym, &addend);
4592           return ((GET_CODE (sym) == LABEL_REF
4593                    || (GET_CODE (sym) == SYMBOL_REF
4594                        && CONSTANT_POOL_ADDRESS_P (sym)
4595                        && aarch64_pcrelative_literal_loads)));
4596         }
4597       return false;
4598
4599     case LO_SUM:
4600       info->type = ADDRESS_LO_SUM;
4601       info->base = XEXP (x, 0);
4602       info->offset = XEXP (x, 1);
4603       if (allow_reg_index_p
4604           && aarch64_base_register_rtx_p (info->base, strict_p))
4605         {
4606           rtx sym, offs;
4607           split_const (info->offset, &sym, &offs);
4608           if (GET_CODE (sym) == SYMBOL_REF
4609               && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4610             {
4611               /* The symbol and offset must be aligned to the access size.  */
4612               unsigned int align;
4613               unsigned int ref_size;
4614
4615               if (CONSTANT_POOL_ADDRESS_P (sym))
4616                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4617               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4618                 {
4619                   tree exp = SYMBOL_REF_DECL (sym);
4620                   align = TYPE_ALIGN (TREE_TYPE (exp));
4621                   align = CONSTANT_ALIGNMENT (exp, align);
4622                 }
4623               else if (SYMBOL_REF_DECL (sym))
4624                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4625               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4626                        && SYMBOL_REF_BLOCK (sym) != NULL)
4627                 align = SYMBOL_REF_BLOCK (sym)->alignment;
4628               else
4629                 align = BITS_PER_UNIT;
4630
4631               ref_size = GET_MODE_SIZE (mode);
4632               if (ref_size == 0)
4633                 ref_size = GET_MODE_SIZE (DImode);
4634
4635               return ((INTVAL (offs) & (ref_size - 1)) == 0
4636                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4637             }
4638         }
4639       return false;
4640
4641     default:
4642       return false;
4643     }
4644 }
4645
4646 /* Return true if the address X is valid for a PRFM instruction.
4647    STRICT_P is true if we should do strict checking with
4648    aarch64_classify_address.  */
4649
4650 bool
4651 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
4652 {
4653   struct aarch64_address_info addr;
4654
4655   /* PRFM accepts the same addresses as DImode...  */
4656   bool res = aarch64_classify_address (&addr, x, DImode, MEM, strict_p);
4657   if (!res)
4658     return false;
4659
4660   /* ... except writeback forms.  */
4661   return addr.type != ADDRESS_REG_WB;
4662 }
4663
4664 bool
4665 aarch64_symbolic_address_p (rtx x)
4666 {
4667   rtx offset;
4668
4669   split_const (x, &x, &offset);
4670   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4671 }
4672
4673 /* Classify the base of symbolic expression X.  */
4674
4675 enum aarch64_symbol_type
4676 aarch64_classify_symbolic_expression (rtx x)
4677 {
4678   rtx offset;
4679
4680   split_const (x, &x, &offset);
4681   return aarch64_classify_symbol (x, offset);
4682 }
4683
4684
4685 /* Return TRUE if X is a legitimate address for accessing memory in
4686    mode MODE.  */
4687 static bool
4688 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4689 {
4690   struct aarch64_address_info addr;
4691
4692   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4693 }
4694
4695 /* Return TRUE if X is a legitimate address for accessing memory in
4696    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
4697    pair operation.  */
4698 bool
4699 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4700                               RTX_CODE outer_code, bool strict_p)
4701 {
4702   struct aarch64_address_info addr;
4703
4704   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4705 }
4706
4707 /* Split an out-of-range address displacement into a base and offset.
4708    Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
4709    to increase opportunities for sharing the base address of different sizes.
4710    For unaligned accesses and TI/TF mode use the signed 9-bit range.  */
4711 static bool
4712 aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
4713 {
4714   HOST_WIDE_INT offset = INTVAL (*disp);
4715   HOST_WIDE_INT base = offset & ~(GET_MODE_SIZE (mode) < 4 ? 0xfff : 0x3ffc);
4716
4717   if (mode == TImode || mode == TFmode
4718       || (offset & (GET_MODE_SIZE (mode) - 1)) != 0)
4719     base = (offset + 0x100) & ~0x1ff;
4720
4721   *off = GEN_INT (base);
4722   *disp = GEN_INT (offset - base);
4723   return true;
4724 }
4725
4726 /* Return TRUE if rtx X is immediate constant 0.0 */
4727 bool
4728 aarch64_float_const_zero_rtx_p (rtx x)
4729 {
4730   if (GET_MODE (x) == VOIDmode)
4731     return false;
4732
4733   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4734     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4735   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4736 }
4737
4738 /* Return the fixed registers used for condition codes.  */
4739
4740 static bool
4741 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4742 {
4743   *p1 = CC_REGNUM;
4744   *p2 = INVALID_REGNUM;
4745   return true;
4746 }
4747
4748 /* This function is used by the call expanders of the machine description.
4749    RESULT is the register in which the result is returned.  It's NULL for
4750    "call" and "sibcall".
4751    MEM is the location of the function call.
4752    SIBCALL indicates whether this function call is normal call or sibling call.
4753    It will generate different pattern accordingly.  */
4754
4755 void
4756 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
4757 {
4758   rtx call, callee, tmp;
4759   rtvec vec;
4760   machine_mode mode;
4761
4762   gcc_assert (MEM_P (mem));
4763   callee = XEXP (mem, 0);
4764   mode = GET_MODE (callee);
4765   gcc_assert (mode == Pmode);
4766
4767   /* Decide if we should generate indirect calls by loading the
4768      address of the callee into a register before performing
4769      the branch-and-link.  */
4770   if (SYMBOL_REF_P (callee)
4771       ? (aarch64_is_long_call_p (callee)
4772          || aarch64_is_noplt_call_p (callee))
4773       : !REG_P (callee))
4774     XEXP (mem, 0) = force_reg (mode, callee);
4775
4776   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
4777
4778   if (result != NULL_RTX)
4779     call = gen_rtx_SET (result, call);
4780
4781   if (sibcall)
4782     tmp = ret_rtx;
4783   else
4784     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
4785
4786   vec = gen_rtvec (2, call, tmp);
4787   call = gen_rtx_PARALLEL (VOIDmode, vec);
4788
4789   aarch64_emit_call_insn (call);
4790 }
4791
4792 /* Emit call insn with PAT and do aarch64-specific handling.  */
4793
4794 void
4795 aarch64_emit_call_insn (rtx pat)
4796 {
4797   rtx insn = emit_call_insn (pat);
4798
4799   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4800   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4801   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4802 }
4803
4804 machine_mode
4805 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4806 {
4807   /* All floating point compares return CCFP if it is an equality
4808      comparison, and CCFPE otherwise.  */
4809   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4810     {
4811       switch (code)
4812         {
4813         case EQ:
4814         case NE:
4815         case UNORDERED:
4816         case ORDERED:
4817         case UNLT:
4818         case UNLE:
4819         case UNGT:
4820         case UNGE:
4821         case UNEQ:
4822         case LTGT:
4823           return CCFPmode;
4824
4825         case LT:
4826         case LE:
4827         case GT:
4828         case GE:
4829           return CCFPEmode;
4830
4831         default:
4832           gcc_unreachable ();
4833         }
4834     }
4835
4836   /* Equality comparisons of short modes against zero can be performed
4837      using the TST instruction with the appropriate bitmask.  */
4838   if (y == const0_rtx && REG_P (x)
4839       && (code == EQ || code == NE)
4840       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4841     return CC_NZmode;
4842
4843   /* Similarly, comparisons of zero_extends from shorter modes can
4844      be performed using an ANDS with an immediate mask.  */
4845   if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
4846       && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4847       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
4848       && (code == EQ || code == NE))
4849     return CC_NZmode;
4850
4851   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4852       && y == const0_rtx
4853       && (code == EQ || code == NE || code == LT || code == GE)
4854       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4855           || GET_CODE (x) == NEG
4856           || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4857               && CONST_INT_P (XEXP (x, 2)))))
4858     return CC_NZmode;
4859
4860   /* A compare with a shifted operand.  Because of canonicalization,
4861      the comparison will have to be swapped when we emit the assembly
4862      code.  */
4863   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4864       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
4865       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4866           || GET_CODE (x) == LSHIFTRT
4867           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4868     return CC_SWPmode;
4869
4870   /* Similarly for a negated operand, but we can only do this for
4871      equalities.  */
4872   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4873       && (REG_P (y) || GET_CODE (y) == SUBREG)
4874       && (code == EQ || code == NE)
4875       && GET_CODE (x) == NEG)
4876     return CC_Zmode;
4877
4878   /* A test for unsigned overflow.  */
4879   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
4880       && code == NE
4881       && GET_CODE (x) == PLUS
4882       && GET_CODE (y) == ZERO_EXTEND)
4883     return CC_Cmode;
4884
4885   /* For everything else, return CCmode.  */
4886   return CCmode;
4887 }
4888
4889 static int
4890 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
4891
4892 int
4893 aarch64_get_condition_code (rtx x)
4894 {
4895   machine_mode mode = GET_MODE (XEXP (x, 0));
4896   enum rtx_code comp_code = GET_CODE (x);
4897
4898   if (GET_MODE_CLASS (mode) != MODE_CC)
4899     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
4900   return aarch64_get_condition_code_1 (mode, comp_code);
4901 }
4902
4903 static int
4904 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
4905 {
4906   switch (mode)
4907     {
4908     case CCFPmode:
4909     case CCFPEmode:
4910       switch (comp_code)
4911         {
4912         case GE: return AARCH64_GE;
4913         case GT: return AARCH64_GT;
4914         case LE: return AARCH64_LS;
4915         case LT: return AARCH64_MI;
4916         case NE: return AARCH64_NE;
4917         case EQ: return AARCH64_EQ;
4918         case ORDERED: return AARCH64_VC;
4919         case UNORDERED: return AARCH64_VS;
4920         case UNLT: return AARCH64_LT;
4921         case UNLE: return AARCH64_LE;
4922         case UNGT: return AARCH64_HI;
4923         case UNGE: return AARCH64_PL;
4924         default: return -1;
4925         }
4926       break;
4927
4928     case CCmode:
4929       switch (comp_code)
4930         {
4931         case NE: return AARCH64_NE;
4932         case EQ: return AARCH64_EQ;
4933         case GE: return AARCH64_GE;
4934         case GT: return AARCH64_GT;
4935         case LE: return AARCH64_LE;
4936         case LT: return AARCH64_LT;
4937         case GEU: return AARCH64_CS;
4938         case GTU: return AARCH64_HI;
4939         case LEU: return AARCH64_LS;
4940         case LTU: return AARCH64_CC;
4941         default: return -1;
4942         }
4943       break;
4944
4945     case CC_SWPmode:
4946       switch (comp_code)
4947         {
4948         case NE: return AARCH64_NE;
4949         case EQ: return AARCH64_EQ;
4950         case GE: return AARCH64_LE;
4951         case GT: return AARCH64_LT;
4952         case LE: return AARCH64_GE;
4953         case LT: return AARCH64_GT;
4954         case GEU: return AARCH64_LS;
4955         case GTU: return AARCH64_CC;
4956         case LEU: return AARCH64_CS;
4957         case LTU: return AARCH64_HI;
4958         default: return -1;
4959         }
4960       break;
4961
4962     case CC_NZmode:
4963       switch (comp_code)
4964         {
4965         case NE: return AARCH64_NE;
4966         case EQ: return AARCH64_EQ;
4967         case GE: return AARCH64_PL;
4968         case LT: return AARCH64_MI;
4969         default: return -1;
4970         }
4971       break;
4972
4973     case CC_Zmode:
4974       switch (comp_code)
4975         {
4976         case NE: return AARCH64_NE;
4977         case EQ: return AARCH64_EQ;
4978         default: return -1;
4979         }
4980       break;
4981
4982     case CC_Cmode:
4983       switch (comp_code)
4984         {
4985         case NE: return AARCH64_CS;
4986         case EQ: return AARCH64_CC;
4987         default: return -1;
4988         }
4989       break;
4990
4991     default:
4992       return -1;
4993     }
4994
4995   return -1;
4996 }
4997
4998 bool
4999 aarch64_const_vec_all_same_in_range_p (rtx x,
5000                                   HOST_WIDE_INT minval,
5001                                   HOST_WIDE_INT maxval)
5002 {
5003   HOST_WIDE_INT firstval;
5004   int count, i;
5005
5006   if (GET_CODE (x) != CONST_VECTOR
5007       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
5008     return false;
5009
5010   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
5011   if (firstval < minval || firstval > maxval)
5012     return false;
5013
5014   count = CONST_VECTOR_NUNITS (x);
5015   for (i = 1; i < count; i++)
5016     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
5017       return false;
5018
5019   return true;
5020 }
5021
5022 bool
5023 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
5024 {
5025   return aarch64_const_vec_all_same_in_range_p (x, val, val);
5026 }
5027
5028
5029 /* N Z C V.  */
5030 #define AARCH64_CC_V 1
5031 #define AARCH64_CC_C (1 << 1)
5032 #define AARCH64_CC_Z (1 << 2)
5033 #define AARCH64_CC_N (1 << 3)
5034
5035 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
5036 static const int aarch64_nzcv_codes[] =
5037 {
5038   0,            /* EQ, Z == 1.  */
5039   AARCH64_CC_Z, /* NE, Z == 0.  */
5040   0,            /* CS, C == 1.  */
5041   AARCH64_CC_C, /* CC, C == 0.  */
5042   0,            /* MI, N == 1.  */
5043   AARCH64_CC_N, /* PL, N == 0.  */
5044   0,            /* VS, V == 1.  */
5045   AARCH64_CC_V, /* VC, V == 0.  */
5046   0,            /* HI, C ==1 && Z == 0.  */
5047   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
5048   AARCH64_CC_V, /* GE, N == V.  */
5049   0,            /* LT, N != V.  */
5050   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
5051   0,            /* LE, !(Z == 0 && N == V).  */
5052   0,            /* AL, Any.  */
5053   0             /* NV, Any.  */
5054 };
5055
5056 static void
5057 aarch64_print_operand (FILE *f, rtx x, int code)
5058 {
5059   switch (code)
5060     {
5061     /* An integer or symbol address without a preceding # sign.  */
5062     case 'c':
5063       switch (GET_CODE (x))
5064         {
5065         case CONST_INT:
5066           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
5067           break;
5068
5069         case SYMBOL_REF:
5070           output_addr_const (f, x);
5071           break;
5072
5073         case CONST:
5074           if (GET_CODE (XEXP (x, 0)) == PLUS
5075               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
5076             {
5077               output_addr_const (f, x);
5078               break;
5079             }
5080           /* Fall through.  */
5081
5082         default:
5083           output_operand_lossage ("Unsupported operand for code '%c'", code);
5084         }
5085       break;
5086
5087     case 'e':
5088       /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w.  */
5089       {
5090         int n;
5091
5092         if (!CONST_INT_P (x)
5093             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
5094           {
5095             output_operand_lossage ("invalid operand for '%%%c'", code);
5096             return;
5097           }
5098
5099         switch (n)
5100           {
5101           case 3:
5102             fputc ('b', f);
5103             break;
5104           case 4:
5105             fputc ('h', f);
5106             break;
5107           case 5:
5108             fputc ('w', f);
5109             break;
5110           default:
5111             output_operand_lossage ("invalid operand for '%%%c'", code);
5112             return;
5113           }
5114       }
5115       break;
5116
5117     case 'p':
5118       {
5119         int n;
5120
5121         /* Print N such that 2^N == X.  */
5122         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
5123           {
5124             output_operand_lossage ("invalid operand for '%%%c'", code);
5125             return;
5126           }
5127
5128         asm_fprintf (f, "%d", n);
5129       }
5130       break;
5131
5132     case 'P':
5133       /* Print the number of non-zero bits in X (a const_int).  */
5134       if (!CONST_INT_P (x))
5135         {
5136           output_operand_lossage ("invalid operand for '%%%c'", code);
5137           return;
5138         }
5139
5140       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
5141       break;
5142
5143     case 'H':
5144       /* Print the higher numbered register of a pair (TImode) of regs.  */
5145       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
5146         {
5147           output_operand_lossage ("invalid operand for '%%%c'", code);
5148           return;
5149         }
5150
5151       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
5152       break;
5153
5154     case 'M':
5155     case 'm':
5156       {
5157         int cond_code;
5158         /* Print a condition (eq, ne, etc) or its inverse.  */
5159
5160         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
5161         if (x == const_true_rtx)
5162           {
5163             if (code == 'M')
5164               fputs ("nv", f);
5165             return;
5166           }
5167
5168         if (!COMPARISON_P (x))
5169           {
5170             output_operand_lossage ("invalid operand for '%%%c'", code);
5171             return;
5172           }
5173
5174         cond_code = aarch64_get_condition_code (x);
5175         gcc_assert (cond_code >= 0);
5176         if (code == 'M')
5177           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
5178         fputs (aarch64_condition_codes[cond_code], f);
5179       }
5180       break;
5181
5182     case 'b':
5183     case 'h':
5184     case 's':
5185     case 'd':
5186     case 'q':
5187       /* Print a scalar FP/SIMD register name.  */
5188       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5189         {
5190           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5191           return;
5192         }
5193       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
5194       break;
5195
5196     case 'S':
5197     case 'T':
5198     case 'U':
5199     case 'V':
5200       /* Print the first FP/SIMD register name in a list.  */
5201       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5202         {
5203           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5204           return;
5205         }
5206       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
5207       break;
5208
5209     case 'R':
5210       /* Print a scalar FP/SIMD register name + 1.  */
5211       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5212         {
5213           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5214           return;
5215         }
5216       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
5217       break;
5218
5219     case 'X':
5220       /* Print bottom 16 bits of integer constant in hex.  */
5221       if (!CONST_INT_P (x))
5222         {
5223           output_operand_lossage ("invalid operand for '%%%c'", code);
5224           return;
5225         }
5226       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
5227       break;
5228
5229     case 'w':
5230     case 'x':
5231       /* Print a general register name or the zero register (32-bit or
5232          64-bit).  */
5233       if (x == const0_rtx
5234           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
5235         {
5236           asm_fprintf (f, "%czr", code);
5237           break;
5238         }
5239
5240       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
5241         {
5242           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
5243           break;
5244         }
5245
5246       if (REG_P (x) && REGNO (x) == SP_REGNUM)
5247         {
5248           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
5249           break;
5250         }
5251
5252       /* Fall through */
5253
5254     case 0:
5255       /* Print a normal operand, if it's a general register, then we
5256          assume DImode.  */
5257       if (x == NULL)
5258         {
5259           output_operand_lossage ("missing operand");
5260           return;
5261         }
5262
5263       switch (GET_CODE (x))
5264         {
5265         case REG:
5266           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
5267           break;
5268
5269         case MEM:
5270           output_address (GET_MODE (x), XEXP (x, 0));
5271           /* Check all memory references are Pmode - even with ILP32.  */
5272           gcc_assert (GET_MODE (XEXP (x, 0)) == Pmode);
5273           break;
5274
5275         case CONST:
5276         case LABEL_REF:
5277         case SYMBOL_REF:
5278           output_addr_const (asm_out_file, x);
5279           break;
5280
5281         case CONST_INT:
5282           asm_fprintf (f, "%wd", INTVAL (x));
5283           break;
5284
5285         case CONST_VECTOR:
5286           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
5287             {
5288               gcc_assert (
5289                   aarch64_const_vec_all_same_in_range_p (x,
5290                                                          HOST_WIDE_INT_MIN,
5291                                                          HOST_WIDE_INT_MAX));
5292               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
5293             }
5294           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
5295             {
5296               fputc ('0', f);
5297             }
5298           else
5299             gcc_unreachable ();
5300           break;
5301
5302         case CONST_DOUBLE:
5303           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
5304              be getting CONST_DOUBLEs holding integers.  */
5305           gcc_assert (GET_MODE (x) != VOIDmode);
5306           if (aarch64_float_const_zero_rtx_p (x))
5307             {
5308               fputc ('0', f);
5309               break;
5310             }
5311           else if (aarch64_float_const_representable_p (x))
5312             {
5313 #define buf_size 20
5314               char float_buf[buf_size] = {'\0'};
5315               real_to_decimal_for_mode (float_buf,
5316                                         CONST_DOUBLE_REAL_VALUE (x),
5317                                         buf_size, buf_size,
5318                                         1, GET_MODE (x));
5319               asm_fprintf (asm_out_file, "%s", float_buf);
5320               break;
5321 #undef buf_size
5322             }
5323           output_operand_lossage ("invalid constant");
5324           return;
5325         default:
5326           output_operand_lossage ("invalid operand");
5327           return;
5328         }
5329       break;
5330
5331     case 'A':
5332       if (GET_CODE (x) == HIGH)
5333         x = XEXP (x, 0);
5334
5335       switch (aarch64_classify_symbolic_expression (x))
5336         {
5337         case SYMBOL_SMALL_GOT_4G:
5338           asm_fprintf (asm_out_file, ":got:");
5339           break;
5340
5341         case SYMBOL_SMALL_TLSGD:
5342           asm_fprintf (asm_out_file, ":tlsgd:");
5343           break;
5344
5345         case SYMBOL_SMALL_TLSDESC:
5346           asm_fprintf (asm_out_file, ":tlsdesc:");
5347           break;
5348
5349         case SYMBOL_SMALL_TLSIE:
5350           asm_fprintf (asm_out_file, ":gottprel:");
5351           break;
5352
5353         case SYMBOL_TLSLE24:
5354           asm_fprintf (asm_out_file, ":tprel:");
5355           break;
5356
5357         case SYMBOL_TINY_GOT:
5358           gcc_unreachable ();
5359           break;
5360
5361         default:
5362           break;
5363         }
5364       output_addr_const (asm_out_file, x);
5365       break;
5366
5367     case 'L':
5368       switch (aarch64_classify_symbolic_expression (x))
5369         {
5370         case SYMBOL_SMALL_GOT_4G:
5371           asm_fprintf (asm_out_file, ":lo12:");
5372           break;
5373
5374         case SYMBOL_SMALL_TLSGD:
5375           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
5376           break;
5377
5378         case SYMBOL_SMALL_TLSDESC:
5379           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
5380           break;
5381
5382         case SYMBOL_SMALL_TLSIE:
5383           asm_fprintf (asm_out_file, ":gottprel_lo12:");
5384           break;
5385
5386         case SYMBOL_TLSLE12:
5387           asm_fprintf (asm_out_file, ":tprel_lo12:");
5388           break;
5389
5390         case SYMBOL_TLSLE24:
5391           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
5392           break;
5393
5394         case SYMBOL_TINY_GOT:
5395           asm_fprintf (asm_out_file, ":got:");
5396           break;
5397
5398         case SYMBOL_TINY_TLSIE:
5399           asm_fprintf (asm_out_file, ":gottprel:");
5400           break;
5401
5402         default:
5403           break;
5404         }
5405       output_addr_const (asm_out_file, x);
5406       break;
5407
5408     case 'G':
5409
5410       switch (aarch64_classify_symbolic_expression (x))
5411         {
5412         case SYMBOL_TLSLE24:
5413           asm_fprintf (asm_out_file, ":tprel_hi12:");
5414           break;
5415         default:
5416           break;
5417         }
5418       output_addr_const (asm_out_file, x);
5419       break;
5420
5421     case 'k':
5422       {
5423         HOST_WIDE_INT cond_code;
5424         /* Print nzcv.  */
5425
5426         if (!CONST_INT_P (x))
5427           {
5428             output_operand_lossage ("invalid operand for '%%%c'", code);
5429             return;
5430           }
5431
5432         cond_code = INTVAL (x);
5433         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
5434         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
5435       }
5436       break;
5437
5438     default:
5439       output_operand_lossage ("invalid operand prefix '%%%c'", code);
5440       return;
5441     }
5442 }
5443
5444 static void
5445 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
5446 {
5447   struct aarch64_address_info addr;
5448
5449   if (aarch64_classify_address (&addr, x, mode, MEM, true))
5450     switch (addr.type)
5451       {
5452       case ADDRESS_REG_IMM:
5453         if (addr.offset == const0_rtx)
5454           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
5455         else
5456           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
5457                        INTVAL (addr.offset));
5458         return;
5459
5460       case ADDRESS_REG_REG:
5461         if (addr.shift == 0)
5462           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
5463                        reg_names [REGNO (addr.offset)]);
5464         else
5465           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
5466                        reg_names [REGNO (addr.offset)], addr.shift);
5467         return;
5468
5469       case ADDRESS_REG_UXTW:
5470         if (addr.shift == 0)
5471           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
5472                        REGNO (addr.offset) - R0_REGNUM);
5473         else
5474           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
5475                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
5476         return;
5477
5478       case ADDRESS_REG_SXTW:
5479         if (addr.shift == 0)
5480           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
5481                        REGNO (addr.offset) - R0_REGNUM);
5482         else
5483           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
5484                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
5485         return;
5486
5487       case ADDRESS_REG_WB:
5488         switch (GET_CODE (x))
5489           {
5490           case PRE_INC:
5491             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
5492                          GET_MODE_SIZE (mode));
5493             return;
5494           case POST_INC:
5495             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
5496                          GET_MODE_SIZE (mode));
5497             return;
5498           case PRE_DEC:
5499             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
5500                          GET_MODE_SIZE (mode));
5501             return;
5502           case POST_DEC:
5503             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
5504                          GET_MODE_SIZE (mode));
5505             return;
5506           case PRE_MODIFY:
5507             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
5508                          INTVAL (addr.offset));
5509             return;
5510           case POST_MODIFY:
5511             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
5512                          INTVAL (addr.offset));
5513             return;
5514           default:
5515             break;
5516           }
5517         break;
5518
5519       case ADDRESS_LO_SUM:
5520         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
5521         output_addr_const (f, addr.offset);
5522         asm_fprintf (f, "]");
5523         return;
5524
5525       case ADDRESS_SYMBOLIC:
5526         break;
5527       }
5528
5529   output_addr_const (f, x);
5530 }
5531
5532 bool
5533 aarch64_label_mentioned_p (rtx x)
5534 {
5535   const char *fmt;
5536   int i;
5537
5538   if (GET_CODE (x) == LABEL_REF)
5539     return true;
5540
5541   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
5542      referencing instruction, but they are constant offsets, not
5543      symbols.  */
5544   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5545     return false;
5546
5547   fmt = GET_RTX_FORMAT (GET_CODE (x));
5548   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
5549     {
5550       if (fmt[i] == 'E')
5551         {
5552           int j;
5553
5554           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
5555             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
5556               return 1;
5557         }
5558       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
5559         return 1;
5560     }
5561
5562   return 0;
5563 }
5564
5565 /* Implement REGNO_REG_CLASS.  */
5566
5567 enum reg_class
5568 aarch64_regno_regclass (unsigned regno)
5569 {
5570   if (GP_REGNUM_P (regno))
5571     return GENERAL_REGS;
5572
5573   if (regno == SP_REGNUM)
5574     return STACK_REG;
5575
5576   if (regno == FRAME_POINTER_REGNUM
5577       || regno == ARG_POINTER_REGNUM)
5578     return POINTER_REGS;
5579
5580   if (FP_REGNUM_P (regno))
5581     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
5582
5583   return NO_REGS;
5584 }
5585
5586 static rtx
5587 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
5588 {
5589   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5590      where mask is selected by alignment and size of the offset.
5591      We try to pick as large a range for the offset as possible to
5592      maximize the chance of a CSE.  However, for aligned addresses
5593      we limit the range to 4k so that structures with different sized
5594      elements are likely to use the same base.  We need to be careful
5595      not to split a CONST for some forms of address expression, otherwise
5596      it will generate sub-optimal code.  */
5597
5598   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5599     {
5600       rtx base = XEXP (x, 0);
5601       rtx offset_rtx = XEXP (x, 1);
5602       HOST_WIDE_INT offset = INTVAL (offset_rtx);
5603
5604       if (GET_CODE (base) == PLUS)
5605         {
5606           rtx op0 = XEXP (base, 0);
5607           rtx op1 = XEXP (base, 1);
5608
5609           /* Force any scaling into a temp for CSE.  */
5610           op0 = force_reg (Pmode, op0);
5611           op1 = force_reg (Pmode, op1);
5612
5613           /* Let the pointer register be in op0.  */
5614           if (REG_POINTER (op1))
5615             std::swap (op0, op1);
5616
5617           /* If the pointer is virtual or frame related, then we know that
5618              virtual register instantiation or register elimination is going
5619              to apply a second constant.  We want the two constants folded
5620              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
5621           if (virt_or_elim_regno_p (REGNO (op0)))
5622             {
5623               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
5624                                    NULL_RTX, true, OPTAB_DIRECT);
5625               return gen_rtx_PLUS (Pmode, base, op1);
5626             }
5627
5628           /* Otherwise, in order to encourage CSE (and thence loop strength
5629              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
5630           base = expand_binop (Pmode, add_optab, op0, op1,
5631                                NULL_RTX, true, OPTAB_DIRECT);
5632           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
5633         }
5634
5635       /* Does it look like we'll need a 16-byte load/store-pair operation?  */
5636       HOST_WIDE_INT base_offset;
5637       if (GET_MODE_SIZE (mode) > 16)
5638         base_offset = (offset + 0x400) & ~0x7f0;
5639       /* For offsets aren't a multiple of the access size, the limit is
5640          -256...255.  */
5641       else if (offset & (GET_MODE_SIZE (mode) - 1))
5642         {
5643           base_offset = (offset + 0x100) & ~0x1ff;
5644
5645           /* BLKmode typically uses LDP of X-registers.  */
5646           if (mode == BLKmode)
5647             base_offset = (offset + 512) & ~0x3ff;
5648         }
5649       /* Small negative offsets are supported.  */
5650       else if (IN_RANGE (offset, -256, 0))
5651         base_offset = 0;
5652       else if (mode == TImode || mode == TFmode)
5653         base_offset = (offset + 0x100) & ~0x1ff;
5654       /* Use 12-bit offset by access size.  */
5655       else
5656         base_offset = offset & (~0xfff * GET_MODE_SIZE (mode));
5657
5658       if (base_offset != 0)
5659         {
5660           base = plus_constant (Pmode, base, base_offset);
5661           base = force_operand (base, NULL_RTX);
5662           return plus_constant (Pmode, base, offset - base_offset);
5663         }
5664     }
5665
5666   return x;
5667 }
5668
5669 /* Return the reload icode required for a constant pool in mode.  */
5670 static enum insn_code
5671 aarch64_constant_pool_reload_icode (machine_mode mode)
5672 {
5673   switch (mode)
5674     {
5675     case SFmode:
5676       return CODE_FOR_aarch64_reload_movcpsfdi;
5677
5678     case DFmode:
5679       return CODE_FOR_aarch64_reload_movcpdfdi;
5680
5681     case TFmode:
5682       return CODE_FOR_aarch64_reload_movcptfdi;
5683
5684     case V8QImode:
5685       return CODE_FOR_aarch64_reload_movcpv8qidi;
5686
5687     case V16QImode:
5688       return CODE_FOR_aarch64_reload_movcpv16qidi;
5689
5690     case V4HImode:
5691       return CODE_FOR_aarch64_reload_movcpv4hidi;
5692
5693     case V8HImode:
5694       return CODE_FOR_aarch64_reload_movcpv8hidi;
5695
5696     case V2SImode:
5697       return CODE_FOR_aarch64_reload_movcpv2sidi;
5698
5699     case V4SImode:
5700       return CODE_FOR_aarch64_reload_movcpv4sidi;
5701
5702     case V2DImode:
5703       return CODE_FOR_aarch64_reload_movcpv2didi;
5704
5705     case V2DFmode:
5706       return CODE_FOR_aarch64_reload_movcpv2dfdi;
5707
5708     default:
5709       gcc_unreachable ();
5710     }
5711
5712   gcc_unreachable ();
5713 }
5714 static reg_class_t
5715 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5716                           reg_class_t rclass,
5717                           machine_mode mode,
5718                           secondary_reload_info *sri)
5719 {
5720
5721   /* If we have to disable direct literal pool loads and stores because the
5722      function is too big, then we need a scratch register.  */
5723   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5724       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5725           || targetm.vector_mode_supported_p (GET_MODE (x)))
5726       && !aarch64_pcrelative_literal_loads)
5727     {
5728       sri->icode = aarch64_constant_pool_reload_icode (mode);
5729       return NO_REGS;
5730     }
5731
5732   /* Without the TARGET_SIMD instructions we cannot move a Q register
5733      to a Q register directly.  We need a scratch.  */
5734   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5735       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5736       && reg_class_subset_p (rclass, FP_REGS))
5737     {
5738       if (mode == TFmode)
5739         sri->icode = CODE_FOR_aarch64_reload_movtf;
5740       else if (mode == TImode)
5741         sri->icode = CODE_FOR_aarch64_reload_movti;
5742       return NO_REGS;
5743     }
5744
5745   /* A TFmode or TImode memory access should be handled via an FP_REGS
5746      because AArch64 has richer addressing modes for LDR/STR instructions
5747      than LDP/STP instructions.  */
5748   if (TARGET_FLOAT && rclass == GENERAL_REGS
5749       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5750     return FP_REGS;
5751
5752   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5753       return GENERAL_REGS;
5754
5755   return NO_REGS;
5756 }
5757
5758 static bool
5759 aarch64_can_eliminate (const int from, const int to)
5760 {
5761   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5762      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
5763
5764   if (frame_pointer_needed)
5765     {
5766       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5767         return true;
5768       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5769         return false;
5770       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5771           && !cfun->calls_alloca)
5772         return true;
5773       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5774         return true;
5775
5776       return false;
5777     }
5778   else
5779     {
5780       /* If we decided that we didn't need a leaf frame pointer but then used
5781          LR in the function, then we'll want a frame pointer after all, so
5782          prevent this elimination to ensure a frame pointer is used.  */
5783       if (to == STACK_POINTER_REGNUM
5784           && flag_omit_leaf_frame_pointer
5785           && df_regs_ever_live_p (LR_REGNUM))
5786         return false;
5787     }
5788
5789   return true;
5790 }
5791
5792 HOST_WIDE_INT
5793 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5794 {
5795   aarch64_layout_frame ();
5796
5797   if (to == HARD_FRAME_POINTER_REGNUM)
5798     {
5799       if (from == ARG_POINTER_REGNUM)
5800         return cfun->machine->frame.hard_fp_offset;
5801
5802       if (from == FRAME_POINTER_REGNUM)
5803         return cfun->machine->frame.hard_fp_offset
5804                - cfun->machine->frame.locals_offset;
5805     }
5806
5807   if (to == STACK_POINTER_REGNUM)
5808     {
5809       if (from == FRAME_POINTER_REGNUM)
5810           return cfun->machine->frame.frame_size
5811                  - cfun->machine->frame.locals_offset;
5812     }
5813
5814   return cfun->machine->frame.frame_size;
5815 }
5816
5817 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
5818    previous frame.  */
5819
5820 rtx
5821 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5822 {
5823   if (count != 0)
5824     return const0_rtx;
5825   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5826 }
5827
5828
5829 static void
5830 aarch64_asm_trampoline_template (FILE *f)
5831 {
5832   if (TARGET_ILP32)
5833     {
5834       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5835       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5836     }
5837   else
5838     {
5839       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5840       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5841     }
5842   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5843   assemble_aligned_integer (4, const0_rtx);
5844   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5845   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5846 }
5847
5848 static void
5849 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5850 {
5851   rtx fnaddr, mem, a_tramp;
5852   const int tramp_code_sz = 16;
5853
5854   /* Don't need to copy the trailing D-words, we fill those in below.  */
5855   emit_block_move (m_tramp, assemble_trampoline_template (),
5856                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5857   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5858   fnaddr = XEXP (DECL_RTL (fndecl), 0);
5859   if (GET_MODE (fnaddr) != ptr_mode)
5860     fnaddr = convert_memory_address (ptr_mode, fnaddr);
5861   emit_move_insn (mem, fnaddr);
5862
5863   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5864   emit_move_insn (mem, chain_value);
5865
5866   /* XXX We should really define a "clear_cache" pattern and use
5867      gen_clear_cache().  */
5868   a_tramp = XEXP (m_tramp, 0);
5869   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5870                      LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
5871                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
5872                      ptr_mode);
5873 }
5874
5875 static unsigned char
5876 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
5877 {
5878   switch (regclass)
5879     {
5880     case CALLER_SAVE_REGS:
5881     case POINTER_REGS:
5882     case GENERAL_REGS:
5883     case ALL_REGS:
5884     case FP_REGS:
5885     case FP_LO_REGS:
5886       return
5887         aarch64_vector_mode_p (mode)
5888           ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
5889           : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5890     case STACK_REG:
5891       return 1;
5892
5893     case NO_REGS:
5894       return 0;
5895
5896     default:
5897       break;
5898     }
5899   gcc_unreachable ();
5900 }
5901
5902 static reg_class_t
5903 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
5904 {
5905   if (regclass == POINTER_REGS)
5906     return GENERAL_REGS;
5907
5908   if (regclass == STACK_REG)
5909     {
5910       if (REG_P(x)
5911           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
5912           return regclass;
5913
5914       return NO_REGS;
5915     }
5916
5917   /* If it's an integer immediate that MOVI can't handle, then
5918      FP_REGS is not an option, so we return NO_REGS instead.  */
5919   if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
5920       && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
5921     return NO_REGS;
5922
5923   /* Register eliminiation can result in a request for
5924      SP+constant->FP_REGS.  We cannot support such operations which
5925      use SP as source and an FP_REG as destination, so reject out
5926      right now.  */
5927   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
5928     {
5929       rtx lhs = XEXP (x, 0);
5930
5931       /* Look through a possible SUBREG introduced by ILP32.  */
5932       if (GET_CODE (lhs) == SUBREG)
5933         lhs = SUBREG_REG (lhs);
5934
5935       gcc_assert (REG_P (lhs));
5936       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
5937                                       POINTER_REGS));
5938       return NO_REGS;
5939     }
5940
5941   return regclass;
5942 }
5943
5944 void
5945 aarch64_asm_output_labelref (FILE* f, const char *name)
5946 {
5947   asm_fprintf (f, "%U%s", name);
5948 }
5949
5950 static void
5951 aarch64_elf_asm_constructor (rtx symbol, int priority)
5952 {
5953   if (priority == DEFAULT_INIT_PRIORITY)
5954     default_ctor_section_asm_out_constructor (symbol, priority);
5955   else
5956     {
5957       section *s;
5958       /* While priority is known to be in range [0, 65535], so 18 bytes
5959          would be enough, the compiler might not know that.  To avoid
5960          -Wformat-truncation false positive, use a larger size.  */
5961       char buf[23];
5962       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
5963       s = get_section (buf, SECTION_WRITE, NULL);
5964       switch_to_section (s);
5965       assemble_align (POINTER_SIZE);
5966       assemble_aligned_integer (POINTER_BYTES, symbol);
5967     }
5968 }
5969
5970 static void
5971 aarch64_elf_asm_destructor (rtx symbol, int priority)
5972 {
5973   if (priority == DEFAULT_INIT_PRIORITY)
5974     default_dtor_section_asm_out_destructor (symbol, priority);
5975   else
5976     {
5977       section *s;
5978       /* While priority is known to be in range [0, 65535], so 18 bytes
5979          would be enough, the compiler might not know that.  To avoid
5980          -Wformat-truncation false positive, use a larger size.  */
5981       char buf[23];
5982       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
5983       s = get_section (buf, SECTION_WRITE, NULL);
5984       switch_to_section (s);
5985       assemble_align (POINTER_SIZE);
5986       assemble_aligned_integer (POINTER_BYTES, symbol);
5987     }
5988 }
5989
5990 const char*
5991 aarch64_output_casesi (rtx *operands)
5992 {
5993   char buf[100];
5994   char label[100];
5995   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5996   int index;
5997   static const char *const patterns[4][2] =
5998   {
5999     {
6000       "ldrb\t%w3, [%0,%w1,uxtw]",
6001       "add\t%3, %4, %w3, sxtb #2"
6002     },
6003     {
6004       "ldrh\t%w3, [%0,%w1,uxtw #1]",
6005       "add\t%3, %4, %w3, sxth #2"
6006     },
6007     {
6008       "ldr\t%w3, [%0,%w1,uxtw #2]",
6009       "add\t%3, %4, %w3, sxtw #2"
6010     },
6011     /* We assume that DImode is only generated when not optimizing and
6012        that we don't really need 64-bit address offsets.  That would
6013        imply an object file with 8GB of code in a single function!  */
6014     {
6015       "ldr\t%w3, [%0,%w1,uxtw #2]",
6016       "add\t%3, %4, %w3, sxtw #2"
6017     }
6018   };
6019
6020   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
6021
6022   index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
6023
6024   gcc_assert (index >= 0 && index <= 3);
6025
6026   /* Need to implement table size reduction, by chaning the code below.  */
6027   output_asm_insn (patterns[index][0], operands);
6028   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
6029   snprintf (buf, sizeof (buf),
6030             "adr\t%%4, %s", targetm.strip_name_encoding (label));
6031   output_asm_insn (buf, operands);
6032   output_asm_insn (patterns[index][1], operands);
6033   output_asm_insn ("br\t%3", operands);
6034   assemble_label (asm_out_file, label);
6035   return "";
6036 }
6037
6038
6039 /* Return size in bits of an arithmetic operand which is shifted/scaled and
6040    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
6041    operator.  */
6042
6043 int
6044 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
6045 {
6046   if (shift >= 0 && shift <= 3)
6047     {
6048       int size;
6049       for (size = 8; size <= 32; size *= 2)
6050         {
6051           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
6052           if (mask == bits << shift)
6053             return size;
6054         }
6055     }
6056   return 0;
6057 }
6058
6059 /* Constant pools are per function only when PC relative
6060    literal loads are true or we are in the large memory
6061    model.  */
6062
6063 static inline bool
6064 aarch64_can_use_per_function_literal_pools_p (void)
6065 {
6066   return (aarch64_pcrelative_literal_loads
6067           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
6068 }
6069
6070 static bool
6071 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
6072 {
6073   /* Fixme:: In an ideal world this would work similar
6074      to the logic in aarch64_select_rtx_section but this
6075      breaks bootstrap in gcc go.  For now we workaround
6076      this by returning false here.  */
6077   return false;
6078 }
6079
6080 /* Select appropriate section for constants depending
6081    on where we place literal pools.  */
6082
6083 static section *
6084 aarch64_select_rtx_section (machine_mode mode,
6085                             rtx x,
6086                             unsigned HOST_WIDE_INT align)
6087 {
6088   if (aarch64_can_use_per_function_literal_pools_p ())
6089     return function_section (current_function_decl);
6090
6091   return default_elf_select_rtx_section (mode, x, align);
6092 }
6093
6094 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
6095 void
6096 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
6097                                   HOST_WIDE_INT offset)
6098 {
6099   /* When using per-function literal pools, we must ensure that any code
6100      section is aligned to the minimal instruction length, lest we get
6101      errors from the assembler re "unaligned instructions".  */
6102   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
6103     ASM_OUTPUT_ALIGN (f, 2);
6104 }
6105
6106 /* Costs.  */
6107
6108 /* Helper function for rtx cost calculation.  Strip a shift expression
6109    from X.  Returns the inner operand if successful, or the original
6110    expression on failure.  */
6111 static rtx
6112 aarch64_strip_shift (rtx x)
6113 {
6114   rtx op = x;
6115
6116   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
6117      we can convert both to ROR during final output.  */
6118   if ((GET_CODE (op) == ASHIFT
6119        || GET_CODE (op) == ASHIFTRT
6120        || GET_CODE (op) == LSHIFTRT
6121        || GET_CODE (op) == ROTATERT
6122        || GET_CODE (op) == ROTATE)
6123       && CONST_INT_P (XEXP (op, 1)))
6124     return XEXP (op, 0);
6125
6126   if (GET_CODE (op) == MULT
6127       && CONST_INT_P (XEXP (op, 1))
6128       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
6129     return XEXP (op, 0);
6130
6131   return x;
6132 }
6133
6134 /* Helper function for rtx cost calculation.  Strip an extend
6135    expression from X.  Returns the inner operand if successful, or the
6136    original expression on failure.  We deal with a number of possible
6137    canonicalization variations here. If STRIP_SHIFT is true, then
6138    we can strip off a shift also.  */
6139 static rtx
6140 aarch64_strip_extend (rtx x, bool strip_shift)
6141 {
6142   rtx op = x;
6143
6144   /* Zero and sign extraction of a widened value.  */
6145   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
6146       && XEXP (op, 2) == const0_rtx
6147       && GET_CODE (XEXP (op, 0)) == MULT
6148       && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
6149                                          XEXP (op, 1)))
6150     return XEXP (XEXP (op, 0), 0);
6151
6152   /* It can also be represented (for zero-extend) as an AND with an
6153      immediate.  */
6154   if (GET_CODE (op) == AND
6155       && GET_CODE (XEXP (op, 0)) == MULT
6156       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
6157       && CONST_INT_P (XEXP (op, 1))
6158       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
6159                            INTVAL (XEXP (op, 1))) != 0)
6160     return XEXP (XEXP (op, 0), 0);
6161
6162   /* Now handle extended register, as this may also have an optional
6163      left shift by 1..4.  */
6164   if (strip_shift
6165       && GET_CODE (op) == ASHIFT
6166       && CONST_INT_P (XEXP (op, 1))
6167       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
6168     op = XEXP (op, 0);
6169
6170   if (GET_CODE (op) == ZERO_EXTEND
6171       || GET_CODE (op) == SIGN_EXTEND)
6172     op = XEXP (op, 0);
6173
6174   if (op != x)
6175     return op;
6176
6177   return x;
6178 }
6179
6180 /* Return true iff CODE is a shift supported in combination
6181    with arithmetic instructions.  */
6182
6183 static bool
6184 aarch64_shift_p (enum rtx_code code)
6185 {
6186   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
6187 }
6188
6189
6190 /* Return true iff X is a cheap shift without a sign extend. */
6191
6192 static bool
6193 aarch64_cheap_mult_shift_p (rtx x)
6194 {
6195   rtx op0, op1;
6196
6197   op0 = XEXP (x, 0);
6198   op1 = XEXP (x, 1);
6199
6200   if (!(aarch64_tune_params.extra_tuning_flags
6201                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
6202     return false;
6203
6204   if (GET_CODE (op0) == SIGN_EXTEND)
6205     return false;
6206
6207   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
6208       && UINTVAL (op1) <= 4)
6209     return true;
6210
6211   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
6212     return false;
6213
6214   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
6215
6216   if (l2 > 0 && l2 <= 4)
6217     return true;
6218
6219   return false;
6220 }
6221
6222 /* Helper function for rtx cost calculation.  Calculate the cost of
6223    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
6224    Return the calculated cost of the expression, recursing manually in to
6225    operands where needed.  */
6226
6227 static int
6228 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
6229 {
6230   rtx op0, op1;
6231   const struct cpu_cost_table *extra_cost
6232     = aarch64_tune_params.insn_extra_cost;
6233   int cost = 0;
6234   bool compound_p = (outer == PLUS || outer == MINUS);
6235   machine_mode mode = GET_MODE (x);
6236
6237   gcc_checking_assert (code == MULT);
6238
6239   op0 = XEXP (x, 0);
6240   op1 = XEXP (x, 1);
6241
6242   if (VECTOR_MODE_P (mode))
6243     mode = GET_MODE_INNER (mode);
6244
6245   /* Integer multiply/fma.  */
6246   if (GET_MODE_CLASS (mode) == MODE_INT)
6247     {
6248       /* The multiply will be canonicalized as a shift, cost it as such.  */
6249       if (aarch64_shift_p (GET_CODE (x))
6250           || (CONST_INT_P (op1)
6251               && exact_log2 (INTVAL (op1)) > 0))
6252         {
6253           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
6254                            || GET_CODE (op0) == SIGN_EXTEND;
6255           if (speed)
6256             {
6257               if (compound_p)
6258                 {
6259                   /* If the shift is considered cheap,
6260                      then don't add any cost. */
6261                   if (aarch64_cheap_mult_shift_p (x))
6262                     ;
6263                   else if (REG_P (op1))
6264                     /* ARITH + shift-by-register.  */
6265                     cost += extra_cost->alu.arith_shift_reg;
6266                   else if (is_extend)
6267                     /* ARITH + extended register.  We don't have a cost field
6268                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
6269                     cost += extra_cost->alu.extend_arith;
6270                   else
6271                     /* ARITH + shift-by-immediate.  */
6272                     cost += extra_cost->alu.arith_shift;
6273                 }
6274               else
6275                 /* LSL (immediate).  */
6276                 cost += extra_cost->alu.shift;
6277
6278             }
6279           /* Strip extends as we will have costed them in the case above.  */
6280           if (is_extend)
6281             op0 = aarch64_strip_extend (op0, true);
6282
6283           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
6284
6285           return cost;
6286         }
6287
6288       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
6289          compound and let the below cases handle it.  After all, MNEG is a
6290          special-case alias of MSUB.  */
6291       if (GET_CODE (op0) == NEG)
6292         {
6293           op0 = XEXP (op0, 0);
6294           compound_p = true;
6295         }
6296
6297       /* Integer multiplies or FMAs have zero/sign extending variants.  */
6298       if ((GET_CODE (op0) == ZERO_EXTEND
6299            && GET_CODE (op1) == ZERO_EXTEND)
6300           || (GET_CODE (op0) == SIGN_EXTEND
6301               && GET_CODE (op1) == SIGN_EXTEND))
6302         {
6303           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
6304           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
6305
6306           if (speed)
6307             {
6308               if (compound_p)
6309                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
6310                 cost += extra_cost->mult[0].extend_add;
6311               else
6312                 /* MUL/SMULL/UMULL.  */
6313                 cost += extra_cost->mult[0].extend;
6314             }
6315
6316           return cost;
6317         }
6318
6319       /* This is either an integer multiply or a MADD.  In both cases
6320          we want to recurse and cost the operands.  */
6321       cost += rtx_cost (op0, mode, MULT, 0, speed);
6322       cost += rtx_cost (op1, mode, MULT, 1, speed);
6323
6324       if (speed)
6325         {
6326           if (compound_p)
6327             /* MADD/MSUB.  */
6328             cost += extra_cost->mult[mode == DImode].add;
6329           else
6330             /* MUL.  */
6331             cost += extra_cost->mult[mode == DImode].simple;
6332         }
6333
6334       return cost;
6335     }
6336   else
6337     {
6338       if (speed)
6339         {
6340           /* Floating-point FMA/FMUL can also support negations of the
6341              operands, unless the rounding mode is upward or downward in
6342              which case FNMUL is different than FMUL with operand negation.  */
6343           bool neg0 = GET_CODE (op0) == NEG;
6344           bool neg1 = GET_CODE (op1) == NEG;
6345           if (compound_p || !flag_rounding_math || (neg0 && neg1))
6346             {
6347               if (neg0)
6348                 op0 = XEXP (op0, 0);
6349               if (neg1)
6350                 op1 = XEXP (op1, 0);
6351             }
6352
6353           if (compound_p)
6354             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
6355             cost += extra_cost->fp[mode == DFmode].fma;
6356           else
6357             /* FMUL/FNMUL.  */
6358             cost += extra_cost->fp[mode == DFmode].mult;
6359         }
6360
6361       cost += rtx_cost (op0, mode, MULT, 0, speed);
6362       cost += rtx_cost (op1, mode, MULT, 1, speed);
6363       return cost;
6364     }
6365 }
6366
6367 static int
6368 aarch64_address_cost (rtx x,
6369                       machine_mode mode,
6370                       addr_space_t as ATTRIBUTE_UNUSED,
6371                       bool speed)
6372 {
6373   enum rtx_code c = GET_CODE (x);
6374   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
6375   struct aarch64_address_info info;
6376   int cost = 0;
6377   info.shift = 0;
6378
6379   if (!aarch64_classify_address (&info, x, mode, c, false))
6380     {
6381       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
6382         {
6383           /* This is a CONST or SYMBOL ref which will be split
6384              in a different way depending on the code model in use.
6385              Cost it through the generic infrastructure.  */
6386           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
6387           /* Divide through by the cost of one instruction to
6388              bring it to the same units as the address costs.  */
6389           cost_symbol_ref /= COSTS_N_INSNS (1);
6390           /* The cost is then the cost of preparing the address,
6391              followed by an immediate (possibly 0) offset.  */
6392           return cost_symbol_ref + addr_cost->imm_offset;
6393         }
6394       else
6395         {
6396           /* This is most likely a jump table from a case
6397              statement.  */
6398           return addr_cost->register_offset;
6399         }
6400     }
6401
6402   switch (info.type)
6403     {
6404       case ADDRESS_LO_SUM:
6405       case ADDRESS_SYMBOLIC:
6406       case ADDRESS_REG_IMM:
6407         cost += addr_cost->imm_offset;
6408         break;
6409
6410       case ADDRESS_REG_WB:
6411         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
6412           cost += addr_cost->pre_modify;
6413         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
6414           cost += addr_cost->post_modify;
6415         else
6416           gcc_unreachable ();
6417
6418         break;
6419
6420       case ADDRESS_REG_REG:
6421         cost += addr_cost->register_offset;
6422         break;
6423
6424       case ADDRESS_REG_SXTW:
6425         cost += addr_cost->register_sextend;
6426         break;
6427
6428       case ADDRESS_REG_UXTW:
6429         cost += addr_cost->register_zextend;
6430         break;
6431
6432       default:
6433         gcc_unreachable ();
6434     }
6435
6436
6437   if (info.shift > 0)
6438     {
6439       /* For the sake of calculating the cost of the shifted register
6440          component, we can treat same sized modes in the same way.  */
6441       switch (GET_MODE_BITSIZE (mode))
6442         {
6443           case 16:
6444             cost += addr_cost->addr_scale_costs.hi;
6445             break;
6446
6447           case 32:
6448             cost += addr_cost->addr_scale_costs.si;
6449             break;
6450
6451           case 64:
6452             cost += addr_cost->addr_scale_costs.di;
6453             break;
6454
6455           /* We can't tell, or this is a 128-bit vector.  */
6456           default:
6457             cost += addr_cost->addr_scale_costs.ti;
6458             break;
6459         }
6460     }
6461
6462   return cost;
6463 }
6464
6465 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
6466    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
6467    to be taken.  */
6468
6469 int
6470 aarch64_branch_cost (bool speed_p, bool predictable_p)
6471 {
6472   /* When optimizing for speed, use the cost of unpredictable branches.  */
6473   const struct cpu_branch_cost *branch_costs =
6474     aarch64_tune_params.branch_costs;
6475
6476   if (!speed_p || predictable_p)
6477     return branch_costs->predictable;
6478   else
6479     return branch_costs->unpredictable;
6480 }
6481
6482 /* Return true if the RTX X in mode MODE is a zero or sign extract
6483    usable in an ADD or SUB (extended register) instruction.  */
6484 static bool
6485 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
6486 {
6487   /* Catch add with a sign extract.
6488      This is add_<optab><mode>_multp2.  */
6489   if (GET_CODE (x) == SIGN_EXTRACT
6490       || GET_CODE (x) == ZERO_EXTRACT)
6491     {
6492       rtx op0 = XEXP (x, 0);
6493       rtx op1 = XEXP (x, 1);
6494       rtx op2 = XEXP (x, 2);
6495
6496       if (GET_CODE (op0) == MULT
6497           && CONST_INT_P (op1)
6498           && op2 == const0_rtx
6499           && CONST_INT_P (XEXP (op0, 1))
6500           && aarch64_is_extend_from_extract (mode,
6501                                              XEXP (op0, 1),
6502                                              op1))
6503         {
6504           return true;
6505         }
6506     }
6507   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
6508      No shift.  */
6509   else if (GET_CODE (x) == SIGN_EXTEND
6510            || GET_CODE (x) == ZERO_EXTEND)
6511     return REG_P (XEXP (x, 0));
6512
6513   return false;
6514 }
6515
6516 static bool
6517 aarch64_frint_unspec_p (unsigned int u)
6518 {
6519   switch (u)
6520     {
6521       case UNSPEC_FRINTZ:
6522       case UNSPEC_FRINTP:
6523       case UNSPEC_FRINTM:
6524       case UNSPEC_FRINTA:
6525       case UNSPEC_FRINTN:
6526       case UNSPEC_FRINTX:
6527       case UNSPEC_FRINTI:
6528         return true;
6529
6530       default:
6531         return false;
6532     }
6533 }
6534
6535 /* Return true iff X is an rtx that will match an extr instruction
6536    i.e. as described in the *extr<mode>5_insn family of patterns.
6537    OP0 and OP1 will be set to the operands of the shifts involved
6538    on success and will be NULL_RTX otherwise.  */
6539
6540 static bool
6541 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
6542 {
6543   rtx op0, op1;
6544   machine_mode mode = GET_MODE (x);
6545
6546   *res_op0 = NULL_RTX;
6547   *res_op1 = NULL_RTX;
6548
6549   if (GET_CODE (x) != IOR)
6550     return false;
6551
6552   op0 = XEXP (x, 0);
6553   op1 = XEXP (x, 1);
6554
6555   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
6556       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
6557     {
6558      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
6559       if (GET_CODE (op1) == ASHIFT)
6560         std::swap (op0, op1);
6561
6562       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
6563         return false;
6564
6565       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
6566       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
6567
6568       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
6569           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
6570         {
6571           *res_op0 = XEXP (op0, 0);
6572           *res_op1 = XEXP (op1, 0);
6573           return true;
6574         }
6575     }
6576
6577   return false;
6578 }
6579
6580 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6581    storing it in *COST.  Result is true if the total cost of the operation
6582    has now been calculated.  */
6583 static bool
6584 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
6585 {
6586   rtx inner;
6587   rtx comparator;
6588   enum rtx_code cmpcode;
6589
6590   if (COMPARISON_P (op0))
6591     {
6592       inner = XEXP (op0, 0);
6593       comparator = XEXP (op0, 1);
6594       cmpcode = GET_CODE (op0);
6595     }
6596   else
6597     {
6598       inner = op0;
6599       comparator = const0_rtx;
6600       cmpcode = NE;
6601     }
6602
6603   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
6604     {
6605       /* Conditional branch.  */
6606       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6607         return true;
6608       else
6609         {
6610           if (cmpcode == NE || cmpcode == EQ)
6611             {
6612               if (comparator == const0_rtx)
6613                 {
6614                   /* TBZ/TBNZ/CBZ/CBNZ.  */
6615                   if (GET_CODE (inner) == ZERO_EXTRACT)
6616                     /* TBZ/TBNZ.  */
6617                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
6618                                        ZERO_EXTRACT, 0, speed);
6619                   else
6620                     /* CBZ/CBNZ.  */
6621                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
6622
6623                 return true;
6624               }
6625             }
6626           else if (cmpcode == LT || cmpcode == GE)
6627             {
6628               /* TBZ/TBNZ.  */
6629               if (comparator == const0_rtx)
6630                 return true;
6631             }
6632         }
6633     }
6634   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6635     {
6636       /* CCMP.  */
6637       if (GET_CODE (op1) == COMPARE)
6638         {
6639           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
6640           if (XEXP (op1, 1) == const0_rtx)
6641             *cost += 1;
6642           if (speed)
6643             {
6644               machine_mode mode = GET_MODE (XEXP (op1, 0));
6645               const struct cpu_cost_table *extra_cost
6646                 = aarch64_tune_params.insn_extra_cost;
6647
6648               if (GET_MODE_CLASS (mode) == MODE_INT)
6649                 *cost += extra_cost->alu.arith;
6650               else
6651                 *cost += extra_cost->fp[mode == DFmode].compare;
6652             }
6653           return true;
6654         }
6655
6656       /* It's a conditional operation based on the status flags,
6657          so it must be some flavor of CSEL.  */
6658
6659       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
6660       if (GET_CODE (op1) == NEG
6661           || GET_CODE (op1) == NOT
6662           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6663         op1 = XEXP (op1, 0);
6664       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6665         {
6666           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
6667           op1 = XEXP (op1, 0);
6668           op2 = XEXP (op2, 0);
6669         }
6670
6671       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6672       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6673       return true;
6674     }
6675
6676   /* We don't know what this is, cost all operands.  */
6677   return false;
6678 }
6679
6680 /* Check whether X is a bitfield operation of the form shift + extend that
6681    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
6682    operand to which the bitfield operation is applied.  Otherwise return
6683    NULL_RTX.  */
6684
6685 static rtx
6686 aarch64_extend_bitfield_pattern_p (rtx x)
6687 {
6688   rtx_code outer_code = GET_CODE (x);
6689   machine_mode outer_mode = GET_MODE (x);
6690
6691   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6692       && outer_mode != SImode && outer_mode != DImode)
6693     return NULL_RTX;
6694
6695   rtx inner = XEXP (x, 0);
6696   rtx_code inner_code = GET_CODE (inner);
6697   machine_mode inner_mode = GET_MODE (inner);
6698   rtx op = NULL_RTX;
6699
6700   switch (inner_code)
6701     {
6702       case ASHIFT:
6703         if (CONST_INT_P (XEXP (inner, 1))
6704             && (inner_mode == QImode || inner_mode == HImode))
6705           op = XEXP (inner, 0);
6706         break;
6707       case LSHIFTRT:
6708         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6709             && (inner_mode == QImode || inner_mode == HImode))
6710           op = XEXP (inner, 0);
6711         break;
6712       case ASHIFTRT:
6713         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6714             && (inner_mode == QImode || inner_mode == HImode))
6715           op = XEXP (inner, 0);
6716         break;
6717       default:
6718         break;
6719     }
6720
6721   return op;
6722 }
6723
6724 /* Return true if the mask and a shift amount from an RTX of the form
6725    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6726    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
6727
6728 bool
6729 aarch64_mask_and_shift_for_ubfiz_p (machine_mode mode, rtx mask, rtx shft_amnt)
6730 {
6731   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
6732          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
6733          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
6734          && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
6735 }
6736
6737 /* Calculate the cost of calculating X, storing it in *COST.  Result
6738    is true if the total cost of the operation has now been calculated.  */
6739 static bool
6740 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6741                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6742 {
6743   rtx op0, op1, op2;
6744   const struct cpu_cost_table *extra_cost
6745     = aarch64_tune_params.insn_extra_cost;
6746   int code = GET_CODE (x);
6747
6748   /* By default, assume that everything has equivalent cost to the
6749      cheapest instruction.  Any additional costs are applied as a delta
6750      above this default.  */
6751   *cost = COSTS_N_INSNS (1);
6752
6753   switch (code)
6754     {
6755     case SET:
6756       /* The cost depends entirely on the operands to SET.  */
6757       *cost = 0;
6758       op0 = SET_DEST (x);
6759       op1 = SET_SRC (x);
6760
6761       switch (GET_CODE (op0))
6762         {
6763         case MEM:
6764           if (speed)
6765             {
6766               rtx address = XEXP (op0, 0);
6767               if (VECTOR_MODE_P (mode))
6768                 *cost += extra_cost->ldst.storev;
6769               else if (GET_MODE_CLASS (mode) == MODE_INT)
6770                 *cost += extra_cost->ldst.store;
6771               else if (mode == SFmode)
6772                 *cost += extra_cost->ldst.storef;
6773               else if (mode == DFmode)
6774                 *cost += extra_cost->ldst.stored;
6775
6776               *cost +=
6777                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6778                                                      0, speed));
6779             }
6780
6781           *cost += rtx_cost (op1, mode, SET, 1, speed);
6782           return true;
6783
6784         case SUBREG:
6785           if (! REG_P (SUBREG_REG (op0)))
6786             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6787
6788           /* Fall through.  */
6789         case REG:
6790           /* The cost is one per vector-register copied.  */
6791           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6792             {
6793               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6794                               / GET_MODE_SIZE (V4SImode);
6795               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6796             }
6797           /* const0_rtx is in general free, but we will use an
6798              instruction to set a register to 0.  */
6799           else if (REG_P (op1) || op1 == const0_rtx)
6800             {
6801               /* The cost is 1 per register copied.  */
6802               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6803                               / UNITS_PER_WORD;
6804               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6805             }
6806           else
6807             /* Cost is just the cost of the RHS of the set.  */
6808             *cost += rtx_cost (op1, mode, SET, 1, speed);
6809           return true;
6810
6811         case ZERO_EXTRACT:
6812         case SIGN_EXTRACT:
6813           /* Bit-field insertion.  Strip any redundant widening of
6814              the RHS to meet the width of the target.  */
6815           if (GET_CODE (op1) == SUBREG)
6816             op1 = SUBREG_REG (op1);
6817           if ((GET_CODE (op1) == ZERO_EXTEND
6818                || GET_CODE (op1) == SIGN_EXTEND)
6819               && CONST_INT_P (XEXP (op0, 1))
6820               && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
6821                   >= INTVAL (XEXP (op0, 1))))
6822             op1 = XEXP (op1, 0);
6823
6824           if (CONST_INT_P (op1))
6825             {
6826               /* MOV immediate is assumed to always be cheap.  */
6827               *cost = COSTS_N_INSNS (1);
6828             }
6829           else
6830             {
6831               /* BFM.  */
6832               if (speed)
6833                 *cost += extra_cost->alu.bfi;
6834               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6835             }
6836
6837           return true;
6838
6839         default:
6840           /* We can't make sense of this, assume default cost.  */
6841           *cost = COSTS_N_INSNS (1);
6842           return false;
6843         }
6844       return false;
6845
6846     case CONST_INT:
6847       /* If an instruction can incorporate a constant within the
6848          instruction, the instruction's expression avoids calling
6849          rtx_cost() on the constant.  If rtx_cost() is called on a
6850          constant, then it is usually because the constant must be
6851          moved into a register by one or more instructions.
6852
6853          The exception is constant 0, which can be expressed
6854          as XZR/WZR and is therefore free.  The exception to this is
6855          if we have (set (reg) (const0_rtx)) in which case we must cost
6856          the move.  However, we can catch that when we cost the SET, so
6857          we don't need to consider that here.  */
6858       if (x == const0_rtx)
6859         *cost = 0;
6860       else
6861         {
6862           /* To an approximation, building any other constant is
6863              proportionally expensive to the number of instructions
6864              required to build that constant.  This is true whether we
6865              are compiling for SPEED or otherwise.  */
6866           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
6867                                  (NULL_RTX, x, false, mode));
6868         }
6869       return true;
6870
6871     case CONST_DOUBLE:
6872       if (speed)
6873         {
6874           /* mov[df,sf]_aarch64.  */
6875           if (aarch64_float_const_representable_p (x))
6876             /* FMOV (scalar immediate).  */
6877             *cost += extra_cost->fp[mode == DFmode].fpconst;
6878           else if (!aarch64_float_const_zero_rtx_p (x))
6879             {
6880               /* This will be a load from memory.  */
6881               if (mode == DFmode)
6882                 *cost += extra_cost->ldst.loadd;
6883               else
6884                 *cost += extra_cost->ldst.loadf;
6885             }
6886           else
6887             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
6888                or MOV v0.s[0], wzr - neither of which are modeled by the
6889                cost tables.  Just use the default cost.  */
6890             {
6891             }
6892         }
6893
6894       return true;
6895
6896     case MEM:
6897       if (speed)
6898         {
6899           /* For loads we want the base cost of a load, plus an
6900              approximation for the additional cost of the addressing
6901              mode.  */
6902           rtx address = XEXP (x, 0);
6903           if (VECTOR_MODE_P (mode))
6904             *cost += extra_cost->ldst.loadv;
6905           else if (GET_MODE_CLASS (mode) == MODE_INT)
6906             *cost += extra_cost->ldst.load;
6907           else if (mode == SFmode)
6908             *cost += extra_cost->ldst.loadf;
6909           else if (mode == DFmode)
6910             *cost += extra_cost->ldst.loadd;
6911
6912           *cost +=
6913                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6914                                                      0, speed));
6915         }
6916
6917       return true;
6918
6919     case NEG:
6920       op0 = XEXP (x, 0);
6921
6922       if (VECTOR_MODE_P (mode))
6923         {
6924           if (speed)
6925             {
6926               /* FNEG.  */
6927               *cost += extra_cost->vect.alu;
6928             }
6929           return false;
6930         }
6931
6932       if (GET_MODE_CLASS (mode) == MODE_INT)
6933         {
6934           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6935               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6936             {
6937               /* CSETM.  */
6938               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
6939               return true;
6940             }
6941
6942           /* Cost this as SUB wzr, X.  */
6943           op0 = CONST0_RTX (mode);
6944           op1 = XEXP (x, 0);
6945           goto cost_minus;
6946         }
6947
6948       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6949         {
6950           /* Support (neg(fma...)) as a single instruction only if
6951              sign of zeros is unimportant.  This matches the decision
6952              making in aarch64.md.  */
6953           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
6954             {
6955               /* FNMADD.  */
6956               *cost = rtx_cost (op0, mode, NEG, 0, speed);
6957               return true;
6958             }
6959           if (GET_CODE (op0) == MULT)
6960             {
6961               /* FNMUL.  */
6962               *cost = rtx_cost (op0, mode, NEG, 0, speed);
6963               return true;
6964             }
6965           if (speed)
6966             /* FNEG.  */
6967             *cost += extra_cost->fp[mode == DFmode].neg;
6968           return false;
6969         }
6970
6971       return false;
6972
6973     case CLRSB:
6974     case CLZ:
6975       if (speed)
6976         {
6977           if (VECTOR_MODE_P (mode))
6978             *cost += extra_cost->vect.alu;
6979           else
6980             *cost += extra_cost->alu.clz;
6981         }
6982
6983       return false;
6984
6985     case COMPARE:
6986       op0 = XEXP (x, 0);
6987       op1 = XEXP (x, 1);
6988
6989       if (op1 == const0_rtx
6990           && GET_CODE (op0) == AND)
6991         {
6992           x = op0;
6993           mode = GET_MODE (op0);
6994           goto cost_logic;
6995         }
6996
6997       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
6998         {
6999           /* TODO: A write to the CC flags possibly costs extra, this
7000              needs encoding in the cost tables.  */
7001
7002           mode = GET_MODE (op0);
7003           /* ANDS.  */
7004           if (GET_CODE (op0) == AND)
7005             {
7006               x = op0;
7007               goto cost_logic;
7008             }
7009
7010           if (GET_CODE (op0) == PLUS)
7011             {
7012               /* ADDS (and CMN alias).  */
7013               x = op0;
7014               goto cost_plus;
7015             }
7016
7017           if (GET_CODE (op0) == MINUS)
7018             {
7019               /* SUBS.  */
7020               x = op0;
7021               goto cost_minus;
7022             }
7023
7024           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
7025               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
7026               && CONST_INT_P (XEXP (op0, 2)))
7027             {
7028               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
7029                  Handle it here directly rather than going to cost_logic
7030                  since we know the immediate generated for the TST is valid
7031                  so we can avoid creating an intermediate rtx for it only
7032                  for costing purposes.  */
7033               if (speed)
7034                 *cost += extra_cost->alu.logical;
7035
7036               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
7037                                  ZERO_EXTRACT, 0, speed);
7038               return true;
7039             }
7040
7041           if (GET_CODE (op1) == NEG)
7042             {
7043               /* CMN.  */
7044               if (speed)
7045                 *cost += extra_cost->alu.arith;
7046
7047               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
7048               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
7049               return true;
7050             }
7051
7052           /* CMP.
7053
7054              Compare can freely swap the order of operands, and
7055              canonicalization puts the more complex operation first.
7056              But the integer MINUS logic expects the shift/extend
7057              operation in op1.  */
7058           if (! (REG_P (op0)
7059                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
7060           {
7061             op0 = XEXP (x, 1);
7062             op1 = XEXP (x, 0);
7063           }
7064           goto cost_minus;
7065         }
7066
7067       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
7068         {
7069           /* FCMP.  */
7070           if (speed)
7071             *cost += extra_cost->fp[mode == DFmode].compare;
7072
7073           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
7074             {
7075               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
7076               /* FCMP supports constant 0.0 for no extra cost. */
7077               return true;
7078             }
7079           return false;
7080         }
7081
7082       if (VECTOR_MODE_P (mode))
7083         {
7084           /* Vector compare.  */
7085           if (speed)
7086             *cost += extra_cost->vect.alu;
7087
7088           if (aarch64_float_const_zero_rtx_p (op1))
7089             {
7090               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
7091                  cost.  */
7092               return true;
7093             }
7094           return false;
7095         }
7096       return false;
7097
7098     case MINUS:
7099       {
7100         op0 = XEXP (x, 0);
7101         op1 = XEXP (x, 1);
7102
7103 cost_minus:
7104         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
7105
7106         /* Detect valid immediates.  */
7107         if ((GET_MODE_CLASS (mode) == MODE_INT
7108              || (GET_MODE_CLASS (mode) == MODE_CC
7109                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
7110             && CONST_INT_P (op1)
7111             && aarch64_uimm12_shift (INTVAL (op1)))
7112           {
7113             if (speed)
7114               /* SUB(S) (immediate).  */
7115               *cost += extra_cost->alu.arith;
7116             return true;
7117           }
7118
7119         /* Look for SUB (extended register).  */
7120         if (aarch64_rtx_arith_op_extract_p (op1, mode))
7121           {
7122             if (speed)
7123               *cost += extra_cost->alu.extend_arith;
7124
7125             op1 = aarch64_strip_extend (op1, true);
7126             *cost += rtx_cost (op1, VOIDmode,
7127                                (enum rtx_code) GET_CODE (op1), 0, speed);
7128             return true;
7129           }
7130
7131         rtx new_op1 = aarch64_strip_extend (op1, false);
7132
7133         /* Cost this as an FMA-alike operation.  */
7134         if ((GET_CODE (new_op1) == MULT
7135              || aarch64_shift_p (GET_CODE (new_op1)))
7136             && code != COMPARE)
7137           {
7138             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
7139                                             (enum rtx_code) code,
7140                                             speed);
7141             return true;
7142           }
7143
7144         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
7145
7146         if (speed)
7147           {
7148             if (VECTOR_MODE_P (mode))
7149               {
7150                 /* Vector SUB.  */
7151                 *cost += extra_cost->vect.alu;
7152               }
7153             else if (GET_MODE_CLASS (mode) == MODE_INT)
7154               {
7155                 /* SUB(S).  */
7156                 *cost += extra_cost->alu.arith;
7157               }
7158             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7159               {
7160                 /* FSUB.  */
7161                 *cost += extra_cost->fp[mode == DFmode].addsub;
7162               }
7163           }
7164         return true;
7165       }
7166
7167     case PLUS:
7168       {
7169         rtx new_op0;
7170
7171         op0 = XEXP (x, 0);
7172         op1 = XEXP (x, 1);
7173
7174 cost_plus:
7175         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7176             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7177           {
7178             /* CSINC.  */
7179             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
7180             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7181             return true;
7182           }
7183
7184         if (GET_MODE_CLASS (mode) == MODE_INT
7185             && CONST_INT_P (op1)
7186             && aarch64_uimm12_shift (INTVAL (op1)))
7187           {
7188             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
7189
7190             if (speed)
7191               /* ADD (immediate).  */
7192               *cost += extra_cost->alu.arith;
7193             return true;
7194           }
7195
7196         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7197
7198         /* Look for ADD (extended register).  */
7199         if (aarch64_rtx_arith_op_extract_p (op0, mode))
7200           {
7201             if (speed)
7202               *cost += extra_cost->alu.extend_arith;
7203
7204             op0 = aarch64_strip_extend (op0, true);
7205             *cost += rtx_cost (op0, VOIDmode,
7206                                (enum rtx_code) GET_CODE (op0), 0, speed);
7207             return true;
7208           }
7209
7210         /* Strip any extend, leave shifts behind as we will
7211            cost them through mult_cost.  */
7212         new_op0 = aarch64_strip_extend (op0, false);
7213
7214         if (GET_CODE (new_op0) == MULT
7215             || aarch64_shift_p (GET_CODE (new_op0)))
7216           {
7217             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
7218                                             speed);
7219             return true;
7220           }
7221
7222         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
7223
7224         if (speed)
7225           {
7226             if (VECTOR_MODE_P (mode))
7227               {
7228                 /* Vector ADD.  */
7229                 *cost += extra_cost->vect.alu;
7230               }
7231             else if (GET_MODE_CLASS (mode) == MODE_INT)
7232               {
7233                 /* ADD.  */
7234                 *cost += extra_cost->alu.arith;
7235               }
7236             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7237               {
7238                 /* FADD.  */
7239                 *cost += extra_cost->fp[mode == DFmode].addsub;
7240               }
7241           }
7242         return true;
7243       }
7244
7245     case BSWAP:
7246       *cost = COSTS_N_INSNS (1);
7247
7248       if (speed)
7249         {
7250           if (VECTOR_MODE_P (mode))
7251             *cost += extra_cost->vect.alu;
7252           else
7253             *cost += extra_cost->alu.rev;
7254         }
7255       return false;
7256
7257     case IOR:
7258       if (aarch_rev16_p (x))
7259         {
7260           *cost = COSTS_N_INSNS (1);
7261
7262           if (speed)
7263             {
7264               if (VECTOR_MODE_P (mode))
7265                 *cost += extra_cost->vect.alu;
7266               else
7267                 *cost += extra_cost->alu.rev;
7268             }
7269           return true;
7270         }
7271
7272       if (aarch64_extr_rtx_p (x, &op0, &op1))
7273         {
7274           *cost += rtx_cost (op0, mode, IOR, 0, speed);
7275           *cost += rtx_cost (op1, mode, IOR, 1, speed);
7276           if (speed)
7277             *cost += extra_cost->alu.shift;
7278
7279           return true;
7280         }
7281     /* Fall through.  */
7282     case XOR:
7283     case AND:
7284     cost_logic:
7285       op0 = XEXP (x, 0);
7286       op1 = XEXP (x, 1);
7287
7288       if (VECTOR_MODE_P (mode))
7289         {
7290           if (speed)
7291             *cost += extra_cost->vect.alu;
7292           return true;
7293         }
7294
7295       if (code == AND
7296           && GET_CODE (op0) == MULT
7297           && CONST_INT_P (XEXP (op0, 1))
7298           && CONST_INT_P (op1)
7299           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
7300                                INTVAL (op1)) != 0)
7301         {
7302           /* This is a UBFM/SBFM.  */
7303           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
7304           if (speed)
7305             *cost += extra_cost->alu.bfx;
7306           return true;
7307         }
7308
7309       if (GET_MODE_CLASS (mode) == MODE_INT)
7310         {
7311           if (CONST_INT_P (op1))
7312             {
7313               /* We have a mask + shift version of a UBFIZ
7314                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
7315               if (GET_CODE (op0) == ASHIFT
7316                   && aarch64_mask_and_shift_for_ubfiz_p (mode, op1,
7317                                                           XEXP (op0, 1)))
7318                 {
7319                   *cost += rtx_cost (XEXP (op0, 0), mode,
7320                                      (enum rtx_code) code, 0, speed);
7321                   if (speed)
7322                     *cost += extra_cost->alu.bfx;
7323
7324                   return true;
7325                 }
7326               else if (aarch64_bitmask_imm (INTVAL (op1), mode))
7327                 {
7328                 /* We possibly get the immediate for free, this is not
7329                    modelled.  */
7330                   *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7331                   if (speed)
7332                     *cost += extra_cost->alu.logical;
7333
7334                   return true;
7335                 }
7336             }
7337           else
7338             {
7339               rtx new_op0 = op0;
7340
7341               /* Handle ORN, EON, or BIC.  */
7342               if (GET_CODE (op0) == NOT)
7343                 op0 = XEXP (op0, 0);
7344
7345               new_op0 = aarch64_strip_shift (op0);
7346
7347               /* If we had a shift on op0 then this is a logical-shift-
7348                  by-register/immediate operation.  Otherwise, this is just
7349                  a logical operation.  */
7350               if (speed)
7351                 {
7352                   if (new_op0 != op0)
7353                     {
7354                       /* Shift by immediate.  */
7355                       if (CONST_INT_P (XEXP (op0, 1)))
7356                         *cost += extra_cost->alu.log_shift;
7357                       else
7358                         *cost += extra_cost->alu.log_shift_reg;
7359                     }
7360                   else
7361                     *cost += extra_cost->alu.logical;
7362                 }
7363
7364               /* In both cases we want to cost both operands.  */
7365               *cost += rtx_cost (new_op0, mode, (enum rtx_code) code, 0, speed);
7366               *cost += rtx_cost (op1, mode, (enum rtx_code) code, 1, speed);
7367
7368               return true;
7369             }
7370         }
7371       return false;
7372
7373     case NOT:
7374       x = XEXP (x, 0);
7375       op0 = aarch64_strip_shift (x);
7376
7377       if (VECTOR_MODE_P (mode))
7378         {
7379           /* Vector NOT.  */
7380           *cost += extra_cost->vect.alu;
7381           return false;
7382         }
7383
7384       /* MVN-shifted-reg.  */
7385       if (op0 != x)
7386         {
7387           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7388
7389           if (speed)
7390             *cost += extra_cost->alu.log_shift;
7391
7392           return true;
7393         }
7394       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
7395          Handle the second form here taking care that 'a' in the above can
7396          be a shift.  */
7397       else if (GET_CODE (op0) == XOR)
7398         {
7399           rtx newop0 = XEXP (op0, 0);
7400           rtx newop1 = XEXP (op0, 1);
7401           rtx op0_stripped = aarch64_strip_shift (newop0);
7402
7403           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
7404           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
7405
7406           if (speed)
7407             {
7408               if (op0_stripped != newop0)
7409                 *cost += extra_cost->alu.log_shift;
7410               else
7411                 *cost += extra_cost->alu.logical;
7412             }
7413
7414           return true;
7415         }
7416       /* MVN.  */
7417       if (speed)
7418         *cost += extra_cost->alu.logical;
7419
7420       return false;
7421
7422     case ZERO_EXTEND:
7423
7424       op0 = XEXP (x, 0);
7425       /* If a value is written in SI mode, then zero extended to DI
7426          mode, the operation will in general be free as a write to
7427          a 'w' register implicitly zeroes the upper bits of an 'x'
7428          register.  However, if this is
7429
7430            (set (reg) (zero_extend (reg)))
7431
7432          we must cost the explicit register move.  */
7433       if (mode == DImode
7434           && GET_MODE (op0) == SImode
7435           && outer == SET)
7436         {
7437           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
7438
7439         /* If OP_COST is non-zero, then the cost of the zero extend
7440            is effectively the cost of the inner operation.  Otherwise
7441            we have a MOV instruction and we take the cost from the MOV
7442            itself.  This is true independently of whether we are
7443            optimizing for space or time.  */
7444           if (op_cost)
7445             *cost = op_cost;
7446
7447           return true;
7448         }
7449       else if (MEM_P (op0))
7450         {
7451           /* All loads can zero extend to any size for free.  */
7452           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
7453           return true;
7454         }
7455
7456       op0 = aarch64_extend_bitfield_pattern_p (x);
7457       if (op0)
7458         {
7459           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
7460           if (speed)
7461             *cost += extra_cost->alu.bfx;
7462           return true;
7463         }
7464
7465       if (speed)
7466         {
7467           if (VECTOR_MODE_P (mode))
7468             {
7469               /* UMOV.  */
7470               *cost += extra_cost->vect.alu;
7471             }
7472           else
7473             {
7474               /* We generate an AND instead of UXTB/UXTH.  */
7475               *cost += extra_cost->alu.logical;
7476             }
7477         }
7478       return false;
7479
7480     case SIGN_EXTEND:
7481       if (MEM_P (XEXP (x, 0)))
7482         {
7483           /* LDRSH.  */
7484           if (speed)
7485             {
7486               rtx address = XEXP (XEXP (x, 0), 0);
7487               *cost += extra_cost->ldst.load_sign_extend;
7488
7489               *cost +=
7490                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7491                                                      0, speed));
7492             }
7493           return true;
7494         }
7495
7496       op0 = aarch64_extend_bitfield_pattern_p (x);
7497       if (op0)
7498         {
7499           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
7500           if (speed)
7501             *cost += extra_cost->alu.bfx;
7502           return true;
7503         }
7504
7505       if (speed)
7506         {
7507           if (VECTOR_MODE_P (mode))
7508             *cost += extra_cost->vect.alu;
7509           else
7510             *cost += extra_cost->alu.extend;
7511         }
7512       return false;
7513
7514     case ASHIFT:
7515       op0 = XEXP (x, 0);
7516       op1 = XEXP (x, 1);
7517
7518       if (CONST_INT_P (op1))
7519         {
7520           if (speed)
7521             {
7522               if (VECTOR_MODE_P (mode))
7523                 {
7524                   /* Vector shift (immediate).  */
7525                   *cost += extra_cost->vect.alu;
7526                 }
7527               else
7528                 {
7529                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
7530                      aliases.  */
7531                   *cost += extra_cost->alu.shift;
7532                 }
7533             }
7534
7535           /* We can incorporate zero/sign extend for free.  */
7536           if (GET_CODE (op0) == ZERO_EXTEND
7537               || GET_CODE (op0) == SIGN_EXTEND)
7538             op0 = XEXP (op0, 0);
7539
7540           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
7541           return true;
7542         }
7543       else
7544         {
7545           if (VECTOR_MODE_P (mode))
7546             {
7547               if (speed)
7548                 /* Vector shift (register).  */
7549                 *cost += extra_cost->vect.alu;
7550             }
7551           else
7552             {
7553               if (speed)
7554                 /* LSLV.  */
7555                 *cost += extra_cost->alu.shift_reg;
7556
7557               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7558                   && CONST_INT_P (XEXP (op1, 1))
7559                   && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7560                 {
7561                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7562                   /* We already demanded XEXP (op1, 0) to be REG_P, so
7563                      don't recurse into it.  */
7564                   return true;
7565                 }
7566             }
7567           return false;  /* All arguments need to be in registers.  */
7568         }
7569
7570     case ROTATE:
7571     case ROTATERT:
7572     case LSHIFTRT:
7573     case ASHIFTRT:
7574       op0 = XEXP (x, 0);
7575       op1 = XEXP (x, 1);
7576
7577       if (CONST_INT_P (op1))
7578         {
7579           /* ASR (immediate) and friends.  */
7580           if (speed)
7581             {
7582               if (VECTOR_MODE_P (mode))
7583                 *cost += extra_cost->vect.alu;
7584               else
7585                 *cost += extra_cost->alu.shift;
7586             }
7587
7588           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7589           return true;
7590         }
7591       else
7592         {
7593           if (VECTOR_MODE_P (mode))
7594             {
7595               if (speed)
7596                 /* Vector shift (register).  */
7597                 *cost += extra_cost->vect.alu;
7598             }
7599           else
7600             {
7601               if (speed)
7602                 /* ASR (register) and friends.  */
7603                 *cost += extra_cost->alu.shift_reg;
7604
7605               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7606                   && CONST_INT_P (XEXP (op1, 1))
7607                   && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7608                 {
7609                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7610                   /* We already demanded XEXP (op1, 0) to be REG_P, so
7611                      don't recurse into it.  */
7612                   return true;
7613                 }
7614             }
7615           return false;  /* All arguments need to be in registers.  */
7616         }
7617
7618     case SYMBOL_REF:
7619
7620       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
7621           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
7622         {
7623           /* LDR.  */
7624           if (speed)
7625             *cost += extra_cost->ldst.load;
7626         }
7627       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
7628                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
7629         {
7630           /* ADRP, followed by ADD.  */
7631           *cost += COSTS_N_INSNS (1);
7632           if (speed)
7633             *cost += 2 * extra_cost->alu.arith;
7634         }
7635       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
7636                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
7637         {
7638           /* ADR.  */
7639           if (speed)
7640             *cost += extra_cost->alu.arith;
7641         }
7642
7643       if (flag_pic)
7644         {
7645           /* One extra load instruction, after accessing the GOT.  */
7646           *cost += COSTS_N_INSNS (1);
7647           if (speed)
7648             *cost += extra_cost->ldst.load;
7649         }
7650       return true;
7651
7652     case HIGH:
7653     case LO_SUM:
7654       /* ADRP/ADD (immediate).  */
7655       if (speed)
7656         *cost += extra_cost->alu.arith;
7657       return true;
7658
7659     case ZERO_EXTRACT:
7660     case SIGN_EXTRACT:
7661       /* UBFX/SBFX.  */
7662       if (speed)
7663         {
7664           if (VECTOR_MODE_P (mode))
7665             *cost += extra_cost->vect.alu;
7666           else
7667             *cost += extra_cost->alu.bfx;
7668         }
7669
7670       /* We can trust that the immediates used will be correct (there
7671          are no by-register forms), so we need only cost op0.  */
7672       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7673       return true;
7674
7675     case MULT:
7676       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7677       /* aarch64_rtx_mult_cost always handles recursion to its
7678          operands.  */
7679       return true;
7680
7681     case MOD:
7682     /* We can expand signed mod by power of 2 using a NEGS, two parallel
7683        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
7684        an unconditional negate.  This case should only ever be reached through
7685        the set_smod_pow2_cheap check in expmed.c.  */
7686       if (CONST_INT_P (XEXP (x, 1))
7687           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7688           && (mode == SImode || mode == DImode))
7689         {
7690           /* We expand to 4 instructions.  Reset the baseline.  */
7691           *cost = COSTS_N_INSNS (4);
7692
7693           if (speed)
7694             *cost += 2 * extra_cost->alu.logical
7695                      + 2 * extra_cost->alu.arith;
7696
7697           return true;
7698         }
7699
7700     /* Fall-through.  */
7701     case UMOD:
7702       if (speed)
7703         {
7704           /* Slighly prefer UMOD over SMOD.  */
7705           if (VECTOR_MODE_P (mode))
7706             *cost += extra_cost->vect.alu;
7707           else if (GET_MODE_CLASS (mode) == MODE_INT)
7708             *cost += (extra_cost->mult[mode == DImode].add
7709                       + extra_cost->mult[mode == DImode].idiv
7710                       + (code == MOD ? 1 : 0));
7711         }
7712       return false;  /* All arguments need to be in registers.  */
7713
7714     case DIV:
7715     case UDIV:
7716     case SQRT:
7717       if (speed)
7718         {
7719           if (VECTOR_MODE_P (mode))
7720             *cost += extra_cost->vect.alu;
7721           else if (GET_MODE_CLASS (mode) == MODE_INT)
7722             /* There is no integer SQRT, so only DIV and UDIV can get
7723                here.  */
7724             *cost += (extra_cost->mult[mode == DImode].idiv
7725                      /* Slighly prefer UDIV over SDIV.  */
7726                      + (code == DIV ? 1 : 0));
7727           else
7728             *cost += extra_cost->fp[mode == DFmode].div;
7729         }
7730       return false;  /* All arguments need to be in registers.  */
7731
7732     case IF_THEN_ELSE:
7733       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7734                                          XEXP (x, 2), cost, speed);
7735
7736     case EQ:
7737     case NE:
7738     case GT:
7739     case GTU:
7740     case LT:
7741     case LTU:
7742     case GE:
7743     case GEU:
7744     case LE:
7745     case LEU:
7746
7747       return false; /* All arguments must be in registers.  */
7748
7749     case FMA:
7750       op0 = XEXP (x, 0);
7751       op1 = XEXP (x, 1);
7752       op2 = XEXP (x, 2);
7753
7754       if (speed)
7755         {
7756           if (VECTOR_MODE_P (mode))
7757             *cost += extra_cost->vect.alu;
7758           else
7759             *cost += extra_cost->fp[mode == DFmode].fma;
7760         }
7761
7762       /* FMSUB, FNMADD, and FNMSUB are free.  */
7763       if (GET_CODE (op0) == NEG)
7764         op0 = XEXP (op0, 0);
7765
7766       if (GET_CODE (op2) == NEG)
7767         op2 = XEXP (op2, 0);
7768
7769       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7770          and the by-element operand as operand 0.  */
7771       if (GET_CODE (op1) == NEG)
7772         op1 = XEXP (op1, 0);
7773
7774       /* Catch vector-by-element operations.  The by-element operand can
7775          either be (vec_duplicate (vec_select (x))) or just
7776          (vec_select (x)), depending on whether we are multiplying by
7777          a vector or a scalar.
7778
7779          Canonicalization is not very good in these cases, FMA4 will put the
7780          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
7781       if (GET_CODE (op0) == VEC_DUPLICATE)
7782         op0 = XEXP (op0, 0);
7783       else if (GET_CODE (op1) == VEC_DUPLICATE)
7784         op1 = XEXP (op1, 0);
7785
7786       if (GET_CODE (op0) == VEC_SELECT)
7787         op0 = XEXP (op0, 0);
7788       else if (GET_CODE (op1) == VEC_SELECT)
7789         op1 = XEXP (op1, 0);
7790
7791       /* If the remaining parameters are not registers,
7792          get the cost to put them into registers.  */
7793       *cost += rtx_cost (op0, mode, FMA, 0, speed);
7794       *cost += rtx_cost (op1, mode, FMA, 1, speed);
7795       *cost += rtx_cost (op2, mode, FMA, 2, speed);
7796       return true;
7797
7798     case FLOAT:
7799     case UNSIGNED_FLOAT:
7800       if (speed)
7801         *cost += extra_cost->fp[mode == DFmode].fromint;
7802       return false;
7803
7804     case FLOAT_EXTEND:
7805       if (speed)
7806         {
7807           if (VECTOR_MODE_P (mode))
7808             {
7809               /*Vector truncate.  */
7810               *cost += extra_cost->vect.alu;
7811             }
7812           else
7813             *cost += extra_cost->fp[mode == DFmode].widen;
7814         }
7815       return false;
7816
7817     case FLOAT_TRUNCATE:
7818       if (speed)
7819         {
7820           if (VECTOR_MODE_P (mode))
7821             {
7822               /*Vector conversion.  */
7823               *cost += extra_cost->vect.alu;
7824             }
7825           else
7826             *cost += extra_cost->fp[mode == DFmode].narrow;
7827         }
7828       return false;
7829
7830     case FIX:
7831     case UNSIGNED_FIX:
7832       x = XEXP (x, 0);
7833       /* Strip the rounding part.  They will all be implemented
7834          by the fcvt* family of instructions anyway.  */
7835       if (GET_CODE (x) == UNSPEC)
7836         {
7837           unsigned int uns_code = XINT (x, 1);
7838
7839           if (uns_code == UNSPEC_FRINTA
7840               || uns_code == UNSPEC_FRINTM
7841               || uns_code == UNSPEC_FRINTN
7842               || uns_code == UNSPEC_FRINTP
7843               || uns_code == UNSPEC_FRINTZ)
7844             x = XVECEXP (x, 0, 0);
7845         }
7846
7847       if (speed)
7848         {
7849           if (VECTOR_MODE_P (mode))
7850             *cost += extra_cost->vect.alu;
7851           else
7852             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
7853         }
7854
7855       /* We can combine fmul by a power of 2 followed by a fcvt into a single
7856          fixed-point fcvt.  */
7857       if (GET_CODE (x) == MULT
7858           && ((VECTOR_MODE_P (mode)
7859                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
7860               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
7861         {
7862           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
7863                              0, speed);
7864           return true;
7865         }
7866
7867       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
7868       return true;
7869
7870     case ABS:
7871       if (VECTOR_MODE_P (mode))
7872         {
7873           /* ABS (vector).  */
7874           if (speed)
7875             *cost += extra_cost->vect.alu;
7876         }
7877       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7878         {
7879           op0 = XEXP (x, 0);
7880
7881           /* FABD, which is analogous to FADD.  */
7882           if (GET_CODE (op0) == MINUS)
7883             {
7884               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
7885               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
7886               if (speed)
7887                 *cost += extra_cost->fp[mode == DFmode].addsub;
7888
7889               return true;
7890             }
7891           /* Simple FABS is analogous to FNEG.  */
7892           if (speed)
7893             *cost += extra_cost->fp[mode == DFmode].neg;
7894         }
7895       else
7896         {
7897           /* Integer ABS will either be split to
7898              two arithmetic instructions, or will be an ABS
7899              (scalar), which we don't model.  */
7900           *cost = COSTS_N_INSNS (2);
7901           if (speed)
7902             *cost += 2 * extra_cost->alu.arith;
7903         }
7904       return false;
7905
7906     case SMAX:
7907     case SMIN:
7908       if (speed)
7909         {
7910           if (VECTOR_MODE_P (mode))
7911             *cost += extra_cost->vect.alu;
7912           else
7913             {
7914               /* FMAXNM/FMINNM/FMAX/FMIN.
7915                  TODO: This may not be accurate for all implementations, but
7916                  we do not model this in the cost tables.  */
7917               *cost += extra_cost->fp[mode == DFmode].addsub;
7918             }
7919         }
7920       return false;
7921
7922     case UNSPEC:
7923       /* The floating point round to integer frint* instructions.  */
7924       if (aarch64_frint_unspec_p (XINT (x, 1)))
7925         {
7926           if (speed)
7927             *cost += extra_cost->fp[mode == DFmode].roundint;
7928
7929           return false;
7930         }
7931
7932       if (XINT (x, 1) == UNSPEC_RBIT)
7933         {
7934           if (speed)
7935             *cost += extra_cost->alu.rev;
7936
7937           return false;
7938         }
7939       break;
7940
7941     case TRUNCATE:
7942
7943       /* Decompose <su>muldi3_highpart.  */
7944       if (/* (truncate:DI  */
7945           mode == DImode
7946           /*   (lshiftrt:TI  */
7947           && GET_MODE (XEXP (x, 0)) == TImode
7948           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
7949           /*      (mult:TI  */
7950           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
7951           /*        (ANY_EXTEND:TI (reg:DI))
7952                     (ANY_EXTEND:TI (reg:DI)))  */
7953           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
7954                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
7955               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
7956                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
7957           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
7958           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
7959           /*     (const_int 64)  */
7960           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7961           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
7962         {
7963           /* UMULH/SMULH.  */
7964           if (speed)
7965             *cost += extra_cost->mult[mode == DImode].extend;
7966           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
7967                              mode, MULT, 0, speed);
7968           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
7969                              mode, MULT, 1, speed);
7970           return true;
7971         }
7972
7973       /* Fall through.  */
7974     default:
7975       break;
7976     }
7977
7978   if (dump_file
7979       && flag_aarch64_verbose_cost)
7980     fprintf (dump_file,
7981       "\nFailed to cost RTX.  Assuming default cost.\n");
7982
7983   return true;
7984 }
7985
7986 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
7987    calculated for X.  This cost is stored in *COST.  Returns true
7988    if the total cost of X was calculated.  */
7989 static bool
7990 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
7991                    int param, int *cost, bool speed)
7992 {
7993   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
7994
7995   if (dump_file
7996       && flag_aarch64_verbose_cost)
7997     {
7998       print_rtl_single (dump_file, x);
7999       fprintf (dump_file, "\n%s cost: %d (%s)\n",
8000                speed ? "Hot" : "Cold",
8001                *cost, result ? "final" : "partial");
8002     }
8003
8004   return result;
8005 }
8006
8007 static int
8008 aarch64_register_move_cost (machine_mode mode,
8009                             reg_class_t from_i, reg_class_t to_i)
8010 {
8011   enum reg_class from = (enum reg_class) from_i;
8012   enum reg_class to = (enum reg_class) to_i;
8013   const struct cpu_regmove_cost *regmove_cost
8014     = aarch64_tune_params.regmove_cost;
8015
8016   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
8017   if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
8018     to = GENERAL_REGS;
8019
8020   if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
8021     from = GENERAL_REGS;
8022
8023   /* Moving between GPR and stack cost is the same as GP2GP.  */
8024   if ((from == GENERAL_REGS && to == STACK_REG)
8025       || (to == GENERAL_REGS && from == STACK_REG))
8026     return regmove_cost->GP2GP;
8027
8028   /* To/From the stack register, we move via the gprs.  */
8029   if (to == STACK_REG || from == STACK_REG)
8030     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
8031             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
8032
8033   if (GET_MODE_SIZE (mode) == 16)
8034     {
8035       /* 128-bit operations on general registers require 2 instructions.  */
8036       if (from == GENERAL_REGS && to == GENERAL_REGS)
8037         return regmove_cost->GP2GP * 2;
8038       else if (from == GENERAL_REGS)
8039         return regmove_cost->GP2FP * 2;
8040       else if (to == GENERAL_REGS)
8041         return regmove_cost->FP2GP * 2;
8042
8043       /* When AdvSIMD instructions are disabled it is not possible to move
8044          a 128-bit value directly between Q registers.  This is handled in
8045          secondary reload.  A general register is used as a scratch to move
8046          the upper DI value and the lower DI value is moved directly,
8047          hence the cost is the sum of three moves. */
8048       if (! TARGET_SIMD)
8049         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
8050
8051       return regmove_cost->FP2FP;
8052     }
8053
8054   if (from == GENERAL_REGS && to == GENERAL_REGS)
8055     return regmove_cost->GP2GP;
8056   else if (from == GENERAL_REGS)
8057     return regmove_cost->GP2FP;
8058   else if (to == GENERAL_REGS)
8059     return regmove_cost->FP2GP;
8060
8061   return regmove_cost->FP2FP;
8062 }
8063
8064 static int
8065 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
8066                           reg_class_t rclass ATTRIBUTE_UNUSED,
8067                           bool in ATTRIBUTE_UNUSED)
8068 {
8069   return aarch64_tune_params.memmov_cost;
8070 }
8071
8072 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
8073    to optimize 1.0/sqrt.  */
8074
8075 static bool
8076 use_rsqrt_p (machine_mode mode)
8077 {
8078   return (!flag_trapping_math
8079           && flag_unsafe_math_optimizations
8080           && ((aarch64_tune_params.approx_modes->recip_sqrt
8081                & AARCH64_APPROX_MODE (mode))
8082               || flag_mrecip_low_precision_sqrt));
8083 }
8084
8085 /* Function to decide when to use the approximate reciprocal square root
8086    builtin.  */
8087
8088 static tree
8089 aarch64_builtin_reciprocal (tree fndecl)
8090 {
8091   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
8092
8093   if (!use_rsqrt_p (mode))
8094     return NULL_TREE;
8095   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
8096 }
8097
8098 typedef rtx (*rsqrte_type) (rtx, rtx);
8099
8100 /* Select reciprocal square root initial estimate insn depending on machine
8101    mode.  */
8102
8103 static rsqrte_type
8104 get_rsqrte_type (machine_mode mode)
8105 {
8106   switch (mode)
8107   {
8108     case DFmode:   return gen_aarch64_rsqrtedf;
8109     case SFmode:   return gen_aarch64_rsqrtesf;
8110     case V2DFmode: return gen_aarch64_rsqrtev2df;
8111     case V2SFmode: return gen_aarch64_rsqrtev2sf;
8112     case V4SFmode: return gen_aarch64_rsqrtev4sf;
8113     default: gcc_unreachable ();
8114   }
8115 }
8116
8117 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
8118
8119 /* Select reciprocal square root series step insn depending on machine mode.  */
8120
8121 static rsqrts_type
8122 get_rsqrts_type (machine_mode mode)
8123 {
8124   switch (mode)
8125   {
8126     case DFmode:   return gen_aarch64_rsqrtsdf;
8127     case SFmode:   return gen_aarch64_rsqrtssf;
8128     case V2DFmode: return gen_aarch64_rsqrtsv2df;
8129     case V2SFmode: return gen_aarch64_rsqrtsv2sf;
8130     case V4SFmode: return gen_aarch64_rsqrtsv4sf;
8131     default: gcc_unreachable ();
8132   }
8133 }
8134
8135 /* Emit instruction sequence to compute either the approximate square root
8136    or its approximate reciprocal, depending on the flag RECP, and return
8137    whether the sequence was emitted or not.  */
8138
8139 bool
8140 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
8141 {
8142   machine_mode mode = GET_MODE (dst);
8143
8144   if (GET_MODE_INNER (mode) == HFmode)
8145     {
8146       gcc_assert (!recp);
8147       return false;
8148     }
8149
8150   machine_mode mmsk
8151     = mode_for_vector (int_mode_for_mode (GET_MODE_INNER (mode)),
8152                        GET_MODE_NUNITS (mode));
8153   if (!recp)
8154     {
8155       if (!(flag_mlow_precision_sqrt
8156             || (aarch64_tune_params.approx_modes->sqrt
8157                 & AARCH64_APPROX_MODE (mode))))
8158         return false;
8159
8160       if (flag_finite_math_only
8161           || flag_trapping_math
8162           || !flag_unsafe_math_optimizations
8163           || optimize_function_for_size_p (cfun))
8164         return false;
8165     }
8166   else
8167     /* Caller assumes we cannot fail.  */
8168     gcc_assert (use_rsqrt_p (mode));
8169
8170
8171   rtx xmsk = gen_reg_rtx (mmsk);
8172   if (!recp)
8173     /* When calculating the approximate square root, compare the
8174        argument with 0.0 and create a mask.  */
8175     emit_insn (gen_rtx_SET (xmsk,
8176                             gen_rtx_NEG (mmsk,
8177                                          gen_rtx_EQ (mmsk, src,
8178                                                      CONST0_RTX (mode)))));
8179
8180   /* Estimate the approximate reciprocal square root.  */
8181   rtx xdst = gen_reg_rtx (mode);
8182   emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
8183
8184   /* Iterate over the series twice for SF and thrice for DF.  */
8185   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8186
8187   /* Optionally iterate over the series once less for faster performance
8188      while sacrificing the accuracy.  */
8189   if ((recp && flag_mrecip_low_precision_sqrt)
8190       || (!recp && flag_mlow_precision_sqrt))
8191     iterations--;
8192
8193   /* Iterate over the series to calculate the approximate reciprocal square
8194      root.  */
8195   rtx x1 = gen_reg_rtx (mode);
8196   while (iterations--)
8197     {
8198       rtx x2 = gen_reg_rtx (mode);
8199       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
8200
8201       emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
8202
8203       if (iterations > 0)
8204         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
8205     }
8206
8207   if (!recp)
8208     {
8209       /* Qualify the approximate reciprocal square root when the argument is
8210          0.0 by squashing the intermediary result to 0.0.  */
8211       rtx xtmp = gen_reg_rtx (mmsk);
8212       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
8213                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
8214       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
8215
8216       /* Calculate the approximate square root.  */
8217       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
8218     }
8219
8220   /* Finalize the approximation.  */
8221   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
8222
8223   return true;
8224 }
8225
8226 typedef rtx (*recpe_type) (rtx, rtx);
8227
8228 /* Select reciprocal initial estimate insn depending on machine mode.  */
8229
8230 static recpe_type
8231 get_recpe_type (machine_mode mode)
8232 {
8233   switch (mode)
8234   {
8235     case SFmode:   return (gen_aarch64_frecpesf);
8236     case V2SFmode: return (gen_aarch64_frecpev2sf);
8237     case V4SFmode: return (gen_aarch64_frecpev4sf);
8238     case DFmode:   return (gen_aarch64_frecpedf);
8239     case V2DFmode: return (gen_aarch64_frecpev2df);
8240     default:       gcc_unreachable ();
8241   }
8242 }
8243
8244 typedef rtx (*recps_type) (rtx, rtx, rtx);
8245
8246 /* Select reciprocal series step insn depending on machine mode.  */
8247
8248 static recps_type
8249 get_recps_type (machine_mode mode)
8250 {
8251   switch (mode)
8252   {
8253     case SFmode:   return (gen_aarch64_frecpssf);
8254     case V2SFmode: return (gen_aarch64_frecpsv2sf);
8255     case V4SFmode: return (gen_aarch64_frecpsv4sf);
8256     case DFmode:   return (gen_aarch64_frecpsdf);
8257     case V2DFmode: return (gen_aarch64_frecpsv2df);
8258     default:       gcc_unreachable ();
8259   }
8260 }
8261
8262 /* Emit the instruction sequence to compute the approximation for the division
8263    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
8264
8265 bool
8266 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
8267 {
8268   machine_mode mode = GET_MODE (quo);
8269
8270   if (GET_MODE_INNER (mode) == HFmode)
8271     return false;
8272
8273   bool use_approx_division_p = (flag_mlow_precision_div
8274                                 || (aarch64_tune_params.approx_modes->division
8275                                     & AARCH64_APPROX_MODE (mode)));
8276
8277   if (!flag_finite_math_only
8278       || flag_trapping_math
8279       || !flag_unsafe_math_optimizations
8280       || optimize_function_for_size_p (cfun)
8281       || !use_approx_division_p)
8282     return false;
8283
8284   /* Estimate the approximate reciprocal.  */
8285   rtx xrcp = gen_reg_rtx (mode);
8286   emit_insn ((*get_recpe_type (mode)) (xrcp, den));
8287
8288   /* Iterate over the series twice for SF and thrice for DF.  */
8289   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8290
8291   /* Optionally iterate over the series once less for faster performance,
8292      while sacrificing the accuracy.  */
8293   if (flag_mlow_precision_div)
8294     iterations--;
8295
8296   /* Iterate over the series to calculate the approximate reciprocal.  */
8297   rtx xtmp = gen_reg_rtx (mode);
8298   while (iterations--)
8299     {
8300       emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
8301
8302       if (iterations > 0)
8303         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
8304     }
8305
8306   if (num != CONST1_RTX (mode))
8307     {
8308       /* As the approximate reciprocal of DEN is already calculated, only
8309          calculate the approximate division when NUM is not 1.0.  */
8310       rtx xnum = force_reg (mode, num);
8311       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
8312     }
8313
8314   /* Finalize the approximation.  */
8315   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
8316   return true;
8317 }
8318
8319 /* Return the number of instructions that can be issued per cycle.  */
8320 static int
8321 aarch64_sched_issue_rate (void)
8322 {
8323   return aarch64_tune_params.issue_rate;
8324 }
8325
8326 static int
8327 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
8328 {
8329   int issue_rate = aarch64_sched_issue_rate ();
8330
8331   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
8332 }
8333
8334
8335 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
8336    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
8337    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
8338
8339 static int
8340 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
8341                                                     int ready_index)
8342 {
8343   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
8344 }
8345
8346
8347 /* Vectorizer cost model target hooks.  */
8348
8349 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
8350 static int
8351 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
8352                                     tree vectype,
8353                                     int misalign ATTRIBUTE_UNUSED)
8354 {
8355   unsigned elements;
8356   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
8357   bool fp = false;
8358
8359   if (vectype != NULL)
8360     fp = FLOAT_TYPE_P (vectype);
8361
8362   switch (type_of_cost)
8363     {
8364       case scalar_stmt:
8365         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
8366
8367       case scalar_load:
8368         return costs->scalar_load_cost;
8369
8370       case scalar_store:
8371         return costs->scalar_store_cost;
8372
8373       case vector_stmt:
8374         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8375
8376       case vector_load:
8377         return costs->vec_align_load_cost;
8378
8379       case vector_store:
8380         return costs->vec_store_cost;
8381
8382       case vec_to_scalar:
8383         return costs->vec_to_scalar_cost;
8384
8385       case scalar_to_vec:
8386         return costs->scalar_to_vec_cost;
8387
8388       case unaligned_load:
8389         return costs->vec_unalign_load_cost;
8390
8391       case unaligned_store:
8392         return costs->vec_unalign_store_cost;
8393
8394       case cond_branch_taken:
8395         return costs->cond_taken_branch_cost;
8396
8397       case cond_branch_not_taken:
8398         return costs->cond_not_taken_branch_cost;
8399
8400       case vec_perm:
8401         return costs->vec_permute_cost;
8402
8403       case vec_promote_demote:
8404         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8405
8406       case vec_construct:
8407         elements = TYPE_VECTOR_SUBPARTS (vectype);
8408         return elements / 2 + 1;
8409
8410       default:
8411         gcc_unreachable ();
8412     }
8413 }
8414
8415 /* Implement targetm.vectorize.add_stmt_cost.  */
8416 static unsigned
8417 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
8418                        struct _stmt_vec_info *stmt_info, int misalign,
8419                        enum vect_cost_model_location where)
8420 {
8421   unsigned *cost = (unsigned *) data;
8422   unsigned retval = 0;
8423
8424   if (flag_vect_cost_model)
8425     {
8426       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
8427       int stmt_cost =
8428             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
8429
8430       /* Statements in an inner loop relative to the loop being
8431          vectorized are weighted more heavily.  The value here is
8432          arbitrary and could potentially be improved with analysis.  */
8433       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
8434         count *= 50; /*  FIXME  */
8435
8436       retval = (unsigned) (count * stmt_cost);
8437       cost[where] += retval;
8438     }
8439
8440   return retval;
8441 }
8442
8443 static void initialize_aarch64_code_model (struct gcc_options *);
8444
8445 /* Parse the TO_PARSE string and put the architecture struct that it
8446    selects into RES and the architectural features into ISA_FLAGS.
8447    Return an aarch64_parse_opt_result describing the parse result.
8448    If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
8449
8450 static enum aarch64_parse_opt_result
8451 aarch64_parse_arch (const char *to_parse, const struct processor **res,
8452                     unsigned long *isa_flags)
8453 {
8454   char *ext;
8455   const struct processor *arch;
8456   char *str = (char *) alloca (strlen (to_parse) + 1);
8457   size_t len;
8458
8459   strcpy (str, to_parse);
8460
8461   ext = strchr (str, '+');
8462
8463   if (ext != NULL)
8464     len = ext - str;
8465   else
8466     len = strlen (str);
8467
8468   if (len == 0)
8469     return AARCH64_PARSE_MISSING_ARG;
8470
8471
8472   /* Loop through the list of supported ARCHes to find a match.  */
8473   for (arch = all_architectures; arch->name != NULL; arch++)
8474     {
8475       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
8476         {
8477           unsigned long isa_temp = arch->flags;
8478
8479           if (ext != NULL)
8480             {
8481               /* TO_PARSE string contains at least one extension.  */
8482               enum aarch64_parse_opt_result ext_res
8483                 = aarch64_parse_extension (ext, &isa_temp);
8484
8485               if (ext_res != AARCH64_PARSE_OK)
8486                 return ext_res;
8487             }
8488           /* Extension parsing was successful.  Confirm the result
8489              arch and ISA flags.  */
8490           *res = arch;
8491           *isa_flags = isa_temp;
8492           return AARCH64_PARSE_OK;
8493         }
8494     }
8495
8496   /* ARCH name not found in list.  */
8497   return AARCH64_PARSE_INVALID_ARG;
8498 }
8499
8500 /* Parse the TO_PARSE string and put the result tuning in RES and the
8501    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
8502    describing the parse result.  If there is an error parsing, RES and
8503    ISA_FLAGS are left unchanged.  */
8504
8505 static enum aarch64_parse_opt_result
8506 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
8507                    unsigned long *isa_flags)
8508 {
8509   char *ext;
8510   const struct processor *cpu;
8511   char *str = (char *) alloca (strlen (to_parse) + 1);
8512   size_t len;
8513
8514   strcpy (str, to_parse);
8515
8516   ext = strchr (str, '+');
8517
8518   if (ext != NULL)
8519     len = ext - str;
8520   else
8521     len = strlen (str);
8522
8523   if (len == 0)
8524     return AARCH64_PARSE_MISSING_ARG;
8525
8526
8527   /* Loop through the list of supported CPUs to find a match.  */
8528   for (cpu = all_cores; cpu->name != NULL; cpu++)
8529     {
8530       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
8531         {
8532           unsigned long isa_temp = cpu->flags;
8533
8534
8535           if (ext != NULL)
8536             {
8537               /* TO_PARSE string contains at least one extension.  */
8538               enum aarch64_parse_opt_result ext_res
8539                 = aarch64_parse_extension (ext, &isa_temp);
8540
8541               if (ext_res != AARCH64_PARSE_OK)
8542                 return ext_res;
8543             }
8544           /* Extension parsing was successfull.  Confirm the result
8545              cpu and ISA flags.  */
8546           *res = cpu;
8547           *isa_flags = isa_temp;
8548           return AARCH64_PARSE_OK;
8549         }
8550     }
8551
8552   /* CPU name not found in list.  */
8553   return AARCH64_PARSE_INVALID_ARG;
8554 }
8555
8556 /* Parse the TO_PARSE string and put the cpu it selects into RES.
8557    Return an aarch64_parse_opt_result describing the parse result.
8558    If the parsing fails the RES does not change.  */
8559
8560 static enum aarch64_parse_opt_result
8561 aarch64_parse_tune (const char *to_parse, const struct processor **res)
8562 {
8563   const struct processor *cpu;
8564   char *str = (char *) alloca (strlen (to_parse) + 1);
8565
8566   strcpy (str, to_parse);
8567
8568   /* Loop through the list of supported CPUs to find a match.  */
8569   for (cpu = all_cores; cpu->name != NULL; cpu++)
8570     {
8571       if (strcmp (cpu->name, str) == 0)
8572         {
8573           *res = cpu;
8574           return AARCH64_PARSE_OK;
8575         }
8576     }
8577
8578   /* CPU name not found in list.  */
8579   return AARCH64_PARSE_INVALID_ARG;
8580 }
8581
8582 /* Parse TOKEN, which has length LENGTH to see if it is an option
8583    described in FLAG.  If it is, return the index bit for that fusion type.
8584    If not, error (printing OPTION_NAME) and return zero.  */
8585
8586 static unsigned int
8587 aarch64_parse_one_option_token (const char *token,
8588                                 size_t length,
8589                                 const struct aarch64_flag_desc *flag,
8590                                 const char *option_name)
8591 {
8592   for (; flag->name != NULL; flag++)
8593     {
8594       if (length == strlen (flag->name)
8595           && !strncmp (flag->name, token, length))
8596         return flag->flag;
8597     }
8598
8599   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
8600   return 0;
8601 }
8602
8603 /* Parse OPTION which is a comma-separated list of flags to enable.
8604    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
8605    default state we inherit from the CPU tuning structures.  OPTION_NAME
8606    gives the top-level option we are parsing in the -moverride string,
8607    for use in error messages.  */
8608
8609 static unsigned int
8610 aarch64_parse_boolean_options (const char *option,
8611                                const struct aarch64_flag_desc *flags,
8612                                unsigned int initial_state,
8613                                const char *option_name)
8614 {
8615   const char separator = '.';
8616   const char* specs = option;
8617   const char* ntoken = option;
8618   unsigned int found_flags = initial_state;
8619
8620   while ((ntoken = strchr (specs, separator)))
8621     {
8622       size_t token_length = ntoken - specs;
8623       unsigned token_ops = aarch64_parse_one_option_token (specs,
8624                                                            token_length,
8625                                                            flags,
8626                                                            option_name);
8627       /* If we find "none" (or, for simplicity's sake, an error) anywhere
8628          in the token stream, reset the supported operations.  So:
8629
8630            adrp+add.cmp+branch.none.adrp+add
8631
8632            would have the result of turning on only adrp+add fusion.  */
8633       if (!token_ops)
8634         found_flags = 0;
8635
8636       found_flags |= token_ops;
8637       specs = ++ntoken;
8638     }
8639
8640   /* We ended with a comma, print something.  */
8641   if (!(*specs))
8642     {
8643       error ("%s string ill-formed\n", option_name);
8644       return 0;
8645     }
8646
8647   /* We still have one more token to parse.  */
8648   size_t token_length = strlen (specs);
8649   unsigned token_ops = aarch64_parse_one_option_token (specs,
8650                                                        token_length,
8651                                                        flags,
8652                                                        option_name);
8653    if (!token_ops)
8654      found_flags = 0;
8655
8656   found_flags |= token_ops;
8657   return found_flags;
8658 }
8659
8660 /* Support for overriding instruction fusion.  */
8661
8662 static void
8663 aarch64_parse_fuse_string (const char *fuse_string,
8664                             struct tune_params *tune)
8665 {
8666   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
8667                                                      aarch64_fusible_pairs,
8668                                                      tune->fusible_ops,
8669                                                      "fuse=");
8670 }
8671
8672 /* Support for overriding other tuning flags.  */
8673
8674 static void
8675 aarch64_parse_tune_string (const char *tune_string,
8676                             struct tune_params *tune)
8677 {
8678   tune->extra_tuning_flags
8679     = aarch64_parse_boolean_options (tune_string,
8680                                      aarch64_tuning_flags,
8681                                      tune->extra_tuning_flags,
8682                                      "tune=");
8683 }
8684
8685 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8686    we understand.  If it is, extract the option string and handoff to
8687    the appropriate function.  */
8688
8689 void
8690 aarch64_parse_one_override_token (const char* token,
8691                                   size_t length,
8692                                   struct tune_params *tune)
8693 {
8694   const struct aarch64_tuning_override_function *fn
8695     = aarch64_tuning_override_functions;
8696
8697   const char *option_part = strchr (token, '=');
8698   if (!option_part)
8699     {
8700       error ("tuning string missing in option (%s)", token);
8701       return;
8702     }
8703
8704   /* Get the length of the option name.  */
8705   length = option_part - token;
8706   /* Skip the '=' to get to the option string.  */
8707   option_part++;
8708
8709   for (; fn->name != NULL; fn++)
8710     {
8711       if (!strncmp (fn->name, token, length))
8712         {
8713           fn->parse_override (option_part, tune);
8714           return;
8715         }
8716     }
8717
8718   error ("unknown tuning option (%s)",token);
8719   return;
8720 }
8721
8722 /* A checking mechanism for the implementation of the tls size.  */
8723
8724 static void
8725 initialize_aarch64_tls_size (struct gcc_options *opts)
8726 {
8727   if (aarch64_tls_size == 0)
8728     aarch64_tls_size = 24;
8729
8730   switch (opts->x_aarch64_cmodel_var)
8731     {
8732     case AARCH64_CMODEL_TINY:
8733       /* Both the default and maximum TLS size allowed under tiny is 1M which
8734          needs two instructions to address, so we clamp the size to 24.  */
8735       if (aarch64_tls_size > 24)
8736         aarch64_tls_size = 24;
8737       break;
8738     case AARCH64_CMODEL_SMALL:
8739       /* The maximum TLS size allowed under small is 4G.  */
8740       if (aarch64_tls_size > 32)
8741         aarch64_tls_size = 32;
8742       break;
8743     case AARCH64_CMODEL_LARGE:
8744       /* The maximum TLS size allowed under large is 16E.
8745          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
8746       if (aarch64_tls_size > 48)
8747         aarch64_tls_size = 48;
8748       break;
8749     default:
8750       gcc_unreachable ();
8751     }
8752
8753   return;
8754 }
8755
8756 /* Parse STRING looking for options in the format:
8757      string     :: option:string
8758      option     :: name=substring
8759      name       :: {a-z}
8760      substring  :: defined by option.  */
8761
8762 static void
8763 aarch64_parse_override_string (const char* input_string,
8764                                struct tune_params* tune)
8765 {
8766   const char separator = ':';
8767   size_t string_length = strlen (input_string) + 1;
8768   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8769   char *string = string_root;
8770   strncpy (string, input_string, string_length);
8771   string[string_length - 1] = '\0';
8772
8773   char* ntoken = string;
8774
8775   while ((ntoken = strchr (string, separator)))
8776     {
8777       size_t token_length = ntoken - string;
8778       /* Make this substring look like a string.  */
8779       *ntoken = '\0';
8780       aarch64_parse_one_override_token (string, token_length, tune);
8781       string = ++ntoken;
8782     }
8783
8784   /* One last option to parse.  */
8785   aarch64_parse_one_override_token (string, strlen (string), tune);
8786   free (string_root);
8787 }
8788
8789
8790 static void
8791 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8792 {
8793   /* The logic here is that if we are disabling all frame pointer generation
8794      then we do not need to disable leaf frame pointer generation as a
8795      separate operation.  But if we are *only* disabling leaf frame pointer
8796      generation then we set flag_omit_frame_pointer to true, but in
8797      aarch64_frame_pointer_required we return false only for leaf functions.
8798
8799      PR 70044: We have to be careful about being called multiple times for the
8800      same function.  Once we have decided to set flag_omit_frame_pointer just
8801      so that we can omit leaf frame pointers, we must then not interpret a
8802      second call as meaning that all frame pointer generation should be
8803      omitted.  We do this by setting flag_omit_frame_pointer to a special,
8804      non-zero value.  */
8805   if (opts->x_flag_omit_frame_pointer == 2)
8806     opts->x_flag_omit_frame_pointer = 0;
8807
8808   if (opts->x_flag_omit_frame_pointer)
8809     opts->x_flag_omit_leaf_frame_pointer = false;
8810   else if (opts->x_flag_omit_leaf_frame_pointer)
8811     opts->x_flag_omit_frame_pointer = 2;
8812
8813   /* If not optimizing for size, set the default
8814      alignment to what the target wants.  */
8815   if (!opts->x_optimize_size)
8816     {
8817       if (opts->x_align_loops <= 0)
8818         opts->x_align_loops = aarch64_tune_params.loop_align;
8819       if (opts->x_align_jumps <= 0)
8820         opts->x_align_jumps = aarch64_tune_params.jump_align;
8821       if (opts->x_align_functions <= 0)
8822         opts->x_align_functions = aarch64_tune_params.function_align;
8823     }
8824
8825   /* We default to no pc-relative literal loads.  */
8826
8827   aarch64_pcrelative_literal_loads = false;
8828
8829   /* If -mpc-relative-literal-loads is set on the command line, this
8830      implies that the user asked for PC relative literal loads.  */
8831   if (opts->x_pcrelative_literal_loads == 1)
8832     aarch64_pcrelative_literal_loads = true;
8833
8834   /* This is PR70113. When building the Linux kernel with
8835      CONFIG_ARM64_ERRATUM_843419, support for relocations
8836      R_AARCH64_ADR_PREL_PG_HI21 and R_AARCH64_ADR_PREL_PG_HI21_NC is
8837      removed from the kernel to avoid loading objects with possibly
8838      offending sequences.  Without -mpc-relative-literal-loads we would
8839      generate such relocations, preventing the kernel build from
8840      succeeding.  */
8841   if (opts->x_pcrelative_literal_loads == 2
8842       && TARGET_FIX_ERR_A53_843419)
8843     aarch64_pcrelative_literal_loads = true;
8844
8845   /* In the tiny memory model it makes no sense to disallow PC relative
8846      literal pool loads.  */
8847   if (aarch64_cmodel == AARCH64_CMODEL_TINY
8848       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
8849     aarch64_pcrelative_literal_loads = true;
8850
8851   /* When enabling the lower precision Newton series for the square root, also
8852      enable it for the reciprocal square root, since the latter is an
8853      intermediary step for the former.  */
8854   if (flag_mlow_precision_sqrt)
8855     flag_mrecip_low_precision_sqrt = true;
8856 }
8857
8858 /* 'Unpack' up the internal tuning structs and update the options
8859     in OPTS.  The caller must have set up selected_tune and selected_arch
8860     as all the other target-specific codegen decisions are
8861     derived from them.  */
8862
8863 void
8864 aarch64_override_options_internal (struct gcc_options *opts)
8865 {
8866   aarch64_tune_flags = selected_tune->flags;
8867   aarch64_tune = selected_tune->sched_core;
8868   /* Make a copy of the tuning parameters attached to the core, which
8869      we may later overwrite.  */
8870   aarch64_tune_params = *(selected_tune->tune);
8871   aarch64_architecture_version = selected_arch->architecture_version;
8872
8873   if (opts->x_aarch64_override_tune_string)
8874     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
8875                                   &aarch64_tune_params);
8876
8877   /* This target defaults to strict volatile bitfields.  */
8878   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
8879     opts->x_flag_strict_volatile_bitfields = 1;
8880
8881   initialize_aarch64_code_model (opts);
8882   initialize_aarch64_tls_size (opts);
8883
8884   int queue_depth = 0;
8885   switch (aarch64_tune_params.autoprefetcher_model)
8886     {
8887       case tune_params::AUTOPREFETCHER_OFF:
8888         queue_depth = -1;
8889         break;
8890       case tune_params::AUTOPREFETCHER_WEAK:
8891         queue_depth = 0;
8892         break;
8893       case tune_params::AUTOPREFETCHER_STRONG:
8894         queue_depth = max_insn_queue_index + 1;
8895         break;
8896       default:
8897         gcc_unreachable ();
8898     }
8899
8900   /* We don't mind passing in global_options_set here as we don't use
8901      the *options_set structs anyway.  */
8902   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
8903                          queue_depth,
8904                          opts->x_param_values,
8905                          global_options_set.x_param_values);
8906
8907   /* Set up parameters to be used in prefetching algorithm.  Do not
8908      override the defaults unless we are tuning for a core we have
8909      researched values for.  */
8910   if (aarch64_tune_params.prefetch->num_slots > 0)
8911     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
8912                            aarch64_tune_params.prefetch->num_slots,
8913                            opts->x_param_values,
8914                            global_options_set.x_param_values);
8915   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
8916     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
8917                            aarch64_tune_params.prefetch->l1_cache_size,
8918                            opts->x_param_values,
8919                            global_options_set.x_param_values);
8920   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
8921     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
8922                            aarch64_tune_params.prefetch->l1_cache_line_size,
8923                            opts->x_param_values,
8924                            global_options_set.x_param_values);
8925   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
8926     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
8927                            aarch64_tune_params.prefetch->l2_cache_size,
8928                            opts->x_param_values,
8929                            global_options_set.x_param_values);
8930
8931   /* Enable sw prefetching at specified optimization level for
8932      CPUS that have prefetch.  Lower optimization level threshold by 1
8933      when profiling is enabled.  */
8934   if (opts->x_flag_prefetch_loop_arrays < 0
8935       && !opts->x_optimize_size
8936       && aarch64_tune_params.prefetch->default_opt_level >= 0
8937       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
8938     opts->x_flag_prefetch_loop_arrays = 1;
8939
8940   aarch64_override_options_after_change_1 (opts);
8941 }
8942
8943 /* Print a hint with a suggestion for a core or architecture name that
8944    most closely resembles what the user passed in STR.  ARCH is true if
8945    the user is asking for an architecture name.  ARCH is false if the user
8946    is asking for a core name.  */
8947
8948 static void
8949 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
8950 {
8951   auto_vec<const char *> candidates;
8952   const struct processor *entry = arch ? all_architectures : all_cores;
8953   for (; entry->name != NULL; entry++)
8954     candidates.safe_push (entry->name);
8955   char *s;
8956   const char *hint = candidates_list_and_hint (str, s, candidates);
8957   if (hint)
8958     inform (input_location, "valid arguments are: %s;"
8959                              " did you mean %qs?", s, hint);
8960   XDELETEVEC (s);
8961 }
8962
8963 /* Print a hint with a suggestion for a core name that most closely resembles
8964    what the user passed in STR.  */
8965
8966 inline static void
8967 aarch64_print_hint_for_core (const char *str)
8968 {
8969   aarch64_print_hint_for_core_or_arch (str, false);
8970 }
8971
8972 /* Print a hint with a suggestion for an architecture name that most closely
8973    resembles what the user passed in STR.  */
8974
8975 inline static void
8976 aarch64_print_hint_for_arch (const char *str)
8977 {
8978   aarch64_print_hint_for_core_or_arch (str, true);
8979 }
8980
8981 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
8982    specified in STR and throw errors if appropriate.  Put the results if
8983    they are valid in RES and ISA_FLAGS.  Return whether the option is
8984    valid.  */
8985
8986 static bool
8987 aarch64_validate_mcpu (const char *str, const struct processor **res,
8988                        unsigned long *isa_flags)
8989 {
8990   enum aarch64_parse_opt_result parse_res
8991     = aarch64_parse_cpu (str, res, isa_flags);
8992
8993   if (parse_res == AARCH64_PARSE_OK)
8994     return true;
8995
8996   switch (parse_res)
8997     {
8998       case AARCH64_PARSE_MISSING_ARG:
8999         error ("missing cpu name in %<-mcpu=%s%>", str);
9000         break;
9001       case AARCH64_PARSE_INVALID_ARG:
9002         error ("unknown value %qs for -mcpu", str);
9003         aarch64_print_hint_for_core (str);
9004         break;
9005       case AARCH64_PARSE_INVALID_FEATURE:
9006         error ("invalid feature modifier in %<-mcpu=%s%>", str);
9007         break;
9008       default:
9009         gcc_unreachable ();
9010     }
9011
9012   return false;
9013 }
9014
9015 /* Validate a command-line -march option.  Parse the arch and extensions
9016    (if any) specified in STR and throw errors if appropriate.  Put the
9017    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
9018    option is valid.  */
9019
9020 static bool
9021 aarch64_validate_march (const char *str, const struct processor **res,
9022                          unsigned long *isa_flags)
9023 {
9024   enum aarch64_parse_opt_result parse_res
9025     = aarch64_parse_arch (str, res, isa_flags);
9026
9027   if (parse_res == AARCH64_PARSE_OK)
9028     return true;
9029
9030   switch (parse_res)
9031     {
9032       case AARCH64_PARSE_MISSING_ARG:
9033         error ("missing arch name in %<-march=%s%>", str);
9034         break;
9035       case AARCH64_PARSE_INVALID_ARG:
9036         error ("unknown value %qs for -march", str);
9037         aarch64_print_hint_for_arch (str);
9038         break;
9039       case AARCH64_PARSE_INVALID_FEATURE:
9040         error ("invalid feature modifier in %<-march=%s%>", str);
9041         break;
9042       default:
9043         gcc_unreachable ();
9044     }
9045
9046   return false;
9047 }
9048
9049 /* Validate a command-line -mtune option.  Parse the cpu
9050    specified in STR and throw errors if appropriate.  Put the
9051    result, if it is valid, in RES.  Return whether the option is
9052    valid.  */
9053
9054 static bool
9055 aarch64_validate_mtune (const char *str, const struct processor **res)
9056 {
9057   enum aarch64_parse_opt_result parse_res
9058     = aarch64_parse_tune (str, res);
9059
9060   if (parse_res == AARCH64_PARSE_OK)
9061     return true;
9062
9063   switch (parse_res)
9064     {
9065       case AARCH64_PARSE_MISSING_ARG:
9066         error ("missing cpu name in %<-mtune=%s%>", str);
9067         break;
9068       case AARCH64_PARSE_INVALID_ARG:
9069         error ("unknown value %qs for -mtune", str);
9070         aarch64_print_hint_for_core (str);
9071         break;
9072       default:
9073         gcc_unreachable ();
9074     }
9075   return false;
9076 }
9077
9078 /* Return the CPU corresponding to the enum CPU.
9079    If it doesn't specify a cpu, return the default.  */
9080
9081 static const struct processor *
9082 aarch64_get_tune_cpu (enum aarch64_processor cpu)
9083 {
9084   if (cpu != aarch64_none)
9085     return &all_cores[cpu];
9086
9087   /* The & 0x3f is to extract the bottom 6 bits that encode the
9088      default cpu as selected by the --with-cpu GCC configure option
9089      in config.gcc.
9090      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
9091      flags mechanism should be reworked to make it more sane.  */
9092   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9093 }
9094
9095 /* Return the architecture corresponding to the enum ARCH.
9096    If it doesn't specify a valid architecture, return the default.  */
9097
9098 static const struct processor *
9099 aarch64_get_arch (enum aarch64_arch arch)
9100 {
9101   if (arch != aarch64_no_arch)
9102     return &all_architectures[arch];
9103
9104   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9105
9106   return &all_architectures[cpu->arch];
9107 }
9108
9109 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
9110    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
9111    tuning structs.  In particular it must set selected_tune and
9112    aarch64_isa_flags that define the available ISA features and tuning
9113    decisions.  It must also set selected_arch as this will be used to
9114    output the .arch asm tags for each function.  */
9115
9116 static void
9117 aarch64_override_options (void)
9118 {
9119   unsigned long cpu_isa = 0;
9120   unsigned long arch_isa = 0;
9121   aarch64_isa_flags = 0;
9122
9123   bool valid_cpu = true;
9124   bool valid_tune = true;
9125   bool valid_arch = true;
9126
9127   selected_cpu = NULL;
9128   selected_arch = NULL;
9129   selected_tune = NULL;
9130
9131   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
9132      If either of -march or -mtune is given, they override their
9133      respective component of -mcpu.  */
9134   if (aarch64_cpu_string)
9135     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
9136                                         &cpu_isa);
9137
9138   if (aarch64_arch_string)
9139     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
9140                                           &arch_isa);
9141
9142   if (aarch64_tune_string)
9143     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
9144
9145   /* If the user did not specify a processor, choose the default
9146      one for them.  This will be the CPU set during configuration using
9147      --with-cpu, otherwise it is "generic".  */
9148   if (!selected_cpu)
9149     {
9150       if (selected_arch)
9151         {
9152           selected_cpu = &all_cores[selected_arch->ident];
9153           aarch64_isa_flags = arch_isa;
9154           explicit_arch = selected_arch->arch;
9155         }
9156       else
9157         {
9158           /* Get default configure-time CPU.  */
9159           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
9160           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
9161         }
9162
9163       if (selected_tune)
9164         explicit_tune_core = selected_tune->ident;
9165     }
9166   /* If both -mcpu and -march are specified check that they are architecturally
9167      compatible, warn if they're not and prefer the -march ISA flags.  */
9168   else if (selected_arch)
9169     {
9170       if (selected_arch->arch != selected_cpu->arch)
9171         {
9172           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
9173                        all_architectures[selected_cpu->arch].name,
9174                        selected_arch->name);
9175         }
9176       aarch64_isa_flags = arch_isa;
9177       explicit_arch = selected_arch->arch;
9178       explicit_tune_core = selected_tune ? selected_tune->ident
9179                                           : selected_cpu->ident;
9180     }
9181   else
9182     {
9183       /* -mcpu but no -march.  */
9184       aarch64_isa_flags = cpu_isa;
9185       explicit_tune_core = selected_tune ? selected_tune->ident
9186                                           : selected_cpu->ident;
9187       gcc_assert (selected_cpu);
9188       selected_arch = &all_architectures[selected_cpu->arch];
9189       explicit_arch = selected_arch->arch;
9190     }
9191
9192   /* Set the arch as well as we will need it when outputing
9193      the .arch directive in assembly.  */
9194   if (!selected_arch)
9195     {
9196       gcc_assert (selected_cpu);
9197       selected_arch = &all_architectures[selected_cpu->arch];
9198     }
9199
9200   if (!selected_tune)
9201     selected_tune = selected_cpu;
9202
9203 #ifndef HAVE_AS_MABI_OPTION
9204   /* The compiler may have been configured with 2.23.* binutils, which does
9205      not have support for ILP32.  */
9206   if (TARGET_ILP32)
9207     error ("Assembler does not support -mabi=ilp32");
9208 #endif
9209
9210   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
9211     sorry ("Return address signing is only supported for -mabi=lp64");
9212
9213   /* Make sure we properly set up the explicit options.  */
9214   if ((aarch64_cpu_string && valid_cpu)
9215        || (aarch64_tune_string && valid_tune))
9216     gcc_assert (explicit_tune_core != aarch64_none);
9217
9218   if ((aarch64_cpu_string && valid_cpu)
9219        || (aarch64_arch_string && valid_arch))
9220     gcc_assert (explicit_arch != aarch64_no_arch);
9221
9222   aarch64_override_options_internal (&global_options);
9223
9224   /* Save these options as the default ones in case we push and pop them later
9225      while processing functions with potential target attributes.  */
9226   target_option_default_node = target_option_current_node
9227       = build_target_option_node (&global_options);
9228 }
9229
9230 /* Implement targetm.override_options_after_change.  */
9231
9232 static void
9233 aarch64_override_options_after_change (void)
9234 {
9235   aarch64_override_options_after_change_1 (&global_options);
9236 }
9237
9238 static struct machine_function *
9239 aarch64_init_machine_status (void)
9240 {
9241   struct machine_function *machine;
9242   machine = ggc_cleared_alloc<machine_function> ();
9243   return machine;
9244 }
9245
9246 void
9247 aarch64_init_expanders (void)
9248 {
9249   init_machine_status = aarch64_init_machine_status;
9250 }
9251
9252 /* A checking mechanism for the implementation of the various code models.  */
9253 static void
9254 initialize_aarch64_code_model (struct gcc_options *opts)
9255 {
9256    if (opts->x_flag_pic)
9257      {
9258        switch (opts->x_aarch64_cmodel_var)
9259          {
9260          case AARCH64_CMODEL_TINY:
9261            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
9262            break;
9263          case AARCH64_CMODEL_SMALL:
9264 #ifdef HAVE_AS_SMALL_PIC_RELOCS
9265            aarch64_cmodel = (flag_pic == 2
9266                              ? AARCH64_CMODEL_SMALL_PIC
9267                              : AARCH64_CMODEL_SMALL_SPIC);
9268 #else
9269            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
9270 #endif
9271            break;
9272          case AARCH64_CMODEL_LARGE:
9273            sorry ("code model %qs with -f%s", "large",
9274                   opts->x_flag_pic > 1 ? "PIC" : "pic");
9275            break;
9276          default:
9277            gcc_unreachable ();
9278          }
9279      }
9280    else
9281      aarch64_cmodel = opts->x_aarch64_cmodel_var;
9282 }
9283
9284 /* Implement TARGET_OPTION_SAVE.  */
9285
9286 static void
9287 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
9288 {
9289   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
9290 }
9291
9292 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
9293    using the information saved in PTR.  */
9294
9295 static void
9296 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
9297 {
9298   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
9299   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9300   opts->x_explicit_arch = ptr->x_explicit_arch;
9301   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
9302   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
9303
9304   aarch64_override_options_internal (opts);
9305 }
9306
9307 /* Implement TARGET_OPTION_PRINT.  */
9308
9309 static void
9310 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
9311 {
9312   const struct processor *cpu
9313     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9314   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
9315   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
9316   std::string extension
9317     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
9318
9319   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
9320   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
9321            arch->name, extension.c_str ());
9322 }
9323
9324 static GTY(()) tree aarch64_previous_fndecl;
9325
9326 void
9327 aarch64_reset_previous_fndecl (void)
9328 {
9329   aarch64_previous_fndecl = NULL;
9330 }
9331
9332 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
9333    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
9334    make sure optab availability predicates are recomputed when necessary.  */
9335
9336 void
9337 aarch64_save_restore_target_globals (tree new_tree)
9338 {
9339   if (TREE_TARGET_GLOBALS (new_tree))
9340     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
9341   else if (new_tree == target_option_default_node)
9342     restore_target_globals (&default_target_globals);
9343   else
9344     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
9345 }
9346
9347 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
9348    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
9349    of the function, if such exists.  This function may be called multiple
9350    times on a single function so use aarch64_previous_fndecl to avoid
9351    setting up identical state.  */
9352
9353 static void
9354 aarch64_set_current_function (tree fndecl)
9355 {
9356   if (!fndecl || fndecl == aarch64_previous_fndecl)
9357     return;
9358
9359   tree old_tree = (aarch64_previous_fndecl
9360                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
9361                    : NULL_TREE);
9362
9363   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9364
9365   /* If current function has no attributes but the previous one did,
9366      use the default node.  */
9367   if (!new_tree && old_tree)
9368     new_tree = target_option_default_node;
9369
9370   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
9371      the default have been handled by aarch64_save_restore_target_globals from
9372      aarch64_pragma_target_parse.  */
9373   if (old_tree == new_tree)
9374     return;
9375
9376   aarch64_previous_fndecl = fndecl;
9377
9378   /* First set the target options.  */
9379   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
9380
9381   aarch64_save_restore_target_globals (new_tree);
9382 }
9383
9384 /* Enum describing the various ways we can handle attributes.
9385    In many cases we can reuse the generic option handling machinery.  */
9386
9387 enum aarch64_attr_opt_type
9388 {
9389   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
9390   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
9391   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
9392   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
9393 };
9394
9395 /* All the information needed to handle a target attribute.
9396    NAME is the name of the attribute.
9397    ATTR_TYPE specifies the type of behavior of the attribute as described
9398    in the definition of enum aarch64_attr_opt_type.
9399    ALLOW_NEG is true if the attribute supports a "no-" form.
9400    HANDLER is the function that takes the attribute string and whether
9401    it is a pragma or attribute and handles the option.  It is needed only
9402    when the ATTR_TYPE is aarch64_attr_custom.
9403    OPT_NUM is the enum specifying the option that the attribute modifies.
9404    This is needed for attributes that mirror the behavior of a command-line
9405    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
9406    aarch64_attr_enum.  */
9407
9408 struct aarch64_attribute_info
9409 {
9410   const char *name;
9411   enum aarch64_attr_opt_type attr_type;
9412   bool allow_neg;
9413   bool (*handler) (const char *, const char *);
9414   enum opt_code opt_num;
9415 };
9416
9417 /* Handle the ARCH_STR argument to the arch= target attribute.
9418    PRAGMA_OR_ATTR is used in potential error messages.  */
9419
9420 static bool
9421 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
9422 {
9423   const struct processor *tmp_arch = NULL;
9424   enum aarch64_parse_opt_result parse_res
9425     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
9426
9427   if (parse_res == AARCH64_PARSE_OK)
9428     {
9429       gcc_assert (tmp_arch);
9430       selected_arch = tmp_arch;
9431       explicit_arch = selected_arch->arch;
9432       return true;
9433     }
9434
9435   switch (parse_res)
9436     {
9437       case AARCH64_PARSE_MISSING_ARG:
9438         error ("missing architecture name in 'arch' target %s", pragma_or_attr);
9439         break;
9440       case AARCH64_PARSE_INVALID_ARG:
9441         error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
9442         aarch64_print_hint_for_arch (str);
9443         break;
9444       case AARCH64_PARSE_INVALID_FEATURE:
9445         error ("invalid feature modifier %qs for 'arch' target %s",
9446                str, pragma_or_attr);
9447         break;
9448       default:
9449         gcc_unreachable ();
9450     }
9451
9452   return false;
9453 }
9454
9455 /* Handle the argument CPU_STR to the cpu= target attribute.
9456    PRAGMA_OR_ATTR is used in potential error messages.  */
9457
9458 static bool
9459 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
9460 {
9461   const struct processor *tmp_cpu = NULL;
9462   enum aarch64_parse_opt_result parse_res
9463     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
9464
9465   if (parse_res == AARCH64_PARSE_OK)
9466     {
9467       gcc_assert (tmp_cpu);
9468       selected_tune = tmp_cpu;
9469       explicit_tune_core = selected_tune->ident;
9470
9471       selected_arch = &all_architectures[tmp_cpu->arch];
9472       explicit_arch = selected_arch->arch;
9473       return true;
9474     }
9475
9476   switch (parse_res)
9477     {
9478       case AARCH64_PARSE_MISSING_ARG:
9479         error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
9480         break;
9481       case AARCH64_PARSE_INVALID_ARG:
9482         error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
9483         aarch64_print_hint_for_core (str);
9484         break;
9485       case AARCH64_PARSE_INVALID_FEATURE:
9486         error ("invalid feature modifier %qs for 'cpu' target %s",
9487                str, pragma_or_attr);
9488         break;
9489       default:
9490         gcc_unreachable ();
9491     }
9492
9493   return false;
9494 }
9495
9496 /* Handle the argument STR to the tune= target attribute.
9497    PRAGMA_OR_ATTR is used in potential error messages.  */
9498
9499 static bool
9500 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
9501 {
9502   const struct processor *tmp_tune = NULL;
9503   enum aarch64_parse_opt_result parse_res
9504     = aarch64_parse_tune (str, &tmp_tune);
9505
9506   if (parse_res == AARCH64_PARSE_OK)
9507     {
9508       gcc_assert (tmp_tune);
9509       selected_tune = tmp_tune;
9510       explicit_tune_core = selected_tune->ident;
9511       return true;
9512     }
9513
9514   switch (parse_res)
9515     {
9516       case AARCH64_PARSE_INVALID_ARG:
9517         error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
9518         aarch64_print_hint_for_core (str);
9519         break;
9520       default:
9521         gcc_unreachable ();
9522     }
9523
9524   return false;
9525 }
9526
9527 /* Parse an architecture extensions target attribute string specified in STR.
9528    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
9529    if successful.  Update aarch64_isa_flags to reflect the ISA features
9530    modified.
9531    PRAGMA_OR_ATTR is used in potential error messages.  */
9532
9533 static bool
9534 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
9535 {
9536   enum aarch64_parse_opt_result parse_res;
9537   unsigned long isa_flags = aarch64_isa_flags;
9538
9539   /* We allow "+nothing" in the beginning to clear out all architectural
9540      features if the user wants to handpick specific features.  */
9541   if (strncmp ("+nothing", str, 8) == 0)
9542     {
9543       isa_flags = 0;
9544       str += 8;
9545     }
9546
9547   parse_res = aarch64_parse_extension (str, &isa_flags);
9548
9549   if (parse_res == AARCH64_PARSE_OK)
9550     {
9551       aarch64_isa_flags = isa_flags;
9552       return true;
9553     }
9554
9555   switch (parse_res)
9556     {
9557       case AARCH64_PARSE_MISSING_ARG:
9558         error ("missing feature modifier in target %s %qs",
9559                pragma_or_attr, str);
9560         break;
9561
9562       case AARCH64_PARSE_INVALID_FEATURE:
9563         error ("invalid feature modifier in target %s %qs",
9564                pragma_or_attr, str);
9565         break;
9566
9567       default:
9568         gcc_unreachable ();
9569     }
9570
9571  return false;
9572 }
9573
9574 /* The target attributes that we support.  On top of these we also support just
9575    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
9576    handled explicitly in aarch64_process_one_target_attr.  */
9577
9578 static const struct aarch64_attribute_info aarch64_attributes[] =
9579 {
9580   { "general-regs-only", aarch64_attr_mask, false, NULL,
9581      OPT_mgeneral_regs_only },
9582   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
9583      OPT_mfix_cortex_a53_835769 },
9584   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
9585      OPT_mfix_cortex_a53_843419 },
9586   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
9587   { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
9588   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
9589      OPT_momit_leaf_frame_pointer },
9590   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
9591   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
9592      OPT_march_ },
9593   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
9594   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
9595      OPT_mtune_ },
9596   { "sign-return-address", aarch64_attr_enum, false, NULL,
9597      OPT_msign_return_address_ },
9598   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
9599 };
9600
9601 /* Parse ARG_STR which contains the definition of one target attribute.
9602    Show appropriate errors if any or return true if the attribute is valid.
9603    PRAGMA_OR_ATTR holds the string to use in error messages about whether
9604    we're processing a target attribute or pragma.  */
9605
9606 static bool
9607 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
9608 {
9609   bool invert = false;
9610
9611   size_t len = strlen (arg_str);
9612
9613   if (len == 0)
9614     {
9615       error ("malformed target %s", pragma_or_attr);
9616       return false;
9617     }
9618
9619   char *str_to_check = (char *) alloca (len + 1);
9620   strcpy (str_to_check, arg_str);
9621
9622   /* Skip leading whitespace.  */
9623   while (*str_to_check == ' ' || *str_to_check == '\t')
9624     str_to_check++;
9625
9626   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
9627      It is easier to detect and handle it explicitly here rather than going
9628      through the machinery for the rest of the target attributes in this
9629      function.  */
9630   if (*str_to_check == '+')
9631     return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
9632
9633   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
9634     {
9635       invert = true;
9636       str_to_check += 3;
9637     }
9638   char *arg = strchr (str_to_check, '=');
9639
9640   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
9641      and point ARG to "foo".  */
9642   if (arg)
9643     {
9644       *arg = '\0';
9645       arg++;
9646     }
9647   const struct aarch64_attribute_info *p_attr;
9648   bool found = false;
9649   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
9650     {
9651       /* If the names don't match up, or the user has given an argument
9652          to an attribute that doesn't accept one, or didn't give an argument
9653          to an attribute that expects one, fail to match.  */
9654       if (strcmp (str_to_check, p_attr->name) != 0)
9655         continue;
9656
9657       found = true;
9658       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
9659                               || p_attr->attr_type == aarch64_attr_enum;
9660
9661       if (attr_need_arg_p ^ (arg != NULL))
9662         {
9663           error ("target %s %qs does not accept an argument",
9664                   pragma_or_attr, str_to_check);
9665           return false;
9666         }
9667
9668       /* If the name matches but the attribute does not allow "no-" versions
9669          then we can't match.  */
9670       if (invert && !p_attr->allow_neg)
9671         {
9672           error ("target %s %qs does not allow a negated form",
9673                   pragma_or_attr, str_to_check);
9674           return false;
9675         }
9676
9677       switch (p_attr->attr_type)
9678         {
9679         /* Has a custom handler registered.
9680            For example, cpu=, arch=, tune=.  */
9681           case aarch64_attr_custom:
9682             gcc_assert (p_attr->handler);
9683             if (!p_attr->handler (arg, pragma_or_attr))
9684               return false;
9685             break;
9686
9687           /* Either set or unset a boolean option.  */
9688           case aarch64_attr_bool:
9689             {
9690               struct cl_decoded_option decoded;
9691
9692               generate_option (p_attr->opt_num, NULL, !invert,
9693                                CL_TARGET, &decoded);
9694               aarch64_handle_option (&global_options, &global_options_set,
9695                                       &decoded, input_location);
9696               break;
9697             }
9698           /* Set or unset a bit in the target_flags.  aarch64_handle_option
9699              should know what mask to apply given the option number.  */
9700           case aarch64_attr_mask:
9701             {
9702               struct cl_decoded_option decoded;
9703               /* We only need to specify the option number.
9704                  aarch64_handle_option will know which mask to apply.  */
9705               decoded.opt_index = p_attr->opt_num;
9706               decoded.value = !invert;
9707               aarch64_handle_option (&global_options, &global_options_set,
9708                                       &decoded, input_location);
9709               break;
9710             }
9711           /* Use the option setting machinery to set an option to an enum.  */
9712           case aarch64_attr_enum:
9713             {
9714               gcc_assert (arg);
9715               bool valid;
9716               int value;
9717               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
9718                                               &value, CL_TARGET);
9719               if (valid)
9720                 {
9721                   set_option (&global_options, NULL, p_attr->opt_num, value,
9722                               NULL, DK_UNSPECIFIED, input_location,
9723                               global_dc);
9724                 }
9725               else
9726                 {
9727                   error ("target %s %s=%s is not valid",
9728                          pragma_or_attr, str_to_check, arg);
9729                 }
9730               break;
9731             }
9732           default:
9733             gcc_unreachable ();
9734         }
9735     }
9736
9737   /* If we reached here we either have found an attribute and validated
9738      it or didn't match any.  If we matched an attribute but its arguments
9739      were malformed we will have returned false already.  */
9740   return found;
9741 }
9742
9743 /* Count how many times the character C appears in
9744    NULL-terminated string STR.  */
9745
9746 static unsigned int
9747 num_occurences_in_str (char c, char *str)
9748 {
9749   unsigned int res = 0;
9750   while (*str != '\0')
9751     {
9752       if (*str == c)
9753         res++;
9754
9755       str++;
9756     }
9757
9758   return res;
9759 }
9760
9761 /* Parse the tree in ARGS that contains the target attribute information
9762    and update the global target options space.  PRAGMA_OR_ATTR is a string
9763    to be used in error messages, specifying whether this is processing
9764    a target attribute or a target pragma.  */
9765
9766 bool
9767 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
9768 {
9769   if (TREE_CODE (args) == TREE_LIST)
9770     {
9771       do
9772         {
9773           tree head = TREE_VALUE (args);
9774           if (head)
9775             {
9776               if (!aarch64_process_target_attr (head, pragma_or_attr))
9777                 return false;
9778             }
9779           args = TREE_CHAIN (args);
9780         } while (args);
9781
9782       return true;
9783     }
9784
9785   if (TREE_CODE (args) != STRING_CST)
9786     {
9787       error ("attribute %<target%> argument not a string");
9788       return false;
9789     }
9790
9791   size_t len = strlen (TREE_STRING_POINTER (args));
9792   char *str_to_check = (char *) alloca (len + 1);
9793   strcpy (str_to_check, TREE_STRING_POINTER (args));
9794
9795   if (len == 0)
9796     {
9797       error ("malformed target %s value", pragma_or_attr);
9798       return false;
9799     }
9800
9801   /* Used to catch empty spaces between commas i.e.
9802      attribute ((target ("attr1,,attr2"))).  */
9803   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9804
9805   /* Handle multiple target attributes separated by ','.  */
9806   char *token = strtok (str_to_check, ",");
9807
9808   unsigned int num_attrs = 0;
9809   while (token)
9810     {
9811       num_attrs++;
9812       if (!aarch64_process_one_target_attr (token, pragma_or_attr))
9813         {
9814           error ("target %s %qs is invalid", pragma_or_attr, token);
9815           return false;
9816         }
9817
9818       token = strtok (NULL, ",");
9819     }
9820
9821   if (num_attrs != num_commas + 1)
9822     {
9823       error ("malformed target %s list %qs",
9824               pragma_or_attr, TREE_STRING_POINTER (args));
9825       return false;
9826     }
9827
9828   return true;
9829 }
9830
9831 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
9832    process attribute ((target ("..."))).  */
9833
9834 static bool
9835 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9836 {
9837   struct cl_target_option cur_target;
9838   bool ret;
9839   tree old_optimize;
9840   tree new_target, new_optimize;
9841   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9842
9843   /* If what we're processing is the current pragma string then the
9844      target option node is already stored in target_option_current_node
9845      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
9846      having to re-parse the string.  This is especially useful to keep
9847      arm_neon.h compile times down since that header contains a lot
9848      of intrinsics enclosed in pragmas.  */
9849   if (!existing_target && args == current_target_pragma)
9850     {
9851       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
9852       return true;
9853     }
9854   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9855
9856   old_optimize = build_optimization_node (&global_options);
9857   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9858
9859   /* If the function changed the optimization levels as well as setting
9860      target options, start with the optimizations specified.  */
9861   if (func_optimize && func_optimize != old_optimize)
9862     cl_optimization_restore (&global_options,
9863                              TREE_OPTIMIZATION (func_optimize));
9864
9865   /* Save the current target options to restore at the end.  */
9866   cl_target_option_save (&cur_target, &global_options);
9867
9868   /* If fndecl already has some target attributes applied to it, unpack
9869      them so that we add this attribute on top of them, rather than
9870      overwriting them.  */
9871   if (existing_target)
9872     {
9873       struct cl_target_option *existing_options
9874         = TREE_TARGET_OPTION (existing_target);
9875
9876       if (existing_options)
9877         cl_target_option_restore (&global_options, existing_options);
9878     }
9879   else
9880     cl_target_option_restore (&global_options,
9881                         TREE_TARGET_OPTION (target_option_current_node));
9882
9883
9884   ret = aarch64_process_target_attr (args, "attribute");
9885
9886   /* Set up any additional state.  */
9887   if (ret)
9888     {
9889       aarch64_override_options_internal (&global_options);
9890       /* Initialize SIMD builtins if we haven't already.
9891          Set current_target_pragma to NULL for the duration so that
9892          the builtin initialization code doesn't try to tag the functions
9893          being built with the attributes specified by any current pragma, thus
9894          going into an infinite recursion.  */
9895       if (TARGET_SIMD)
9896         {
9897           tree saved_current_target_pragma = current_target_pragma;
9898           current_target_pragma = NULL;
9899           aarch64_init_simd_builtins ();
9900           current_target_pragma = saved_current_target_pragma;
9901         }
9902       new_target = build_target_option_node (&global_options);
9903     }
9904   else
9905     new_target = NULL;
9906
9907   new_optimize = build_optimization_node (&global_options);
9908
9909   if (fndecl && ret)
9910     {
9911       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
9912
9913       if (old_optimize != new_optimize)
9914         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
9915     }
9916
9917   cl_target_option_restore (&global_options, &cur_target);
9918
9919   if (old_optimize != new_optimize)
9920     cl_optimization_restore (&global_options,
9921                              TREE_OPTIMIZATION (old_optimize));
9922   return ret;
9923 }
9924
9925 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
9926    tri-bool options (yes, no, don't care) and the default value is
9927    DEF, determine whether to reject inlining.  */
9928
9929 static bool
9930 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
9931                                      int dont_care, int def)
9932 {
9933   /* If the callee doesn't care, always allow inlining.  */
9934   if (callee == dont_care)
9935     return true;
9936
9937   /* If the caller doesn't care, always allow inlining.  */
9938   if (caller == dont_care)
9939     return true;
9940
9941   /* Otherwise, allow inlining if either the callee and caller values
9942      agree, or if the callee is using the default value.  */
9943   return (callee == caller || callee == def);
9944 }
9945
9946 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
9947    to inline CALLEE into CALLER based on target-specific info.
9948    Make sure that the caller and callee have compatible architectural
9949    features.  Then go through the other possible target attributes
9950    and see if they can block inlining.  Try not to reject always_inline
9951    callees unless they are incompatible architecturally.  */
9952
9953 static bool
9954 aarch64_can_inline_p (tree caller, tree callee)
9955 {
9956   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
9957   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
9958
9959   /* If callee has no option attributes, then it is ok to inline.  */
9960   if (!callee_tree)
9961     return true;
9962
9963   struct cl_target_option *caller_opts
9964         = TREE_TARGET_OPTION (caller_tree ? caller_tree
9965                                            : target_option_default_node);
9966
9967   struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
9968
9969
9970   /* Callee's ISA flags should be a subset of the caller's.  */
9971   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
9972        != callee_opts->x_aarch64_isa_flags)
9973     return false;
9974
9975   /* Allow non-strict aligned functions inlining into strict
9976      aligned ones.  */
9977   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
9978        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
9979       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
9980            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
9981     return false;
9982
9983   bool always_inline = lookup_attribute ("always_inline",
9984                                           DECL_ATTRIBUTES (callee));
9985
9986   /* If the architectural features match up and the callee is always_inline
9987      then the other attributes don't matter.  */
9988   if (always_inline)
9989     return true;
9990
9991   if (caller_opts->x_aarch64_cmodel_var
9992       != callee_opts->x_aarch64_cmodel_var)
9993     return false;
9994
9995   if (caller_opts->x_aarch64_tls_dialect
9996       != callee_opts->x_aarch64_tls_dialect)
9997     return false;
9998
9999   /* Honour explicit requests to workaround errata.  */
10000   if (!aarch64_tribools_ok_for_inlining_p (
10001           caller_opts->x_aarch64_fix_a53_err835769,
10002           callee_opts->x_aarch64_fix_a53_err835769,
10003           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
10004     return false;
10005
10006   if (!aarch64_tribools_ok_for_inlining_p (
10007           caller_opts->x_aarch64_fix_a53_err843419,
10008           callee_opts->x_aarch64_fix_a53_err843419,
10009           2, TARGET_FIX_ERR_A53_843419))
10010     return false;
10011
10012   /* If the user explicitly specified -momit-leaf-frame-pointer for the
10013      caller and calle and they don't match up, reject inlining.  */
10014   if (!aarch64_tribools_ok_for_inlining_p (
10015           caller_opts->x_flag_omit_leaf_frame_pointer,
10016           callee_opts->x_flag_omit_leaf_frame_pointer,
10017           2, 1))
10018     return false;
10019
10020   /* If the callee has specific tuning overrides, respect them.  */
10021   if (callee_opts->x_aarch64_override_tune_string != NULL
10022       && caller_opts->x_aarch64_override_tune_string == NULL)
10023     return false;
10024
10025   /* If the user specified tuning override strings for the
10026      caller and callee and they don't match up, reject inlining.
10027      We just do a string compare here, we don't analyze the meaning
10028      of the string, as it would be too costly for little gain.  */
10029   if (callee_opts->x_aarch64_override_tune_string
10030       && caller_opts->x_aarch64_override_tune_string
10031       && (strcmp (callee_opts->x_aarch64_override_tune_string,
10032                   caller_opts->x_aarch64_override_tune_string) != 0))
10033     return false;
10034
10035   return true;
10036 }
10037
10038 /* Return true if SYMBOL_REF X binds locally.  */
10039
10040 static bool
10041 aarch64_symbol_binds_local_p (const_rtx x)
10042 {
10043   return (SYMBOL_REF_DECL (x)
10044           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
10045           : SYMBOL_REF_LOCAL_P (x));
10046 }
10047
10048 /* Return true if SYMBOL_REF X is thread local */
10049 static bool
10050 aarch64_tls_symbol_p (rtx x)
10051 {
10052   if (! TARGET_HAVE_TLS)
10053     return false;
10054
10055   if (GET_CODE (x) != SYMBOL_REF)
10056     return false;
10057
10058   return SYMBOL_REF_TLS_MODEL (x) != 0;
10059 }
10060
10061 /* Classify a TLS symbol into one of the TLS kinds.  */
10062 enum aarch64_symbol_type
10063 aarch64_classify_tls_symbol (rtx x)
10064 {
10065   enum tls_model tls_kind = tls_symbolic_operand_type (x);
10066
10067   switch (tls_kind)
10068     {
10069     case TLS_MODEL_GLOBAL_DYNAMIC:
10070     case TLS_MODEL_LOCAL_DYNAMIC:
10071       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
10072
10073     case TLS_MODEL_INITIAL_EXEC:
10074       switch (aarch64_cmodel)
10075         {
10076         case AARCH64_CMODEL_TINY:
10077         case AARCH64_CMODEL_TINY_PIC:
10078           return SYMBOL_TINY_TLSIE;
10079         default:
10080           return SYMBOL_SMALL_TLSIE;
10081         }
10082
10083     case TLS_MODEL_LOCAL_EXEC:
10084       if (aarch64_tls_size == 12)
10085         return SYMBOL_TLSLE12;
10086       else if (aarch64_tls_size == 24)
10087         return SYMBOL_TLSLE24;
10088       else if (aarch64_tls_size == 32)
10089         return SYMBOL_TLSLE32;
10090       else if (aarch64_tls_size == 48)
10091         return SYMBOL_TLSLE48;
10092       else
10093         gcc_unreachable ();
10094
10095     case TLS_MODEL_EMULATED:
10096     case TLS_MODEL_NONE:
10097       return SYMBOL_FORCE_TO_MEM;
10098
10099     default:
10100       gcc_unreachable ();
10101     }
10102 }
10103
10104 /* Return the method that should be used to access SYMBOL_REF or
10105    LABEL_REF X.  */
10106
10107 enum aarch64_symbol_type
10108 aarch64_classify_symbol (rtx x, rtx offset)
10109 {
10110   if (GET_CODE (x) == LABEL_REF)
10111     {
10112       switch (aarch64_cmodel)
10113         {
10114         case AARCH64_CMODEL_LARGE:
10115           return SYMBOL_FORCE_TO_MEM;
10116
10117         case AARCH64_CMODEL_TINY_PIC:
10118         case AARCH64_CMODEL_TINY:
10119           return SYMBOL_TINY_ABSOLUTE;
10120
10121         case AARCH64_CMODEL_SMALL_SPIC:
10122         case AARCH64_CMODEL_SMALL_PIC:
10123         case AARCH64_CMODEL_SMALL:
10124           return SYMBOL_SMALL_ABSOLUTE;
10125
10126         default:
10127           gcc_unreachable ();
10128         }
10129     }
10130
10131   if (GET_CODE (x) == SYMBOL_REF)
10132     {
10133       if (aarch64_tls_symbol_p (x))
10134         return aarch64_classify_tls_symbol (x);
10135
10136       switch (aarch64_cmodel)
10137         {
10138         case AARCH64_CMODEL_TINY:
10139           /* When we retrieve symbol + offset address, we have to make sure
10140              the offset does not cause overflow of the final address.  But
10141              we have no way of knowing the address of symbol at compile time
10142              so we can't accurately say if the distance between the PC and
10143              symbol + offset is outside the addressible range of +/-1M in the
10144              TINY code model.  So we rely on images not being greater than
10145              1M and cap the offset at 1M and anything beyond 1M will have to
10146              be loaded using an alternative mechanism.  Furthermore if the
10147              symbol is a weak reference to something that isn't known to
10148              resolve to a symbol in this module, then force to memory.  */
10149           if ((SYMBOL_REF_WEAK (x)
10150                && !aarch64_symbol_binds_local_p (x))
10151               || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
10152             return SYMBOL_FORCE_TO_MEM;
10153           return SYMBOL_TINY_ABSOLUTE;
10154
10155         case AARCH64_CMODEL_SMALL:
10156           /* Same reasoning as the tiny code model, but the offset cap here is
10157              4G.  */
10158           if ((SYMBOL_REF_WEAK (x)
10159                && !aarch64_symbol_binds_local_p (x))
10160               || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
10161                             HOST_WIDE_INT_C (4294967264)))
10162             return SYMBOL_FORCE_TO_MEM;
10163           return SYMBOL_SMALL_ABSOLUTE;
10164
10165         case AARCH64_CMODEL_TINY_PIC:
10166           if (!aarch64_symbol_binds_local_p (x))
10167             return SYMBOL_TINY_GOT;
10168           return SYMBOL_TINY_ABSOLUTE;
10169
10170         case AARCH64_CMODEL_SMALL_SPIC:
10171         case AARCH64_CMODEL_SMALL_PIC:
10172           if (!aarch64_symbol_binds_local_p (x))
10173             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
10174                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
10175           return SYMBOL_SMALL_ABSOLUTE;
10176
10177         case AARCH64_CMODEL_LARGE:
10178           /* This is alright even in PIC code as the constant
10179              pool reference is always PC relative and within
10180              the same translation unit.  */
10181           if (CONSTANT_POOL_ADDRESS_P (x))
10182             return SYMBOL_SMALL_ABSOLUTE;
10183           else
10184             return SYMBOL_FORCE_TO_MEM;
10185
10186         default:
10187           gcc_unreachable ();
10188         }
10189     }
10190
10191   /* By default push everything into the constant pool.  */
10192   return SYMBOL_FORCE_TO_MEM;
10193 }
10194
10195 bool
10196 aarch64_constant_address_p (rtx x)
10197 {
10198   return (CONSTANT_P (x) && memory_address_p (DImode, x));
10199 }
10200
10201 bool
10202 aarch64_legitimate_pic_operand_p (rtx x)
10203 {
10204   if (GET_CODE (x) == SYMBOL_REF
10205       || (GET_CODE (x) == CONST
10206           && GET_CODE (XEXP (x, 0)) == PLUS
10207           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
10208      return false;
10209
10210   return true;
10211 }
10212
10213 /* Return true if X holds either a quarter-precision or
10214      floating-point +0.0 constant.  */
10215 static bool
10216 aarch64_valid_floating_const (machine_mode mode, rtx x)
10217 {
10218   if (!CONST_DOUBLE_P (x))
10219     return false;
10220
10221   if (aarch64_float_const_zero_rtx_p (x))
10222     return true;
10223
10224   /* We only handle moving 0.0 to a TFmode register.  */
10225   if (!(mode == SFmode || mode == DFmode))
10226     return false;
10227
10228   return aarch64_float_const_representable_p (x);
10229 }
10230
10231 static bool
10232 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
10233 {
10234   /* Do not allow vector struct mode constants.  We could support
10235      0 and -1 easily, but they need support in aarch64-simd.md.  */
10236   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
10237     return false;
10238
10239   /* This could probably go away because
10240      we now decompose CONST_INTs according to expand_mov_immediate.  */
10241   if ((GET_CODE (x) == CONST_VECTOR
10242        && aarch64_simd_valid_immediate (x, mode, false, NULL))
10243       || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
10244         return !targetm.cannot_force_const_mem (mode, x);
10245
10246   if (GET_CODE (x) == HIGH
10247       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
10248     return true;
10249
10250   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
10251      so spilling them is better than rematerialization.  */
10252   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
10253     return true;
10254
10255   return aarch64_constant_address_p (x);
10256 }
10257
10258 rtx
10259 aarch64_load_tp (rtx target)
10260 {
10261   if (!target
10262       || GET_MODE (target) != Pmode
10263       || !register_operand (target, Pmode))
10264     target = gen_reg_rtx (Pmode);
10265
10266   /* Can return in any reg.  */
10267   emit_insn (gen_aarch64_load_tp_hard (target));
10268   return target;
10269 }
10270
10271 /* On AAPCS systems, this is the "struct __va_list".  */
10272 static GTY(()) tree va_list_type;
10273
10274 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
10275    Return the type to use as __builtin_va_list.
10276
10277    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
10278
10279    struct __va_list
10280    {
10281      void *__stack;
10282      void *__gr_top;
10283      void *__vr_top;
10284      int   __gr_offs;
10285      int   __vr_offs;
10286    };  */
10287
10288 static tree
10289 aarch64_build_builtin_va_list (void)
10290 {
10291   tree va_list_name;
10292   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10293
10294   /* Create the type.  */
10295   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
10296   /* Give it the required name.  */
10297   va_list_name = build_decl (BUILTINS_LOCATION,
10298                              TYPE_DECL,
10299                              get_identifier ("__va_list"),
10300                              va_list_type);
10301   DECL_ARTIFICIAL (va_list_name) = 1;
10302   TYPE_NAME (va_list_type) = va_list_name;
10303   TYPE_STUB_DECL (va_list_type) = va_list_name;
10304
10305   /* Create the fields.  */
10306   f_stack = build_decl (BUILTINS_LOCATION,
10307                         FIELD_DECL, get_identifier ("__stack"),
10308                         ptr_type_node);
10309   f_grtop = build_decl (BUILTINS_LOCATION,
10310                         FIELD_DECL, get_identifier ("__gr_top"),
10311                         ptr_type_node);
10312   f_vrtop = build_decl (BUILTINS_LOCATION,
10313                         FIELD_DECL, get_identifier ("__vr_top"),
10314                         ptr_type_node);
10315   f_groff = build_decl (BUILTINS_LOCATION,
10316                         FIELD_DECL, get_identifier ("__gr_offs"),
10317                         integer_type_node);
10318   f_vroff = build_decl (BUILTINS_LOCATION,
10319                         FIELD_DECL, get_identifier ("__vr_offs"),
10320                         integer_type_node);
10321
10322   /* Tell tree-stdarg pass about our internal offset fields.
10323      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
10324      purpose to identify whether the code is updating va_list internal
10325      offset fields through irregular way.  */
10326   va_list_gpr_counter_field = f_groff;
10327   va_list_fpr_counter_field = f_vroff;
10328
10329   DECL_ARTIFICIAL (f_stack) = 1;
10330   DECL_ARTIFICIAL (f_grtop) = 1;
10331   DECL_ARTIFICIAL (f_vrtop) = 1;
10332   DECL_ARTIFICIAL (f_groff) = 1;
10333   DECL_ARTIFICIAL (f_vroff) = 1;
10334
10335   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
10336   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
10337   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
10338   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
10339   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
10340
10341   TYPE_FIELDS (va_list_type) = f_stack;
10342   DECL_CHAIN (f_stack) = f_grtop;
10343   DECL_CHAIN (f_grtop) = f_vrtop;
10344   DECL_CHAIN (f_vrtop) = f_groff;
10345   DECL_CHAIN (f_groff) = f_vroff;
10346
10347   /* Compute its layout.  */
10348   layout_type (va_list_type);
10349
10350   return va_list_type;
10351 }
10352
10353 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
10354 static void
10355 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
10356 {
10357   const CUMULATIVE_ARGS *cum;
10358   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10359   tree stack, grtop, vrtop, groff, vroff;
10360   tree t;
10361   int gr_save_area_size = cfun->va_list_gpr_size;
10362   int vr_save_area_size = cfun->va_list_fpr_size;
10363   int vr_offset;
10364
10365   cum = &crtl->args.info;
10366   if (cfun->va_list_gpr_size)
10367     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
10368                              cfun->va_list_gpr_size);
10369   if (cfun->va_list_fpr_size)
10370     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
10371                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
10372
10373   if (!TARGET_FLOAT)
10374     {
10375       gcc_assert (cum->aapcs_nvrn == 0);
10376       vr_save_area_size = 0;
10377     }
10378
10379   f_stack = TYPE_FIELDS (va_list_type_node);
10380   f_grtop = DECL_CHAIN (f_stack);
10381   f_vrtop = DECL_CHAIN (f_grtop);
10382   f_groff = DECL_CHAIN (f_vrtop);
10383   f_vroff = DECL_CHAIN (f_groff);
10384
10385   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
10386                   NULL_TREE);
10387   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
10388                   NULL_TREE);
10389   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
10390                   NULL_TREE);
10391   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
10392                   NULL_TREE);
10393   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
10394                   NULL_TREE);
10395
10396   /* Emit code to initialize STACK, which points to the next varargs stack
10397      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
10398      by named arguments.  STACK is 8-byte aligned.  */
10399   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
10400   if (cum->aapcs_stack_size > 0)
10401     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
10402   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
10403   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10404
10405   /* Emit code to initialize GRTOP, the top of the GR save area.
10406      virtual_incoming_args_rtx should have been 16 byte aligned.  */
10407   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
10408   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
10409   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10410
10411   /* Emit code to initialize VRTOP, the top of the VR save area.
10412      This address is gr_save_area_bytes below GRTOP, rounded
10413      down to the next 16-byte boundary.  */
10414   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
10415   vr_offset = ROUND_UP (gr_save_area_size,
10416                         STACK_BOUNDARY / BITS_PER_UNIT);
10417
10418   if (vr_offset)
10419     t = fold_build_pointer_plus_hwi (t, -vr_offset);
10420   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
10421   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10422
10423   /* Emit code to initialize GROFF, the offset from GRTOP of the
10424      next GPR argument.  */
10425   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
10426               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
10427   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10428
10429   /* Likewise emit code to initialize VROFF, the offset from FTOP
10430      of the next VR argument.  */
10431   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
10432               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
10433   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10434 }
10435
10436 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
10437
10438 static tree
10439 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
10440                               gimple_seq *post_p ATTRIBUTE_UNUSED)
10441 {
10442   tree addr;
10443   bool indirect_p;
10444   bool is_ha;           /* is HFA or HVA.  */
10445   bool dw_align;        /* double-word align.  */
10446   machine_mode ag_mode = VOIDmode;
10447   int nregs;
10448   machine_mode mode;
10449
10450   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10451   tree stack, f_top, f_off, off, arg, roundup, on_stack;
10452   HOST_WIDE_INT size, rsize, adjust, align;
10453   tree t, u, cond1, cond2;
10454
10455   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
10456   if (indirect_p)
10457     type = build_pointer_type (type);
10458
10459   mode = TYPE_MODE (type);
10460
10461   f_stack = TYPE_FIELDS (va_list_type_node);
10462   f_grtop = DECL_CHAIN (f_stack);
10463   f_vrtop = DECL_CHAIN (f_grtop);
10464   f_groff = DECL_CHAIN (f_vrtop);
10465   f_vroff = DECL_CHAIN (f_groff);
10466
10467   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
10468                   f_stack, NULL_TREE);
10469   size = int_size_in_bytes (type);
10470   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
10471
10472   dw_align = false;
10473   adjust = 0;
10474   if (aarch64_vfp_is_call_or_return_candidate (mode,
10475                                                type,
10476                                                &ag_mode,
10477                                                &nregs,
10478                                                &is_ha))
10479     {
10480       /* TYPE passed in fp/simd registers.  */
10481       if (!TARGET_FLOAT)
10482         aarch64_err_no_fpadvsimd (mode, "varargs");
10483
10484       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
10485                       unshare_expr (valist), f_vrtop, NULL_TREE);
10486       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
10487                       unshare_expr (valist), f_vroff, NULL_TREE);
10488
10489       rsize = nregs * UNITS_PER_VREG;
10490
10491       if (is_ha)
10492         {
10493           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
10494             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
10495         }
10496       else if (BLOCK_REG_PADDING (mode, type, 1) == downward
10497                && size < UNITS_PER_VREG)
10498         {
10499           adjust = UNITS_PER_VREG - size;
10500         }
10501     }
10502   else
10503     {
10504       /* TYPE passed in general registers.  */
10505       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
10506                       unshare_expr (valist), f_grtop, NULL_TREE);
10507       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
10508                       unshare_expr (valist), f_groff, NULL_TREE);
10509       rsize = ROUND_UP (size, UNITS_PER_WORD);
10510       nregs = rsize / UNITS_PER_WORD;
10511
10512       if (align > 8)
10513         dw_align = true;
10514
10515       if (BLOCK_REG_PADDING (mode, type, 1) == downward
10516           && size < UNITS_PER_WORD)
10517         {
10518           adjust = UNITS_PER_WORD  - size;
10519         }
10520     }
10521
10522   /* Get a local temporary for the field value.  */
10523   off = get_initialized_tmp_var (f_off, pre_p, NULL);
10524
10525   /* Emit code to branch if off >= 0.  */
10526   t = build2 (GE_EXPR, boolean_type_node, off,
10527               build_int_cst (TREE_TYPE (off), 0));
10528   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
10529
10530   if (dw_align)
10531     {
10532       /* Emit: offs = (offs + 15) & -16.  */
10533       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10534                   build_int_cst (TREE_TYPE (off), 15));
10535       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
10536                   build_int_cst (TREE_TYPE (off), -16));
10537       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
10538     }
10539   else
10540     roundup = NULL;
10541
10542   /* Update ap.__[g|v]r_offs  */
10543   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10544               build_int_cst (TREE_TYPE (off), rsize));
10545   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
10546
10547   /* String up.  */
10548   if (roundup)
10549     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10550
10551   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
10552   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
10553               build_int_cst (TREE_TYPE (f_off), 0));
10554   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
10555
10556   /* String up: make sure the assignment happens before the use.  */
10557   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
10558   COND_EXPR_ELSE (cond1) = t;
10559
10560   /* Prepare the trees handling the argument that is passed on the stack;
10561      the top level node will store in ON_STACK.  */
10562   arg = get_initialized_tmp_var (stack, pre_p, NULL);
10563   if (align > 8)
10564     {
10565       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
10566       t = fold_convert (intDI_type_node, arg);
10567       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10568                   build_int_cst (TREE_TYPE (t), 15));
10569       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10570                   build_int_cst (TREE_TYPE (t), -16));
10571       t = fold_convert (TREE_TYPE (arg), t);
10572       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
10573     }
10574   else
10575     roundup = NULL;
10576   /* Advance ap.__stack  */
10577   t = fold_convert (intDI_type_node, arg);
10578   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10579               build_int_cst (TREE_TYPE (t), size + 7));
10580   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10581               build_int_cst (TREE_TYPE (t), -8));
10582   t = fold_convert (TREE_TYPE (arg), t);
10583   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
10584   /* String up roundup and advance.  */
10585   if (roundup)
10586     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10587   /* String up with arg */
10588   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
10589   /* Big-endianness related address adjustment.  */
10590   if (BLOCK_REG_PADDING (mode, type, 1) == downward
10591       && size < UNITS_PER_WORD)
10592   {
10593     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
10594                 size_int (UNITS_PER_WORD - size));
10595     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
10596   }
10597
10598   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
10599   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
10600
10601   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
10602   t = off;
10603   if (adjust)
10604     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
10605                 build_int_cst (TREE_TYPE (off), adjust));
10606
10607   t = fold_convert (sizetype, t);
10608   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
10609
10610   if (is_ha)
10611     {
10612       /* type ha; // treat as "struct {ftype field[n];}"
10613          ... [computing offs]
10614          for (i = 0; i <nregs; ++i, offs += 16)
10615            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
10616          return ha;  */
10617       int i;
10618       tree tmp_ha, field_t, field_ptr_t;
10619
10620       /* Declare a local variable.  */
10621       tmp_ha = create_tmp_var_raw (type, "ha");
10622       gimple_add_tmp_var (tmp_ha);
10623
10624       /* Establish the base type.  */
10625       switch (ag_mode)
10626         {
10627         case SFmode:
10628           field_t = float_type_node;
10629           field_ptr_t = float_ptr_type_node;
10630           break;
10631         case DFmode:
10632           field_t = double_type_node;
10633           field_ptr_t = double_ptr_type_node;
10634           break;
10635         case TFmode:
10636           field_t = long_double_type_node;
10637           field_ptr_t = long_double_ptr_type_node;
10638           break;
10639         case HFmode:
10640           field_t = aarch64_fp16_type_node;
10641           field_ptr_t = aarch64_fp16_ptr_type_node;
10642           break;
10643         case V2SImode:
10644         case V4SImode:
10645             {
10646               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
10647               field_t = build_vector_type_for_mode (innertype, ag_mode);
10648               field_ptr_t = build_pointer_type (field_t);
10649             }
10650           break;
10651         default:
10652           gcc_assert (0);
10653         }
10654
10655       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
10656       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
10657       addr = t;
10658       t = fold_convert (field_ptr_t, addr);
10659       t = build2 (MODIFY_EXPR, field_t,
10660                   build1 (INDIRECT_REF, field_t, tmp_ha),
10661                   build1 (INDIRECT_REF, field_t, t));
10662
10663       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
10664       for (i = 1; i < nregs; ++i)
10665         {
10666           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
10667           u = fold_convert (field_ptr_t, addr);
10668           u = build2 (MODIFY_EXPR, field_t,
10669                       build2 (MEM_REF, field_t, tmp_ha,
10670                               build_int_cst (field_ptr_t,
10671                                              (i *
10672                                               int_size_in_bytes (field_t)))),
10673                       build1 (INDIRECT_REF, field_t, u));
10674           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
10675         }
10676
10677       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
10678       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
10679     }
10680
10681   COND_EXPR_ELSE (cond2) = t;
10682   addr = fold_convert (build_pointer_type (type), cond1);
10683   addr = build_va_arg_indirect_ref (addr);
10684
10685   if (indirect_p)
10686     addr = build_va_arg_indirect_ref (addr);
10687
10688   return addr;
10689 }
10690
10691 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
10692
10693 static void
10694 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10695                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
10696                                 int no_rtl)
10697 {
10698   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10699   CUMULATIVE_ARGS local_cum;
10700   int gr_saved = cfun->va_list_gpr_size;
10701   int vr_saved = cfun->va_list_fpr_size;
10702
10703   /* The caller has advanced CUM up to, but not beyond, the last named
10704      argument.  Advance a local copy of CUM past the last "real" named
10705      argument, to find out how many registers are left over.  */
10706   local_cum = *cum;
10707   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
10708
10709   /* Found out how many registers we need to save.
10710      Honor tree-stdvar analysis results.  */
10711   if (cfun->va_list_gpr_size)
10712     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
10713                     cfun->va_list_gpr_size / UNITS_PER_WORD);
10714   if (cfun->va_list_fpr_size)
10715     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
10716                     cfun->va_list_fpr_size / UNITS_PER_VREG);
10717
10718   if (!TARGET_FLOAT)
10719     {
10720       gcc_assert (local_cum.aapcs_nvrn == 0);
10721       vr_saved = 0;
10722     }
10723
10724   if (!no_rtl)
10725     {
10726       if (gr_saved > 0)
10727         {
10728           rtx ptr, mem;
10729
10730           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
10731           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
10732                                - gr_saved * UNITS_PER_WORD);
10733           mem = gen_frame_mem (BLKmode, ptr);
10734           set_mem_alias_set (mem, get_varargs_alias_set ());
10735
10736           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
10737                                mem, gr_saved);
10738         }
10739       if (vr_saved > 0)
10740         {
10741           /* We can't use move_block_from_reg, because it will use
10742              the wrong mode, storing D regs only.  */
10743           machine_mode mode = TImode;
10744           int off, i, vr_start;
10745
10746           /* Set OFF to the offset from virtual_incoming_args_rtx of
10747              the first vector register.  The VR save area lies below
10748              the GR one, and is aligned to 16 bytes.  */
10749           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10750                            STACK_BOUNDARY / BITS_PER_UNIT);
10751           off -= vr_saved * UNITS_PER_VREG;
10752
10753           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
10754           for (i = 0; i < vr_saved; ++i)
10755             {
10756               rtx ptr, mem;
10757
10758               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10759               mem = gen_frame_mem (mode, ptr);
10760               set_mem_alias_set (mem, get_varargs_alias_set ());
10761               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
10762               off += UNITS_PER_VREG;
10763             }
10764         }
10765     }
10766
10767   /* We don't save the size into *PRETEND_SIZE because we want to avoid
10768      any complication of having crtl->args.pretend_args_size changed.  */
10769   cfun->machine->frame.saved_varargs_size
10770     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10771                  STACK_BOUNDARY / BITS_PER_UNIT)
10772        + vr_saved * UNITS_PER_VREG);
10773 }
10774
10775 static void
10776 aarch64_conditional_register_usage (void)
10777 {
10778   int i;
10779   if (!TARGET_FLOAT)
10780     {
10781       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10782         {
10783           fixed_regs[i] = 1;
10784           call_used_regs[i] = 1;
10785         }
10786     }
10787 }
10788
10789 /* Walk down the type tree of TYPE counting consecutive base elements.
10790    If *MODEP is VOIDmode, then set it to the first valid floating point
10791    type.  If a non-floating point type is found, or if a floating point
10792    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10793    otherwise return the count in the sub-tree.  */
10794 static int
10795 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10796 {
10797   machine_mode mode;
10798   HOST_WIDE_INT size;
10799
10800   switch (TREE_CODE (type))
10801     {
10802     case REAL_TYPE:
10803       mode = TYPE_MODE (type);
10804       if (mode != DFmode && mode != SFmode
10805           && mode != TFmode && mode != HFmode)
10806         return -1;
10807
10808       if (*modep == VOIDmode)
10809         *modep = mode;
10810
10811       if (*modep == mode)
10812         return 1;
10813
10814       break;
10815
10816     case COMPLEX_TYPE:
10817       mode = TYPE_MODE (TREE_TYPE (type));
10818       if (mode != DFmode && mode != SFmode
10819           && mode != TFmode && mode != HFmode)
10820         return -1;
10821
10822       if (*modep == VOIDmode)
10823         *modep = mode;
10824
10825       if (*modep == mode)
10826         return 2;
10827
10828       break;
10829
10830     case VECTOR_TYPE:
10831       /* Use V2SImode and V4SImode as representatives of all 64-bit
10832          and 128-bit vector types.  */
10833       size = int_size_in_bytes (type);
10834       switch (size)
10835         {
10836         case 8:
10837           mode = V2SImode;
10838           break;
10839         case 16:
10840           mode = V4SImode;
10841           break;
10842         default:
10843           return -1;
10844         }
10845
10846       if (*modep == VOIDmode)
10847         *modep = mode;
10848
10849       /* Vector modes are considered to be opaque: two vectors are
10850          equivalent for the purposes of being homogeneous aggregates
10851          if they are the same size.  */
10852       if (*modep == mode)
10853         return 1;
10854
10855       break;
10856
10857     case ARRAY_TYPE:
10858       {
10859         int count;
10860         tree index = TYPE_DOMAIN (type);
10861
10862         /* Can't handle incomplete types nor sizes that are not
10863            fixed.  */
10864         if (!COMPLETE_TYPE_P (type)
10865             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10866           return -1;
10867
10868         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
10869         if (count == -1
10870             || !index
10871             || !TYPE_MAX_VALUE (index)
10872             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
10873             || !TYPE_MIN_VALUE (index)
10874             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
10875             || count < 0)
10876           return -1;
10877
10878         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
10879                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
10880
10881         /* There must be no padding.  */
10882         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10883           return -1;
10884
10885         return count;
10886       }
10887
10888     case RECORD_TYPE:
10889       {
10890         int count = 0;
10891         int sub_count;
10892         tree field;
10893
10894         /* Can't handle incomplete types nor sizes that are not
10895            fixed.  */
10896         if (!COMPLETE_TYPE_P (type)
10897             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10898           return -1;
10899
10900         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10901           {
10902             if (TREE_CODE (field) != FIELD_DECL)
10903               continue;
10904
10905             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10906             if (sub_count < 0)
10907               return -1;
10908             count += sub_count;
10909           }
10910
10911         /* There must be no padding.  */
10912         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10913           return -1;
10914
10915         return count;
10916       }
10917
10918     case UNION_TYPE:
10919     case QUAL_UNION_TYPE:
10920       {
10921         /* These aren't very interesting except in a degenerate case.  */
10922         int count = 0;
10923         int sub_count;
10924         tree field;
10925
10926         /* Can't handle incomplete types nor sizes that are not
10927            fixed.  */
10928         if (!COMPLETE_TYPE_P (type)
10929             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10930           return -1;
10931
10932         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10933           {
10934             if (TREE_CODE (field) != FIELD_DECL)
10935               continue;
10936
10937             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10938             if (sub_count < 0)
10939               return -1;
10940             count = count > sub_count ? count : sub_count;
10941           }
10942
10943         /* There must be no padding.  */
10944         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10945           return -1;
10946
10947         return count;
10948       }
10949
10950     default:
10951       break;
10952     }
10953
10954   return -1;
10955 }
10956
10957 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
10958    type as described in AAPCS64 \S 4.1.2.
10959
10960    See the comment above aarch64_composite_type_p for the notes on MODE.  */
10961
10962 static bool
10963 aarch64_short_vector_p (const_tree type,
10964                         machine_mode mode)
10965 {
10966   HOST_WIDE_INT size = -1;
10967
10968   if (type && TREE_CODE (type) == VECTOR_TYPE)
10969     size = int_size_in_bytes (type);
10970   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
10971             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
10972     size = GET_MODE_SIZE (mode);
10973
10974   return (size == 8 || size == 16);
10975 }
10976
10977 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
10978    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
10979    array types.  The C99 floating-point complex types are also considered
10980    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
10981    types, which are GCC extensions and out of the scope of AAPCS64, are
10982    treated as composite types here as well.
10983
10984    Note that MODE itself is not sufficient in determining whether a type
10985    is such a composite type or not.  This is because
10986    stor-layout.c:compute_record_mode may have already changed the MODE
10987    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
10988    structure with only one field may have its MODE set to the mode of the
10989    field.  Also an integer mode whose size matches the size of the
10990    RECORD_TYPE type may be used to substitute the original mode
10991    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
10992    solely relied on.  */
10993
10994 static bool
10995 aarch64_composite_type_p (const_tree type,
10996                           machine_mode mode)
10997 {
10998   if (aarch64_short_vector_p (type, mode))
10999     return false;
11000
11001   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
11002     return true;
11003
11004   if (mode == BLKmode
11005       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
11006       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
11007     return true;
11008
11009   return false;
11010 }
11011
11012 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
11013    shall be passed or returned in simd/fp register(s) (providing these
11014    parameter passing registers are available).
11015
11016    Upon successful return, *COUNT returns the number of needed registers,
11017    *BASE_MODE returns the mode of the individual register and when IS_HAF
11018    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
11019    floating-point aggregate or a homogeneous short-vector aggregate.  */
11020
11021 static bool
11022 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
11023                                          const_tree type,
11024                                          machine_mode *base_mode,
11025                                          int *count,
11026                                          bool *is_ha)
11027 {
11028   machine_mode new_mode = VOIDmode;
11029   bool composite_p = aarch64_composite_type_p (type, mode);
11030
11031   if (is_ha != NULL) *is_ha = false;
11032
11033   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
11034       || aarch64_short_vector_p (type, mode))
11035     {
11036       *count = 1;
11037       new_mode = mode;
11038     }
11039   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
11040     {
11041       if (is_ha != NULL) *is_ha = true;
11042       *count = 2;
11043       new_mode = GET_MODE_INNER (mode);
11044     }
11045   else if (type && composite_p)
11046     {
11047       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
11048
11049       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
11050         {
11051           if (is_ha != NULL) *is_ha = true;
11052           *count = ag_count;
11053         }
11054       else
11055         return false;
11056     }
11057   else
11058     return false;
11059
11060   *base_mode = new_mode;
11061   return true;
11062 }
11063
11064 /* Implement TARGET_STRUCT_VALUE_RTX.  */
11065
11066 static rtx
11067 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
11068                           int incoming ATTRIBUTE_UNUSED)
11069 {
11070   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
11071 }
11072
11073 /* Implements target hook vector_mode_supported_p.  */
11074 static bool
11075 aarch64_vector_mode_supported_p (machine_mode mode)
11076 {
11077   if (TARGET_SIMD
11078       && (mode == V4SImode  || mode == V8HImode
11079           || mode == V16QImode || mode == V2DImode
11080           || mode == V2SImode  || mode == V4HImode
11081           || mode == V8QImode || mode == V2SFmode
11082           || mode == V4SFmode || mode == V2DFmode
11083           || mode == V4HFmode || mode == V8HFmode
11084           || mode == V1DFmode))
11085     return true;
11086
11087   return false;
11088 }
11089
11090 /* Return appropriate SIMD container
11091    for MODE within a vector of WIDTH bits.  */
11092 static machine_mode
11093 aarch64_simd_container_mode (machine_mode mode, unsigned width)
11094 {
11095   gcc_assert (width == 64 || width == 128);
11096   if (TARGET_SIMD)
11097     {
11098       if (width == 128)
11099         switch (mode)
11100           {
11101           case DFmode:
11102             return V2DFmode;
11103           case SFmode:
11104             return V4SFmode;
11105           case HFmode:
11106             return V8HFmode;
11107           case SImode:
11108             return V4SImode;
11109           case HImode:
11110             return V8HImode;
11111           case QImode:
11112             return V16QImode;
11113           case DImode:
11114             return V2DImode;
11115           default:
11116             break;
11117           }
11118       else
11119         switch (mode)
11120           {
11121           case SFmode:
11122             return V2SFmode;
11123           case HFmode:
11124             return V4HFmode;
11125           case SImode:
11126             return V2SImode;
11127           case HImode:
11128             return V4HImode;
11129           case QImode:
11130             return V8QImode;
11131           default:
11132             break;
11133           }
11134     }
11135   return word_mode;
11136 }
11137
11138 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
11139 static machine_mode
11140 aarch64_preferred_simd_mode (machine_mode mode)
11141 {
11142   return aarch64_simd_container_mode (mode, 128);
11143 }
11144
11145 /* Return the bitmask of possible vector sizes for the vectorizer
11146    to iterate over.  */
11147 static unsigned int
11148 aarch64_autovectorize_vector_sizes (void)
11149 {
11150   return (16 | 8);
11151 }
11152
11153 /* Implement TARGET_MANGLE_TYPE.  */
11154
11155 static const char *
11156 aarch64_mangle_type (const_tree type)
11157 {
11158   /* The AArch64 ABI documents say that "__va_list" has to be
11159      managled as if it is in the "std" namespace.  */
11160   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
11161     return "St9__va_list";
11162
11163   /* Half-precision float.  */
11164   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
11165     return "Dh";
11166
11167   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
11168      builtin types.  */
11169   if (TYPE_NAME (type) != NULL)
11170     return aarch64_mangle_builtin_type (type);
11171
11172   /* Use the default mangling.  */
11173   return NULL;
11174 }
11175
11176 /* Find the first rtx_insn before insn that will generate an assembly
11177    instruction.  */
11178
11179 static rtx_insn *
11180 aarch64_prev_real_insn (rtx_insn *insn)
11181 {
11182   if (!insn)
11183     return NULL;
11184
11185   do
11186     {
11187       insn = prev_real_insn (insn);
11188     }
11189   while (insn && recog_memoized (insn) < 0);
11190
11191   return insn;
11192 }
11193
11194 static bool
11195 is_madd_op (enum attr_type t1)
11196 {
11197   unsigned int i;
11198   /* A number of these may be AArch32 only.  */
11199   enum attr_type mlatypes[] = {
11200     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
11201     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
11202     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
11203   };
11204
11205   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
11206     {
11207       if (t1 == mlatypes[i])
11208         return true;
11209     }
11210
11211   return false;
11212 }
11213
11214 /* Check if there is a register dependency between a load and the insn
11215    for which we hold recog_data.  */
11216
11217 static bool
11218 dep_between_memop_and_curr (rtx memop)
11219 {
11220   rtx load_reg;
11221   int opno;
11222
11223   gcc_assert (GET_CODE (memop) == SET);
11224
11225   if (!REG_P (SET_DEST (memop)))
11226     return false;
11227
11228   load_reg = SET_DEST (memop);
11229   for (opno = 1; opno < recog_data.n_operands; opno++)
11230     {
11231       rtx operand = recog_data.operand[opno];
11232       if (REG_P (operand)
11233           && reg_overlap_mentioned_p (load_reg, operand))
11234         return true;
11235
11236     }
11237   return false;
11238 }
11239
11240
11241 /* When working around the Cortex-A53 erratum 835769,
11242    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
11243    instruction and has a preceding memory instruction such that a NOP
11244    should be inserted between them.  */
11245
11246 bool
11247 aarch64_madd_needs_nop (rtx_insn* insn)
11248 {
11249   enum attr_type attr_type;
11250   rtx_insn *prev;
11251   rtx body;
11252
11253   if (!TARGET_FIX_ERR_A53_835769)
11254     return false;
11255
11256   if (!INSN_P (insn) || recog_memoized (insn) < 0)
11257     return false;
11258
11259   attr_type = get_attr_type (insn);
11260   if (!is_madd_op (attr_type))
11261     return false;
11262
11263   prev = aarch64_prev_real_insn (insn);
11264   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
11265      Restore recog state to INSN to avoid state corruption.  */
11266   extract_constrain_insn_cached (insn);
11267
11268   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
11269     return false;
11270
11271   body = single_set (prev);
11272
11273   /* If the previous insn is a memory op and there is no dependency between
11274      it and the DImode madd, emit a NOP between them.  If body is NULL then we
11275      have a complex memory operation, probably a load/store pair.
11276      Be conservative for now and emit a NOP.  */
11277   if (GET_MODE (recog_data.operand[0]) == DImode
11278       && (!body || !dep_between_memop_and_curr (body)))
11279     return true;
11280
11281   return false;
11282
11283 }
11284
11285
11286 /* Implement FINAL_PRESCAN_INSN.  */
11287
11288 void
11289 aarch64_final_prescan_insn (rtx_insn *insn)
11290 {
11291   if (aarch64_madd_needs_nop (insn))
11292     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
11293 }
11294
11295
11296 /* Return the equivalent letter for size.  */
11297 static char
11298 sizetochar (int size)
11299 {
11300   switch (size)
11301     {
11302     case 64: return 'd';
11303     case 32: return 's';
11304     case 16: return 'h';
11305     case 8 : return 'b';
11306     default: gcc_unreachable ();
11307     }
11308 }
11309
11310 /* Return true iff x is a uniform vector of floating-point
11311    constants, and the constant can be represented in
11312    quarter-precision form.  Note, as aarch64_float_const_representable
11313    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
11314 static bool
11315 aarch64_vect_float_const_representable_p (rtx x)
11316 {
11317   rtx elt;
11318   return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
11319           && const_vec_duplicate_p (x, &elt)
11320           && aarch64_float_const_representable_p (elt));
11321 }
11322
11323 /* Return true for valid and false for invalid.  */
11324 bool
11325 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
11326                               struct simd_immediate_info *info)
11327 {
11328 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
11329   matches = 1;                                          \
11330   for (i = 0; i < idx; i += (STRIDE))                   \
11331     if (!(TEST))                                        \
11332       matches = 0;                                      \
11333   if (matches)                                          \
11334     {                                                   \
11335       immtype = (CLASS);                                \
11336       elsize = (ELSIZE);                                \
11337       eshift = (SHIFT);                                 \
11338       emvn = (NEG);                                     \
11339       break;                                            \
11340     }
11341
11342   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
11343   unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
11344   unsigned char bytes[16];
11345   int immtype = -1, matches;
11346   unsigned int invmask = inverse ? 0xff : 0;
11347   int eshift, emvn;
11348
11349   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11350     {
11351       if (! (aarch64_simd_imm_zero_p (op, mode)
11352              || aarch64_vect_float_const_representable_p (op)))
11353         return false;
11354
11355       if (info)
11356         {
11357           info->value = CONST_VECTOR_ELT (op, 0);
11358           info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
11359           info->mvn = false;
11360           info->shift = 0;
11361         }
11362
11363       return true;
11364     }
11365
11366   /* Splat vector constant out into a byte vector.  */
11367   for (i = 0; i < n_elts; i++)
11368     {
11369       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
11370          it must be laid out in the vector register in reverse order.  */
11371       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
11372       unsigned HOST_WIDE_INT elpart;
11373
11374       gcc_assert (CONST_INT_P (el));
11375       elpart = INTVAL (el);
11376
11377       for (unsigned int byte = 0; byte < innersize; byte++)
11378         {
11379           bytes[idx++] = (elpart & 0xff) ^ invmask;
11380           elpart >>= BITS_PER_UNIT;
11381         }
11382
11383     }
11384
11385   /* Sanity check.  */
11386   gcc_assert (idx == GET_MODE_SIZE (mode));
11387
11388   do
11389     {
11390       CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
11391              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
11392
11393       CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11394              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11395
11396       CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
11397              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11398
11399       CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
11400              && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
11401
11402       CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
11403
11404       CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
11405
11406       CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
11407              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
11408
11409       CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11410              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11411
11412       CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
11413              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11414
11415       CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
11416              && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
11417
11418       CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
11419
11420       CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
11421
11422       CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11423              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11424
11425       CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11426              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11427
11428       CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
11429              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11430
11431       CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
11432              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11433
11434       CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
11435
11436       CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
11437              && bytes[i] == bytes[(i + 8) % idx], 0, 0);
11438     }
11439   while (0);
11440
11441   if (immtype == -1)
11442     return false;
11443
11444   if (info)
11445     {
11446       info->element_width = elsize;
11447       info->mvn = emvn != 0;
11448       info->shift = eshift;
11449
11450       unsigned HOST_WIDE_INT imm = 0;
11451
11452       if (immtype >= 12 && immtype <= 15)
11453         info->msl = true;
11454
11455       /* Un-invert bytes of recognized vector, if necessary.  */
11456       if (invmask != 0)
11457         for (i = 0; i < idx; i++)
11458           bytes[i] ^= invmask;
11459
11460       if (immtype == 17)
11461         {
11462           /* FIXME: Broken on 32-bit H_W_I hosts.  */
11463           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
11464
11465           for (i = 0; i < 8; i++)
11466             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
11467               << (i * BITS_PER_UNIT);
11468
11469
11470           info->value = GEN_INT (imm);
11471         }
11472       else
11473         {
11474           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
11475             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
11476
11477           /* Construct 'abcdefgh' because the assembler cannot handle
11478              generic constants.  */
11479           if (info->mvn)
11480             imm = ~imm;
11481           imm = (imm >> info->shift) & 0xff;
11482           info->value = GEN_INT (imm);
11483         }
11484     }
11485
11486   return true;
11487 #undef CHECK
11488 }
11489
11490 /* Check of immediate shift constants are within range.  */
11491 bool
11492 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
11493 {
11494   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
11495   if (left)
11496     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
11497   else
11498     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
11499 }
11500
11501 /* Return true if X is a uniform vector where all elements
11502    are either the floating-point constant 0.0 or the
11503    integer constant 0.  */
11504 bool
11505 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
11506 {
11507   return x == CONST0_RTX (mode);
11508 }
11509
11510
11511 /* Return the bitmask CONST_INT to select the bits required by a zero extract
11512    operation of width WIDTH at bit position POS.  */
11513
11514 rtx
11515 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
11516 {
11517   gcc_assert (CONST_INT_P (width));
11518   gcc_assert (CONST_INT_P (pos));
11519
11520   unsigned HOST_WIDE_INT mask
11521     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
11522   return GEN_INT (mask << UINTVAL (pos));
11523 }
11524
11525 bool
11526 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
11527 {
11528   HOST_WIDE_INT imm = INTVAL (x);
11529   int i;
11530
11531   for (i = 0; i < 8; i++)
11532     {
11533       unsigned int byte = imm & 0xff;
11534       if (byte != 0xff && byte != 0)
11535        return false;
11536       imm >>= 8;
11537     }
11538
11539   return true;
11540 }
11541
11542 bool
11543 aarch64_mov_operand_p (rtx x, machine_mode mode)
11544 {
11545   if (GET_CODE (x) == HIGH
11546       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
11547     return true;
11548
11549   if (CONST_INT_P (x))
11550     return true;
11551
11552   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
11553     return true;
11554
11555   return aarch64_classify_symbolic_expression (x)
11556     == SYMBOL_TINY_ABSOLUTE;
11557 }
11558
11559 /* Return a const_int vector of VAL.  */
11560 rtx
11561 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
11562 {
11563   int nunits = GET_MODE_NUNITS (mode);
11564   rtvec v = rtvec_alloc (nunits);
11565   int i;
11566
11567   rtx cache = GEN_INT (val);
11568
11569   for (i=0; i < nunits; i++)
11570     RTVEC_ELT (v, i) = cache;
11571
11572   return gen_rtx_CONST_VECTOR (mode, v);
11573 }
11574
11575 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
11576
11577 bool
11578 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
11579 {
11580   machine_mode vmode;
11581
11582   gcc_assert (!VECTOR_MODE_P (mode));
11583   vmode = aarch64_preferred_simd_mode (mode);
11584   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
11585   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
11586 }
11587
11588 /* Construct and return a PARALLEL RTX vector with elements numbering the
11589    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
11590    the vector - from the perspective of the architecture.  This does not
11591    line up with GCC's perspective on lane numbers, so we end up with
11592    different masks depending on our target endian-ness.  The diagram
11593    below may help.  We must draw the distinction when building masks
11594    which select one half of the vector.  An instruction selecting
11595    architectural low-lanes for a big-endian target, must be described using
11596    a mask selecting GCC high-lanes.
11597
11598                  Big-Endian             Little-Endian
11599
11600 GCC             0   1   2   3           3   2   1   0
11601               | x | x | x | x |       | x | x | x | x |
11602 Architecture    3   2   1   0           3   2   1   0
11603
11604 Low Mask:         { 2, 3 }                { 0, 1 }
11605 High Mask:        { 0, 1 }                { 2, 3 }
11606 */
11607
11608 rtx
11609 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
11610 {
11611   int nunits = GET_MODE_NUNITS (mode);
11612   rtvec v = rtvec_alloc (nunits / 2);
11613   int high_base = nunits / 2;
11614   int low_base = 0;
11615   int base;
11616   rtx t1;
11617   int i;
11618
11619   if (BYTES_BIG_ENDIAN)
11620     base = high ? low_base : high_base;
11621   else
11622     base = high ? high_base : low_base;
11623
11624   for (i = 0; i < nunits / 2; i++)
11625     RTVEC_ELT (v, i) = GEN_INT (base + i);
11626
11627   t1 = gen_rtx_PARALLEL (mode, v);
11628   return t1;
11629 }
11630
11631 /* Check OP for validity as a PARALLEL RTX vector with elements
11632    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
11633    from the perspective of the architecture.  See the diagram above
11634    aarch64_simd_vect_par_cnst_half for more details.  */
11635
11636 bool
11637 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
11638                                        bool high)
11639 {
11640   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
11641   HOST_WIDE_INT count_op = XVECLEN (op, 0);
11642   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
11643   int i = 0;
11644
11645   if (!VECTOR_MODE_P (mode))
11646     return false;
11647
11648   if (count_op != count_ideal)
11649     return false;
11650
11651   for (i = 0; i < count_ideal; i++)
11652     {
11653       rtx elt_op = XVECEXP (op, 0, i);
11654       rtx elt_ideal = XVECEXP (ideal, 0, i);
11655
11656       if (!CONST_INT_P (elt_op)
11657           || INTVAL (elt_ideal) != INTVAL (elt_op))
11658         return false;
11659     }
11660   return true;
11661 }
11662
11663 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
11664    HIGH (exclusive).  */
11665 void
11666 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
11667                           const_tree exp)
11668 {
11669   HOST_WIDE_INT lane;
11670   gcc_assert (CONST_INT_P (operand));
11671   lane = INTVAL (operand);
11672
11673   if (lane < low || lane >= high)
11674   {
11675     if (exp)
11676       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
11677     else
11678       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
11679   }
11680 }
11681
11682 /* Return TRUE if OP is a valid vector addressing mode.  */
11683 bool
11684 aarch64_simd_mem_operand_p (rtx op)
11685 {
11686   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
11687                         || REG_P (XEXP (op, 0)));
11688 }
11689
11690 /* Emit a register copy from operand to operand, taking care not to
11691    early-clobber source registers in the process.
11692
11693    COUNT is the number of components into which the copy needs to be
11694    decomposed.  */
11695 void
11696 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
11697                                 unsigned int count)
11698 {
11699   unsigned int i;
11700   int rdest = REGNO (operands[0]);
11701   int rsrc = REGNO (operands[1]);
11702
11703   if (!reg_overlap_mentioned_p (operands[0], operands[1])
11704       || rdest < rsrc)
11705     for (i = 0; i < count; i++)
11706       emit_move_insn (gen_rtx_REG (mode, rdest + i),
11707                       gen_rtx_REG (mode, rsrc + i));
11708   else
11709     for (i = 0; i < count; i++)
11710       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
11711                       gen_rtx_REG (mode, rsrc + count - i - 1));
11712 }
11713
11714 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
11715    one of VSTRUCT modes: OI, CI, or XI.  */
11716 int
11717 aarch64_simd_attr_length_rglist (machine_mode mode)
11718 {
11719   return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
11720 }
11721
11722 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
11723    alignment of a vector to 128 bits.  */
11724 static HOST_WIDE_INT
11725 aarch64_simd_vector_alignment (const_tree type)
11726 {
11727   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
11728   return MIN (align, 128);
11729 }
11730
11731 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
11732 static bool
11733 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
11734 {
11735   if (is_packed)
11736     return false;
11737
11738   /* We guarantee alignment for vectors up to 128-bits.  */
11739   if (tree_int_cst_compare (TYPE_SIZE (type),
11740                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
11741     return false;
11742
11743   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
11744   return true;
11745 }
11746
11747 /* Return true if the vector misalignment factor is supported by the
11748    target.  */
11749 static bool
11750 aarch64_builtin_support_vector_misalignment (machine_mode mode,
11751                                              const_tree type, int misalignment,
11752                                              bool is_packed)
11753 {
11754   if (TARGET_SIMD && STRICT_ALIGNMENT)
11755     {
11756       /* Return if movmisalign pattern is not supported for this mode.  */
11757       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
11758         return false;
11759
11760       if (misalignment == -1)
11761         {
11762           /* Misalignment factor is unknown at compile time but we know
11763              it's word aligned.  */
11764           if (aarch64_simd_vector_alignment_reachable (type, is_packed))
11765             {
11766               int element_size = TREE_INT_CST_LOW (TYPE_SIZE (type));
11767
11768               if (element_size != 64)
11769                 return true;
11770             }
11771           return false;
11772         }
11773     }
11774   return default_builtin_support_vector_misalignment (mode, type, misalignment,
11775                                                       is_packed);
11776 }
11777
11778 /* If VALS is a vector constant that can be loaded into a register
11779    using DUP, generate instructions to do so and return an RTX to
11780    assign to the register.  Otherwise return NULL_RTX.  */
11781 static rtx
11782 aarch64_simd_dup_constant (rtx vals)
11783 {
11784   machine_mode mode = GET_MODE (vals);
11785   machine_mode inner_mode = GET_MODE_INNER (mode);
11786   rtx x;
11787
11788   if (!const_vec_duplicate_p (vals, &x))
11789     return NULL_RTX;
11790
11791   /* We can load this constant by using DUP and a constant in a
11792      single ARM register.  This will be cheaper than a vector
11793      load.  */
11794   x = copy_to_mode_reg (inner_mode, x);
11795   return gen_rtx_VEC_DUPLICATE (mode, x);
11796 }
11797
11798
11799 /* Generate code to load VALS, which is a PARALLEL containing only
11800    constants (for vec_init) or CONST_VECTOR, efficiently into a
11801    register.  Returns an RTX to copy into the register, or NULL_RTX
11802    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
11803 static rtx
11804 aarch64_simd_make_constant (rtx vals)
11805 {
11806   machine_mode mode = GET_MODE (vals);
11807   rtx const_dup;
11808   rtx const_vec = NULL_RTX;
11809   int n_elts = GET_MODE_NUNITS (mode);
11810   int n_const = 0;
11811   int i;
11812
11813   if (GET_CODE (vals) == CONST_VECTOR)
11814     const_vec = vals;
11815   else if (GET_CODE (vals) == PARALLEL)
11816     {
11817       /* A CONST_VECTOR must contain only CONST_INTs and
11818          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11819          Only store valid constants in a CONST_VECTOR.  */
11820       for (i = 0; i < n_elts; ++i)
11821         {
11822           rtx x = XVECEXP (vals, 0, i);
11823           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11824             n_const++;
11825         }
11826       if (n_const == n_elts)
11827         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11828     }
11829   else
11830     gcc_unreachable ();
11831
11832   if (const_vec != NULL_RTX
11833       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11834     /* Load using MOVI/MVNI.  */
11835     return const_vec;
11836   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11837     /* Loaded using DUP.  */
11838     return const_dup;
11839   else if (const_vec != NULL_RTX)
11840     /* Load from constant pool. We can not take advantage of single-cycle
11841        LD1 because we need a PC-relative addressing mode.  */
11842     return const_vec;
11843   else
11844     /* A PARALLEL containing something not valid inside CONST_VECTOR.
11845        We can not construct an initializer.  */
11846     return NULL_RTX;
11847 }
11848
11849 /* Expand a vector initialisation sequence, such that TARGET is
11850    initialised to contain VALS.  */
11851
11852 void
11853 aarch64_expand_vector_init (rtx target, rtx vals)
11854 {
11855   machine_mode mode = GET_MODE (target);
11856   machine_mode inner_mode = GET_MODE_INNER (mode);
11857   /* The number of vector elements.  */
11858   int n_elts = GET_MODE_NUNITS (mode);
11859   /* The number of vector elements which are not constant.  */
11860   int n_var = 0;
11861   rtx any_const = NULL_RTX;
11862   /* The first element of vals.  */
11863   rtx v0 = XVECEXP (vals, 0, 0);
11864   bool all_same = true;
11865
11866   /* Count the number of variable elements to initialise.  */
11867   for (int i = 0; i < n_elts; ++i)
11868     {
11869       rtx x = XVECEXP (vals, 0, i);
11870       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
11871         ++n_var;
11872       else
11873         any_const = x;
11874
11875       all_same &= rtx_equal_p (x, v0);
11876     }
11877
11878   /* No variable elements, hand off to aarch64_simd_make_constant which knows
11879      how best to handle this.  */
11880   if (n_var == 0)
11881     {
11882       rtx constant = aarch64_simd_make_constant (vals);
11883       if (constant != NULL_RTX)
11884         {
11885           emit_move_insn (target, constant);
11886           return;
11887         }
11888     }
11889
11890   /* Splat a single non-constant element if we can.  */
11891   if (all_same)
11892     {
11893       rtx x = copy_to_mode_reg (inner_mode, v0);
11894       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
11895       return;
11896     }
11897
11898   enum insn_code icode = optab_handler (vec_set_optab, mode);
11899   gcc_assert (icode != CODE_FOR_nothing);
11900
11901   /* If there are only variable elements, try to optimize
11902      the insertion using dup for the most common element
11903      followed by insertions.  */
11904
11905   /* The algorithm will fill matches[*][0] with the earliest matching element,
11906      and matches[X][1] with the count of duplicate elements (if X is the
11907      earliest element which has duplicates).  */
11908
11909   if (n_var == n_elts && n_elts <= 16)
11910     {
11911       int matches[16][2] = {0};
11912       for (int i = 0; i < n_elts; i++)
11913         {
11914           for (int j = 0; j <= i; j++)
11915             {
11916               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
11917                 {
11918                   matches[i][0] = j;
11919                   matches[j][1]++;
11920                   break;
11921                 }
11922             }
11923         }
11924       int maxelement = 0;
11925       int maxv = 0;
11926       for (int i = 0; i < n_elts; i++)
11927         if (matches[i][1] > maxv)
11928           {
11929             maxelement = i;
11930             maxv = matches[i][1];
11931           }
11932
11933       /* Create a duplicate of the most common element.  */
11934       rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
11935       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
11936
11937       /* Insert the rest.  */
11938       for (int i = 0; i < n_elts; i++)
11939         {
11940           rtx x = XVECEXP (vals, 0, i);
11941           if (matches[i][0] == maxelement)
11942             continue;
11943           x = copy_to_mode_reg (inner_mode, x);
11944           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
11945         }
11946       return;
11947     }
11948
11949   /* Initialise a vector which is part-variable.  We want to first try
11950      to build those lanes which are constant in the most efficient way we
11951      can.  */
11952   if (n_var != n_elts)
11953     {
11954       rtx copy = copy_rtx (vals);
11955
11956       /* Load constant part of vector.  We really don't care what goes into the
11957          parts we will overwrite, but we're more likely to be able to load the
11958          constant efficiently if it has fewer, larger, repeating parts
11959          (see aarch64_simd_valid_immediate).  */
11960       for (int i = 0; i < n_elts; i++)
11961         {
11962           rtx x = XVECEXP (vals, 0, i);
11963           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11964             continue;
11965           rtx subst = any_const;
11966           for (int bit = n_elts / 2; bit > 0; bit /= 2)
11967             {
11968               /* Look in the copied vector, as more elements are const.  */
11969               rtx test = XVECEXP (copy, 0, i ^ bit);
11970               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
11971                 {
11972                   subst = test;
11973                   break;
11974                 }
11975             }
11976           XVECEXP (copy, 0, i) = subst;
11977         }
11978       aarch64_expand_vector_init (target, copy);
11979     }
11980
11981   /* Insert the variable lanes directly.  */
11982   for (int i = 0; i < n_elts; i++)
11983     {
11984       rtx x = XVECEXP (vals, 0, i);
11985       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11986         continue;
11987       x = copy_to_mode_reg (inner_mode, x);
11988       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
11989     }
11990 }
11991
11992 static unsigned HOST_WIDE_INT
11993 aarch64_shift_truncation_mask (machine_mode mode)
11994 {
11995   return
11996     (!SHIFT_COUNT_TRUNCATED
11997      || aarch64_vector_mode_supported_p (mode)
11998      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
11999 }
12000
12001 /* Select a format to encode pointers in exception handling data.  */
12002 int
12003 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
12004 {
12005    int type;
12006    switch (aarch64_cmodel)
12007      {
12008      case AARCH64_CMODEL_TINY:
12009      case AARCH64_CMODEL_TINY_PIC:
12010      case AARCH64_CMODEL_SMALL:
12011      case AARCH64_CMODEL_SMALL_PIC:
12012      case AARCH64_CMODEL_SMALL_SPIC:
12013        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
12014           for everything.  */
12015        type = DW_EH_PE_sdata4;
12016        break;
12017      default:
12018        /* No assumptions here.  8-byte relocs required.  */
12019        type = DW_EH_PE_sdata8;
12020        break;
12021      }
12022    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
12023 }
12024
12025 /* The last .arch and .tune assembly strings that we printed.  */
12026 static std::string aarch64_last_printed_arch_string;
12027 static std::string aarch64_last_printed_tune_string;
12028
12029 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
12030    by the function fndecl.  */
12031
12032 void
12033 aarch64_declare_function_name (FILE *stream, const char* name,
12034                                 tree fndecl)
12035 {
12036   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12037
12038   struct cl_target_option *targ_options;
12039   if (target_parts)
12040     targ_options = TREE_TARGET_OPTION (target_parts);
12041   else
12042     targ_options = TREE_TARGET_OPTION (target_option_current_node);
12043   gcc_assert (targ_options);
12044
12045   const struct processor *this_arch
12046     = aarch64_get_arch (targ_options->x_explicit_arch);
12047
12048   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
12049   std::string extension
12050     = aarch64_get_extension_string_for_isa_flags (isa_flags,
12051                                                   this_arch->flags);
12052   /* Only update the assembler .arch string if it is distinct from the last
12053      such string we printed.  */
12054   std::string to_print = this_arch->name + extension;
12055   if (to_print != aarch64_last_printed_arch_string)
12056     {
12057       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
12058       aarch64_last_printed_arch_string = to_print;
12059     }
12060
12061   /* Print the cpu name we're tuning for in the comments, might be
12062      useful to readers of the generated asm.  Do it only when it changes
12063      from function to function and verbose assembly is requested.  */
12064   const struct processor *this_tune
12065     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
12066
12067   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
12068     {
12069       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
12070                    this_tune->name);
12071       aarch64_last_printed_tune_string = this_tune->name;
12072     }
12073
12074   /* Don't forget the type directive for ELF.  */
12075   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
12076   ASM_OUTPUT_LABEL (stream, name);
12077 }
12078
12079 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
12080
12081 static void
12082 aarch64_start_file (void)
12083 {
12084   struct cl_target_option *default_options
12085     = TREE_TARGET_OPTION (target_option_default_node);
12086
12087   const struct processor *default_arch
12088     = aarch64_get_arch (default_options->x_explicit_arch);
12089   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
12090   std::string extension
12091     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
12092                                                   default_arch->flags);
12093
12094    aarch64_last_printed_arch_string = default_arch->name + extension;
12095    aarch64_last_printed_tune_string = "";
12096    asm_fprintf (asm_out_file, "\t.arch %s\n",
12097                 aarch64_last_printed_arch_string.c_str ());
12098
12099    default_file_start ();
12100 }
12101
12102 /* Emit load exclusive.  */
12103
12104 static void
12105 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
12106                              rtx mem, rtx model_rtx)
12107 {
12108   rtx (*gen) (rtx, rtx, rtx);
12109
12110   switch (mode)
12111     {
12112     case QImode: gen = gen_aarch64_load_exclusiveqi; break;
12113     case HImode: gen = gen_aarch64_load_exclusivehi; break;
12114     case SImode: gen = gen_aarch64_load_exclusivesi; break;
12115     case DImode: gen = gen_aarch64_load_exclusivedi; break;
12116     default:
12117       gcc_unreachable ();
12118     }
12119
12120   emit_insn (gen (rval, mem, model_rtx));
12121 }
12122
12123 /* Emit store exclusive.  */
12124
12125 static void
12126 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
12127                               rtx rval, rtx mem, rtx model_rtx)
12128 {
12129   rtx (*gen) (rtx, rtx, rtx, rtx);
12130
12131   switch (mode)
12132     {
12133     case QImode: gen = gen_aarch64_store_exclusiveqi; break;
12134     case HImode: gen = gen_aarch64_store_exclusivehi; break;
12135     case SImode: gen = gen_aarch64_store_exclusivesi; break;
12136     case DImode: gen = gen_aarch64_store_exclusivedi; break;
12137     default:
12138       gcc_unreachable ();
12139     }
12140
12141   emit_insn (gen (bval, rval, mem, model_rtx));
12142 }
12143
12144 /* Mark the previous jump instruction as unlikely.  */
12145
12146 static void
12147 aarch64_emit_unlikely_jump (rtx insn)
12148 {
12149   int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
12150
12151   rtx_insn *jump = emit_jump_insn (insn);
12152   add_int_reg_note (jump, REG_BR_PROB, very_unlikely);
12153 }
12154
12155 /* Expand a compare and swap pattern.  */
12156
12157 void
12158 aarch64_expand_compare_and_swap (rtx operands[])
12159 {
12160   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
12161   machine_mode mode, cmp_mode;
12162   typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
12163   int idx;
12164   gen_cas_fn gen;
12165   const gen_cas_fn split_cas[] =
12166   {
12167     gen_aarch64_compare_and_swapqi,
12168     gen_aarch64_compare_and_swaphi,
12169     gen_aarch64_compare_and_swapsi,
12170     gen_aarch64_compare_and_swapdi
12171   };
12172   const gen_cas_fn atomic_cas[] =
12173   {
12174     gen_aarch64_compare_and_swapqi_lse,
12175     gen_aarch64_compare_and_swaphi_lse,
12176     gen_aarch64_compare_and_swapsi_lse,
12177     gen_aarch64_compare_and_swapdi_lse
12178   };
12179
12180   bval = operands[0];
12181   rval = operands[1];
12182   mem = operands[2];
12183   oldval = operands[3];
12184   newval = operands[4];
12185   is_weak = operands[5];
12186   mod_s = operands[6];
12187   mod_f = operands[7];
12188   mode = GET_MODE (mem);
12189   cmp_mode = mode;
12190
12191   /* Normally the succ memory model must be stronger than fail, but in the
12192      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
12193      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
12194
12195   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
12196       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
12197     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
12198
12199   switch (mode)
12200     {
12201     case QImode:
12202     case HImode:
12203       /* For short modes, we're going to perform the comparison in SImode,
12204          so do the zero-extension now.  */
12205       cmp_mode = SImode;
12206       rval = gen_reg_rtx (SImode);
12207       oldval = convert_modes (SImode, mode, oldval, true);
12208       /* Fall through.  */
12209
12210     case SImode:
12211     case DImode:
12212       /* Force the value into a register if needed.  */
12213       if (!aarch64_plus_operand (oldval, mode))
12214         oldval = force_reg (cmp_mode, oldval);
12215       break;
12216
12217     default:
12218       gcc_unreachable ();
12219     }
12220
12221   switch (mode)
12222     {
12223     case QImode: idx = 0; break;
12224     case HImode: idx = 1; break;
12225     case SImode: idx = 2; break;
12226     case DImode: idx = 3; break;
12227     default:
12228       gcc_unreachable ();
12229     }
12230   if (TARGET_LSE)
12231     gen = atomic_cas[idx];
12232   else
12233     gen = split_cas[idx];
12234
12235   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
12236
12237   if (mode == QImode || mode == HImode)
12238     emit_move_insn (operands[1], gen_lowpart (mode, rval));
12239
12240   x = gen_rtx_REG (CCmode, CC_REGNUM);
12241   x = gen_rtx_EQ (SImode, x, const0_rtx);
12242   emit_insn (gen_rtx_SET (bval, x));
12243 }
12244
12245 /* Test whether the target supports using a atomic load-operate instruction.
12246    CODE is the operation and AFTER is TRUE if the data in memory after the
12247    operation should be returned and FALSE if the data before the operation
12248    should be returned.  Returns FALSE if the operation isn't supported by the
12249    architecture.  */
12250
12251 bool
12252 aarch64_atomic_ldop_supported_p (enum rtx_code code)
12253 {
12254   if (!TARGET_LSE)
12255     return false;
12256
12257   switch (code)
12258     {
12259     case SET:
12260     case AND:
12261     case IOR:
12262     case XOR:
12263     case MINUS:
12264     case PLUS:
12265       return true;
12266     default:
12267       return false;
12268     }
12269 }
12270
12271 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
12272    sequence implementing an atomic operation.  */
12273
12274 static void
12275 aarch64_emit_post_barrier (enum memmodel model)
12276 {
12277   const enum memmodel base_model = memmodel_base (model);
12278
12279   if (is_mm_sync (model)
12280       && (base_model == MEMMODEL_ACQUIRE
12281           || base_model == MEMMODEL_ACQ_REL
12282           || base_model == MEMMODEL_SEQ_CST))
12283     {
12284       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
12285     }
12286 }
12287
12288 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
12289    for the data in memory.  EXPECTED is the value expected to be in memory.
12290    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
12291    is the memory ordering to use.  */
12292
12293 void
12294 aarch64_gen_atomic_cas (rtx rval, rtx mem,
12295                         rtx expected, rtx desired,
12296                         rtx model)
12297 {
12298   rtx (*gen) (rtx, rtx, rtx, rtx);
12299   machine_mode mode;
12300
12301   mode = GET_MODE (mem);
12302
12303   switch (mode)
12304     {
12305     case QImode: gen = gen_aarch64_atomic_casqi; break;
12306     case HImode: gen = gen_aarch64_atomic_cashi; break;
12307     case SImode: gen = gen_aarch64_atomic_cassi; break;
12308     case DImode: gen = gen_aarch64_atomic_casdi; break;
12309     default:
12310       gcc_unreachable ();
12311     }
12312
12313   /* Move the expected value into the CAS destination register.  */
12314   emit_insn (gen_rtx_SET (rval, expected));
12315
12316   /* Emit the CAS.  */
12317   emit_insn (gen (rval, mem, desired, model));
12318
12319   /* Compare the expected value with the value loaded by the CAS, to establish
12320      whether the swap was made.  */
12321   aarch64_gen_compare_reg (EQ, rval, expected);
12322 }
12323
12324 /* Split a compare and swap pattern.  */
12325
12326 void
12327 aarch64_split_compare_and_swap (rtx operands[])
12328 {
12329   rtx rval, mem, oldval, newval, scratch;
12330   machine_mode mode;
12331   bool is_weak;
12332   rtx_code_label *label1, *label2;
12333   rtx x, cond;
12334   enum memmodel model;
12335   rtx model_rtx;
12336
12337   rval = operands[0];
12338   mem = operands[1];
12339   oldval = operands[2];
12340   newval = operands[3];
12341   is_weak = (operands[4] != const0_rtx);
12342   model_rtx = operands[5];
12343   scratch = operands[7];
12344   mode = GET_MODE (mem);
12345   model = memmodel_from_int (INTVAL (model_rtx));
12346
12347   /* When OLDVAL is zero and we want the strong version we can emit a tighter
12348     loop:
12349     .label1:
12350         LD[A]XR rval, [mem]
12351         CBNZ    rval, .label2
12352         ST[L]XR scratch, newval, [mem]
12353         CBNZ    scratch, .label1
12354     .label2:
12355         CMP     rval, 0.  */
12356   bool strong_zero_p = !is_weak && oldval == const0_rtx;
12357
12358   label1 = NULL;
12359   if (!is_weak)
12360     {
12361       label1 = gen_label_rtx ();
12362       emit_label (label1);
12363     }
12364   label2 = gen_label_rtx ();
12365
12366   /* The initial load can be relaxed for a __sync operation since a final
12367      barrier will be emitted to stop code hoisting.  */
12368   if (is_mm_sync (model))
12369     aarch64_emit_load_exclusive (mode, rval, mem,
12370                                  GEN_INT (MEMMODEL_RELAXED));
12371   else
12372     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
12373
12374   if (strong_zero_p)
12375     {
12376       x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
12377       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12378                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12379       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12380     }
12381   else
12382     {
12383       cond = aarch64_gen_compare_reg (NE, rval, oldval);
12384       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12385       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12386                                  gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12387       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12388     }
12389
12390   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
12391
12392   if (!is_weak)
12393     {
12394       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
12395       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12396                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
12397       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12398     }
12399   else
12400     {
12401       cond = gen_rtx_REG (CCmode, CC_REGNUM);
12402       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
12403       emit_insn (gen_rtx_SET (cond, x));
12404     }
12405
12406   emit_label (label2);
12407   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
12408      to set the condition flags.  If this is not used it will be removed by
12409      later passes.  */
12410   if (strong_zero_p)
12411     {
12412       cond = gen_rtx_REG (CCmode, CC_REGNUM);
12413       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
12414       emit_insn (gen_rtx_SET (cond, x));
12415     }
12416   /* Emit any final barrier needed for a __sync operation.  */
12417   if (is_mm_sync (model))
12418     aarch64_emit_post_barrier (model);
12419 }
12420
12421 /* Emit a BIC instruction.  */
12422
12423 static void
12424 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
12425 {
12426   rtx shift_rtx = GEN_INT (shift);
12427   rtx (*gen) (rtx, rtx, rtx, rtx);
12428
12429   switch (mode)
12430     {
12431     case SImode: gen = gen_and_one_cmpl_lshrsi3; break;
12432     case DImode: gen = gen_and_one_cmpl_lshrdi3; break;
12433     default:
12434       gcc_unreachable ();
12435     }
12436
12437   emit_insn (gen (dst, s2, shift_rtx, s1));
12438 }
12439
12440 /* Emit an atomic swap.  */
12441
12442 static void
12443 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
12444                           rtx mem, rtx model)
12445 {
12446   rtx (*gen) (rtx, rtx, rtx, rtx);
12447
12448   switch (mode)
12449     {
12450     case QImode: gen = gen_aarch64_atomic_swpqi; break;
12451     case HImode: gen = gen_aarch64_atomic_swphi; break;
12452     case SImode: gen = gen_aarch64_atomic_swpsi; break;
12453     case DImode: gen = gen_aarch64_atomic_swpdi; break;
12454     default:
12455       gcc_unreachable ();
12456     }
12457
12458   emit_insn (gen (dst, mem, value, model));
12459 }
12460
12461 /* Operations supported by aarch64_emit_atomic_load_op.  */
12462
12463 enum aarch64_atomic_load_op_code
12464 {
12465   AARCH64_LDOP_PLUS,    /* A + B  */
12466   AARCH64_LDOP_XOR,     /* A ^ B  */
12467   AARCH64_LDOP_OR,      /* A | B  */
12468   AARCH64_LDOP_BIC      /* A & ~B  */
12469 };
12470
12471 /* Emit an atomic load-operate.  */
12472
12473 static void
12474 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
12475                              machine_mode mode, rtx dst, rtx src,
12476                              rtx mem, rtx model)
12477 {
12478   typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
12479   const aarch64_atomic_load_op_fn plus[] =
12480   {
12481     gen_aarch64_atomic_loadaddqi,
12482     gen_aarch64_atomic_loadaddhi,
12483     gen_aarch64_atomic_loadaddsi,
12484     gen_aarch64_atomic_loadadddi
12485   };
12486   const aarch64_atomic_load_op_fn eor[] =
12487   {
12488     gen_aarch64_atomic_loadeorqi,
12489     gen_aarch64_atomic_loadeorhi,
12490     gen_aarch64_atomic_loadeorsi,
12491     gen_aarch64_atomic_loadeordi
12492   };
12493   const aarch64_atomic_load_op_fn ior[] =
12494   {
12495     gen_aarch64_atomic_loadsetqi,
12496     gen_aarch64_atomic_loadsethi,
12497     gen_aarch64_atomic_loadsetsi,
12498     gen_aarch64_atomic_loadsetdi
12499   };
12500   const aarch64_atomic_load_op_fn bic[] =
12501   {
12502     gen_aarch64_atomic_loadclrqi,
12503     gen_aarch64_atomic_loadclrhi,
12504     gen_aarch64_atomic_loadclrsi,
12505     gen_aarch64_atomic_loadclrdi
12506   };
12507   aarch64_atomic_load_op_fn gen;
12508   int idx = 0;
12509
12510   switch (mode)
12511     {
12512     case QImode: idx = 0; break;
12513     case HImode: idx = 1; break;
12514     case SImode: idx = 2; break;
12515     case DImode: idx = 3; break;
12516     default:
12517       gcc_unreachable ();
12518     }
12519
12520   switch (code)
12521     {
12522     case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
12523     case AARCH64_LDOP_XOR: gen = eor[idx]; break;
12524     case AARCH64_LDOP_OR: gen = ior[idx]; break;
12525     case AARCH64_LDOP_BIC: gen = bic[idx]; break;
12526     default:
12527       gcc_unreachable ();
12528     }
12529
12530   emit_insn (gen (dst, mem, src, model));
12531 }
12532
12533 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
12534    location to store the data read from memory.  OUT_RESULT is the location to
12535    store the result of the operation.  MEM is the memory location to read and
12536    modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
12537    operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
12538    be NULL.  */
12539
12540 void
12541 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
12542                          rtx mem, rtx value, rtx model_rtx)
12543 {
12544   machine_mode mode = GET_MODE (mem);
12545   machine_mode wmode = (mode == DImode ? DImode : SImode);
12546   const bool short_mode = (mode < SImode);
12547   aarch64_atomic_load_op_code ldop_code;
12548   rtx src;
12549   rtx x;
12550
12551   if (out_data)
12552     out_data = gen_lowpart (mode, out_data);
12553
12554   if (out_result)
12555     out_result = gen_lowpart (mode, out_result);
12556
12557   /* Make sure the value is in a register, putting it into a destination
12558      register if it needs to be manipulated.  */
12559   if (!register_operand (value, mode)
12560       || code == AND || code == MINUS)
12561     {
12562       src = out_result ? out_result : out_data;
12563       emit_move_insn (src, gen_lowpart (mode, value));
12564     }
12565   else
12566     src = value;
12567   gcc_assert (register_operand (src, mode));
12568
12569   /* Preprocess the data for the operation as necessary.  If the operation is
12570      a SET then emit a swap instruction and finish.  */
12571   switch (code)
12572     {
12573     case SET:
12574       aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
12575       return;
12576
12577     case MINUS:
12578       /* Negate the value and treat it as a PLUS.  */
12579       {
12580         rtx neg_src;
12581
12582         /* Resize the value if necessary.  */
12583         if (short_mode)
12584           src = gen_lowpart (wmode, src);
12585
12586         neg_src = gen_rtx_NEG (wmode, src);
12587         emit_insn (gen_rtx_SET (src, neg_src));
12588
12589         if (short_mode)
12590           src = gen_lowpart (mode, src);
12591       }
12592       /* Fall-through.  */
12593     case PLUS:
12594       ldop_code = AARCH64_LDOP_PLUS;
12595       break;
12596
12597     case IOR:
12598       ldop_code = AARCH64_LDOP_OR;
12599       break;
12600
12601     case XOR:
12602       ldop_code = AARCH64_LDOP_XOR;
12603       break;
12604
12605     case AND:
12606       {
12607         rtx not_src;
12608
12609         /* Resize the value if necessary.  */
12610         if (short_mode)
12611           src = gen_lowpart (wmode, src);
12612
12613         not_src = gen_rtx_NOT (wmode, src);
12614         emit_insn (gen_rtx_SET (src, not_src));
12615
12616         if (short_mode)
12617           src = gen_lowpart (mode, src);
12618       }
12619       ldop_code = AARCH64_LDOP_BIC;
12620       break;
12621
12622     default:
12623       /* The operation can't be done with atomic instructions.  */
12624       gcc_unreachable ();
12625     }
12626
12627   aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
12628
12629   /* If necessary, calculate the data in memory after the update by redoing the
12630      operation from values in registers.  */
12631   if (!out_result)
12632     return;
12633
12634   if (short_mode)
12635     {
12636       src = gen_lowpart (wmode, src);
12637       out_data = gen_lowpart (wmode, out_data);
12638       out_result = gen_lowpart (wmode, out_result);
12639     }
12640
12641   x = NULL_RTX;
12642
12643   switch (code)
12644     {
12645     case MINUS:
12646     case PLUS:
12647       x = gen_rtx_PLUS (wmode, out_data, src);
12648       break;
12649     case IOR:
12650       x = gen_rtx_IOR (wmode, out_data, src);
12651       break;
12652     case XOR:
12653       x = gen_rtx_XOR (wmode, out_data, src);
12654       break;
12655     case AND:
12656       aarch64_emit_bic (wmode, out_result, out_data, src, 0);
12657       return;
12658     default:
12659       gcc_unreachable ();
12660     }
12661
12662   emit_set_insn (out_result, x);
12663
12664   return;
12665 }
12666
12667 /* Split an atomic operation.  */
12668
12669 void
12670 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
12671                          rtx value, rtx model_rtx, rtx cond)
12672 {
12673   machine_mode mode = GET_MODE (mem);
12674   machine_mode wmode = (mode == DImode ? DImode : SImode);
12675   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
12676   const bool is_sync = is_mm_sync (model);
12677   rtx_code_label *label;
12678   rtx x;
12679
12680   /* Split the atomic operation into a sequence.  */
12681   label = gen_label_rtx ();
12682   emit_label (label);
12683
12684   if (new_out)
12685     new_out = gen_lowpart (wmode, new_out);
12686   if (old_out)
12687     old_out = gen_lowpart (wmode, old_out);
12688   else
12689     old_out = new_out;
12690   value = simplify_gen_subreg (wmode, value, mode, 0);
12691
12692   /* The initial load can be relaxed for a __sync operation since a final
12693      barrier will be emitted to stop code hoisting.  */
12694  if (is_sync)
12695     aarch64_emit_load_exclusive (mode, old_out, mem,
12696                                  GEN_INT (MEMMODEL_RELAXED));
12697   else
12698     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
12699
12700   switch (code)
12701     {
12702     case SET:
12703       new_out = value;
12704       break;
12705
12706     case NOT:
12707       x = gen_rtx_AND (wmode, old_out, value);
12708       emit_insn (gen_rtx_SET (new_out, x));
12709       x = gen_rtx_NOT (wmode, new_out);
12710       emit_insn (gen_rtx_SET (new_out, x));
12711       break;
12712
12713     case MINUS:
12714       if (CONST_INT_P (value))
12715         {
12716           value = GEN_INT (-INTVAL (value));
12717           code = PLUS;
12718         }
12719       /* Fall through.  */
12720
12721     default:
12722       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
12723       emit_insn (gen_rtx_SET (new_out, x));
12724       break;
12725     }
12726
12727   aarch64_emit_store_exclusive (mode, cond, mem,
12728                                 gen_lowpart (mode, new_out), model_rtx);
12729
12730   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12731   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12732                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
12733   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12734
12735   /* Emit any final barrier needed for a __sync operation.  */
12736   if (is_sync)
12737     aarch64_emit_post_barrier (model);
12738 }
12739
12740 static void
12741 aarch64_init_libfuncs (void)
12742 {
12743    /* Half-precision float operations.  The compiler handles all operations
12744      with NULL libfuncs by converting to SFmode.  */
12745
12746   /* Conversions.  */
12747   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
12748   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
12749
12750   /* Arithmetic.  */
12751   set_optab_libfunc (add_optab, HFmode, NULL);
12752   set_optab_libfunc (sdiv_optab, HFmode, NULL);
12753   set_optab_libfunc (smul_optab, HFmode, NULL);
12754   set_optab_libfunc (neg_optab, HFmode, NULL);
12755   set_optab_libfunc (sub_optab, HFmode, NULL);
12756
12757   /* Comparisons.  */
12758   set_optab_libfunc (eq_optab, HFmode, NULL);
12759   set_optab_libfunc (ne_optab, HFmode, NULL);
12760   set_optab_libfunc (lt_optab, HFmode, NULL);
12761   set_optab_libfunc (le_optab, HFmode, NULL);
12762   set_optab_libfunc (ge_optab, HFmode, NULL);
12763   set_optab_libfunc (gt_optab, HFmode, NULL);
12764   set_optab_libfunc (unord_optab, HFmode, NULL);
12765 }
12766
12767 /* Target hook for c_mode_for_suffix.  */
12768 static machine_mode
12769 aarch64_c_mode_for_suffix (char suffix)
12770 {
12771   if (suffix == 'q')
12772     return TFmode;
12773
12774   return VOIDmode;
12775 }
12776
12777 /* We can only represent floating point constants which will fit in
12778    "quarter-precision" values.  These values are characterised by
12779    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
12780    by:
12781
12782    (-1)^s * (n/16) * 2^r
12783
12784    Where:
12785      's' is the sign bit.
12786      'n' is an integer in the range 16 <= n <= 31.
12787      'r' is an integer in the range -3 <= r <= 4.  */
12788
12789 /* Return true iff X can be represented by a quarter-precision
12790    floating point immediate operand X.  Note, we cannot represent 0.0.  */
12791 bool
12792 aarch64_float_const_representable_p (rtx x)
12793 {
12794   /* This represents our current view of how many bits
12795      make up the mantissa.  */
12796   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
12797   int exponent;
12798   unsigned HOST_WIDE_INT mantissa, mask;
12799   REAL_VALUE_TYPE r, m;
12800   bool fail;
12801
12802   if (!CONST_DOUBLE_P (x))
12803     return false;
12804
12805   /* We don't support HFmode constants yet.  */
12806   if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
12807     return false;
12808
12809   r = *CONST_DOUBLE_REAL_VALUE (x);
12810
12811   /* We cannot represent infinities, NaNs or +/-zero.  We won't
12812      know if we have +zero until we analyse the mantissa, but we
12813      can reject the other invalid values.  */
12814   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
12815       || REAL_VALUE_MINUS_ZERO (r))
12816     return false;
12817
12818   /* Extract exponent.  */
12819   r = real_value_abs (&r);
12820   exponent = REAL_EXP (&r);
12821
12822   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
12823      highest (sign) bit, with a fixed binary point at bit point_pos.
12824      m1 holds the low part of the mantissa, m2 the high part.
12825      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
12826      bits for the mantissa, this can fail (low bits will be lost).  */
12827   real_ldexp (&m, &r, point_pos - exponent);
12828   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
12829
12830   /* If the low part of the mantissa has bits set we cannot represent
12831      the value.  */
12832   if (w.ulow () != 0)
12833     return false;
12834   /* We have rejected the lower HOST_WIDE_INT, so update our
12835      understanding of how many bits lie in the mantissa and
12836      look only at the high HOST_WIDE_INT.  */
12837   mantissa = w.elt (1);
12838   point_pos -= HOST_BITS_PER_WIDE_INT;
12839
12840   /* We can only represent values with a mantissa of the form 1.xxxx.  */
12841   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
12842   if ((mantissa & mask) != 0)
12843     return false;
12844
12845   /* Having filtered unrepresentable values, we may now remove all
12846      but the highest 5 bits.  */
12847   mantissa >>= point_pos - 5;
12848
12849   /* We cannot represent the value 0.0, so reject it.  This is handled
12850      elsewhere.  */
12851   if (mantissa == 0)
12852     return false;
12853
12854   /* Then, as bit 4 is always set, we can mask it off, leaving
12855      the mantissa in the range [0, 15].  */
12856   mantissa &= ~(1 << 4);
12857   gcc_assert (mantissa <= 15);
12858
12859   /* GCC internally does not use IEEE754-like encoding (where normalized
12860      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
12861      Our mantissa values are shifted 4 places to the left relative to
12862      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
12863      by 5 places to correct for GCC's representation.  */
12864   exponent = 5 - exponent;
12865
12866   return (exponent >= 0 && exponent <= 7);
12867 }
12868
12869 char*
12870 aarch64_output_simd_mov_immediate (rtx const_vector,
12871                                    machine_mode mode,
12872                                    unsigned width)
12873 {
12874   bool is_valid;
12875   static char templ[40];
12876   const char *mnemonic;
12877   const char *shift_op;
12878   unsigned int lane_count = 0;
12879   char element_char;
12880
12881   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
12882
12883   /* This will return true to show const_vector is legal for use as either
12884      a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate.  It will
12885      also update INFO to show how the immediate should be generated.  */
12886   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
12887   gcc_assert (is_valid);
12888
12889   element_char = sizetochar (info.element_width);
12890   lane_count = width / info.element_width;
12891
12892   mode = GET_MODE_INNER (mode);
12893   if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12894     {
12895       gcc_assert (info.shift == 0 && ! info.mvn);
12896       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
12897          move immediate path.  */
12898       if (aarch64_float_const_zero_rtx_p (info.value))
12899         info.value = GEN_INT (0);
12900       else
12901         {
12902           const unsigned int buf_size = 20;
12903           char float_buf[buf_size] = {'\0'};
12904           real_to_decimal_for_mode (float_buf,
12905                                     CONST_DOUBLE_REAL_VALUE (info.value),
12906                                     buf_size, buf_size, 1, mode);
12907
12908           if (lane_count == 1)
12909             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
12910           else
12911             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
12912                       lane_count, element_char, float_buf);
12913           return templ;
12914         }
12915     }
12916
12917   mnemonic = info.mvn ? "mvni" : "movi";
12918   shift_op = info.msl ? "msl" : "lsl";
12919
12920   gcc_assert (CONST_INT_P (info.value));
12921   if (lane_count == 1)
12922     snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
12923               mnemonic, UINTVAL (info.value));
12924   else if (info.shift)
12925     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
12926               ", %s %d", mnemonic, lane_count, element_char,
12927               UINTVAL (info.value), shift_op, info.shift);
12928   else
12929     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
12930               mnemonic, lane_count, element_char, UINTVAL (info.value));
12931   return templ;
12932 }
12933
12934 char*
12935 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
12936                                           machine_mode mode)
12937 {
12938   machine_mode vmode;
12939
12940   gcc_assert (!VECTOR_MODE_P (mode));
12941   vmode = aarch64_simd_container_mode (mode, 64);
12942   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
12943   return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
12944 }
12945
12946 /* Split operands into moves from op[1] + op[2] into op[0].  */
12947
12948 void
12949 aarch64_split_combinev16qi (rtx operands[3])
12950 {
12951   unsigned int dest = REGNO (operands[0]);
12952   unsigned int src1 = REGNO (operands[1]);
12953   unsigned int src2 = REGNO (operands[2]);
12954   machine_mode halfmode = GET_MODE (operands[1]);
12955   unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
12956   rtx destlo, desthi;
12957
12958   gcc_assert (halfmode == V16QImode);
12959
12960   if (src1 == dest && src2 == dest + halfregs)
12961     {
12962       /* No-op move.  Can't split to nothing; emit something.  */
12963       emit_note (NOTE_INSN_DELETED);
12964       return;
12965     }
12966
12967   /* Preserve register attributes for variable tracking.  */
12968   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
12969   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
12970                                GET_MODE_SIZE (halfmode));
12971
12972   /* Special case of reversed high/low parts.  */
12973   if (reg_overlap_mentioned_p (operands[2], destlo)
12974       && reg_overlap_mentioned_p (operands[1], desthi))
12975     {
12976       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12977       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
12978       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12979     }
12980   else if (!reg_overlap_mentioned_p (operands[2], destlo))
12981     {
12982       /* Try to avoid unnecessary moves if part of the result
12983          is in the right place already.  */
12984       if (src1 != dest)
12985         emit_move_insn (destlo, operands[1]);
12986       if (src2 != dest + halfregs)
12987         emit_move_insn (desthi, operands[2]);
12988     }
12989   else
12990     {
12991       if (src2 != dest + halfregs)
12992         emit_move_insn (desthi, operands[2]);
12993       if (src1 != dest)
12994         emit_move_insn (destlo, operands[1]);
12995     }
12996 }
12997
12998 /* vec_perm support.  */
12999
13000 #define MAX_VECT_LEN 16
13001
13002 struct expand_vec_perm_d
13003 {
13004   rtx target, op0, op1;
13005   unsigned char perm[MAX_VECT_LEN];
13006   machine_mode vmode;
13007   unsigned char nelt;
13008   bool one_vector_p;
13009   bool testing_p;
13010 };
13011
13012 /* Generate a variable permutation.  */
13013
13014 static void
13015 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
13016 {
13017   machine_mode vmode = GET_MODE (target);
13018   bool one_vector_p = rtx_equal_p (op0, op1);
13019
13020   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
13021   gcc_checking_assert (GET_MODE (op0) == vmode);
13022   gcc_checking_assert (GET_MODE (op1) == vmode);
13023   gcc_checking_assert (GET_MODE (sel) == vmode);
13024   gcc_checking_assert (TARGET_SIMD);
13025
13026   if (one_vector_p)
13027     {
13028       if (vmode == V8QImode)
13029         {
13030           /* Expand the argument to a V16QI mode by duplicating it.  */
13031           rtx pair = gen_reg_rtx (V16QImode);
13032           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
13033           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13034         }
13035       else
13036         {
13037           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
13038         }
13039     }
13040   else
13041     {
13042       rtx pair;
13043
13044       if (vmode == V8QImode)
13045         {
13046           pair = gen_reg_rtx (V16QImode);
13047           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
13048           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13049         }
13050       else
13051         {
13052           pair = gen_reg_rtx (OImode);
13053           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
13054           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
13055         }
13056     }
13057 }
13058
13059 void
13060 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
13061 {
13062   machine_mode vmode = GET_MODE (target);
13063   unsigned int nelt = GET_MODE_NUNITS (vmode);
13064   bool one_vector_p = rtx_equal_p (op0, op1);
13065   rtx mask;
13066
13067   /* The TBL instruction does not use a modulo index, so we must take care
13068      of that ourselves.  */
13069   mask = aarch64_simd_gen_const_vector_dup (vmode,
13070       one_vector_p ? nelt - 1 : 2 * nelt - 1);
13071   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
13072
13073   /* For big-endian, we also need to reverse the index within the vector
13074      (but not which vector).  */
13075   if (BYTES_BIG_ENDIAN)
13076     {
13077       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
13078       if (!one_vector_p)
13079         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
13080       sel = expand_simple_binop (vmode, XOR, sel, mask,
13081                                  NULL, 0, OPTAB_LIB_WIDEN);
13082     }
13083   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
13084 }
13085
13086 /* Recognize patterns suitable for the TRN instructions.  */
13087 static bool
13088 aarch64_evpc_trn (struct expand_vec_perm_d *d)
13089 {
13090   unsigned int i, odd, mask, nelt = d->nelt;
13091   rtx out, in0, in1, x;
13092   rtx (*gen) (rtx, rtx, rtx);
13093   machine_mode vmode = d->vmode;
13094
13095   if (GET_MODE_UNIT_SIZE (vmode) > 8)
13096     return false;
13097
13098   /* Note that these are little-endian tests.
13099      We correct for big-endian later.  */
13100   if (d->perm[0] == 0)
13101     odd = 0;
13102   else if (d->perm[0] == 1)
13103     odd = 1;
13104   else
13105     return false;
13106   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13107
13108   for (i = 0; i < nelt; i += 2)
13109     {
13110       if (d->perm[i] != i + odd)
13111         return false;
13112       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
13113         return false;
13114     }
13115
13116   /* Success!  */
13117   if (d->testing_p)
13118     return true;
13119
13120   in0 = d->op0;
13121   in1 = d->op1;
13122   if (BYTES_BIG_ENDIAN)
13123     {
13124       x = in0, in0 = in1, in1 = x;
13125       odd = !odd;
13126     }
13127   out = d->target;
13128
13129   if (odd)
13130     {
13131       switch (vmode)
13132         {
13133         case V16QImode: gen = gen_aarch64_trn2v16qi; break;
13134         case V8QImode: gen = gen_aarch64_trn2v8qi; break;
13135         case V8HImode: gen = gen_aarch64_trn2v8hi; break;
13136         case V4HImode: gen = gen_aarch64_trn2v4hi; break;
13137         case V4SImode: gen = gen_aarch64_trn2v4si; break;
13138         case V2SImode: gen = gen_aarch64_trn2v2si; break;
13139         case V2DImode: gen = gen_aarch64_trn2v2di; break;
13140         case V4HFmode: gen = gen_aarch64_trn2v4hf; break;
13141         case V8HFmode: gen = gen_aarch64_trn2v8hf; break;
13142         case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
13143         case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
13144         case V2DFmode: gen = gen_aarch64_trn2v2df; break;
13145         default:
13146           return false;
13147         }
13148     }
13149   else
13150     {
13151       switch (vmode)
13152         {
13153         case V16QImode: gen = gen_aarch64_trn1v16qi; break;
13154         case V8QImode: gen = gen_aarch64_trn1v8qi; break;
13155         case V8HImode: gen = gen_aarch64_trn1v8hi; break;
13156         case V4HImode: gen = gen_aarch64_trn1v4hi; break;
13157         case V4SImode: gen = gen_aarch64_trn1v4si; break;
13158         case V2SImode: gen = gen_aarch64_trn1v2si; break;
13159         case V2DImode: gen = gen_aarch64_trn1v2di; break;
13160         case V4HFmode: gen = gen_aarch64_trn1v4hf; break;
13161         case V8HFmode: gen = gen_aarch64_trn1v8hf; break;
13162         case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
13163         case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
13164         case V2DFmode: gen = gen_aarch64_trn1v2df; break;
13165         default:
13166           return false;
13167         }
13168     }
13169
13170   emit_insn (gen (out, in0, in1));
13171   return true;
13172 }
13173
13174 /* Recognize patterns suitable for the UZP instructions.  */
13175 static bool
13176 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
13177 {
13178   unsigned int i, odd, mask, nelt = d->nelt;
13179   rtx out, in0, in1, x;
13180   rtx (*gen) (rtx, rtx, rtx);
13181   machine_mode vmode = d->vmode;
13182
13183   if (GET_MODE_UNIT_SIZE (vmode) > 8)
13184     return false;
13185
13186   /* Note that these are little-endian tests.
13187      We correct for big-endian later.  */
13188   if (d->perm[0] == 0)
13189     odd = 0;
13190   else if (d->perm[0] == 1)
13191     odd = 1;
13192   else
13193     return false;
13194   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13195
13196   for (i = 0; i < nelt; i++)
13197     {
13198       unsigned elt = (i * 2 + odd) & mask;
13199       if (d->perm[i] != elt)
13200         return false;
13201     }
13202
13203   /* Success!  */
13204   if (d->testing_p)
13205     return true;
13206
13207   in0 = d->op0;
13208   in1 = d->op1;
13209   if (BYTES_BIG_ENDIAN)
13210     {
13211       x = in0, in0 = in1, in1 = x;
13212       odd = !odd;
13213     }
13214   out = d->target;
13215
13216   if (odd)
13217     {
13218       switch (vmode)
13219         {
13220         case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
13221         case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
13222         case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
13223         case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
13224         case V4SImode: gen = gen_aarch64_uzp2v4si; break;
13225         case V2SImode: gen = gen_aarch64_uzp2v2si; break;
13226         case V2DImode: gen = gen_aarch64_uzp2v2di; break;
13227         case V4HFmode: gen = gen_aarch64_uzp2v4hf; break;
13228         case V8HFmode: gen = gen_aarch64_uzp2v8hf; break;
13229         case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
13230         case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
13231         case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
13232         default:
13233           return false;
13234         }
13235     }
13236   else
13237     {
13238       switch (vmode)
13239         {
13240         case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
13241         case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
13242         case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
13243         case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
13244         case V4SImode: gen = gen_aarch64_uzp1v4si; break;
13245         case V2SImode: gen = gen_aarch64_uzp1v2si; break;
13246         case V2DImode: gen = gen_aarch64_uzp1v2di; break;
13247         case V4HFmode: gen = gen_aarch64_uzp1v4hf; break;
13248         case V8HFmode: gen = gen_aarch64_uzp1v8hf; break;
13249         case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
13250         case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
13251         case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
13252         default:
13253           return false;
13254         }
13255     }
13256
13257   emit_insn (gen (out, in0, in1));
13258   return true;
13259 }
13260
13261 /* Recognize patterns suitable for the ZIP instructions.  */
13262 static bool
13263 aarch64_evpc_zip (struct expand_vec_perm_d *d)
13264 {
13265   unsigned int i, high, mask, nelt = d->nelt;
13266   rtx out, in0, in1, x;
13267   rtx (*gen) (rtx, rtx, rtx);
13268   machine_mode vmode = d->vmode;
13269
13270   if (GET_MODE_UNIT_SIZE (vmode) > 8)
13271     return false;
13272
13273   /* Note that these are little-endian tests.
13274      We correct for big-endian later.  */
13275   high = nelt / 2;
13276   if (d->perm[0] == high)
13277     /* Do Nothing.  */
13278     ;
13279   else if (d->perm[0] == 0)
13280     high = 0;
13281   else
13282     return false;
13283   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13284
13285   for (i = 0; i < nelt / 2; i++)
13286     {
13287       unsigned elt = (i + high) & mask;
13288       if (d->perm[i * 2] != elt)
13289         return false;
13290       elt = (elt + nelt) & mask;
13291       if (d->perm[i * 2 + 1] != elt)
13292         return false;
13293     }
13294
13295   /* Success!  */
13296   if (d->testing_p)
13297     return true;
13298
13299   in0 = d->op0;
13300   in1 = d->op1;
13301   if (BYTES_BIG_ENDIAN)
13302     {
13303       x = in0, in0 = in1, in1 = x;
13304       high = !high;
13305     }
13306   out = d->target;
13307
13308   if (high)
13309     {
13310       switch (vmode)
13311         {
13312         case V16QImode: gen = gen_aarch64_zip2v16qi; break;
13313         case V8QImode: gen = gen_aarch64_zip2v8qi; break;
13314         case V8HImode: gen = gen_aarch64_zip2v8hi; break;
13315         case V4HImode: gen = gen_aarch64_zip2v4hi; break;
13316         case V4SImode: gen = gen_aarch64_zip2v4si; break;
13317         case V2SImode: gen = gen_aarch64_zip2v2si; break;
13318         case V2DImode: gen = gen_aarch64_zip2v2di; break;
13319         case V4HFmode: gen = gen_aarch64_zip2v4hf; break;
13320         case V8HFmode: gen = gen_aarch64_zip2v8hf; break;
13321         case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
13322         case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
13323         case V2DFmode: gen = gen_aarch64_zip2v2df; break;
13324         default:
13325           return false;
13326         }
13327     }
13328   else
13329     {
13330       switch (vmode)
13331         {
13332         case V16QImode: gen = gen_aarch64_zip1v16qi; break;
13333         case V8QImode: gen = gen_aarch64_zip1v8qi; break;
13334         case V8HImode: gen = gen_aarch64_zip1v8hi; break;
13335         case V4HImode: gen = gen_aarch64_zip1v4hi; break;
13336         case V4SImode: gen = gen_aarch64_zip1v4si; break;
13337         case V2SImode: gen = gen_aarch64_zip1v2si; break;
13338         case V2DImode: gen = gen_aarch64_zip1v2di; break;
13339         case V4HFmode: gen = gen_aarch64_zip1v4hf; break;
13340         case V8HFmode: gen = gen_aarch64_zip1v8hf; break;
13341         case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
13342         case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
13343         case V2DFmode: gen = gen_aarch64_zip1v2df; break;
13344         default:
13345           return false;
13346         }
13347     }
13348
13349   emit_insn (gen (out, in0, in1));
13350   return true;
13351 }
13352
13353 /* Recognize patterns for the EXT insn.  */
13354
13355 static bool
13356 aarch64_evpc_ext (struct expand_vec_perm_d *d)
13357 {
13358   unsigned int i, nelt = d->nelt;
13359   rtx (*gen) (rtx, rtx, rtx, rtx);
13360   rtx offset;
13361
13362   unsigned int location = d->perm[0]; /* Always < nelt.  */
13363
13364   /* Check if the extracted indices are increasing by one.  */
13365   for (i = 1; i < nelt; i++)
13366     {
13367       unsigned int required = location + i;
13368       if (d->one_vector_p)
13369         {
13370           /* We'll pass the same vector in twice, so allow indices to wrap.  */
13371           required &= (nelt - 1);
13372         }
13373       if (d->perm[i] != required)
13374         return false;
13375     }
13376
13377   switch (d->vmode)
13378     {
13379     case V16QImode: gen = gen_aarch64_extv16qi; break;
13380     case V8QImode: gen = gen_aarch64_extv8qi; break;
13381     case V4HImode: gen = gen_aarch64_extv4hi; break;
13382     case V8HImode: gen = gen_aarch64_extv8hi; break;
13383     case V2SImode: gen = gen_aarch64_extv2si; break;
13384     case V4SImode: gen = gen_aarch64_extv4si; break;
13385     case V4HFmode: gen = gen_aarch64_extv4hf; break;
13386     case V8HFmode: gen = gen_aarch64_extv8hf; break;
13387     case V2SFmode: gen = gen_aarch64_extv2sf; break;
13388     case V4SFmode: gen = gen_aarch64_extv4sf; break;
13389     case V2DImode: gen = gen_aarch64_extv2di; break;
13390     case V2DFmode: gen = gen_aarch64_extv2df; break;
13391     default:
13392       return false;
13393     }
13394
13395   /* Success! */
13396   if (d->testing_p)
13397     return true;
13398
13399   /* The case where (location == 0) is a no-op for both big- and little-endian,
13400      and is removed by the mid-end at optimization levels -O1 and higher.  */
13401
13402   if (BYTES_BIG_ENDIAN && (location != 0))
13403     {
13404       /* After setup, we want the high elements of the first vector (stored
13405          at the LSB end of the register), and the low elements of the second
13406          vector (stored at the MSB end of the register). So swap.  */
13407       std::swap (d->op0, d->op1);
13408       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
13409       location = nelt - location;
13410     }
13411
13412   offset = GEN_INT (location);
13413   emit_insn (gen (d->target, d->op0, d->op1, offset));
13414   return true;
13415 }
13416
13417 /* Recognize patterns for the REV insns.  */
13418
13419 static bool
13420 aarch64_evpc_rev (struct expand_vec_perm_d *d)
13421 {
13422   unsigned int i, j, diff, nelt = d->nelt;
13423   rtx (*gen) (rtx, rtx);
13424
13425   if (!d->one_vector_p)
13426     return false;
13427
13428   diff = d->perm[0];
13429   switch (diff)
13430     {
13431     case 7:
13432       switch (d->vmode)
13433         {
13434         case V16QImode: gen = gen_aarch64_rev64v16qi; break;
13435         case V8QImode: gen = gen_aarch64_rev64v8qi;  break;
13436         default:
13437           return false;
13438         }
13439       break;
13440     case 3:
13441       switch (d->vmode)
13442         {
13443         case V16QImode: gen = gen_aarch64_rev32v16qi; break;
13444         case V8QImode: gen = gen_aarch64_rev32v8qi;  break;
13445         case V8HImode: gen = gen_aarch64_rev64v8hi;  break;
13446         case V4HImode: gen = gen_aarch64_rev64v4hi;  break;
13447         default:
13448           return false;
13449         }
13450       break;
13451     case 1:
13452       switch (d->vmode)
13453         {
13454         case V16QImode: gen = gen_aarch64_rev16v16qi; break;
13455         case V8QImode: gen = gen_aarch64_rev16v8qi;  break;
13456         case V8HImode: gen = gen_aarch64_rev32v8hi;  break;
13457         case V4HImode: gen = gen_aarch64_rev32v4hi;  break;
13458         case V4SImode: gen = gen_aarch64_rev64v4si;  break;
13459         case V2SImode: gen = gen_aarch64_rev64v2si;  break;
13460         case V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
13461         case V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
13462         case V8HFmode: gen = gen_aarch64_rev64v8hf;  break;
13463         case V4HFmode: gen = gen_aarch64_rev64v4hf;  break;
13464         default:
13465           return false;
13466         }
13467       break;
13468     default:
13469       return false;
13470     }
13471
13472   for (i = 0; i < nelt ; i += diff + 1)
13473     for (j = 0; j <= diff; j += 1)
13474       {
13475         /* This is guaranteed to be true as the value of diff
13476            is 7, 3, 1 and we should have enough elements in the
13477            queue to generate this.  Getting a vector mask with a
13478            value of diff other than these values implies that
13479            something is wrong by the time we get here.  */
13480         gcc_assert (i + j < nelt);
13481         if (d->perm[i + j] != i + diff - j)
13482           return false;
13483       }
13484
13485   /* Success! */
13486   if (d->testing_p)
13487     return true;
13488
13489   emit_insn (gen (d->target, d->op0));
13490   return true;
13491 }
13492
13493 static bool
13494 aarch64_evpc_dup (struct expand_vec_perm_d *d)
13495 {
13496   rtx (*gen) (rtx, rtx, rtx);
13497   rtx out = d->target;
13498   rtx in0;
13499   machine_mode vmode = d->vmode;
13500   unsigned int i, elt, nelt = d->nelt;
13501   rtx lane;
13502
13503   elt = d->perm[0];
13504   for (i = 1; i < nelt; i++)
13505     {
13506       if (elt != d->perm[i])
13507         return false;
13508     }
13509
13510   /* The generic preparation in aarch64_expand_vec_perm_const_1
13511      swaps the operand order and the permute indices if it finds
13512      d->perm[0] to be in the second operand.  Thus, we can always
13513      use d->op0 and need not do any extra arithmetic to get the
13514      correct lane number.  */
13515   in0 = d->op0;
13516   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
13517
13518   switch (vmode)
13519     {
13520     case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
13521     case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
13522     case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
13523     case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
13524     case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
13525     case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
13526     case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
13527     case V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
13528     case V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
13529     case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
13530     case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
13531     case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
13532     default:
13533       return false;
13534     }
13535
13536   emit_insn (gen (out, in0, lane));
13537   return true;
13538 }
13539
13540 static bool
13541 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
13542 {
13543   rtx rperm[MAX_VECT_LEN], sel;
13544   machine_mode vmode = d->vmode;
13545   unsigned int i, nelt = d->nelt;
13546
13547   if (d->testing_p)
13548     return true;
13549
13550   /* Generic code will try constant permutation twice.  Once with the
13551      original mode and again with the elements lowered to QImode.
13552      So wait and don't do the selector expansion ourselves.  */
13553   if (vmode != V8QImode && vmode != V16QImode)
13554     return false;
13555
13556   for (i = 0; i < nelt; ++i)
13557     {
13558       int nunits = GET_MODE_NUNITS (vmode);
13559
13560       /* If big-endian and two vectors we end up with a weird mixed-endian
13561          mode on NEON.  Reverse the index within each word but not the word
13562          itself.  */
13563       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
13564                                            : d->perm[i]);
13565     }
13566   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
13567   sel = force_reg (vmode, sel);
13568
13569   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
13570   return true;
13571 }
13572
13573 static bool
13574 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
13575 {
13576   /* The pattern matching functions above are written to look for a small
13577      number to begin the sequence (0, 1, N/2).  If we begin with an index
13578      from the second operand, we can swap the operands.  */
13579   if (d->perm[0] >= d->nelt)
13580     {
13581       unsigned i, nelt = d->nelt;
13582
13583       gcc_assert (nelt == (nelt & -nelt));
13584       for (i = 0; i < nelt; ++i)
13585         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
13586
13587       std::swap (d->op0, d->op1);
13588     }
13589
13590   if (TARGET_SIMD)
13591     {
13592       if (aarch64_evpc_rev (d))
13593         return true;
13594       else if (aarch64_evpc_ext (d))
13595         return true;
13596       else if (aarch64_evpc_dup (d))
13597         return true;
13598       else if (aarch64_evpc_zip (d))
13599         return true;
13600       else if (aarch64_evpc_uzp (d))
13601         return true;
13602       else if (aarch64_evpc_trn (d))
13603         return true;
13604       return aarch64_evpc_tbl (d);
13605     }
13606   return false;
13607 }
13608
13609 /* Expand a vec_perm_const pattern.  */
13610
13611 bool
13612 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
13613 {
13614   struct expand_vec_perm_d d;
13615   int i, nelt, which;
13616
13617   d.target = target;
13618   d.op0 = op0;
13619   d.op1 = op1;
13620
13621   d.vmode = GET_MODE (target);
13622   gcc_assert (VECTOR_MODE_P (d.vmode));
13623   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13624   d.testing_p = false;
13625
13626   for (i = which = 0; i < nelt; ++i)
13627     {
13628       rtx e = XVECEXP (sel, 0, i);
13629       int ei = INTVAL (e) & (2 * nelt - 1);
13630       which |= (ei < nelt ? 1 : 2);
13631       d.perm[i] = ei;
13632     }
13633
13634   switch (which)
13635     {
13636     default:
13637       gcc_unreachable ();
13638
13639     case 3:
13640       d.one_vector_p = false;
13641       if (!rtx_equal_p (op0, op1))
13642         break;
13643
13644       /* The elements of PERM do not suggest that only the first operand
13645          is used, but both operands are identical.  Allow easier matching
13646          of the permutation by folding the permutation into the single
13647          input vector.  */
13648       /* Fall Through.  */
13649     case 2:
13650       for (i = 0; i < nelt; ++i)
13651         d.perm[i] &= nelt - 1;
13652       d.op0 = op1;
13653       d.one_vector_p = true;
13654       break;
13655
13656     case 1:
13657       d.op1 = op0;
13658       d.one_vector_p = true;
13659       break;
13660     }
13661
13662   return aarch64_expand_vec_perm_const_1 (&d);
13663 }
13664
13665 static bool
13666 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
13667                                      const unsigned char *sel)
13668 {
13669   struct expand_vec_perm_d d;
13670   unsigned int i, nelt, which;
13671   bool ret;
13672
13673   d.vmode = vmode;
13674   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13675   d.testing_p = true;
13676   memcpy (d.perm, sel, nelt);
13677
13678   /* Calculate whether all elements are in one vector.  */
13679   for (i = which = 0; i < nelt; ++i)
13680     {
13681       unsigned char e = d.perm[i];
13682       gcc_assert (e < 2 * nelt);
13683       which |= (e < nelt ? 1 : 2);
13684     }
13685
13686   /* If all elements are from the second vector, reindex as if from the
13687      first vector.  */
13688   if (which == 2)
13689     for (i = 0; i < nelt; ++i)
13690       d.perm[i] -= nelt;
13691
13692   /* Check whether the mask can be applied to a single vector.  */
13693   d.one_vector_p = (which != 3);
13694
13695   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
13696   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
13697   if (!d.one_vector_p)
13698     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
13699
13700   start_sequence ();
13701   ret = aarch64_expand_vec_perm_const_1 (&d);
13702   end_sequence ();
13703
13704   return ret;
13705 }
13706
13707 rtx
13708 aarch64_reverse_mask (machine_mode mode)
13709 {
13710   /* We have to reverse each vector because we dont have
13711      a permuted load that can reverse-load according to ABI rules.  */
13712   rtx mask;
13713   rtvec v = rtvec_alloc (16);
13714   int i, j;
13715   int nunits = GET_MODE_NUNITS (mode);
13716   int usize = GET_MODE_UNIT_SIZE (mode);
13717
13718   gcc_assert (BYTES_BIG_ENDIAN);
13719   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
13720
13721   for (i = 0; i < nunits; i++)
13722     for (j = 0; j < usize; j++)
13723       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
13724   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
13725   return force_reg (V16QImode, mask);
13726 }
13727
13728 /* Implement MODES_TIEABLE_P.  In principle we should always return true.
13729    However due to issues with register allocation it is preferable to avoid
13730    tieing integer scalar and FP scalar modes.  Executing integer operations
13731    in general registers is better than treating them as scalar vector
13732    operations.  This reduces latency and avoids redundant int<->FP moves.
13733    So tie modes if they are either the same class, or vector modes with
13734    other vector modes, vector structs or any scalar mode.
13735 */
13736
13737 bool
13738 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
13739 {
13740   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
13741     return true;
13742
13743   /* We specifically want to allow elements of "structure" modes to
13744      be tieable to the structure.  This more general condition allows
13745      other rarer situations too.  */
13746   if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
13747     return true;
13748
13749   /* Also allow any scalar modes with vectors.  */
13750   if (aarch64_vector_mode_supported_p (mode1)
13751       || aarch64_vector_mode_supported_p (mode2))
13752     return true;
13753
13754   return false;
13755 }
13756
13757 /* Return a new RTX holding the result of moving POINTER forward by
13758    AMOUNT bytes.  */
13759
13760 static rtx
13761 aarch64_move_pointer (rtx pointer, int amount)
13762 {
13763   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
13764
13765   return adjust_automodify_address (pointer, GET_MODE (pointer),
13766                                     next, amount);
13767 }
13768
13769 /* Return a new RTX holding the result of moving POINTER forward by the
13770    size of the mode it points to.  */
13771
13772 static rtx
13773 aarch64_progress_pointer (rtx pointer)
13774 {
13775   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
13776
13777   return aarch64_move_pointer (pointer, amount);
13778 }
13779
13780 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
13781    MODE bytes.  */
13782
13783 static void
13784 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
13785                                               machine_mode mode)
13786 {
13787   rtx reg = gen_reg_rtx (mode);
13788
13789   /* "Cast" the pointers to the correct mode.  */
13790   *src = adjust_address (*src, mode, 0);
13791   *dst = adjust_address (*dst, mode, 0);
13792   /* Emit the memcpy.  */
13793   emit_move_insn (reg, *src);
13794   emit_move_insn (*dst, reg);
13795   /* Move the pointers forward.  */
13796   *src = aarch64_progress_pointer (*src);
13797   *dst = aarch64_progress_pointer (*dst);
13798 }
13799
13800 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
13801    we succeed, otherwise return false.  */
13802
13803 bool
13804 aarch64_expand_movmem (rtx *operands)
13805 {
13806   unsigned int n;
13807   rtx dst = operands[0];
13808   rtx src = operands[1];
13809   rtx base;
13810   bool speed_p = !optimize_function_for_size_p (cfun);
13811
13812   /* When optimizing for size, give a better estimate of the length of a
13813      memcpy call, but use the default otherwise.  */
13814   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
13815
13816   /* We can't do anything smart if the amount to copy is not constant.  */
13817   if (!CONST_INT_P (operands[2]))
13818     return false;
13819
13820   n = UINTVAL (operands[2]);
13821
13822   /* Try to keep the number of instructions low.  For cases below 16 bytes we
13823      need to make at most two moves.  For cases above 16 bytes it will be one
13824      move for each 16 byte chunk, then at most two additional moves.  */
13825   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
13826     return false;
13827
13828   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
13829   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
13830
13831   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
13832   src = adjust_automodify_address (src, VOIDmode, base, 0);
13833
13834   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
13835      1-byte chunk.  */
13836   if (n < 4)
13837     {
13838       if (n >= 2)
13839         {
13840           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13841           n -= 2;
13842         }
13843
13844       if (n == 1)
13845         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13846
13847       return true;
13848     }
13849
13850   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
13851      4-byte chunk, partially overlapping with the previously copied chunk.  */
13852   if (n < 8)
13853     {
13854       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13855       n -= 4;
13856       if (n > 0)
13857         {
13858           int move = n - 4;
13859
13860           src = aarch64_move_pointer (src, move);
13861           dst = aarch64_move_pointer (dst, move);
13862           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13863         }
13864       return true;
13865     }
13866
13867   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
13868      them, then (if applicable) an 8-byte chunk.  */
13869   while (n >= 8)
13870     {
13871       if (n / 16)
13872         {
13873           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
13874           n -= 16;
13875         }
13876       else
13877         {
13878           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13879           n -= 8;
13880         }
13881     }
13882
13883   /* Finish the final bytes of the copy.  We can always do this in one
13884      instruction.  We either copy the exact amount we need, or partially
13885      overlap with the previous chunk we copied and copy 8-bytes.  */
13886   if (n == 0)
13887     return true;
13888   else if (n == 1)
13889     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13890   else if (n == 2)
13891     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13892   else if (n == 4)
13893     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13894   else
13895     {
13896       if (n == 3)
13897         {
13898           src = aarch64_move_pointer (src, -1);
13899           dst = aarch64_move_pointer (dst, -1);
13900           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13901         }
13902       else
13903         {
13904           int move = n - 8;
13905
13906           src = aarch64_move_pointer (src, move);
13907           dst = aarch64_move_pointer (dst, move);
13908           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13909         }
13910     }
13911
13912   return true;
13913 }
13914
13915 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
13916    SImode stores.  Handle the case when the constant has identical
13917    bottom and top halves.  This is beneficial when the two stores can be
13918    merged into an STP and we avoid synthesising potentially expensive
13919    immediates twice.  Return true if such a split is possible.  */
13920
13921 bool
13922 aarch64_split_dimode_const_store (rtx dst, rtx src)
13923 {
13924   rtx lo = gen_lowpart (SImode, src);
13925   rtx hi = gen_highpart_mode (SImode, DImode, src);
13926
13927   bool size_p = optimize_function_for_size_p (cfun);
13928
13929   if (!rtx_equal_p (lo, hi))
13930     return false;
13931
13932   unsigned int orig_cost
13933     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
13934   unsigned int lo_cost
13935     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
13936
13937   /* We want to transform:
13938      MOV        x1, 49370
13939      MOVK       x1, 0x140, lsl 16
13940      MOVK       x1, 0xc0da, lsl 32
13941      MOVK       x1, 0x140, lsl 48
13942      STR        x1, [x0]
13943    into:
13944      MOV        w1, 49370
13945      MOVK       w1, 0x140, lsl 16
13946      STP        w1, w1, [x0]
13947    So we want to perform this only when we save two instructions
13948    or more.  When optimizing for size, however, accept any code size
13949    savings we can.  */
13950   if (size_p && orig_cost <= lo_cost)
13951     return false;
13952
13953   if (!size_p
13954       && (orig_cost <= lo_cost + 1))
13955     return false;
13956
13957   rtx mem_lo = adjust_address (dst, SImode, 0);
13958   if (!aarch64_mem_pair_operand (mem_lo, SImode))
13959     return false;
13960
13961   rtx tmp_reg = gen_reg_rtx (SImode);
13962   aarch64_expand_mov_immediate (tmp_reg, lo);
13963   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
13964   /* Don't emit an explicit store pair as this may not be always profitable.
13965      Let the sched-fusion logic decide whether to merge them.  */
13966   emit_move_insn (mem_lo, tmp_reg);
13967   emit_move_insn (mem_hi, tmp_reg);
13968
13969   return true;
13970 }
13971
13972 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
13973
13974 static unsigned HOST_WIDE_INT
13975 aarch64_asan_shadow_offset (void)
13976 {
13977   return (HOST_WIDE_INT_1 << 36);
13978 }
13979
13980 static bool
13981 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
13982                                         unsigned int align,
13983                                         enum by_pieces_operation op,
13984                                         bool speed_p)
13985 {
13986   /* STORE_BY_PIECES can be used when copying a constant string, but
13987      in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
13988      For now we always fail this and let the move_by_pieces code copy
13989      the string from read-only memory.  */
13990   if (op == STORE_BY_PIECES)
13991     return false;
13992
13993   return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
13994 }
13995
13996 static rtx
13997 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
13998                         int code, tree treeop0, tree treeop1)
13999 {
14000   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
14001   rtx op0, op1;
14002   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
14003   insn_code icode;
14004   struct expand_operand ops[4];
14005
14006   start_sequence ();
14007   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
14008
14009   op_mode = GET_MODE (op0);
14010   if (op_mode == VOIDmode)
14011     op_mode = GET_MODE (op1);
14012
14013   switch (op_mode)
14014     {
14015     case QImode:
14016     case HImode:
14017     case SImode:
14018       cmp_mode = SImode;
14019       icode = CODE_FOR_cmpsi;
14020       break;
14021
14022     case DImode:
14023       cmp_mode = DImode;
14024       icode = CODE_FOR_cmpdi;
14025       break;
14026
14027     case SFmode:
14028       cmp_mode = SFmode;
14029       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14030       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
14031       break;
14032
14033     case DFmode:
14034       cmp_mode = DFmode;
14035       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14036       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
14037       break;
14038
14039     default:
14040       end_sequence ();
14041       return NULL_RTX;
14042     }
14043
14044   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
14045   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
14046   if (!op0 || !op1)
14047     {
14048       end_sequence ();
14049       return NULL_RTX;
14050     }
14051   *prep_seq = get_insns ();
14052   end_sequence ();
14053
14054   create_fixed_operand (&ops[0], op0);
14055   create_fixed_operand (&ops[1], op1);
14056
14057   start_sequence ();
14058   if (!maybe_expand_insn (icode, 2, ops))
14059     {
14060       end_sequence ();
14061       return NULL_RTX;
14062     }
14063   *gen_seq = get_insns ();
14064   end_sequence ();
14065
14066   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
14067                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
14068 }
14069
14070 static rtx
14071 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
14072                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
14073 {
14074   rtx op0, op1, target;
14075   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
14076   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
14077   insn_code icode;
14078   struct expand_operand ops[6];
14079   int aarch64_cond;
14080
14081   push_to_sequence (*prep_seq);
14082   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
14083
14084   op_mode = GET_MODE (op0);
14085   if (op_mode == VOIDmode)
14086     op_mode = GET_MODE (op1);
14087
14088   switch (op_mode)
14089     {
14090     case QImode:
14091     case HImode:
14092     case SImode:
14093       cmp_mode = SImode;
14094       icode = CODE_FOR_ccmpsi;
14095       break;
14096
14097     case DImode:
14098       cmp_mode = DImode;
14099       icode = CODE_FOR_ccmpdi;
14100       break;
14101
14102     case SFmode:
14103       cmp_mode = SFmode;
14104       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14105       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
14106       break;
14107
14108     case DFmode:
14109       cmp_mode = DFmode;
14110       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14111       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
14112       break;
14113
14114     default:
14115       end_sequence ();
14116       return NULL_RTX;
14117     }
14118
14119   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
14120   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
14121   if (!op0 || !op1)
14122     {
14123       end_sequence ();
14124       return NULL_RTX;
14125     }
14126   *prep_seq = get_insns ();
14127   end_sequence ();
14128
14129   target = gen_rtx_REG (cc_mode, CC_REGNUM);
14130   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
14131
14132   if (bit_code != AND)
14133     {
14134       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
14135                                                 GET_MODE (XEXP (prev, 0))),
14136                              VOIDmode, XEXP (prev, 0), const0_rtx);
14137       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
14138     }
14139
14140   create_fixed_operand (&ops[0], XEXP (prev, 0));
14141   create_fixed_operand (&ops[1], target);
14142   create_fixed_operand (&ops[2], op0);
14143   create_fixed_operand (&ops[3], op1);
14144   create_fixed_operand (&ops[4], prev);
14145   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
14146
14147   push_to_sequence (*gen_seq);
14148   if (!maybe_expand_insn (icode, 6, ops))
14149     {
14150       end_sequence ();
14151       return NULL_RTX;
14152     }
14153
14154   *gen_seq = get_insns ();
14155   end_sequence ();
14156
14157   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
14158 }
14159
14160 #undef TARGET_GEN_CCMP_FIRST
14161 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
14162
14163 #undef TARGET_GEN_CCMP_NEXT
14164 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
14165
14166 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
14167    instruction fusion of some sort.  */
14168
14169 static bool
14170 aarch64_macro_fusion_p (void)
14171 {
14172   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
14173 }
14174
14175
14176 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
14177    should be kept together during scheduling.  */
14178
14179 static bool
14180 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
14181 {
14182   rtx set_dest;
14183   rtx prev_set = single_set (prev);
14184   rtx curr_set = single_set (curr);
14185   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
14186   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
14187
14188   if (!aarch64_macro_fusion_p ())
14189     return false;
14190
14191   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
14192     {
14193       /* We are trying to match:
14194          prev (mov)  == (set (reg r0) (const_int imm16))
14195          curr (movk) == (set (zero_extract (reg r0)
14196                                            (const_int 16)
14197                                            (const_int 16))
14198                              (const_int imm16_1))  */
14199
14200       set_dest = SET_DEST (curr_set);
14201
14202       if (GET_CODE (set_dest) == ZERO_EXTRACT
14203           && CONST_INT_P (SET_SRC (curr_set))
14204           && CONST_INT_P (SET_SRC (prev_set))
14205           && CONST_INT_P (XEXP (set_dest, 2))
14206           && INTVAL (XEXP (set_dest, 2)) == 16
14207           && REG_P (XEXP (set_dest, 0))
14208           && REG_P (SET_DEST (prev_set))
14209           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
14210         {
14211           return true;
14212         }
14213     }
14214
14215   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
14216     {
14217
14218       /*  We're trying to match:
14219           prev (adrp) == (set (reg r1)
14220                               (high (symbol_ref ("SYM"))))
14221           curr (add) == (set (reg r0)
14222                              (lo_sum (reg r1)
14223                                      (symbol_ref ("SYM"))))
14224           Note that r0 need not necessarily be the same as r1, especially
14225           during pre-regalloc scheduling.  */
14226
14227       if (satisfies_constraint_Ush (SET_SRC (prev_set))
14228           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14229         {
14230           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
14231               && REG_P (XEXP (SET_SRC (curr_set), 0))
14232               && REGNO (XEXP (SET_SRC (curr_set), 0))
14233                  == REGNO (SET_DEST (prev_set))
14234               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
14235                               XEXP (SET_SRC (curr_set), 1)))
14236             return true;
14237         }
14238     }
14239
14240   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
14241     {
14242
14243       /* We're trying to match:
14244          prev (movk) == (set (zero_extract (reg r0)
14245                                            (const_int 16)
14246                                            (const_int 32))
14247                              (const_int imm16_1))
14248          curr (movk) == (set (zero_extract (reg r0)
14249                                            (const_int 16)
14250                                            (const_int 48))
14251                              (const_int imm16_2))  */
14252
14253       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
14254           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
14255           && REG_P (XEXP (SET_DEST (prev_set), 0))
14256           && REG_P (XEXP (SET_DEST (curr_set), 0))
14257           && REGNO (XEXP (SET_DEST (prev_set), 0))
14258              == REGNO (XEXP (SET_DEST (curr_set), 0))
14259           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
14260           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
14261           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
14262           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
14263           && CONST_INT_P (SET_SRC (prev_set))
14264           && CONST_INT_P (SET_SRC (curr_set)))
14265         return true;
14266
14267     }
14268   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
14269     {
14270       /* We're trying to match:
14271           prev (adrp) == (set (reg r0)
14272                               (high (symbol_ref ("SYM"))))
14273           curr (ldr) == (set (reg r1)
14274                              (mem (lo_sum (reg r0)
14275                                              (symbol_ref ("SYM")))))
14276                  or
14277           curr (ldr) == (set (reg r1)
14278                              (zero_extend (mem
14279                                            (lo_sum (reg r0)
14280                                                    (symbol_ref ("SYM"))))))  */
14281       if (satisfies_constraint_Ush (SET_SRC (prev_set))
14282           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14283         {
14284           rtx curr_src = SET_SRC (curr_set);
14285
14286           if (GET_CODE (curr_src) == ZERO_EXTEND)
14287             curr_src = XEXP (curr_src, 0);
14288
14289           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
14290               && REG_P (XEXP (XEXP (curr_src, 0), 0))
14291               && REGNO (XEXP (XEXP (curr_src, 0), 0))
14292                  == REGNO (SET_DEST (prev_set))
14293               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
14294                               XEXP (SET_SRC (prev_set), 0)))
14295               return true;
14296         }
14297     }
14298
14299   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
14300        && aarch_crypto_can_dual_issue (prev, curr))
14301     return true;
14302
14303   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
14304       && any_condjump_p (curr))
14305     {
14306       enum attr_type prev_type = get_attr_type (prev);
14307
14308       unsigned int condreg1, condreg2;
14309       rtx cc_reg_1;
14310       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
14311       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
14312
14313       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
14314           && prev
14315           && modified_in_p (cc_reg_1, prev))
14316         {
14317           /* FIXME: this misses some which is considered simple arthematic
14318              instructions for ThunderX.  Simple shifts are missed here.  */
14319           if (prev_type == TYPE_ALUS_SREG
14320               || prev_type == TYPE_ALUS_IMM
14321               || prev_type == TYPE_LOGICS_REG
14322               || prev_type == TYPE_LOGICS_IMM)
14323             return true;
14324         }
14325     }
14326
14327   if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
14328       && any_condjump_p (curr))
14329     {
14330       /* We're trying to match:
14331           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
14332           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
14333                                                          (const_int 0))
14334                                                  (label_ref ("SYM"))
14335                                                  (pc))  */
14336       if (SET_DEST (curr_set) == (pc_rtx)
14337           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
14338           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
14339           && REG_P (SET_DEST (prev_set))
14340           && REGNO (SET_DEST (prev_set))
14341              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
14342         {
14343           /* Fuse ALU operations followed by conditional branch instruction.  */
14344           switch (get_attr_type (prev))
14345             {
14346             case TYPE_ALU_IMM:
14347             case TYPE_ALU_SREG:
14348             case TYPE_ADC_REG:
14349             case TYPE_ADC_IMM:
14350             case TYPE_ADCS_REG:
14351             case TYPE_ADCS_IMM:
14352             case TYPE_LOGIC_REG:
14353             case TYPE_LOGIC_IMM:
14354             case TYPE_CSEL:
14355             case TYPE_ADR:
14356             case TYPE_MOV_IMM:
14357             case TYPE_SHIFT_REG:
14358             case TYPE_SHIFT_IMM:
14359             case TYPE_BFM:
14360             case TYPE_RBIT:
14361             case TYPE_REV:
14362             case TYPE_EXTEND:
14363               return true;
14364
14365             default:;
14366             }
14367         }
14368     }
14369
14370   return false;
14371 }
14372
14373 /* Return true iff the instruction fusion described by OP is enabled.  */
14374
14375 bool
14376 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
14377 {
14378   return (aarch64_tune_params.fusible_ops & op) != 0;
14379 }
14380
14381 /* If MEM is in the form of [base+offset], extract the two parts
14382    of address and set to BASE and OFFSET, otherwise return false
14383    after clearing BASE and OFFSET.  */
14384
14385 bool
14386 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
14387 {
14388   rtx addr;
14389
14390   gcc_assert (MEM_P (mem));
14391
14392   addr = XEXP (mem, 0);
14393
14394   if (REG_P (addr))
14395     {
14396       *base = addr;
14397       *offset = const0_rtx;
14398       return true;
14399     }
14400
14401   if (GET_CODE (addr) == PLUS
14402       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
14403     {
14404       *base = XEXP (addr, 0);
14405       *offset = XEXP (addr, 1);
14406       return true;
14407     }
14408
14409   *base = NULL_RTX;
14410   *offset = NULL_RTX;
14411
14412   return false;
14413 }
14414
14415 /* Types for scheduling fusion.  */
14416 enum sched_fusion_type
14417 {
14418   SCHED_FUSION_NONE = 0,
14419   SCHED_FUSION_LD_SIGN_EXTEND,
14420   SCHED_FUSION_LD_ZERO_EXTEND,
14421   SCHED_FUSION_LD,
14422   SCHED_FUSION_ST,
14423   SCHED_FUSION_NUM
14424 };
14425
14426 /* If INSN is a load or store of address in the form of [base+offset],
14427    extract the two parts and set to BASE and OFFSET.  Return scheduling
14428    fusion type this INSN is.  */
14429
14430 static enum sched_fusion_type
14431 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
14432 {
14433   rtx x, dest, src;
14434   enum sched_fusion_type fusion = SCHED_FUSION_LD;
14435
14436   gcc_assert (INSN_P (insn));
14437   x = PATTERN (insn);
14438   if (GET_CODE (x) != SET)
14439     return SCHED_FUSION_NONE;
14440
14441   src = SET_SRC (x);
14442   dest = SET_DEST (x);
14443
14444   machine_mode dest_mode = GET_MODE (dest);
14445
14446   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
14447     return SCHED_FUSION_NONE;
14448
14449   if (GET_CODE (src) == SIGN_EXTEND)
14450     {
14451       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
14452       src = XEXP (src, 0);
14453       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14454         return SCHED_FUSION_NONE;
14455     }
14456   else if (GET_CODE (src) == ZERO_EXTEND)
14457     {
14458       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
14459       src = XEXP (src, 0);
14460       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14461         return SCHED_FUSION_NONE;
14462     }
14463
14464   if (GET_CODE (src) == MEM && REG_P (dest))
14465     extract_base_offset_in_addr (src, base, offset);
14466   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
14467     {
14468       fusion = SCHED_FUSION_ST;
14469       extract_base_offset_in_addr (dest, base, offset);
14470     }
14471   else
14472     return SCHED_FUSION_NONE;
14473
14474   if (*base == NULL_RTX || *offset == NULL_RTX)
14475     fusion = SCHED_FUSION_NONE;
14476
14477   return fusion;
14478 }
14479
14480 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
14481
14482    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
14483    and PRI are only calculated for these instructions.  For other instruction,
14484    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
14485    type instruction fusion can be added by returning different priorities.
14486
14487    It's important that irrelevant instructions get the largest FUSION_PRI.  */
14488
14489 static void
14490 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
14491                                int *fusion_pri, int *pri)
14492 {
14493   int tmp, off_val;
14494   rtx base, offset;
14495   enum sched_fusion_type fusion;
14496
14497   gcc_assert (INSN_P (insn));
14498
14499   tmp = max_pri - 1;
14500   fusion = fusion_load_store (insn, &base, &offset);
14501   if (fusion == SCHED_FUSION_NONE)
14502     {
14503       *pri = tmp;
14504       *fusion_pri = tmp;
14505       return;
14506     }
14507
14508   /* Set FUSION_PRI according to fusion type and base register.  */
14509   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
14510
14511   /* Calculate PRI.  */
14512   tmp /= 2;
14513
14514   /* INSN with smaller offset goes first.  */
14515   off_val = (int)(INTVAL (offset));
14516   if (off_val >= 0)
14517     tmp -= (off_val & 0xfffff);
14518   else
14519     tmp += ((- off_val) & 0xfffff);
14520
14521   *pri = tmp;
14522   return;
14523 }
14524
14525 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
14526    Adjust priority of sha1h instructions so they are scheduled before
14527    other SHA1 instructions.  */
14528
14529 static int
14530 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
14531 {
14532   rtx x = PATTERN (insn);
14533
14534   if (GET_CODE (x) == SET)
14535     {
14536       x = SET_SRC (x);
14537
14538       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
14539         return priority + 10;
14540     }
14541
14542   return priority;
14543 }
14544
14545 /* Given OPERANDS of consecutive load/store, check if we can merge
14546    them into ldp/stp.  LOAD is true if they are load instructions.
14547    MODE is the mode of memory operands.  */
14548
14549 bool
14550 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
14551                                 machine_mode mode)
14552 {
14553   HOST_WIDE_INT offval_1, offval_2, msize;
14554   enum reg_class rclass_1, rclass_2;
14555   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
14556
14557   if (load)
14558     {
14559       mem_1 = operands[1];
14560       mem_2 = operands[3];
14561       reg_1 = operands[0];
14562       reg_2 = operands[2];
14563       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
14564       if (REGNO (reg_1) == REGNO (reg_2))
14565         return false;
14566     }
14567   else
14568     {
14569       mem_1 = operands[0];
14570       mem_2 = operands[2];
14571       reg_1 = operands[1];
14572       reg_2 = operands[3];
14573     }
14574
14575   /* The mems cannot be volatile.  */
14576   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
14577     return false;
14578
14579   /* If we have SImode and slow unaligned ldp,
14580      check the alignment to be at least 8 byte. */
14581   if (mode == SImode
14582       && (aarch64_tune_params.extra_tuning_flags
14583           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14584       && !optimize_size
14585       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14586     return false;
14587
14588   /* Check if the addresses are in the form of [base+offset].  */
14589   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14590   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14591     return false;
14592   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14593   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14594     return false;
14595
14596   /* Check if the bases are same.  */
14597   if (!rtx_equal_p (base_1, base_2))
14598     return false;
14599
14600   offval_1 = INTVAL (offset_1);
14601   offval_2 = INTVAL (offset_2);
14602   msize = GET_MODE_SIZE (mode);
14603   /* Check if the offsets are consecutive.  */
14604   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
14605     return false;
14606
14607   /* Check if the addresses are clobbered by load.  */
14608   if (load)
14609     {
14610       if (reg_mentioned_p (reg_1, mem_1))
14611         return false;
14612
14613       /* In increasing order, the last load can clobber the address.  */
14614       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
14615       return false;
14616     }
14617
14618   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14619     rclass_1 = FP_REGS;
14620   else
14621     rclass_1 = GENERAL_REGS;
14622
14623   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14624     rclass_2 = FP_REGS;
14625   else
14626     rclass_2 = GENERAL_REGS;
14627
14628   /* Check if the registers are of same class.  */
14629   if (rclass_1 != rclass_2)
14630     return false;
14631
14632   return true;
14633 }
14634
14635 /* Given OPERANDS of consecutive load/store, check if we can merge
14636    them into ldp/stp by adjusting the offset.  LOAD is true if they
14637    are load instructions.  MODE is the mode of memory operands.
14638
14639    Given below consecutive stores:
14640
14641      str  w1, [xb, 0x100]
14642      str  w1, [xb, 0x104]
14643      str  w1, [xb, 0x108]
14644      str  w1, [xb, 0x10c]
14645
14646    Though the offsets are out of the range supported by stp, we can
14647    still pair them after adjusting the offset, like:
14648
14649      add  scratch, xb, 0x100
14650      stp  w1, w1, [scratch]
14651      stp  w1, w1, [scratch, 0x8]
14652
14653    The peephole patterns detecting this opportunity should guarantee
14654    the scratch register is avaliable.  */
14655
14656 bool
14657 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
14658                                        machine_mode mode)
14659 {
14660   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
14661   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
14662   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
14663   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
14664
14665   if (load)
14666     {
14667       reg_1 = operands[0];
14668       mem_1 = operands[1];
14669       reg_2 = operands[2];
14670       mem_2 = operands[3];
14671       reg_3 = operands[4];
14672       mem_3 = operands[5];
14673       reg_4 = operands[6];
14674       mem_4 = operands[7];
14675       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
14676                   && REG_P (reg_3) && REG_P (reg_4));
14677       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
14678         return false;
14679     }
14680   else
14681     {
14682       mem_1 = operands[0];
14683       reg_1 = operands[1];
14684       mem_2 = operands[2];
14685       reg_2 = operands[3];
14686       mem_3 = operands[4];
14687       reg_3 = operands[5];
14688       mem_4 = operands[6];
14689       reg_4 = operands[7];
14690     }
14691   /* Skip if memory operand is by itslef valid for ldp/stp.  */
14692   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
14693     return false;
14694
14695   /* The mems cannot be volatile.  */
14696   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
14697       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
14698     return false;
14699
14700   /* Check if the addresses are in the form of [base+offset].  */
14701   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14702   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14703     return false;
14704   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14705   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14706     return false;
14707   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
14708   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
14709     return false;
14710   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
14711   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
14712     return false;
14713
14714   /* Check if the bases are same.  */
14715   if (!rtx_equal_p (base_1, base_2)
14716       || !rtx_equal_p (base_2, base_3)
14717       || !rtx_equal_p (base_3, base_4))
14718     return false;
14719
14720   offval_1 = INTVAL (offset_1);
14721   offval_2 = INTVAL (offset_2);
14722   offval_3 = INTVAL (offset_3);
14723   offval_4 = INTVAL (offset_4);
14724   msize = GET_MODE_SIZE (mode);
14725   /* Check if the offsets are consecutive.  */
14726   if ((offval_1 != (offval_2 + msize)
14727        || offval_1 != (offval_3 + msize * 2)
14728        || offval_1 != (offval_4 + msize * 3))
14729       && (offval_4 != (offval_3 + msize)
14730           || offval_4 != (offval_2 + msize * 2)
14731           || offval_4 != (offval_1 + msize * 3)))
14732     return false;
14733
14734   /* Check if the addresses are clobbered by load.  */
14735   if (load)
14736     {
14737       if (reg_mentioned_p (reg_1, mem_1)
14738           || reg_mentioned_p (reg_2, mem_2)
14739           || reg_mentioned_p (reg_3, mem_3))
14740         return false;
14741
14742       /* In increasing order, the last load can clobber the address.  */
14743       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
14744         return false;
14745     }
14746
14747   /* If we have SImode and slow unaligned ldp,
14748      check the alignment to be at least 8 byte. */
14749   if (mode == SImode
14750       && (aarch64_tune_params.extra_tuning_flags
14751           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14752       && !optimize_size
14753       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14754     return false;
14755
14756   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14757     rclass_1 = FP_REGS;
14758   else
14759     rclass_1 = GENERAL_REGS;
14760
14761   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14762     rclass_2 = FP_REGS;
14763   else
14764     rclass_2 = GENERAL_REGS;
14765
14766   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
14767     rclass_3 = FP_REGS;
14768   else
14769     rclass_3 = GENERAL_REGS;
14770
14771   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
14772     rclass_4 = FP_REGS;
14773   else
14774     rclass_4 = GENERAL_REGS;
14775
14776   /* Check if the registers are of same class.  */
14777   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
14778     return false;
14779
14780   return true;
14781 }
14782
14783 /* Given OPERANDS of consecutive load/store, this function pairs them
14784    into ldp/stp after adjusting the offset.  It depends on the fact
14785    that addresses of load/store instructions are in increasing order.
14786    MODE is the mode of memory operands.  CODE is the rtl operator
14787    which should be applied to all memory operands, it's SIGN_EXTEND,
14788    ZERO_EXTEND or UNKNOWN.  */
14789
14790 bool
14791 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
14792                              machine_mode mode, RTX_CODE code)
14793 {
14794   rtx base, offset, t1, t2;
14795   rtx mem_1, mem_2, mem_3, mem_4;
14796   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
14797
14798   if (load)
14799     {
14800       mem_1 = operands[1];
14801       mem_2 = operands[3];
14802       mem_3 = operands[5];
14803       mem_4 = operands[7];
14804     }
14805   else
14806     {
14807       mem_1 = operands[0];
14808       mem_2 = operands[2];
14809       mem_3 = operands[4];
14810       mem_4 = operands[6];
14811       gcc_assert (code == UNKNOWN);
14812     }
14813
14814   extract_base_offset_in_addr (mem_1, &base, &offset);
14815   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
14816
14817   /* Adjust offset thus it can fit in ldp/stp instruction.  */
14818   msize = GET_MODE_SIZE (mode);
14819   stp_off_limit = msize * 0x40;
14820   off_val = INTVAL (offset);
14821   abs_off = (off_val < 0) ? -off_val : off_val;
14822   new_off = abs_off % stp_off_limit;
14823   adj_off = abs_off - new_off;
14824
14825   /* Further adjust to make sure all offsets are OK.  */
14826   if ((new_off + msize * 2) >= stp_off_limit)
14827     {
14828       adj_off += stp_off_limit;
14829       new_off -= stp_off_limit;
14830     }
14831
14832   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
14833   if (adj_off >= 0x1000)
14834     return false;
14835
14836   if (off_val < 0)
14837     {
14838       adj_off = -adj_off;
14839       new_off = -new_off;
14840     }
14841
14842   /* Create new memory references.  */
14843   mem_1 = change_address (mem_1, VOIDmode,
14844                           plus_constant (DImode, operands[8], new_off));
14845
14846   /* Check if the adjusted address is OK for ldp/stp.  */
14847   if (!aarch64_mem_pair_operand (mem_1, mode))
14848     return false;
14849
14850   msize = GET_MODE_SIZE (mode);
14851   mem_2 = change_address (mem_2, VOIDmode,
14852                           plus_constant (DImode,
14853                                          operands[8],
14854                                          new_off + msize));
14855   mem_3 = change_address (mem_3, VOIDmode,
14856                           plus_constant (DImode,
14857                                          operands[8],
14858                                          new_off + msize * 2));
14859   mem_4 = change_address (mem_4, VOIDmode,
14860                           plus_constant (DImode,
14861                                          operands[8],
14862                                          new_off + msize * 3));
14863
14864   if (code == ZERO_EXTEND)
14865     {
14866       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
14867       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
14868       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
14869       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
14870     }
14871   else if (code == SIGN_EXTEND)
14872     {
14873       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
14874       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
14875       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
14876       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
14877     }
14878
14879   if (load)
14880     {
14881       operands[1] = mem_1;
14882       operands[3] = mem_2;
14883       operands[5] = mem_3;
14884       operands[7] = mem_4;
14885     }
14886   else
14887     {
14888       operands[0] = mem_1;
14889       operands[2] = mem_2;
14890       operands[4] = mem_3;
14891       operands[6] = mem_4;
14892     }
14893
14894   /* Emit adjusting instruction.  */
14895   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
14896   /* Emit ldp/stp instructions.  */
14897   t1 = gen_rtx_SET (operands[0], operands[1]);
14898   t2 = gen_rtx_SET (operands[2], operands[3]);
14899   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
14900   t1 = gen_rtx_SET (operands[4], operands[5]);
14901   t2 = gen_rtx_SET (operands[6], operands[7]);
14902   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
14903   return true;
14904 }
14905
14906 /* Return 1 if pseudo register should be created and used to hold
14907    GOT address for PIC code.  */
14908
14909 bool
14910 aarch64_use_pseudo_pic_reg (void)
14911 {
14912   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
14913 }
14914
14915 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
14916
14917 static int
14918 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
14919 {
14920   switch (XINT (x, 1))
14921     {
14922     case UNSPEC_GOTSMALLPIC:
14923     case UNSPEC_GOTSMALLPIC28K:
14924     case UNSPEC_GOTTINYPIC:
14925       return 0;
14926     default:
14927       break;
14928     }
14929
14930   return default_unspec_may_trap_p (x, flags);
14931 }
14932
14933
14934 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
14935    return the log2 of that value.  Otherwise return -1.  */
14936
14937 int
14938 aarch64_fpconst_pow_of_2 (rtx x)
14939 {
14940   const REAL_VALUE_TYPE *r;
14941
14942   if (!CONST_DOUBLE_P (x))
14943     return -1;
14944
14945   r = CONST_DOUBLE_REAL_VALUE (x);
14946
14947   if (REAL_VALUE_NEGATIVE (*r)
14948       || REAL_VALUE_ISNAN (*r)
14949       || REAL_VALUE_ISINF (*r)
14950       || !real_isinteger (r, DFmode))
14951     return -1;
14952
14953   return exact_log2 (real_to_integer (r));
14954 }
14955
14956 /* If X is a vector of equal CONST_DOUBLE values and that value is
14957    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
14958
14959 int
14960 aarch64_vec_fpconst_pow_of_2 (rtx x)
14961 {
14962   if (GET_CODE (x) != CONST_VECTOR)
14963     return -1;
14964
14965   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
14966     return -1;
14967
14968   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
14969   if (firstval <= 0)
14970     return -1;
14971
14972   for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
14973     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
14974       return -1;
14975
14976   return firstval;
14977 }
14978
14979 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
14980    to float.
14981
14982    __fp16 always promotes through this hook.
14983    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
14984    through the generic excess precision logic rather than here.  */
14985
14986 static tree
14987 aarch64_promoted_type (const_tree t)
14988 {
14989   if (SCALAR_FLOAT_TYPE_P (t)
14990       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
14991     return float_type_node;
14992
14993   return NULL_TREE;
14994 }
14995
14996 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
14997
14998 static bool
14999 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
15000                            optimization_type opt_type)
15001 {
15002   switch (op)
15003     {
15004     case rsqrt_optab:
15005       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
15006
15007     default:
15008       return true;
15009     }
15010 }
15011
15012 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
15013    if MODE is HFmode, and punt to the generic implementation otherwise.  */
15014
15015 static bool
15016 aarch64_libgcc_floating_mode_supported_p (machine_mode mode)
15017 {
15018   return (mode == HFmode
15019           ? true
15020           : default_libgcc_floating_mode_supported_p (mode));
15021 }
15022
15023 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
15024    if MODE is HFmode, and punt to the generic implementation otherwise.  */
15025
15026 static bool
15027 aarch64_scalar_mode_supported_p (machine_mode mode)
15028 {
15029   return (mode == HFmode
15030           ? true
15031           : default_scalar_mode_supported_p (mode));
15032 }
15033
15034 /* Set the value of FLT_EVAL_METHOD.
15035    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
15036
15037     0: evaluate all operations and constants, whose semantic type has at
15038        most the range and precision of type float, to the range and
15039        precision of float; evaluate all other operations and constants to
15040        the range and precision of the semantic type;
15041
15042     N, where _FloatN is a supported interchange floating type
15043        evaluate all operations and constants, whose semantic type has at
15044        most the range and precision of _FloatN type, to the range and
15045        precision of the _FloatN type; evaluate all other operations and
15046        constants to the range and precision of the semantic type;
15047
15048    If we have the ARMv8.2-A extensions then we support _Float16 in native
15049    precision, so we should set this to 16.  Otherwise, we support the type,
15050    but want to evaluate expressions in float precision, so set this to
15051    0.  */
15052
15053 static enum flt_eval_method
15054 aarch64_excess_precision (enum excess_precision_type type)
15055 {
15056   switch (type)
15057     {
15058       case EXCESS_PRECISION_TYPE_FAST:
15059       case EXCESS_PRECISION_TYPE_STANDARD:
15060         /* We can calculate either in 16-bit range and precision or
15061            32-bit range and precision.  Make that decision based on whether
15062            we have native support for the ARMv8.2-A 16-bit floating-point
15063            instructions or not.  */
15064         return (TARGET_FP_F16INST
15065                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
15066                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
15067       case EXCESS_PRECISION_TYPE_IMPLICIT:
15068         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
15069       default:
15070         gcc_unreachable ();
15071     }
15072   return FLT_EVAL_METHOD_UNPREDICTABLE;
15073 }
15074
15075 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
15076    scheduled for speculative execution.  Reject the long-running division
15077    and square-root instructions.  */
15078
15079 static bool
15080 aarch64_sched_can_speculate_insn (rtx_insn *insn)
15081 {
15082   switch (get_attr_type (insn))
15083     {
15084       case TYPE_SDIV:
15085       case TYPE_UDIV:
15086       case TYPE_FDIVS:
15087       case TYPE_FDIVD:
15088       case TYPE_FSQRTS:
15089       case TYPE_FSQRTD:
15090       case TYPE_NEON_FP_SQRT_S:
15091       case TYPE_NEON_FP_SQRT_D:
15092       case TYPE_NEON_FP_SQRT_S_Q:
15093       case TYPE_NEON_FP_SQRT_D_Q:
15094       case TYPE_NEON_FP_DIV_S:
15095       case TYPE_NEON_FP_DIV_D:
15096       case TYPE_NEON_FP_DIV_S_Q:
15097       case TYPE_NEON_FP_DIV_D_Q:
15098         return false;
15099       default:
15100         return true;
15101     }
15102 }
15103
15104 /* Target-specific selftests.  */
15105
15106 #if CHECKING_P
15107
15108 namespace selftest {
15109
15110 /* Selftest for the RTL loader.
15111    Verify that the RTL loader copes with a dump from
15112    print_rtx_function.  This is essentially just a test that class
15113    function_reader can handle a real dump, but it also verifies
15114    that lookup_reg_by_dump_name correctly handles hard regs.
15115    The presence of hard reg names in the dump means that the test is
15116    target-specific, hence it is in this file.  */
15117
15118 static void
15119 aarch64_test_loading_full_dump ()
15120 {
15121   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
15122
15123   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
15124
15125   rtx_insn *insn_1 = get_insn_by_uid (1);
15126   ASSERT_EQ (NOTE, GET_CODE (insn_1));
15127
15128   rtx_insn *insn_15 = get_insn_by_uid (15);
15129   ASSERT_EQ (INSN, GET_CODE (insn_15));
15130   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
15131
15132   /* Verify crtl->return_rtx.  */
15133   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
15134   ASSERT_EQ (0, REGNO (crtl->return_rtx));
15135   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
15136 }
15137
15138 /* Run all target-specific selftests.  */
15139
15140 static void
15141 aarch64_run_selftests (void)
15142 {
15143   aarch64_test_loading_full_dump ();
15144 }
15145
15146 } // namespace selftest
15147
15148 #endif /* #if CHECKING_P */
15149
15150 #undef TARGET_ADDRESS_COST
15151 #define TARGET_ADDRESS_COST aarch64_address_cost
15152
15153 /* This hook will determines whether unnamed bitfields affect the alignment
15154    of the containing structure.  The hook returns true if the structure
15155    should inherit the alignment requirements of an unnamed bitfield's
15156    type.  */
15157 #undef TARGET_ALIGN_ANON_BITFIELD
15158 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
15159
15160 #undef TARGET_ASM_ALIGNED_DI_OP
15161 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
15162
15163 #undef TARGET_ASM_ALIGNED_HI_OP
15164 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
15165
15166 #undef TARGET_ASM_ALIGNED_SI_OP
15167 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
15168
15169 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
15170 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
15171   hook_bool_const_tree_hwi_hwi_const_tree_true
15172
15173 #undef TARGET_ASM_FILE_START
15174 #define TARGET_ASM_FILE_START aarch64_start_file
15175
15176 #undef TARGET_ASM_OUTPUT_MI_THUNK
15177 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
15178
15179 #undef TARGET_ASM_SELECT_RTX_SECTION
15180 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
15181
15182 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
15183 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
15184
15185 #undef TARGET_BUILD_BUILTIN_VA_LIST
15186 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
15187
15188 #undef TARGET_CALLEE_COPIES
15189 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
15190
15191 #undef TARGET_CAN_ELIMINATE
15192 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
15193
15194 #undef TARGET_CAN_INLINE_P
15195 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
15196
15197 #undef TARGET_CANNOT_FORCE_CONST_MEM
15198 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
15199
15200 #undef TARGET_CASE_VALUES_THRESHOLD
15201 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
15202
15203 #undef TARGET_CONDITIONAL_REGISTER_USAGE
15204 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
15205
15206 /* Only the least significant bit is used for initialization guard
15207    variables.  */
15208 #undef TARGET_CXX_GUARD_MASK_BIT
15209 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
15210
15211 #undef TARGET_C_MODE_FOR_SUFFIX
15212 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
15213
15214 #ifdef TARGET_BIG_ENDIAN_DEFAULT
15215 #undef  TARGET_DEFAULT_TARGET_FLAGS
15216 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
15217 #endif
15218
15219 #undef TARGET_CLASS_MAX_NREGS
15220 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
15221
15222 #undef TARGET_BUILTIN_DECL
15223 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
15224
15225 #undef TARGET_BUILTIN_RECIPROCAL
15226 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
15227
15228 #undef TARGET_C_EXCESS_PRECISION
15229 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
15230
15231 #undef  TARGET_EXPAND_BUILTIN
15232 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
15233
15234 #undef TARGET_EXPAND_BUILTIN_VA_START
15235 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
15236
15237 #undef TARGET_FOLD_BUILTIN
15238 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
15239
15240 #undef TARGET_FUNCTION_ARG
15241 #define TARGET_FUNCTION_ARG aarch64_function_arg
15242
15243 #undef TARGET_FUNCTION_ARG_ADVANCE
15244 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
15245
15246 #undef TARGET_FUNCTION_ARG_BOUNDARY
15247 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
15248
15249 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
15250 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
15251
15252 #undef TARGET_FUNCTION_VALUE
15253 #define TARGET_FUNCTION_VALUE aarch64_function_value
15254
15255 #undef TARGET_FUNCTION_VALUE_REGNO_P
15256 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
15257
15258 #undef TARGET_FRAME_POINTER_REQUIRED
15259 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
15260
15261 #undef TARGET_GIMPLE_FOLD_BUILTIN
15262 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
15263
15264 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
15265 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
15266
15267 #undef  TARGET_INIT_BUILTINS
15268 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
15269
15270 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
15271 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
15272   aarch64_ira_change_pseudo_allocno_class
15273
15274 #undef TARGET_LEGITIMATE_ADDRESS_P
15275 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
15276
15277 #undef TARGET_LEGITIMATE_CONSTANT_P
15278 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
15279
15280 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
15281 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
15282   aarch64_legitimize_address_displacement
15283
15284 #undef TARGET_LIBGCC_CMP_RETURN_MODE
15285 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
15286
15287 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
15288 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
15289 aarch64_libgcc_floating_mode_supported_p
15290
15291 #undef TARGET_MANGLE_TYPE
15292 #define TARGET_MANGLE_TYPE aarch64_mangle_type
15293
15294 #undef TARGET_MEMORY_MOVE_COST
15295 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
15296
15297 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
15298 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
15299
15300 #undef TARGET_MUST_PASS_IN_STACK
15301 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
15302
15303 /* This target hook should return true if accesses to volatile bitfields
15304    should use the narrowest mode possible.  It should return false if these
15305    accesses should use the bitfield container type.  */
15306 #undef TARGET_NARROW_VOLATILE_BITFIELD
15307 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
15308
15309 #undef  TARGET_OPTION_OVERRIDE
15310 #define TARGET_OPTION_OVERRIDE aarch64_override_options
15311
15312 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
15313 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
15314   aarch64_override_options_after_change
15315
15316 #undef TARGET_OPTION_SAVE
15317 #define TARGET_OPTION_SAVE aarch64_option_save
15318
15319 #undef TARGET_OPTION_RESTORE
15320 #define TARGET_OPTION_RESTORE aarch64_option_restore
15321
15322 #undef TARGET_OPTION_PRINT
15323 #define TARGET_OPTION_PRINT aarch64_option_print
15324
15325 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
15326 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
15327
15328 #undef TARGET_SET_CURRENT_FUNCTION
15329 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
15330
15331 #undef TARGET_PASS_BY_REFERENCE
15332 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
15333
15334 #undef TARGET_PREFERRED_RELOAD_CLASS
15335 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
15336
15337 #undef TARGET_SCHED_REASSOCIATION_WIDTH
15338 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
15339
15340 #undef TARGET_PROMOTED_TYPE
15341 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
15342
15343 #undef TARGET_SECONDARY_RELOAD
15344 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
15345
15346 #undef TARGET_SHIFT_TRUNCATION_MASK
15347 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
15348
15349 #undef TARGET_SETUP_INCOMING_VARARGS
15350 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
15351
15352 #undef TARGET_STRUCT_VALUE_RTX
15353 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
15354
15355 #undef TARGET_REGISTER_MOVE_COST
15356 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
15357
15358 #undef TARGET_RETURN_IN_MEMORY
15359 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
15360
15361 #undef TARGET_RETURN_IN_MSB
15362 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
15363
15364 #undef TARGET_RTX_COSTS
15365 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
15366
15367 #undef TARGET_SCALAR_MODE_SUPPORTED_P
15368 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
15369
15370 #undef TARGET_SCHED_ISSUE_RATE
15371 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
15372
15373 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
15374 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
15375   aarch64_sched_first_cycle_multipass_dfa_lookahead
15376
15377 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
15378 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
15379   aarch64_first_cycle_multipass_dfa_lookahead_guard
15380
15381 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
15382 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
15383   aarch64_get_separate_components
15384
15385 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
15386 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
15387   aarch64_components_for_bb
15388
15389 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
15390 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
15391   aarch64_disqualify_components
15392
15393 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
15394 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
15395   aarch64_emit_prologue_components
15396
15397 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
15398 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
15399   aarch64_emit_epilogue_components
15400
15401 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
15402 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
15403   aarch64_set_handled_components
15404
15405 #undef TARGET_TRAMPOLINE_INIT
15406 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
15407
15408 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
15409 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
15410
15411 #undef TARGET_VECTOR_MODE_SUPPORTED_P
15412 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
15413
15414 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
15415 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
15416   aarch64_builtin_support_vector_misalignment
15417
15418 #undef TARGET_ARRAY_MODE_SUPPORTED_P
15419 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
15420
15421 #undef TARGET_VECTORIZE_ADD_STMT_COST
15422 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
15423
15424 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
15425 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
15426   aarch64_builtin_vectorization_cost
15427
15428 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
15429 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
15430
15431 #undef TARGET_VECTORIZE_BUILTINS
15432 #define TARGET_VECTORIZE_BUILTINS
15433
15434 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
15435 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
15436   aarch64_builtin_vectorized_function
15437
15438 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
15439 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
15440   aarch64_autovectorize_vector_sizes
15441
15442 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
15443 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
15444   aarch64_atomic_assign_expand_fenv
15445
15446 /* Section anchor support.  */
15447
15448 #undef TARGET_MIN_ANCHOR_OFFSET
15449 #define TARGET_MIN_ANCHOR_OFFSET -256
15450
15451 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
15452    byte offset; we can do much more for larger data types, but have no way
15453    to determine the size of the access.  We assume accesses are aligned.  */
15454 #undef TARGET_MAX_ANCHOR_OFFSET
15455 #define TARGET_MAX_ANCHOR_OFFSET 4095
15456
15457 #undef TARGET_VECTOR_ALIGNMENT
15458 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
15459
15460 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
15461 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
15462   aarch64_simd_vector_alignment_reachable
15463
15464 /* vec_perm support.  */
15465
15466 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
15467 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
15468   aarch64_vectorize_vec_perm_const_ok
15469
15470 #undef TARGET_INIT_LIBFUNCS
15471 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
15472
15473 #undef TARGET_FIXED_CONDITION_CODE_REGS
15474 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
15475
15476 #undef TARGET_FLAGS_REGNUM
15477 #define TARGET_FLAGS_REGNUM CC_REGNUM
15478
15479 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
15480 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
15481
15482 #undef TARGET_ASAN_SHADOW_OFFSET
15483 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
15484
15485 #undef TARGET_LEGITIMIZE_ADDRESS
15486 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
15487
15488 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
15489 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
15490   aarch64_use_by_pieces_infrastructure_p
15491
15492 #undef TARGET_SCHED_CAN_SPECULATE_INSN
15493 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
15494
15495 #undef TARGET_CAN_USE_DOLOOP_P
15496 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
15497
15498 #undef TARGET_SCHED_ADJUST_PRIORITY
15499 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
15500
15501 #undef TARGET_SCHED_MACRO_FUSION_P
15502 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
15503
15504 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
15505 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
15506
15507 #undef TARGET_SCHED_FUSION_PRIORITY
15508 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
15509
15510 #undef TARGET_UNSPEC_MAY_TRAP_P
15511 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
15512
15513 #undef TARGET_USE_PSEUDO_PIC_REG
15514 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
15515
15516 #undef TARGET_PRINT_OPERAND
15517 #define TARGET_PRINT_OPERAND aarch64_print_operand
15518
15519 #undef TARGET_PRINT_OPERAND_ADDRESS
15520 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
15521
15522 #undef TARGET_OPTAB_SUPPORTED_P
15523 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
15524
15525 #undef TARGET_OMIT_STRUCT_RETURN_REG
15526 #define TARGET_OMIT_STRUCT_RETURN_REG true
15527
15528 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
15529 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
15530 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
15531
15532 #if CHECKING_P
15533 #undef TARGET_RUN_TARGET_SELFTESTS
15534 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
15535 #endif /* #if CHECKING_P */
15536
15537 struct gcc_target targetm = TARGET_INITIALIZER;
15538
15539 #include "gt-aarch64.h"