gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2017 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #define INCLUDE_STRING
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "memmodel.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "cfgloop.h"
  33 #include "df.h"
  34 #include "tm_p.h"
  35 #include "stringpool.h"
  36 #include "optabs.h"
  37 #include "regs.h"
  38 #include "emit-rtl.h"
  39 #include "recog.h"
  40 #include "diagnostic.h"
  41 #include "insn-attr.h"
  42 #include "alias.h"
  43 #include "fold-const.h"
  44 #include "stor-layout.h"
  45 #include "calls.h"
  46 #include "varasm.h"
  47 #include "output.h"
  48 #include "flags.h"
  49 #include "explow.h"
  50 #include "expr.h"
  51 #include "reload.h"
  52 #include "langhooks.h"
  53 #include "opts.h"
  54 #include "params.h"
  55 #include "gimplify.h"
  56 #include "dwarf2.h"
  57 #include "gimple-iterator.h"
  58 #include "tree-vectorizer.h"
  59 #include "aarch64-cost-tables.h"
  60 #include "dumpfile.h"
  61 #include "builtins.h"
  62 #include "rtl-iter.h"
  63 #include "tm-constrs.h"
  64 #include "sched-int.h"
  65 #include "target-globals.h"
  66 #include "common/common-target.h"
  67 #include "selftest.h"
  68 #include "selftest-rtl.h"
  69
  70 /* This file should be included last.  */
  71 #include "target-def.h"
  72
  73 /* Defined for convenience.  */
  74 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  75
  76 /* Classifies an address.
  77
  78    ADDRESS_REG_IMM
  79        A simple base register plus immediate offset.
  80
  81    ADDRESS_REG_WB
  82        A base register indexed by immediate offset with writeback.
  83
  84    ADDRESS_REG_REG
  85        A base register indexed by (optionally scaled) register.
  86
  87    ADDRESS_REG_UXTW
  88        A base register indexed by (optionally scaled) zero-extended register.
  89
  90    ADDRESS_REG_SXTW
  91        A base register indexed by (optionally scaled) sign-extended register.
  92
  93    ADDRESS_LO_SUM
  94        A LO_SUM rtx with a base register and "LO12" symbol relocation.
  95
  96    ADDRESS_SYMBOLIC:
  97        A constant symbolic address, in pc-relative literal pool.  */
  98
  99 enum aarch64_address_type {
 100   ADDRESS_REG_IMM,
 101   ADDRESS_REG_WB,
 102   ADDRESS_REG_REG,
 103   ADDRESS_REG_UXTW,
 104   ADDRESS_REG_SXTW,
 105   ADDRESS_LO_SUM,
 106   ADDRESS_SYMBOLIC
 107 };
 108
 109 struct aarch64_address_info {
 110   enum aarch64_address_type type;
 111   rtx base;
 112   rtx offset;
 113   int shift;
 114   enum aarch64_symbol_type symbol_type;
 115 };
 116
 117 struct simd_immediate_info
 118 {
 119   rtx value;
 120   int shift;
 121   int element_width;
 122   bool mvn;
 123   bool msl;
 124 };
 125
 126 /* The current code model.  */
 127 enum aarch64_code_model aarch64_cmodel;
 128
 129 #ifdef HAVE_AS_TLS
 130 #undef TARGET_HAVE_TLS
 131 #define TARGET_HAVE_TLS 1
 132 #endif
 133
 134 static bool aarch64_composite_type_p (const_tree, machine_mode);
 135 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 136                                                      const_tree,
 137                                                      machine_mode *, int *,
 138                                                      bool *);
 139 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 140 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 141 static void aarch64_override_options_after_change (void);
 142 static bool aarch64_vector_mode_supported_p (machine_mode);
 143 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
 144                                                  const unsigned char *sel);
 145 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 146 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 147                                                          const_tree type,
 148                                                          int misalignment,
 149                                                          bool is_packed);
 150
 151 /* Major revision number of the ARM Architecture implemented by the target.  */
 152 unsigned aarch64_architecture_version;
 153
 154 /* The processor for which instructions should be scheduled.  */
 155 enum aarch64_processor aarch64_tune = cortexa53;
 156
 157 /* Mask to specify which instruction scheduling options should be used.  */
 158 unsigned long aarch64_tune_flags = 0;
 159
 160 /* Global flag for PC relative loads.  */
 161 bool aarch64_pcrelative_literal_loads;
 162
 163 /* Support for command line parsing of boolean flags in the tuning
 164    structures.  */
 165 struct aarch64_flag_desc
 166 {
 167   const char* name;
 168   unsigned int flag;
 169 };
 170
 171 #define AARCH64_FUSION_PAIR(name, internal_name) \
 172   { name, AARCH64_FUSE_##internal_name },
 173 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 174 {
 175   { "none", AARCH64_FUSE_NOTHING },
 176 #include "aarch64-fusion-pairs.def"
 177   { "all", AARCH64_FUSE_ALL },
 178   { NULL, AARCH64_FUSE_NOTHING }
 179 };
 180
 181 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 182   { name, AARCH64_EXTRA_TUNE_##internal_name },
 183 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 184 {
 185   { "none", AARCH64_EXTRA_TUNE_NONE },
 186 #include "aarch64-tuning-flags.def"
 187   { "all", AARCH64_EXTRA_TUNE_ALL },
 188   { NULL, AARCH64_EXTRA_TUNE_NONE }
 189 };
 190
 191 /* Tuning parameters.  */
 192
 193 static const struct cpu_addrcost_table generic_addrcost_table =
 194 {
 195     {
 196       1, /* hi  */
 197       0, /* si  */
 198       0, /* di  */
 199       1, /* ti  */
 200     },
 201   0, /* pre_modify  */
 202   0, /* post_modify  */
 203   0, /* register_offset  */
 204   0, /* register_sextend  */
 205   0, /* register_zextend  */
 206   0 /* imm_offset  */
 207 };
 208
 209 static const struct cpu_addrcost_table cortexa57_addrcost_table =
 210 {
 211     {
 212       1, /* hi  */
 213       0, /* si  */
 214       0, /* di  */
 215       1, /* ti  */
 216     },
 217   0, /* pre_modify  */
 218   0, /* post_modify  */
 219   0, /* register_offset  */
 220   0, /* register_sextend  */
 221   0, /* register_zextend  */
 222   0, /* imm_offset  */
 223 };
 224
 225 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 226 {
 227     {
 228       0, /* hi  */
 229       0, /* si  */
 230       0, /* di  */
 231       2, /* ti  */
 232     },
 233   0, /* pre_modify  */
 234   0, /* post_modify  */
 235   1, /* register_offset  */
 236   1, /* register_sextend  */
 237   2, /* register_zextend  */
 238   0, /* imm_offset  */
 239 };
 240
 241 static const struct cpu_addrcost_table xgene1_addrcost_table =
 242 {
 243     {
 244       1, /* hi  */
 245       0, /* si  */
 246       0, /* di  */
 247       1, /* ti  */
 248     },
 249   1, /* pre_modify  */
 250   0, /* post_modify  */
 251   0, /* register_offset  */
 252   1, /* register_sextend  */
 253   1, /* register_zextend  */
 254   0, /* imm_offset  */
 255 };
 256
 257 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 258 {
 259     {
 260       1, /* hi  */
 261       0, /* si  */
 262       0, /* di  */
 263       1, /* ti  */
 264     },
 265   0, /* pre_modify  */
 266   0, /* post_modify  */
 267   0, /* register_offset  */
 268   0, /* register_sextend  */
 269   0, /* register_zextend  */
 270   0 /* imm_offset  */
 271 };
 272
 273 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 274 {
 275     {
 276       1, /* hi  */
 277       1, /* si  */
 278       1, /* di  */
 279       2, /* ti  */
 280     },
 281   0, /* pre_modify  */
 282   0, /* post_modify  */
 283   2, /* register_offset  */
 284   3, /* register_sextend  */
 285   3, /* register_zextend  */
 286   0, /* imm_offset  */
 287 };
 288
 289 static const struct cpu_regmove_cost generic_regmove_cost =
 290 {
 291   1, /* GP2GP  */
 292   /* Avoid the use of slow int<->fp moves for spilling by setting
 293      their cost higher than memmov_cost.  */
 294   5, /* GP2FP  */
 295   5, /* FP2GP  */
 296   2 /* FP2FP  */
 297 };
 298
 299 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 300 {
 301   1, /* GP2GP  */
 302   /* Avoid the use of slow int<->fp moves for spilling by setting
 303      their cost higher than memmov_cost.  */
 304   5, /* GP2FP  */
 305   5, /* FP2GP  */
 306   2 /* FP2FP  */
 307 };
 308
 309 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 310 {
 311   1, /* GP2GP  */
 312   /* Avoid the use of slow int<->fp moves for spilling by setting
 313      their cost higher than memmov_cost.  */
 314   5, /* GP2FP  */
 315   5, /* FP2GP  */
 316   2 /* FP2FP  */
 317 };
 318
 319 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 320 {
 321   1, /* GP2GP  */
 322   /* Avoid the use of slow int<->fp moves for spilling by setting
 323      their cost higher than memmov_cost (actual, 4 and 9).  */
 324   9, /* GP2FP  */
 325   9, /* FP2GP  */
 326   1 /* FP2FP  */
 327 };
 328
 329 static const struct cpu_regmove_cost thunderx_regmove_cost =
 330 {
 331   2, /* GP2GP  */
 332   2, /* GP2FP  */
 333   6, /* FP2GP  */
 334   4 /* FP2FP  */
 335 };
 336
 337 static const struct cpu_regmove_cost xgene1_regmove_cost =
 338 {
 339   1, /* GP2GP  */
 340   /* Avoid the use of slow int<->fp moves for spilling by setting
 341      their cost higher than memmov_cost.  */
 342   8, /* GP2FP  */
 343   8, /* FP2GP  */
 344   2 /* FP2FP  */
 345 };
 346
 347 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 348 {
 349   2, /* GP2GP  */
 350   /* Avoid the use of int<->fp moves for spilling.  */
 351   6, /* GP2FP  */
 352   6, /* FP2GP  */
 353   4 /* FP2FP  */
 354 };
 355
 356 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 357 {
 358   1, /* GP2GP  */
 359   /* Avoid the use of int<->fp moves for spilling.  */
 360   8, /* GP2FP  */
 361   8, /* FP2GP  */
 362   4  /* FP2FP  */
 363 };
 364
 365 /* Generic costs for vector insn classes.  */
 366 static const struct cpu_vector_cost generic_vector_cost =
 367 {
 368   1, /* scalar_int_stmt_cost  */
 369   1, /* scalar_fp_stmt_cost  */
 370   1, /* scalar_load_cost  */
 371   1, /* scalar_store_cost  */
 372   1, /* vec_int_stmt_cost  */
 373   1, /* vec_fp_stmt_cost  */
 374   2, /* vec_permute_cost  */
 375   1, /* vec_to_scalar_cost  */
 376   1, /* scalar_to_vec_cost  */
 377   1, /* vec_align_load_cost  */
 378   1, /* vec_unalign_load_cost  */
 379   1, /* vec_unalign_store_cost  */
 380   1, /* vec_store_cost  */
 381   3, /* cond_taken_branch_cost  */
 382   1 /* cond_not_taken_branch_cost  */
 383 };
 384
 385 /* ThunderX costs for vector insn classes.  */
 386 static const struct cpu_vector_cost thunderx_vector_cost =
 387 {
 388   1, /* scalar_int_stmt_cost  */
 389   1, /* scalar_fp_stmt_cost  */
 390   3, /* scalar_load_cost  */
 391   1, /* scalar_store_cost  */
 392   4, /* vec_int_stmt_cost  */
 393   4, /* vec_fp_stmt_cost  */
 394   4, /* vec_permute_cost  */
 395   2, /* vec_to_scalar_cost  */
 396   2, /* scalar_to_vec_cost  */
 397   3, /* vec_align_load_cost  */
 398   10, /* vec_unalign_load_cost  */
 399   10, /* vec_unalign_store_cost  */
 400   1, /* vec_store_cost  */
 401   3, /* cond_taken_branch_cost  */
 402   3 /* cond_not_taken_branch_cost  */
 403 };
 404
 405 /* Generic costs for vector insn classes.  */
 406 static const struct cpu_vector_cost cortexa57_vector_cost =
 407 {
 408   1, /* scalar_int_stmt_cost  */
 409   1, /* scalar_fp_stmt_cost  */
 410   4, /* scalar_load_cost  */
 411   1, /* scalar_store_cost  */
 412   2, /* vec_int_stmt_cost  */
 413   2, /* vec_fp_stmt_cost  */
 414   3, /* vec_permute_cost  */
 415   8, /* vec_to_scalar_cost  */
 416   8, /* scalar_to_vec_cost  */
 417   4, /* vec_align_load_cost  */
 418   4, /* vec_unalign_load_cost  */
 419   1, /* vec_unalign_store_cost  */
 420   1, /* vec_store_cost  */
 421   1, /* cond_taken_branch_cost  */
 422   1 /* cond_not_taken_branch_cost  */
 423 };
 424
 425 static const struct cpu_vector_cost exynosm1_vector_cost =
 426 {
 427   1, /* scalar_int_stmt_cost  */
 428   1, /* scalar_fp_stmt_cost  */
 429   5, /* scalar_load_cost  */
 430   1, /* scalar_store_cost  */
 431   3, /* vec_int_stmt_cost  */
 432   3, /* vec_fp_stmt_cost  */
 433   3, /* vec_permute_cost  */
 434   3, /* vec_to_scalar_cost  */
 435   3, /* scalar_to_vec_cost  */
 436   5, /* vec_align_load_cost  */
 437   5, /* vec_unalign_load_cost  */
 438   1, /* vec_unalign_store_cost  */
 439   1, /* vec_store_cost  */
 440   1, /* cond_taken_branch_cost  */
 441   1 /* cond_not_taken_branch_cost  */
 442 };
 443
 444 /* Generic costs for vector insn classes.  */
 445 static const struct cpu_vector_cost xgene1_vector_cost =
 446 {
 447   1, /* scalar_int_stmt_cost  */
 448   1, /* scalar_fp_stmt_cost  */
 449   5, /* scalar_load_cost  */
 450   1, /* scalar_store_cost  */
 451   2, /* vec_int_stmt_cost  */
 452   2, /* vec_fp_stmt_cost  */
 453   2, /* vec_permute_cost  */
 454   4, /* vec_to_scalar_cost  */
 455   4, /* scalar_to_vec_cost  */
 456   10, /* vec_align_load_cost  */
 457   10, /* vec_unalign_load_cost  */
 458   2, /* vec_unalign_store_cost  */
 459   2, /* vec_store_cost  */
 460   2, /* cond_taken_branch_cost  */
 461   1 /* cond_not_taken_branch_cost  */
 462 };
 463
 464 /* Costs for vector insn classes for Vulcan.  */
 465 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 466 {
 467   1, /* scalar_int_stmt_cost  */
 468   6, /* scalar_fp_stmt_cost  */
 469   4, /* scalar_load_cost  */
 470   1, /* scalar_store_cost  */
 471   5, /* vec_int_stmt_cost  */
 472   6, /* vec_fp_stmt_cost  */
 473   3, /* vec_permute_cost  */
 474   6, /* vec_to_scalar_cost  */
 475   5, /* scalar_to_vec_cost  */
 476   8, /* vec_align_load_cost  */
 477   8, /* vec_unalign_load_cost  */
 478   4, /* vec_unalign_store_cost  */
 479   4, /* vec_store_cost  */
 480   2, /* cond_taken_branch_cost  */
 481   1  /* cond_not_taken_branch_cost  */
 482 };
 483
 484 /* Generic costs for branch instructions.  */
 485 static const struct cpu_branch_cost generic_branch_cost =
 486 {
 487   1,  /* Predictable.  */
 488   3   /* Unpredictable.  */
 489 };
 490
 491 /* Branch costs for Cortex-A57.  */
 492 static const struct cpu_branch_cost cortexa57_branch_cost =
 493 {
 494   1,  /* Predictable.  */
 495   3   /* Unpredictable.  */
 496 };
 497
 498 /* Branch costs for Vulcan.  */
 499 static const struct cpu_branch_cost thunderx2t99_branch_cost =
 500 {
 501   1,  /* Predictable.  */
 502   3   /* Unpredictable.  */
 503 };
 504
 505 /* Generic approximation modes.  */
 506 static const cpu_approx_modes generic_approx_modes =
 507 {
 508   AARCH64_APPROX_NONE,  /* division  */
 509   AARCH64_APPROX_NONE,  /* sqrt  */
 510   AARCH64_APPROX_NONE   /* recip_sqrt  */
 511 };
 512
 513 /* Approximation modes for Exynos M1.  */
 514 static const cpu_approx_modes exynosm1_approx_modes =
 515 {
 516   AARCH64_APPROX_NONE,  /* division  */
 517   AARCH64_APPROX_ALL,   /* sqrt  */
 518   AARCH64_APPROX_ALL    /* recip_sqrt  */
 519 };
 520
 521 /* Approximation modes for X-Gene 1.  */
 522 static const cpu_approx_modes xgene1_approx_modes =
 523 {
 524   AARCH64_APPROX_NONE,  /* division  */
 525   AARCH64_APPROX_NONE,  /* sqrt  */
 526   AARCH64_APPROX_ALL    /* recip_sqrt  */
 527 };
 528
 529 /* Generic prefetch settings (which disable prefetch).  */
 530 static const cpu_prefetch_tune generic_prefetch_tune =
 531 {
 532   0,                    /* num_slots  */
 533   -1,                   /* l1_cache_size  */
 534   -1,                   /* l1_cache_line_size  */
 535   -1,                   /* l2_cache_size  */
 536   -1                    /* default_opt_level  */
 537 };
 538
 539 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 540 {
 541   0,                    /* num_slots  */
 542   -1,                   /* l1_cache_size  */
 543   64,                   /* l1_cache_line_size  */
 544   -1,                   /* l2_cache_size  */
 545   -1                    /* default_opt_level  */
 546 };
 547
 548 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 549 {
 550   4,                    /* num_slots  */
 551   32,                   /* l1_cache_size  */
 552   64,                   /* l1_cache_line_size  */
 553   1024,                 /* l2_cache_size  */
 554   3                     /* default_opt_level  */
 555 };
 556
 557 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 558 {
 559   0,                    /* num_slots  */
 560   -1,                   /* l1_cache_size  */
 561   64,                   /* l1_cache_line_size  */
 562   -1,                   /* l2_cache_size  */
 563   -1                    /* default_opt_level  */
 564 };
 565
 566 static const struct tune_params generic_tunings =
 567 {
 568   &cortexa57_extra_costs,
 569   &generic_addrcost_table,
 570   &generic_regmove_cost,
 571   &generic_vector_cost,
 572   &generic_branch_cost,
 573   &generic_approx_modes,
 574   4, /* memmov_cost  */
 575   2, /* issue_rate  */
 576   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 577   8,    /* function_align.  */
 578   4,    /* jump_align.  */
 579   8,    /* loop_align.  */
 580   2,    /* int_reassoc_width.  */
 581   4,    /* fp_reassoc_width.  */
 582   1,    /* vec_reassoc_width.  */
 583   2,    /* min_div_recip_mul_sf.  */
 584   2,    /* min_div_recip_mul_df.  */
 585   0,    /* max_case_values.  */
 586   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 587   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 588   &generic_prefetch_tune
 589 };
 590
 591 static const struct tune_params cortexa35_tunings =
 592 {
 593   &cortexa53_extra_costs,
 594   &generic_addrcost_table,
 595   &cortexa53_regmove_cost,
 596   &generic_vector_cost,
 597   &cortexa57_branch_cost,
 598   &generic_approx_modes,
 599   4, /* memmov_cost  */
 600   1, /* issue_rate  */
 601   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 602    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 603   16,   /* function_align.  */
 604   4,    /* jump_align.  */
 605   8,    /* loop_align.  */
 606   2,    /* int_reassoc_width.  */
 607   4,    /* fp_reassoc_width.  */
 608   1,    /* vec_reassoc_width.  */
 609   2,    /* min_div_recip_mul_sf.  */
 610   2,    /* min_div_recip_mul_df.  */
 611   0,    /* max_case_values.  */
 612   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 613   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 614   &generic_prefetch_tune
 615 };
 616
 617 static const struct tune_params cortexa53_tunings =
 618 {
 619   &cortexa53_extra_costs,
 620   &generic_addrcost_table,
 621   &cortexa53_regmove_cost,
 622   &generic_vector_cost,
 623   &cortexa57_branch_cost,
 624   &generic_approx_modes,
 625   4, /* memmov_cost  */
 626   2, /* issue_rate  */
 627   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 628    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 629   16,   /* function_align.  */
 630   4,    /* jump_align.  */
 631   8,    /* loop_align.  */
 632   2,    /* int_reassoc_width.  */
 633   4,    /* fp_reassoc_width.  */
 634   1,    /* vec_reassoc_width.  */
 635   2,    /* min_div_recip_mul_sf.  */
 636   2,    /* min_div_recip_mul_df.  */
 637   0,    /* max_case_values.  */
 638   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 639   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 640   &generic_prefetch_tune
 641 };
 642
 643 static const struct tune_params cortexa57_tunings =
 644 {
 645   &cortexa57_extra_costs,
 646   &cortexa57_addrcost_table,
 647   &cortexa57_regmove_cost,
 648   &cortexa57_vector_cost,
 649   &cortexa57_branch_cost,
 650   &generic_approx_modes,
 651   4, /* memmov_cost  */
 652   3, /* issue_rate  */
 653   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 654    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 655   16,   /* function_align.  */
 656   4,    /* jump_align.  */
 657   8,    /* loop_align.  */
 658   2,    /* int_reassoc_width.  */
 659   4,    /* fp_reassoc_width.  */
 660   1,    /* vec_reassoc_width.  */
 661   2,    /* min_div_recip_mul_sf.  */
 662   2,    /* min_div_recip_mul_df.  */
 663   0,    /* max_case_values.  */
 664   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 665   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 666   &generic_prefetch_tune
 667 };
 668
 669 static const struct tune_params cortexa72_tunings =
 670 {
 671   &cortexa57_extra_costs,
 672   &cortexa57_addrcost_table,
 673   &cortexa57_regmove_cost,
 674   &cortexa57_vector_cost,
 675   &cortexa57_branch_cost,
 676   &generic_approx_modes,
 677   4, /* memmov_cost  */
 678   3, /* issue_rate  */
 679   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 680    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 681   16,   /* function_align.  */
 682   4,    /* jump_align.  */
 683   8,    /* loop_align.  */
 684   2,    /* int_reassoc_width.  */
 685   4,    /* fp_reassoc_width.  */
 686   1,    /* vec_reassoc_width.  */
 687   2,    /* min_div_recip_mul_sf.  */
 688   2,    /* min_div_recip_mul_df.  */
 689   0,    /* max_case_values.  */
 690   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 691   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 692   &generic_prefetch_tune
 693 };
 694
 695 static const struct tune_params cortexa73_tunings =
 696 {
 697   &cortexa57_extra_costs,
 698   &cortexa57_addrcost_table,
 699   &cortexa57_regmove_cost,
 700   &cortexa57_vector_cost,
 701   &cortexa57_branch_cost,
 702   &generic_approx_modes,
 703   4, /* memmov_cost.  */
 704   2, /* issue_rate.  */
 705   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 706    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 707   16,   /* function_align.  */
 708   4,    /* jump_align.  */
 709   8,    /* loop_align.  */
 710   2,    /* int_reassoc_width.  */
 711   4,    /* fp_reassoc_width.  */
 712   1,    /* vec_reassoc_width.  */
 713   2,    /* min_div_recip_mul_sf.  */
 714   2,    /* min_div_recip_mul_df.  */
 715   0,    /* max_case_values.  */
 716   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 717   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 718   &generic_prefetch_tune
 719 };
 720
 721
 722
 723 static const struct tune_params exynosm1_tunings =
 724 {
 725   &exynosm1_extra_costs,
 726   &exynosm1_addrcost_table,
 727   &exynosm1_regmove_cost,
 728   &exynosm1_vector_cost,
 729   &generic_branch_cost,
 730   &exynosm1_approx_modes,
 731   4,    /* memmov_cost  */
 732   3,    /* issue_rate  */
 733   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 734   4,    /* function_align.  */
 735   4,    /* jump_align.  */
 736   4,    /* loop_align.  */
 737   2,    /* int_reassoc_width.  */
 738   4,    /* fp_reassoc_width.  */
 739   1,    /* vec_reassoc_width.  */
 740   2,    /* min_div_recip_mul_sf.  */
 741   2,    /* min_div_recip_mul_df.  */
 742   48,   /* max_case_values.  */
 743   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 744   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 745   &exynosm1_prefetch_tune
 746 };
 747
 748 static const struct tune_params thunderx_tunings =
 749 {
 750   &thunderx_extra_costs,
 751   &generic_addrcost_table,
 752   &thunderx_regmove_cost,
 753   &thunderx_vector_cost,
 754   &generic_branch_cost,
 755   &generic_approx_modes,
 756   6, /* memmov_cost  */
 757   2, /* issue_rate  */
 758   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 759   8,    /* function_align.  */
 760   8,    /* jump_align.  */
 761   8,    /* loop_align.  */
 762   2,    /* int_reassoc_width.  */
 763   4,    /* fp_reassoc_width.  */
 764   1,    /* vec_reassoc_width.  */
 765   2,    /* min_div_recip_mul_sf.  */
 766   2,    /* min_div_recip_mul_df.  */
 767   0,    /* max_case_values.  */
 768   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 769   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 770   &generic_prefetch_tune
 771 };
 772
 773 static const struct tune_params xgene1_tunings =
 774 {
 775   &xgene1_extra_costs,
 776   &xgene1_addrcost_table,
 777   &xgene1_regmove_cost,
 778   &xgene1_vector_cost,
 779   &generic_branch_cost,
 780   &xgene1_approx_modes,
 781   6, /* memmov_cost  */
 782   4, /* issue_rate  */
 783   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 784   16,   /* function_align.  */
 785   8,    /* jump_align.  */
 786   16,   /* loop_align.  */
 787   2,    /* int_reassoc_width.  */
 788   4,    /* fp_reassoc_width.  */
 789   1,    /* vec_reassoc_width.  */
 790   2,    /* min_div_recip_mul_sf.  */
 791   2,    /* min_div_recip_mul_df.  */
 792   0,    /* max_case_values.  */
 793   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 794   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 795   &generic_prefetch_tune
 796 };
 797
 798 static const struct tune_params qdf24xx_tunings =
 799 {
 800   &qdf24xx_extra_costs,
 801   &qdf24xx_addrcost_table,
 802   &qdf24xx_regmove_cost,
 803   &generic_vector_cost,
 804   &generic_branch_cost,
 805   &generic_approx_modes,
 806   4, /* memmov_cost  */
 807   4, /* issue_rate  */
 808   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 809    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 810   16,   /* function_align.  */
 811   8,    /* jump_align.  */
 812   16,   /* loop_align.  */
 813   2,    /* int_reassoc_width.  */
 814   4,    /* fp_reassoc_width.  */
 815   1,    /* vec_reassoc_width.  */
 816   2,    /* min_div_recip_mul_sf.  */
 817   2,    /* min_div_recip_mul_df.  */
 818   0,    /* max_case_values.  */
 819   tune_params::AUTOPREFETCHER_STRONG,   /* autoprefetcher_model.  */
 820   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
 821   &qdf24xx_prefetch_tune
 822 };
 823
 824 static const struct tune_params thunderx2t99_tunings =
 825 {
 826   &thunderx2t99_extra_costs,
 827   &thunderx2t99_addrcost_table,
 828   &thunderx2t99_regmove_cost,
 829   &thunderx2t99_vector_cost,
 830   &thunderx2t99_branch_cost,
 831   &generic_approx_modes,
 832   4, /* memmov_cost.  */
 833   4, /* issue_rate.  */
 834   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 835   16,   /* function_align.  */
 836   8,    /* jump_align.  */
 837   16,   /* loop_align.  */
 838   3,    /* int_reassoc_width.  */
 839   2,    /* fp_reassoc_width.  */
 840   2,    /* vec_reassoc_width.  */
 841   2,    /* min_div_recip_mul_sf.  */
 842   2,    /* min_div_recip_mul_df.  */
 843   0,    /* max_case_values.  */
 844   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 845   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 846   &thunderx2t99_prefetch_tune
 847 };
 848
 849 /* Support for fine-grained override of the tuning structures.  */
 850 struct aarch64_tuning_override_function
 851 {
 852   const char* name;
 853   void (*parse_override)(const char*, struct tune_params*);
 854 };
 855
 856 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 857 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 858
 859 static const struct aarch64_tuning_override_function
 860 aarch64_tuning_override_functions[] =
 861 {
 862   { "fuse", aarch64_parse_fuse_string },
 863   { "tune", aarch64_parse_tune_string },
 864   { NULL, NULL }
 865 };
 866
 867 /* A processor implementing AArch64.  */
 868 struct processor
 869 {
 870   const char *const name;
 871   enum aarch64_processor ident;
 872   enum aarch64_processor sched_core;
 873   enum aarch64_arch arch;
 874   unsigned architecture_version;
 875   const unsigned long flags;
 876   const struct tune_params *const tune;
 877 };
 878
 879 /* Architectures implementing AArch64.  */
 880 static const struct processor all_architectures[] =
 881 {
 882 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
 883   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
 884 #include "aarch64-arches.def"
 885   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 886 };
 887
 888 /* Processor cores implementing AArch64.  */
 889 static const struct processor all_cores[] =
 890 {
 891 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
 892   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
 893   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
 894   FLAGS, &COSTS##_tunings},
 895 #include "aarch64-cores.def"
 896   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
 897     AARCH64_FL_FOR_ARCH8, &generic_tunings},
 898   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 899 };
 900
 901
 902 /* Target specification.  These are populated by the -march, -mtune, -mcpu
 903    handling code or by target attributes.  */
 904 static const struct processor *selected_arch;
 905 static const struct processor *selected_cpu;
 906 static const struct processor *selected_tune;
 907
 908 /* The current tuning set.  */
 909 struct tune_params aarch64_tune_params = generic_tunings;
 910
 911 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 912
 913 /* An ISA extension in the co-processor and main instruction set space.  */
 914 struct aarch64_option_extension
 915 {
 916   const char *const name;
 917   const unsigned long flags_on;
 918   const unsigned long flags_off;
 919 };
 920
 921 typedef enum aarch64_cond_code
 922 {
 923   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 924   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 925   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 926 }
 927 aarch64_cc;
 928
 929 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 930
 931 /* The condition codes of the processor, and the inverse function.  */
 932 static const char * const aarch64_condition_codes[] =
 933 {
 934   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 935   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 936 };
 937
 938 /* Generate code to enable conditional branches in functions over 1 MiB.  */
 939 const char *
 940 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
 941                         const char * branch_format)
 942 {
 943     rtx_code_label * tmp_label = gen_label_rtx ();
 944     char label_buf[256];
 945     char buffer[128];
 946     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
 947                                  CODE_LABEL_NUMBER (tmp_label));
 948     const char *label_ptr = targetm.strip_name_encoding (label_buf);
 949     rtx dest_label = operands[pos_label];
 950     operands[pos_label] = tmp_label;
 951
 952     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
 953     output_asm_insn (buffer, operands);
 954
 955     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
 956     operands[pos_label] = dest_label;
 957     output_asm_insn (buffer, operands);
 958     return "";
 959 }
 960
 961 void
 962 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
 963 {
 964   const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
 965   if (TARGET_GENERAL_REGS_ONLY)
 966     error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
 967   else
 968     error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
 969 }
 970
 971 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
 972    The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
 973    the same cost even if ALL_REGS has a much larger cost.  ALL_REGS is also
 974    used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
 975    cost (in this case the best class is the lowest cost one).  Using ALL_REGS
 976    irrespectively of its cost results in bad allocations with many redundant
 977    int<->FP moves which are expensive on various cores.
 978    To avoid this we don't allow ALL_REGS as the allocno class, but force a
 979    decision between FP_REGS and GENERAL_REGS.  We use the allocno class if it
 980    isn't ALL_REGS.  Similarly, use the best class if it isn't ALL_REGS.
 981    Otherwise set the allocno class depending on the mode.
 982    The result of this is that it is no longer inefficient to have a higher
 983    memory move cost than the register move cost.
 984 */
 985
 986 static reg_class_t
 987 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
 988                                          reg_class_t best_class)
 989 {
 990   enum machine_mode mode;
 991
 992   if (allocno_class != ALL_REGS)
 993     return allocno_class;
 994
 995   if (best_class != ALL_REGS)
 996     return best_class;
 997
 998   mode = PSEUDO_REGNO_MODE (regno);
 999   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1000 }
1001
1002 static unsigned int
1003 aarch64_min_divisions_for_recip_mul (enum machine_mode mode)
1004 {
1005   if (GET_MODE_UNIT_SIZE (mode) == 4)
1006     return aarch64_tune_params.min_div_recip_mul_sf;
1007   return aarch64_tune_params.min_div_recip_mul_df;
1008 }
1009
1010 static int
1011 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
1012                              enum machine_mode mode)
1013 {
1014   if (VECTOR_MODE_P (mode))
1015     return aarch64_tune_params.vec_reassoc_width;
1016   if (INTEGRAL_MODE_P (mode))
1017     return aarch64_tune_params.int_reassoc_width;
1018   if (FLOAT_MODE_P (mode))
1019     return aarch64_tune_params.fp_reassoc_width;
1020   return 1;
1021 }
1022
1023 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1024 unsigned
1025 aarch64_dbx_register_number (unsigned regno)
1026 {
1027    if (GP_REGNUM_P (regno))
1028      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1029    else if (regno == SP_REGNUM)
1030      return AARCH64_DWARF_SP;
1031    else if (FP_REGNUM_P (regno))
1032      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1033
1034    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1035       equivalent DWARF register.  */
1036    return DWARF_FRAME_REGISTERS;
1037 }
1038
1039 /* Return TRUE if MODE is any of the large INT modes.  */
1040 static bool
1041 aarch64_vect_struct_mode_p (machine_mode mode)
1042 {
1043   return mode == OImode || mode == CImode || mode == XImode;
1044 }
1045
1046 /* Return TRUE if MODE is any of the vector modes.  */
1047 static bool
1048 aarch64_vector_mode_p (machine_mode mode)
1049 {
1050   return aarch64_vector_mode_supported_p (mode)
1051          || aarch64_vect_struct_mode_p (mode);
1052 }
1053
1054 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1055 static bool
1056 aarch64_array_mode_supported_p (machine_mode mode,
1057                                 unsigned HOST_WIDE_INT nelems)
1058 {
1059   if (TARGET_SIMD
1060       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1061           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1062       && (nelems >= 2 && nelems <= 4))
1063     return true;
1064
1065   return false;
1066 }
1067
1068 /* Implement HARD_REGNO_NREGS.  */
1069
1070 int
1071 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1072 {
1073   switch (aarch64_regno_regclass (regno))
1074     {
1075     case FP_REGS:
1076     case FP_LO_REGS:
1077       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
1078     default:
1079       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
1080     }
1081   gcc_unreachable ();
1082 }
1083
1084 /* Implement HARD_REGNO_MODE_OK.  */
1085
1086 int
1087 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1088 {
1089   if (GET_MODE_CLASS (mode) == MODE_CC)
1090     return regno == CC_REGNUM;
1091
1092   if (regno == SP_REGNUM)
1093     /* The purpose of comparing with ptr_mode is to support the
1094        global register variable associated with the stack pointer
1095        register via the syntax of asm ("wsp") in ILP32.  */
1096     return mode == Pmode || mode == ptr_mode;
1097
1098   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1099     return mode == Pmode;
1100
1101   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
1102     return 1;
1103
1104   if (FP_REGNUM_P (regno))
1105     {
1106       if (aarch64_vect_struct_mode_p (mode))
1107         return
1108           (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
1109       else
1110         return 1;
1111     }
1112
1113   return 0;
1114 }
1115
1116 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1117 machine_mode
1118 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
1119                                      machine_mode mode)
1120 {
1121   /* Handle modes that fit within single registers.  */
1122   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
1123     {
1124       if (GET_MODE_SIZE (mode) >= 4)
1125         return mode;
1126       else
1127         return SImode;
1128     }
1129   /* Fall back to generic for multi-reg and very large modes.  */
1130   else
1131     return choose_hard_reg_mode (regno, nregs, false);
1132 }
1133
1134 /* Return true if calls to DECL should be treated as
1135    long-calls (ie called via a register).  */
1136 static bool
1137 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1138 {
1139   return false;
1140 }
1141
1142 /* Return true if calls to symbol-ref SYM should be treated as
1143    long-calls (ie called via a register).  */
1144 bool
1145 aarch64_is_long_call_p (rtx sym)
1146 {
1147   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1148 }
1149
1150 /* Return true if calls to symbol-ref SYM should not go through
1151    plt stubs.  */
1152
1153 bool
1154 aarch64_is_noplt_call_p (rtx sym)
1155 {
1156   const_tree decl = SYMBOL_REF_DECL (sym);
1157
1158   if (flag_pic
1159       && decl
1160       && (!flag_plt
1161           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1162       && !targetm.binds_local_p (decl))
1163     return true;
1164
1165   return false;
1166 }
1167
1168 /* Return true if the offsets to a zero/sign-extract operation
1169    represent an expression that matches an extend operation.  The
1170    operands represent the paramters from
1171
1172    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1173 bool
1174 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
1175                                 rtx extract_imm)
1176 {
1177   HOST_WIDE_INT mult_val, extract_val;
1178
1179   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1180     return false;
1181
1182   mult_val = INTVAL (mult_imm);
1183   extract_val = INTVAL (extract_imm);
1184
1185   if (extract_val > 8
1186       && extract_val < GET_MODE_BITSIZE (mode)
1187       && exact_log2 (extract_val & ~7) > 0
1188       && (extract_val & 7) <= 4
1189       && mult_val == (1 << (extract_val & 7)))
1190     return true;
1191
1192   return false;
1193 }
1194
1195 /* Emit an insn that's a simple single-set.  Both the operands must be
1196    known to be valid.  */
1197 inline static rtx_insn *
1198 emit_set_insn (rtx x, rtx y)
1199 {
1200   return emit_insn (gen_rtx_SET (x, y));
1201 }
1202
1203 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1204    return the rtx for register 0 in the proper mode.  */
1205 rtx
1206 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1207 {
1208   machine_mode mode = SELECT_CC_MODE (code, x, y);
1209   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1210
1211   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1212   return cc_reg;
1213 }
1214
1215 /* Build the SYMBOL_REF for __tls_get_addr.  */
1216
1217 static GTY(()) rtx tls_get_addr_libfunc;
1218
1219 rtx
1220 aarch64_tls_get_addr (void)
1221 {
1222   if (!tls_get_addr_libfunc)
1223     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1224   return tls_get_addr_libfunc;
1225 }
1226
1227 /* Return the TLS model to use for ADDR.  */
1228
1229 static enum tls_model
1230 tls_symbolic_operand_type (rtx addr)
1231 {
1232   enum tls_model tls_kind = TLS_MODEL_NONE;
1233   rtx sym, addend;
1234
1235   if (GET_CODE (addr) == CONST)
1236     {
1237       split_const (addr, &sym, &addend);
1238       if (GET_CODE (sym) == SYMBOL_REF)
1239         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1240     }
1241   else if (GET_CODE (addr) == SYMBOL_REF)
1242     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1243
1244   return tls_kind;
1245 }
1246
1247 /* We'll allow lo_sum's in addresses in our legitimate addresses
1248    so that combine would take care of combining addresses where
1249    necessary, but for generation purposes, we'll generate the address
1250    as :
1251    RTL                               Absolute
1252    tmp = hi (symbol_ref);            adrp  x1, foo
1253    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1254                                      nop
1255
1256    PIC                               TLS
1257    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1258    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1259                                      bl   __tls_get_addr
1260                                      nop
1261
1262    Load TLS symbol, depending on TLS mechanism and TLS access model.
1263
1264    Global Dynamic - Traditional TLS:
1265    adrp tmp, :tlsgd:imm
1266    add  dest, tmp, #:tlsgd_lo12:imm
1267    bl   __tls_get_addr
1268
1269    Global Dynamic - TLS Descriptors:
1270    adrp dest, :tlsdesc:imm
1271    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1272    add  dest, dest, #:tlsdesc_lo12:imm
1273    blr  tmp
1274    mrs  tp, tpidr_el0
1275    add  dest, dest, tp
1276
1277    Initial Exec:
1278    mrs  tp, tpidr_el0
1279    adrp tmp, :gottprel:imm
1280    ldr  dest, [tmp, #:gottprel_lo12:imm]
1281    add  dest, dest, tp
1282
1283    Local Exec:
1284    mrs  tp, tpidr_el0
1285    add  t0, tp, #:tprel_hi12:imm, lsl #12
1286    add  t0, t0, #:tprel_lo12_nc:imm
1287 */
1288
1289 static void
1290 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1291                                    enum aarch64_symbol_type type)
1292 {
1293   switch (type)
1294     {
1295     case SYMBOL_SMALL_ABSOLUTE:
1296       {
1297         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1298         rtx tmp_reg = dest;
1299         machine_mode mode = GET_MODE (dest);
1300
1301         gcc_assert (mode == Pmode || mode == ptr_mode);
1302
1303         if (can_create_pseudo_p ())
1304           tmp_reg = gen_reg_rtx (mode);
1305
1306         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1307         emit_insn (gen_add_losym (dest, tmp_reg, imm));
1308         return;
1309       }
1310
1311     case SYMBOL_TINY_ABSOLUTE:
1312       emit_insn (gen_rtx_SET (dest, imm));
1313       return;
1314
1315     case SYMBOL_SMALL_GOT_28K:
1316       {
1317         machine_mode mode = GET_MODE (dest);
1318         rtx gp_rtx = pic_offset_table_rtx;
1319         rtx insn;
1320         rtx mem;
1321
1322         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1323            here before rtl expand.  Tree IVOPT will generate rtl pattern to
1324            decide rtx costs, in which case pic_offset_table_rtx is not
1325            initialized.  For that case no need to generate the first adrp
1326            instruction as the final cost for global variable access is
1327            one instruction.  */
1328         if (gp_rtx != NULL)
1329           {
1330             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1331                using the page base as GOT base, the first page may be wasted,
1332                in the worst scenario, there is only 28K space for GOT).
1333
1334                The generate instruction sequence for accessing global variable
1335                is:
1336
1337                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1338
1339                Only one instruction needed. But we must initialize
1340                pic_offset_table_rtx properly.  We generate initialize insn for
1341                every global access, and allow CSE to remove all redundant.
1342
1343                The final instruction sequences will look like the following
1344                for multiply global variables access.
1345
1346                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1347
1348                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1349                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1350                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1351                  ...  */
1352
1353             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1354             crtl->uses_pic_offset_table = 1;
1355             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1356
1357             if (mode != GET_MODE (gp_rtx))
1358              gp_rtx = gen_lowpart (mode, gp_rtx);
1359
1360           }
1361
1362         if (mode == ptr_mode)
1363           {
1364             if (mode == DImode)
1365               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1366             else
1367               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1368
1369             mem = XVECEXP (SET_SRC (insn), 0, 0);
1370           }
1371         else
1372           {
1373             gcc_assert (mode == Pmode);
1374
1375             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1376             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1377           }
1378
1379         /* The operand is expected to be MEM.  Whenever the related insn
1380            pattern changed, above code which calculate mem should be
1381            updated.  */
1382         gcc_assert (GET_CODE (mem) == MEM);
1383         MEM_READONLY_P (mem) = 1;
1384         MEM_NOTRAP_P (mem) = 1;
1385         emit_insn (insn);
1386         return;
1387       }
1388
1389     case SYMBOL_SMALL_GOT_4G:
1390       {
1391         /* In ILP32, the mode of dest can be either SImode or DImode,
1392            while the got entry is always of SImode size.  The mode of
1393            dest depends on how dest is used: if dest is assigned to a
1394            pointer (e.g. in the memory), it has SImode; it may have
1395            DImode if dest is dereferenced to access the memeory.
1396            This is why we have to handle three different ldr_got_small
1397            patterns here (two patterns for ILP32).  */
1398
1399         rtx insn;
1400         rtx mem;
1401         rtx tmp_reg = dest;
1402         machine_mode mode = GET_MODE (dest);
1403
1404         if (can_create_pseudo_p ())
1405           tmp_reg = gen_reg_rtx (mode);
1406
1407         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1408         if (mode == ptr_mode)
1409           {
1410             if (mode == DImode)
1411               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1412             else
1413               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1414
1415             mem = XVECEXP (SET_SRC (insn), 0, 0);
1416           }
1417         else
1418           {
1419             gcc_assert (mode == Pmode);
1420
1421             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1422             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1423           }
1424
1425         gcc_assert (GET_CODE (mem) == MEM);
1426         MEM_READONLY_P (mem) = 1;
1427         MEM_NOTRAP_P (mem) = 1;
1428         emit_insn (insn);
1429         return;
1430       }
1431
1432     case SYMBOL_SMALL_TLSGD:
1433       {
1434         rtx_insn *insns;
1435         machine_mode mode = GET_MODE (dest);
1436         rtx result = gen_rtx_REG (mode, R0_REGNUM);
1437
1438         start_sequence ();
1439         if (TARGET_ILP32)
1440           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1441         else
1442           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1443         insns = get_insns ();
1444         end_sequence ();
1445
1446         RTL_CONST_CALL_P (insns) = 1;
1447         emit_libcall_block (insns, dest, result, imm);
1448         return;
1449       }
1450
1451     case SYMBOL_SMALL_TLSDESC:
1452       {
1453         machine_mode mode = GET_MODE (dest);
1454         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1455         rtx tp;
1456
1457         gcc_assert (mode == Pmode || mode == ptr_mode);
1458
1459         /* In ILP32, the got entry is always of SImode size.  Unlike
1460            small GOT, the dest is fixed at reg 0.  */
1461         if (TARGET_ILP32)
1462           emit_insn (gen_tlsdesc_small_si (imm));
1463         else
1464           emit_insn (gen_tlsdesc_small_di (imm));
1465         tp = aarch64_load_tp (NULL);
1466
1467         if (mode != Pmode)
1468           tp = gen_lowpart (mode, tp);
1469
1470         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1471         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1472         return;
1473       }
1474
1475     case SYMBOL_SMALL_TLSIE:
1476       {
1477         /* In ILP32, the mode of dest can be either SImode or DImode,
1478            while the got entry is always of SImode size.  The mode of
1479            dest depends on how dest is used: if dest is assigned to a
1480            pointer (e.g. in the memory), it has SImode; it may have
1481            DImode if dest is dereferenced to access the memeory.
1482            This is why we have to handle three different tlsie_small
1483            patterns here (two patterns for ILP32).  */
1484         machine_mode mode = GET_MODE (dest);
1485         rtx tmp_reg = gen_reg_rtx (mode);
1486         rtx tp = aarch64_load_tp (NULL);
1487
1488         if (mode == ptr_mode)
1489           {
1490             if (mode == DImode)
1491               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1492             else
1493               {
1494                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1495                 tp = gen_lowpart (mode, tp);
1496               }
1497           }
1498         else
1499           {
1500             gcc_assert (mode == Pmode);
1501             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1502           }
1503
1504         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1505         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1506         return;
1507       }
1508
1509     case SYMBOL_TLSLE12:
1510     case SYMBOL_TLSLE24:
1511     case SYMBOL_TLSLE32:
1512     case SYMBOL_TLSLE48:
1513       {
1514         machine_mode mode = GET_MODE (dest);
1515         rtx tp = aarch64_load_tp (NULL);
1516
1517         if (mode != Pmode)
1518           tp = gen_lowpart (mode, tp);
1519
1520         switch (type)
1521           {
1522           case SYMBOL_TLSLE12:
1523             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1524                         (dest, tp, imm));
1525             break;
1526           case SYMBOL_TLSLE24:
1527             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1528                         (dest, tp, imm));
1529           break;
1530           case SYMBOL_TLSLE32:
1531             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1532                         (dest, imm));
1533             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1534                         (dest, dest, tp));
1535           break;
1536           case SYMBOL_TLSLE48:
1537             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1538                         (dest, imm));
1539             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1540                         (dest, dest, tp));
1541             break;
1542           default:
1543             gcc_unreachable ();
1544           }
1545
1546         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1547         return;
1548       }
1549
1550     case SYMBOL_TINY_GOT:
1551       emit_insn (gen_ldr_got_tiny (dest, imm));
1552       return;
1553
1554     case SYMBOL_TINY_TLSIE:
1555       {
1556         machine_mode mode = GET_MODE (dest);
1557         rtx tp = aarch64_load_tp (NULL);
1558
1559         if (mode == ptr_mode)
1560           {
1561             if (mode == DImode)
1562               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1563             else
1564               {
1565                 tp = gen_lowpart (mode, tp);
1566                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1567               }
1568           }
1569         else
1570           {
1571             gcc_assert (mode == Pmode);
1572             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1573           }
1574
1575         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1576         return;
1577       }
1578
1579     default:
1580       gcc_unreachable ();
1581     }
1582 }
1583
1584 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1585    handle all moves if !can_create_pseudo_p ().  The distinction is
1586    important because, unlike emit_move_insn, the move expanders know
1587    how to force Pmode objects into the constant pool even when the
1588    constant pool address is not itself legitimate.  */
1589 static rtx
1590 aarch64_emit_move (rtx dest, rtx src)
1591 {
1592   return (can_create_pseudo_p ()
1593           ? emit_move_insn (dest, src)
1594           : emit_move_insn_1 (dest, src));
1595 }
1596
1597 /* Split a 128-bit move operation into two 64-bit move operations,
1598    taking care to handle partial overlap of register to register
1599    copies.  Special cases are needed when moving between GP regs and
1600    FP regs.  SRC can be a register, constant or memory; DST a register
1601    or memory.  If either operand is memory it must not have any side
1602    effects.  */
1603 void
1604 aarch64_split_128bit_move (rtx dst, rtx src)
1605 {
1606   rtx dst_lo, dst_hi;
1607   rtx src_lo, src_hi;
1608
1609   machine_mode mode = GET_MODE (dst);
1610
1611   gcc_assert (mode == TImode || mode == TFmode);
1612   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1613   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1614
1615   if (REG_P (dst) && REG_P (src))
1616     {
1617       int src_regno = REGNO (src);
1618       int dst_regno = REGNO (dst);
1619
1620       /* Handle FP <-> GP regs.  */
1621       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1622         {
1623           src_lo = gen_lowpart (word_mode, src);
1624           src_hi = gen_highpart (word_mode, src);
1625
1626           if (mode == TImode)
1627             {
1628               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1629               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1630             }
1631           else
1632             {
1633               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1634               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1635             }
1636           return;
1637         }
1638       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1639         {
1640           dst_lo = gen_lowpart (word_mode, dst);
1641           dst_hi = gen_highpart (word_mode, dst);
1642
1643           if (mode == TImode)
1644             {
1645               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1646               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1647             }
1648           else
1649             {
1650               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1651               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1652             }
1653           return;
1654         }
1655     }
1656
1657   dst_lo = gen_lowpart (word_mode, dst);
1658   dst_hi = gen_highpart (word_mode, dst);
1659   src_lo = gen_lowpart (word_mode, src);
1660   src_hi = gen_highpart_mode (word_mode, mode, src);
1661
1662   /* At most one pairing may overlap.  */
1663   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1664     {
1665       aarch64_emit_move (dst_hi, src_hi);
1666       aarch64_emit_move (dst_lo, src_lo);
1667     }
1668   else
1669     {
1670       aarch64_emit_move (dst_lo, src_lo);
1671       aarch64_emit_move (dst_hi, src_hi);
1672     }
1673 }
1674
1675 bool
1676 aarch64_split_128bit_move_p (rtx dst, rtx src)
1677 {
1678   return (! REG_P (src)
1679           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1680 }
1681
1682 /* Split a complex SIMD combine.  */
1683
1684 void
1685 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1686 {
1687   machine_mode src_mode = GET_MODE (src1);
1688   machine_mode dst_mode = GET_MODE (dst);
1689
1690   gcc_assert (VECTOR_MODE_P (dst_mode));
1691
1692   if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1693     {
1694       rtx (*gen) (rtx, rtx, rtx);
1695
1696       switch (src_mode)
1697         {
1698         case V8QImode:
1699           gen = gen_aarch64_simd_combinev8qi;
1700           break;
1701         case V4HImode:
1702           gen = gen_aarch64_simd_combinev4hi;
1703           break;
1704         case V2SImode:
1705           gen = gen_aarch64_simd_combinev2si;
1706           break;
1707         case V4HFmode:
1708           gen = gen_aarch64_simd_combinev4hf;
1709           break;
1710         case V2SFmode:
1711           gen = gen_aarch64_simd_combinev2sf;
1712           break;
1713         case DImode:
1714           gen = gen_aarch64_simd_combinedi;
1715           break;
1716         case DFmode:
1717           gen = gen_aarch64_simd_combinedf;
1718           break;
1719         default:
1720           gcc_unreachable ();
1721         }
1722
1723       emit_insn (gen (dst, src1, src2));
1724       return;
1725     }
1726 }
1727
1728 /* Split a complex SIMD move.  */
1729
1730 void
1731 aarch64_split_simd_move (rtx dst, rtx src)
1732 {
1733   machine_mode src_mode = GET_MODE (src);
1734   machine_mode dst_mode = GET_MODE (dst);
1735
1736   gcc_assert (VECTOR_MODE_P (dst_mode));
1737
1738   if (REG_P (dst) && REG_P (src))
1739     {
1740       rtx (*gen) (rtx, rtx);
1741
1742       gcc_assert (VECTOR_MODE_P (src_mode));
1743
1744       switch (src_mode)
1745         {
1746         case V16QImode:
1747           gen = gen_aarch64_split_simd_movv16qi;
1748           break;
1749         case V8HImode:
1750           gen = gen_aarch64_split_simd_movv8hi;
1751           break;
1752         case V4SImode:
1753           gen = gen_aarch64_split_simd_movv4si;
1754           break;
1755         case V2DImode:
1756           gen = gen_aarch64_split_simd_movv2di;
1757           break;
1758         case V8HFmode:
1759           gen = gen_aarch64_split_simd_movv8hf;
1760           break;
1761         case V4SFmode:
1762           gen = gen_aarch64_split_simd_movv4sf;
1763           break;
1764         case V2DFmode:
1765           gen = gen_aarch64_split_simd_movv2df;
1766           break;
1767         default:
1768           gcc_unreachable ();
1769         }
1770
1771       emit_insn (gen (dst, src));
1772       return;
1773     }
1774 }
1775
1776 bool
1777 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1778                               machine_mode ymode, rtx y)
1779 {
1780   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1781   gcc_assert (r != NULL);
1782   return rtx_equal_p (x, r);
1783 }
1784
1785
1786 static rtx
1787 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1788 {
1789   if (can_create_pseudo_p ())
1790     return force_reg (mode, value);
1791   else
1792     {
1793       x = aarch64_emit_move (x, value);
1794       return x;
1795     }
1796 }
1797
1798
1799 static rtx
1800 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1801 {
1802   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1803     {
1804       rtx high;
1805       /* Load the full offset into a register.  This
1806          might be improvable in the future.  */
1807       high = GEN_INT (offset);
1808       offset = 0;
1809       high = aarch64_force_temporary (mode, temp, high);
1810       reg = aarch64_force_temporary (mode, temp,
1811                                      gen_rtx_PLUS (mode, high, reg));
1812     }
1813   return plus_constant (mode, reg, offset);
1814 }
1815
1816 static int
1817 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1818                                 machine_mode mode)
1819 {
1820   int i;
1821   unsigned HOST_WIDE_INT val, val2, mask;
1822   int one_match, zero_match;
1823   int num_insns;
1824
1825   val = INTVAL (imm);
1826
1827   if (aarch64_move_imm (val, mode))
1828     {
1829       if (generate)
1830         emit_insn (gen_rtx_SET (dest, imm));
1831       return 1;
1832     }
1833
1834   if ((val >> 32) == 0 || mode == SImode)
1835     {
1836       if (generate)
1837         {
1838           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1839           if (mode == SImode)
1840             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1841                                        GEN_INT ((val >> 16) & 0xffff)));
1842           else
1843             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1844                                        GEN_INT ((val >> 16) & 0xffff)));
1845         }
1846       return 2;
1847     }
1848
1849   /* Remaining cases are all for DImode.  */
1850
1851   mask = 0xffff;
1852   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1853     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1854   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1855     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1856
1857   if (zero_match != 2 && one_match != 2)
1858     {
1859       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1860          For a 64-bit bitmask try whether changing 16 bits to all ones or
1861          zeroes creates a valid bitmask.  To check any repeated bitmask,
1862          try using 16 bits from the other 32-bit half of val.  */
1863
1864       for (i = 0; i < 64; i += 16, mask <<= 16)
1865         {
1866           val2 = val & ~mask;
1867           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1868             break;
1869           val2 = val | mask;
1870           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1871             break;
1872           val2 = val2 & ~mask;
1873           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1874           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1875             break;
1876         }
1877       if (i != 64)
1878         {
1879           if (generate)
1880             {
1881               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1882               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1883                                          GEN_INT ((val >> i) & 0xffff)));
1884             }
1885           return 2;
1886         }
1887     }
1888
1889   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1890      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
1891      otherwise skip zero bits.  */
1892
1893   num_insns = 1;
1894   mask = 0xffff;
1895   val2 = one_match > zero_match ? ~val : val;
1896   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1897
1898   if (generate)
1899     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1900                                            ? (val | ~(mask << i))
1901                                            : (val & (mask << i)))));
1902   for (i += 16; i < 64; i += 16)
1903     {
1904       if ((val2 & (mask << i)) == 0)
1905         continue;
1906       if (generate)
1907         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1908                                    GEN_INT ((val >> i) & 0xffff)));
1909       num_insns ++;
1910     }
1911
1912   return num_insns;
1913 }
1914
1915
1916 void
1917 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1918 {
1919   machine_mode mode = GET_MODE (dest);
1920
1921   gcc_assert (mode == SImode || mode == DImode);
1922
1923   /* Check on what type of symbol it is.  */
1924   if (GET_CODE (imm) == SYMBOL_REF
1925       || GET_CODE (imm) == LABEL_REF
1926       || GET_CODE (imm) == CONST)
1927     {
1928       rtx mem, base, offset;
1929       enum aarch64_symbol_type sty;
1930
1931       /* If we have (const (plus symbol offset)), separate out the offset
1932          before we start classifying the symbol.  */
1933       split_const (imm, &base, &offset);
1934
1935       sty = aarch64_classify_symbol (base, offset);
1936       switch (sty)
1937         {
1938         case SYMBOL_FORCE_TO_MEM:
1939           if (offset != const0_rtx
1940               && targetm.cannot_force_const_mem (mode, imm))
1941             {
1942               gcc_assert (can_create_pseudo_p ());
1943               base = aarch64_force_temporary (mode, dest, base);
1944               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1945               aarch64_emit_move (dest, base);
1946               return;
1947             }
1948
1949           mem = force_const_mem (ptr_mode, imm);
1950           gcc_assert (mem);
1951
1952           /* If we aren't generating PC relative literals, then
1953              we need to expand the literal pool access carefully.
1954              This is something that needs to be done in a number
1955              of places, so could well live as a separate function.  */
1956           if (!aarch64_pcrelative_literal_loads)
1957             {
1958               gcc_assert (can_create_pseudo_p ());
1959               base = gen_reg_rtx (ptr_mode);
1960               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
1961               mem = gen_rtx_MEM (ptr_mode, base);
1962             }
1963
1964           if (mode != ptr_mode)
1965             mem = gen_rtx_ZERO_EXTEND (mode, mem);
1966
1967           emit_insn (gen_rtx_SET (dest, mem));
1968
1969           return;
1970
1971         case SYMBOL_SMALL_TLSGD:
1972         case SYMBOL_SMALL_TLSDESC:
1973         case SYMBOL_SMALL_TLSIE:
1974         case SYMBOL_SMALL_GOT_28K:
1975         case SYMBOL_SMALL_GOT_4G:
1976         case SYMBOL_TINY_GOT:
1977         case SYMBOL_TINY_TLSIE:
1978           if (offset != const0_rtx)
1979             {
1980               gcc_assert(can_create_pseudo_p ());
1981               base = aarch64_force_temporary (mode, dest, base);
1982               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1983               aarch64_emit_move (dest, base);
1984               return;
1985             }
1986           /* FALLTHRU */
1987
1988         case SYMBOL_SMALL_ABSOLUTE:
1989         case SYMBOL_TINY_ABSOLUTE:
1990         case SYMBOL_TLSLE12:
1991         case SYMBOL_TLSLE24:
1992         case SYMBOL_TLSLE32:
1993         case SYMBOL_TLSLE48:
1994           aarch64_load_symref_appropriately (dest, imm, sty);
1995           return;
1996
1997         default:
1998           gcc_unreachable ();
1999         }
2000     }
2001
2002   if (!CONST_INT_P (imm))
2003     {
2004       if (GET_CODE (imm) == HIGH)
2005         emit_insn (gen_rtx_SET (dest, imm));
2006       else
2007         {
2008           rtx mem = force_const_mem (mode, imm);
2009           gcc_assert (mem);
2010           emit_insn (gen_rtx_SET (dest, mem));
2011         }
2012
2013       return;
2014     }
2015
2016   aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
2017 }
2018
2019 /* Add DELTA to REGNUM in mode MODE.  SCRATCHREG can be used to hold a
2020    temporary value if necessary.  FRAME_RELATED_P should be true if
2021    the RTX_FRAME_RELATED flag should be set and CFA adjustments added
2022    to the generated instructions.  If SCRATCHREG is known to hold
2023    abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
2024    immediate again.
2025
2026    Since this function may be used to adjust the stack pointer, we must
2027    ensure that it cannot cause transient stack deallocation (for example
2028    by first incrementing SP and then decrementing when adjusting by a
2029    large immediate).  */
2030
2031 static void
2032 aarch64_add_constant_internal (machine_mode mode, int regnum, int scratchreg,
2033                                HOST_WIDE_INT delta, bool frame_related_p,
2034                                bool emit_move_imm)
2035 {
2036   HOST_WIDE_INT mdelta = abs_hwi (delta);
2037   rtx this_rtx = gen_rtx_REG (mode, regnum);
2038   rtx_insn *insn;
2039
2040   if (!mdelta)
2041     return;
2042
2043   /* Single instruction adjustment.  */
2044   if (aarch64_uimm12_shift (mdelta))
2045     {
2046       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
2047       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2048       return;
2049     }
2050
2051   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
2052      Only do this if mdelta is not a 16-bit move as adjusting using a move
2053      is better.  */
2054   if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode))
2055     {
2056       HOST_WIDE_INT low_off = mdelta & 0xfff;
2057
2058       low_off = delta < 0 ? -low_off : low_off;
2059       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
2060       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2061       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
2062       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2063       return;
2064     }
2065
2066   /* Emit a move immediate if required and an addition/subtraction.  */
2067   rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
2068   if (emit_move_imm)
2069     aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode);
2070   insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx)
2071                               : gen_add2_insn (this_rtx, scratch_rtx));
2072   if (frame_related_p)
2073     {
2074       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2075       rtx adj = plus_constant (mode, this_rtx, delta);
2076       add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
2077     }
2078 }
2079
2080 static inline void
2081 aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
2082                       HOST_WIDE_INT delta)
2083 {
2084   aarch64_add_constant_internal (mode, regnum, scratchreg, delta, false, true);
2085 }
2086
2087 static inline void
2088 aarch64_add_sp (int scratchreg, HOST_WIDE_INT delta, bool emit_move_imm)
2089 {
2090   aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, delta,
2091                                  true, emit_move_imm);
2092 }
2093
2094 static inline void
2095 aarch64_sub_sp (int scratchreg, HOST_WIDE_INT delta, bool frame_related_p)
2096 {
2097   aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, -delta,
2098                                  frame_related_p, true);
2099 }
2100
2101 static bool
2102 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
2103                                  tree exp ATTRIBUTE_UNUSED)
2104 {
2105   /* Currently, always true.  */
2106   return true;
2107 }
2108
2109 /* Implement TARGET_PASS_BY_REFERENCE.  */
2110
2111 static bool
2112 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
2113                            machine_mode mode,
2114                            const_tree type,
2115                            bool named ATTRIBUTE_UNUSED)
2116 {
2117   HOST_WIDE_INT size;
2118   machine_mode dummymode;
2119   int nregs;
2120
2121   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
2122   size = (mode == BLKmode && type)
2123     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
2124
2125   /* Aggregates are passed by reference based on their size.  */
2126   if (type && AGGREGATE_TYPE_P (type))
2127     {
2128       size = int_size_in_bytes (type);
2129     }
2130
2131   /* Variable sized arguments are always returned by reference.  */
2132   if (size < 0)
2133     return true;
2134
2135   /* Can this be a candidate to be passed in fp/simd register(s)?  */
2136   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2137                                                &dummymode, &nregs,
2138                                                NULL))
2139     return false;
2140
2141   /* Arguments which are variable sized or larger than 2 registers are
2142      passed by reference unless they are a homogenous floating point
2143      aggregate.  */
2144   return size > 2 * UNITS_PER_WORD;
2145 }
2146
2147 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
2148 static bool
2149 aarch64_return_in_msb (const_tree valtype)
2150 {
2151   machine_mode dummy_mode;
2152   int dummy_int;
2153
2154   /* Never happens in little-endian mode.  */
2155   if (!BYTES_BIG_ENDIAN)
2156     return false;
2157
2158   /* Only composite types smaller than or equal to 16 bytes can
2159      be potentially returned in registers.  */
2160   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
2161       || int_size_in_bytes (valtype) <= 0
2162       || int_size_in_bytes (valtype) > 16)
2163     return false;
2164
2165   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2166      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2167      is always passed/returned in the least significant bits of fp/simd
2168      register(s).  */
2169   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
2170                                                &dummy_mode, &dummy_int, NULL))
2171     return false;
2172
2173   return true;
2174 }
2175
2176 /* Implement TARGET_FUNCTION_VALUE.
2177    Define how to find the value returned by a function.  */
2178
2179 static rtx
2180 aarch64_function_value (const_tree type, const_tree func,
2181                         bool outgoing ATTRIBUTE_UNUSED)
2182 {
2183   machine_mode mode;
2184   int unsignedp;
2185   int count;
2186   machine_mode ag_mode;
2187
2188   mode = TYPE_MODE (type);
2189   if (INTEGRAL_TYPE_P (type))
2190     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
2191
2192   if (aarch64_return_in_msb (type))
2193     {
2194       HOST_WIDE_INT size = int_size_in_bytes (type);
2195
2196       if (size % UNITS_PER_WORD != 0)
2197         {
2198           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
2199           mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
2200         }
2201     }
2202
2203   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2204                                                &ag_mode, &count, NULL))
2205     {
2206       if (!aarch64_composite_type_p (type, mode))
2207         {
2208           gcc_assert (count == 1 && mode == ag_mode);
2209           return gen_rtx_REG (mode, V0_REGNUM);
2210         }
2211       else
2212         {
2213           int i;
2214           rtx par;
2215
2216           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
2217           for (i = 0; i < count; i++)
2218             {
2219               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
2220               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2221                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
2222               XVECEXP (par, 0, i) = tmp;
2223             }
2224           return par;
2225         }
2226     }
2227   else
2228     return gen_rtx_REG (mode, R0_REGNUM);
2229 }
2230
2231 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2232    Return true if REGNO is the number of a hard register in which the values
2233    of called function may come back.  */
2234
2235 static bool
2236 aarch64_function_value_regno_p (const unsigned int regno)
2237 {
2238   /* Maximum of 16 bytes can be returned in the general registers.  Examples
2239      of 16-byte return values are: 128-bit integers and 16-byte small
2240      structures (excluding homogeneous floating-point aggregates).  */
2241   if (regno == R0_REGNUM || regno == R1_REGNUM)
2242     return true;
2243
2244   /* Up to four fp/simd registers can return a function value, e.g. a
2245      homogeneous floating-point aggregate having four members.  */
2246   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
2247     return TARGET_FLOAT;
2248
2249   return false;
2250 }
2251
2252 /* Implement TARGET_RETURN_IN_MEMORY.
2253
2254    If the type T of the result of a function is such that
2255      void func (T arg)
2256    would require that arg be passed as a value in a register (or set of
2257    registers) according to the parameter passing rules, then the result
2258    is returned in the same registers as would be used for such an
2259    argument.  */
2260
2261 static bool
2262 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
2263 {
2264   HOST_WIDE_INT size;
2265   machine_mode ag_mode;
2266   int count;
2267
2268   if (!AGGREGATE_TYPE_P (type)
2269       && TREE_CODE (type) != COMPLEX_TYPE
2270       && TREE_CODE (type) != VECTOR_TYPE)
2271     /* Simple scalar types always returned in registers.  */
2272     return false;
2273
2274   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
2275                                                type,
2276                                                &ag_mode,
2277                                                &count,
2278                                                NULL))
2279     return false;
2280
2281   /* Types larger than 2 registers returned in memory.  */
2282   size = int_size_in_bytes (type);
2283   return (size < 0 || size > 2 * UNITS_PER_WORD);
2284 }
2285
2286 static bool
2287 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
2288                                const_tree type, int *nregs)
2289 {
2290   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2291   return aarch64_vfp_is_call_or_return_candidate (mode,
2292                                                   type,
2293                                                   &pcum->aapcs_vfp_rmode,
2294                                                   nregs,
2295                                                   NULL);
2296 }
2297
2298 /* Given MODE and TYPE of a function argument, return the alignment in
2299    bits.  The idea is to suppress any stronger alignment requested by
2300    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2301    This is a helper function for local use only.  */
2302
2303 static unsigned int
2304 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
2305 {
2306   if (!type)
2307     return GET_MODE_ALIGNMENT (mode);
2308
2309   if (integer_zerop (TYPE_SIZE (type)))
2310     return 0;
2311
2312   gcc_assert (TYPE_MODE (type) == mode);
2313
2314   if (!AGGREGATE_TYPE_P (type))
2315     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
2316
2317   if (TREE_CODE (type) == ARRAY_TYPE)
2318     return TYPE_ALIGN (TREE_TYPE (type));
2319
2320   unsigned int alignment = 0;
2321   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
2322     if (TREE_CODE (field) == FIELD_DECL)
2323       alignment = std::max (alignment, DECL_ALIGN (field));
2324
2325   return alignment;
2326 }
2327
2328 /* Layout a function argument according to the AAPCS64 rules.  The rule
2329    numbers refer to the rule numbers in the AAPCS64.  */
2330
2331 static void
2332 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
2333                     const_tree type,
2334                     bool named ATTRIBUTE_UNUSED)
2335 {
2336   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2337   int ncrn, nvrn, nregs;
2338   bool allocate_ncrn, allocate_nvrn;
2339   HOST_WIDE_INT size;
2340
2341   /* We need to do this once per argument.  */
2342   if (pcum->aapcs_arg_processed)
2343     return;
2344
2345   pcum->aapcs_arg_processed = true;
2346
2347   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
2348   size
2349     = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2350                 UNITS_PER_WORD);
2351
2352   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2353   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2354                                                  mode,
2355                                                  type,
2356                                                  &nregs);
2357
2358   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2359      The following code thus handles passing by SIMD/FP registers first.  */
2360
2361   nvrn = pcum->aapcs_nvrn;
2362
2363   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2364      and homogenous short-vector aggregates (HVA).  */
2365   if (allocate_nvrn)
2366     {
2367       if (!TARGET_FLOAT)
2368         aarch64_err_no_fpadvsimd (mode, "argument");
2369
2370       if (nvrn + nregs <= NUM_FP_ARG_REGS)
2371         {
2372           pcum->aapcs_nextnvrn = nvrn + nregs;
2373           if (!aarch64_composite_type_p (type, mode))
2374             {
2375               gcc_assert (nregs == 1);
2376               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2377             }
2378           else
2379             {
2380               rtx par;
2381               int i;
2382               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2383               for (i = 0; i < nregs; i++)
2384                 {
2385                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2386                                          V0_REGNUM + nvrn + i);
2387                   tmp = gen_rtx_EXPR_LIST
2388                     (VOIDmode, tmp,
2389                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2390                   XVECEXP (par, 0, i) = tmp;
2391                 }
2392               pcum->aapcs_reg = par;
2393             }
2394           return;
2395         }
2396       else
2397         {
2398           /* C.3 NSRN is set to 8.  */
2399           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2400           goto on_stack;
2401         }
2402     }
2403
2404   ncrn = pcum->aapcs_ncrn;
2405   nregs = size / UNITS_PER_WORD;
2406
2407   /* C6 - C9.  though the sign and zero extension semantics are
2408      handled elsewhere.  This is the case where the argument fits
2409      entirely general registers.  */
2410   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2411     {
2412
2413       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2414
2415       /* C.8 if the argument has an alignment of 16 then the NGRN is
2416          rounded up to the next even number.  */
2417       if (nregs == 2
2418           && ncrn % 2
2419           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
2420              comparison is there because for > 16 * BITS_PER_UNIT
2421              alignment nregs should be > 2 and therefore it should be
2422              passed by reference rather than value.  */
2423           && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2424         {
2425           ++ncrn;
2426           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2427         }
2428
2429       /* NREGS can be 0 when e.g. an empty structure is to be passed.
2430          A reg is still generated for it, but the caller should be smart
2431          enough not to use it.  */
2432       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2433         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2434       else
2435         {
2436           rtx par;
2437           int i;
2438
2439           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2440           for (i = 0; i < nregs; i++)
2441             {
2442               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2443               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2444                                        GEN_INT (i * UNITS_PER_WORD));
2445               XVECEXP (par, 0, i) = tmp;
2446             }
2447           pcum->aapcs_reg = par;
2448         }
2449
2450       pcum->aapcs_nextncrn = ncrn + nregs;
2451       return;
2452     }
2453
2454   /* C.11  */
2455   pcum->aapcs_nextncrn = NUM_ARG_REGS;
2456
2457   /* The argument is passed on stack; record the needed number of words for
2458      this argument and align the total size if necessary.  */
2459 on_stack:
2460   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2461
2462   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2463     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2464                                        16 / UNITS_PER_WORD);
2465   return;
2466 }
2467
2468 /* Implement TARGET_FUNCTION_ARG.  */
2469
2470 static rtx
2471 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2472                       const_tree type, bool named)
2473 {
2474   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2475   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2476
2477   if (mode == VOIDmode)
2478     return NULL_RTX;
2479
2480   aarch64_layout_arg (pcum_v, mode, type, named);
2481   return pcum->aapcs_reg;
2482 }
2483
2484 void
2485 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2486                            const_tree fntype ATTRIBUTE_UNUSED,
2487                            rtx libname ATTRIBUTE_UNUSED,
2488                            const_tree fndecl ATTRIBUTE_UNUSED,
2489                            unsigned n_named ATTRIBUTE_UNUSED)
2490 {
2491   pcum->aapcs_ncrn = 0;
2492   pcum->aapcs_nvrn = 0;
2493   pcum->aapcs_nextncrn = 0;
2494   pcum->aapcs_nextnvrn = 0;
2495   pcum->pcs_variant = ARM_PCS_AAPCS64;
2496   pcum->aapcs_reg = NULL_RTX;
2497   pcum->aapcs_arg_processed = false;
2498   pcum->aapcs_stack_words = 0;
2499   pcum->aapcs_stack_size = 0;
2500
2501   if (!TARGET_FLOAT
2502       && fndecl && TREE_PUBLIC (fndecl)
2503       && fntype && fntype != error_mark_node)
2504     {
2505       const_tree type = TREE_TYPE (fntype);
2506       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
2507       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
2508       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2509                                                    &mode, &nregs, NULL))
2510         aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2511     }
2512   return;
2513 }
2514
2515 static void
2516 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2517                               machine_mode mode,
2518                               const_tree type,
2519                               bool named)
2520 {
2521   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2522   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2523     {
2524       aarch64_layout_arg (pcum_v, mode, type, named);
2525       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2526                   != (pcum->aapcs_stack_words != 0));
2527       pcum->aapcs_arg_processed = false;
2528       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2529       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2530       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2531       pcum->aapcs_stack_words = 0;
2532       pcum->aapcs_reg = NULL_RTX;
2533     }
2534 }
2535
2536 bool
2537 aarch64_function_arg_regno_p (unsigned regno)
2538 {
2539   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2540           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2541 }
2542
2543 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
2544    PARM_BOUNDARY bits of alignment, but will be given anything up
2545    to STACK_BOUNDARY bits if the type requires it.  This makes sure
2546    that both before and after the layout of each argument, the Next
2547    Stacked Argument Address (NSAA) will have a minimum alignment of
2548    8 bytes.  */
2549
2550 static unsigned int
2551 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2552 {
2553   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2554   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
2555 }
2556
2557 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2558
2559    Return true if an argument passed on the stack should be padded upwards,
2560    i.e. if the least-significant byte of the stack slot has useful data.
2561
2562    Small aggregate types are placed in the lowest memory address.
2563
2564    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
2565
2566 bool
2567 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
2568 {
2569   /* On little-endian targets, the least significant byte of every stack
2570      argument is passed at the lowest byte address of the stack slot.  */
2571   if (!BYTES_BIG_ENDIAN)
2572     return true;
2573
2574   /* Otherwise, integral, floating-point and pointer types are padded downward:
2575      the least significant byte of a stack argument is passed at the highest
2576      byte address of the stack slot.  */
2577   if (type
2578       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2579          || POINTER_TYPE_P (type))
2580       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2581     return false;
2582
2583   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
2584   return true;
2585 }
2586
2587 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2588
2589    It specifies padding for the last (may also be the only)
2590    element of a block move between registers and memory.  If
2591    assuming the block is in the memory, padding upward means that
2592    the last element is padded after its highest significant byte,
2593    while in downward padding, the last element is padded at the
2594    its least significant byte side.
2595
2596    Small aggregates and small complex types are always padded
2597    upwards.
2598
2599    We don't need to worry about homogeneous floating-point or
2600    short-vector aggregates; their move is not affected by the
2601    padding direction determined here.  Regardless of endianness,
2602    each element of such an aggregate is put in the least
2603    significant bits of a fp/simd register.
2604
2605    Return !BYTES_BIG_ENDIAN if the least significant byte of the
2606    register has useful data, and return the opposite if the most
2607    significant byte does.  */
2608
2609 bool
2610 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2611                      bool first ATTRIBUTE_UNUSED)
2612 {
2613
2614   /* Small composite types are always padded upward.  */
2615   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2616     {
2617       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2618                             : GET_MODE_SIZE (mode));
2619       if (size < 2 * UNITS_PER_WORD)
2620         return true;
2621     }
2622
2623   /* Otherwise, use the default padding.  */
2624   return !BYTES_BIG_ENDIAN;
2625 }
2626
2627 static machine_mode
2628 aarch64_libgcc_cmp_return_mode (void)
2629 {
2630   return SImode;
2631 }
2632
2633 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2634
2635 /* We use the 12-bit shifted immediate arithmetic instructions so values
2636    must be multiple of (1 << 12), i.e. 4096.  */
2637 #define ARITH_FACTOR 4096
2638
2639 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2640 #error Cannot use simple address calculation for stack probing
2641 #endif
2642
2643 /* The pair of scratch registers used for stack probing.  */
2644 #define PROBE_STACK_FIRST_REG  9
2645 #define PROBE_STACK_SECOND_REG 10
2646
2647 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2648    inclusive.  These are offsets from the current stack pointer.  */
2649
2650 static void
2651 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2652 {
2653   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
2654
2655   /* See the same assertion on PROBE_INTERVAL above.  */
2656   gcc_assert ((first % ARITH_FACTOR) == 0);
2657
2658   /* See if we have a constant small number of probes to generate.  If so,
2659      that's the easy case.  */
2660   if (size <= PROBE_INTERVAL)
2661     {
2662       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2663
2664       emit_set_insn (reg1,
2665                      plus_constant (Pmode,
2666                                     stack_pointer_rtx, -(first + base)));
2667       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
2668     }
2669
2670   /* The run-time loop is made up of 8 insns in the generic case while the
2671      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
2672   else if (size <= 4 * PROBE_INTERVAL)
2673     {
2674       HOST_WIDE_INT i, rem;
2675
2676       emit_set_insn (reg1,
2677                      plus_constant (Pmode,
2678                                     stack_pointer_rtx,
2679                                     -(first + PROBE_INTERVAL)));
2680       emit_stack_probe (reg1);
2681
2682       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2683          it exceeds SIZE.  If only two probes are needed, this will not
2684          generate any code.  Then probe at FIRST + SIZE.  */
2685       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2686         {
2687           emit_set_insn (reg1,
2688                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
2689           emit_stack_probe (reg1);
2690         }
2691
2692       rem = size - (i - PROBE_INTERVAL);
2693       if (rem > 256)
2694         {
2695           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2696
2697           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
2698           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
2699         }
2700       else
2701         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
2702     }
2703
2704   /* Otherwise, do the same as above, but in a loop.  Note that we must be
2705      extra careful with variables wrapping around because we might be at
2706      the very top (or the very bottom) of the address space and we have
2707      to be able to handle this case properly; in particular, we use an
2708      equality test for the loop condition.  */
2709   else
2710     {
2711       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
2712
2713       /* Step 1: round SIZE to the previous multiple of the interval.  */
2714
2715       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2716
2717
2718       /* Step 2: compute initial and final value of the loop counter.  */
2719
2720       /* TEST_ADDR = SP + FIRST.  */
2721       emit_set_insn (reg1,
2722                      plus_constant (Pmode, stack_pointer_rtx, -first));
2723
2724       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
2725       emit_set_insn (reg2,
2726                      plus_constant (Pmode, stack_pointer_rtx,
2727                                     -(first + rounded_size)));
2728
2729
2730       /* Step 3: the loop
2731
2732          do
2733            {
2734              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2735              probe at TEST_ADDR
2736            }
2737          while (TEST_ADDR != LAST_ADDR)
2738
2739          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2740          until it is equal to ROUNDED_SIZE.  */
2741
2742       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
2743
2744
2745       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2746          that SIZE is equal to ROUNDED_SIZE.  */
2747
2748       if (size != rounded_size)
2749         {
2750           HOST_WIDE_INT rem = size - rounded_size;
2751
2752           if (rem > 256)
2753             {
2754               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2755
2756               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
2757               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
2758             }
2759           else
2760             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
2761         }
2762     }
2763
2764   /* Make sure nothing is scheduled before we are done.  */
2765   emit_insn (gen_blockage ());
2766 }
2767
2768 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
2769    absolute addresses.  */
2770
2771 const char *
2772 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2773 {
2774   static int labelno = 0;
2775   char loop_lab[32];
2776   rtx xops[2];
2777
2778   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2779
2780   /* Loop.  */
2781   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2782
2783   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
2784   xops[0] = reg1;
2785   xops[1] = GEN_INT (PROBE_INTERVAL);
2786   output_asm_insn ("sub\t%0, %0, %1", xops);
2787
2788   /* Probe at TEST_ADDR.  */
2789   output_asm_insn ("str\txzr, [%0]", xops);
2790
2791   /* Test if TEST_ADDR == LAST_ADDR.  */
2792   xops[1] = reg2;
2793   output_asm_insn ("cmp\t%0, %1", xops);
2794
2795   /* Branch.  */
2796   fputs ("\tb.ne\t", asm_out_file);
2797   assemble_name_raw (asm_out_file, loop_lab);
2798   fputc ('\n', asm_out_file);
2799
2800   return "";
2801 }
2802
2803 static bool
2804 aarch64_frame_pointer_required (void)
2805 {
2806   /* In aarch64_override_options_after_change
2807      flag_omit_leaf_frame_pointer turns off the frame pointer by
2808      default.  Turn it back on now if we've not got a leaf
2809      function.  */
2810   if (flag_omit_leaf_frame_pointer
2811       && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2812     return true;
2813
2814   /* Force a frame pointer for EH returns so the return address is at FP+8.  */
2815   if (crtl->calls_eh_return)
2816     return true;
2817
2818   return false;
2819 }
2820
2821 /* Mark the registers that need to be saved by the callee and calculate
2822    the size of the callee-saved registers area and frame record (both FP
2823    and LR may be omitted).  */
2824 static void
2825 aarch64_layout_frame (void)
2826 {
2827   HOST_WIDE_INT offset = 0;
2828   int regno, last_fp_reg = INVALID_REGNUM;
2829
2830   if (reload_completed && cfun->machine->frame.laid_out)
2831     return;
2832
2833 #define SLOT_NOT_REQUIRED (-2)
2834 #define SLOT_REQUIRED     (-1)
2835
2836   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
2837   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
2838
2839   /* First mark all the registers that really need to be saved...  */
2840   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2841     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2842
2843   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2844     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2845
2846   /* ... that includes the eh data registers (if needed)...  */
2847   if (crtl->calls_eh_return)
2848     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2849       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2850         = SLOT_REQUIRED;
2851
2852   /* ... and any callee saved register that dataflow says is live.  */
2853   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2854     if (df_regs_ever_live_p (regno)
2855         && (regno == R30_REGNUM
2856             || !call_used_regs[regno]))
2857       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2858
2859   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2860     if (df_regs_ever_live_p (regno)
2861         && !call_used_regs[regno])
2862       {
2863         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2864         last_fp_reg = regno;
2865       }
2866
2867   if (frame_pointer_needed)
2868     {
2869       /* FP and LR are placed in the linkage record.  */
2870       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2871       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2872       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2873       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2874       offset += 2 * UNITS_PER_WORD;
2875     }
2876
2877   /* Now assign stack slots for them.  */
2878   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2879     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2880       {
2881         cfun->machine->frame.reg_offset[regno] = offset;
2882         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2883           cfun->machine->frame.wb_candidate1 = regno;
2884         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
2885           cfun->machine->frame.wb_candidate2 = regno;
2886         offset += UNITS_PER_WORD;
2887       }
2888
2889   HOST_WIDE_INT max_int_offset = offset;
2890   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2891   bool has_align_gap = offset != max_int_offset;
2892
2893   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2894     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2895       {
2896         /* If there is an alignment gap between integer and fp callee-saves,
2897            allocate the last fp register to it if possible.  */
2898         if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
2899           {
2900             cfun->machine->frame.reg_offset[regno] = max_int_offset;
2901             break;
2902           }
2903
2904         cfun->machine->frame.reg_offset[regno] = offset;
2905         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2906           cfun->machine->frame.wb_candidate1 = regno;
2907         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
2908                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2909           cfun->machine->frame.wb_candidate2 = regno;
2910         offset += UNITS_PER_WORD;
2911       }
2912
2913   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2914
2915   cfun->machine->frame.saved_regs_size = offset;
2916
2917   HOST_WIDE_INT varargs_and_saved_regs_size
2918     = offset + cfun->machine->frame.saved_varargs_size;
2919
2920   cfun->machine->frame.hard_fp_offset
2921     = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
2922                 STACK_BOUNDARY / BITS_PER_UNIT);
2923
2924   cfun->machine->frame.frame_size
2925     = ROUND_UP (cfun->machine->frame.hard_fp_offset
2926                 + crtl->outgoing_args_size,
2927                 STACK_BOUNDARY / BITS_PER_UNIT);
2928
2929   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
2930
2931   cfun->machine->frame.initial_adjust = 0;
2932   cfun->machine->frame.final_adjust = 0;
2933   cfun->machine->frame.callee_adjust = 0;
2934   cfun->machine->frame.callee_offset = 0;
2935
2936   HOST_WIDE_INT max_push_offset = 0;
2937   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
2938     max_push_offset = 512;
2939   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
2940     max_push_offset = 256;
2941
2942   if (cfun->machine->frame.frame_size < max_push_offset
2943       && crtl->outgoing_args_size == 0)
2944     {
2945       /* Simple, small frame with no outgoing arguments:
2946          stp reg1, reg2, [sp, -frame_size]!
2947          stp reg3, reg4, [sp, 16]  */
2948       cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
2949     }
2950   else if ((crtl->outgoing_args_size
2951             + cfun->machine->frame.saved_regs_size < 512)
2952            && !(cfun->calls_alloca
2953                 && cfun->machine->frame.hard_fp_offset < max_push_offset))
2954     {
2955       /* Frame with small outgoing arguments:
2956          sub sp, sp, frame_size
2957          stp reg1, reg2, [sp, outgoing_args_size]
2958          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
2959       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
2960       cfun->machine->frame.callee_offset
2961         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
2962     }
2963   else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
2964     {
2965       /* Frame with large outgoing arguments but a small local area:
2966          stp reg1, reg2, [sp, -hard_fp_offset]!
2967          stp reg3, reg4, [sp, 16]
2968          sub sp, sp, outgoing_args_size  */
2969       cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
2970       cfun->machine->frame.final_adjust
2971         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
2972     }
2973   else if (!frame_pointer_needed
2974            && varargs_and_saved_regs_size < max_push_offset)
2975     {
2976       /* Frame with large local area and outgoing arguments (this pushes the
2977          callee-saves first, followed by the locals and outgoing area):
2978          stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
2979          stp reg3, reg4, [sp, 16]
2980          sub sp, sp, frame_size - varargs_and_saved_regs_size  */
2981       cfun->machine->frame.callee_adjust = varargs_and_saved_regs_size;
2982       cfun->machine->frame.final_adjust
2983         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
2984       cfun->machine->frame.hard_fp_offset = cfun->machine->frame.callee_adjust;
2985       cfun->machine->frame.locals_offset = cfun->machine->frame.hard_fp_offset;
2986     }
2987   else
2988     {
2989       /* Frame with large local area and outgoing arguments using frame pointer:
2990          sub sp, sp, hard_fp_offset
2991          stp x29, x30, [sp, 0]
2992          add x29, sp, 0
2993          stp reg3, reg4, [sp, 16]
2994          sub sp, sp, outgoing_args_size  */
2995       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
2996       cfun->machine->frame.final_adjust
2997         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
2998     }
2999
3000   cfun->machine->frame.laid_out = true;
3001 }
3002
3003 /* Return true if the register REGNO is saved on entry to
3004    the current function.  */
3005
3006 static bool
3007 aarch64_register_saved_on_entry (int regno)
3008 {
3009   return cfun->machine->frame.reg_offset[regno] >= 0;
3010 }
3011
3012 /* Return the next register up from REGNO up to LIMIT for the callee
3013    to save.  */
3014
3015 static unsigned
3016 aarch64_next_callee_save (unsigned regno, unsigned limit)
3017 {
3018   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
3019     regno ++;
3020   return regno;
3021 }
3022
3023 /* Push the register number REGNO of mode MODE to the stack with write-back
3024    adjusting the stack by ADJUSTMENT.  */
3025
3026 static void
3027 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
3028                            HOST_WIDE_INT adjustment)
3029  {
3030   rtx base_rtx = stack_pointer_rtx;
3031   rtx insn, reg, mem;
3032
3033   reg = gen_rtx_REG (mode, regno);
3034   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
3035                             plus_constant (Pmode, base_rtx, -adjustment));
3036   mem = gen_rtx_MEM (mode, mem);
3037
3038   insn = emit_move_insn (mem, reg);
3039   RTX_FRAME_RELATED_P (insn) = 1;
3040 }
3041
3042 /* Generate and return an instruction to store the pair of registers
3043    REG and REG2 of mode MODE to location BASE with write-back adjusting
3044    the stack location BASE by ADJUSTMENT.  */
3045
3046 static rtx
3047 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3048                           HOST_WIDE_INT adjustment)
3049 {
3050   switch (mode)
3051     {
3052     case DImode:
3053       return gen_storewb_pairdi_di (base, base, reg, reg2,
3054                                     GEN_INT (-adjustment),
3055                                     GEN_INT (UNITS_PER_WORD - adjustment));
3056     case DFmode:
3057       return gen_storewb_pairdf_di (base, base, reg, reg2,
3058                                     GEN_INT (-adjustment),
3059                                     GEN_INT (UNITS_PER_WORD - adjustment));
3060     default:
3061       gcc_unreachable ();
3062     }
3063 }
3064
3065 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
3066    stack pointer by ADJUSTMENT.  */
3067
3068 static void
3069 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
3070 {
3071   rtx_insn *insn;
3072   machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
3073
3074   if (regno2 == INVALID_REGNUM)
3075     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
3076
3077   rtx reg1 = gen_rtx_REG (mode, regno1);
3078   rtx reg2 = gen_rtx_REG (mode, regno2);
3079
3080   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
3081                                               reg2, adjustment));
3082   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
3083   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3084   RTX_FRAME_RELATED_P (insn) = 1;
3085 }
3086
3087 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
3088    adjusting it by ADJUSTMENT afterwards.  */
3089
3090 static rtx
3091 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3092                          HOST_WIDE_INT adjustment)
3093 {
3094   switch (mode)
3095     {
3096     case DImode:
3097       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
3098                                    GEN_INT (UNITS_PER_WORD));
3099     case DFmode:
3100       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
3101                                    GEN_INT (UNITS_PER_WORD));
3102     default:
3103       gcc_unreachable ();
3104     }
3105 }
3106
3107 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
3108    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
3109    into CFI_OPS.  */
3110
3111 static void
3112 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
3113                   rtx *cfi_ops)
3114 {
3115   machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
3116   rtx reg1 = gen_rtx_REG (mode, regno1);
3117
3118   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
3119
3120   if (regno2 == INVALID_REGNUM)
3121     {
3122       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
3123       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3124       emit_move_insn (reg1, gen_rtx_MEM (mode, mem));
3125     }
3126   else
3127     {
3128       rtx reg2 = gen_rtx_REG (mode, regno2);
3129       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3130       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
3131                                           reg2, adjustment));
3132     }
3133 }
3134
3135 /* Generate and return a store pair instruction of mode MODE to store
3136    register REG1 to MEM1 and register REG2 to MEM2.  */
3137
3138 static rtx
3139 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
3140                         rtx reg2)
3141 {
3142   switch (mode)
3143     {
3144     case DImode:
3145       return gen_store_pairdi (mem1, reg1, mem2, reg2);
3146
3147     case DFmode:
3148       return gen_store_pairdf (mem1, reg1, mem2, reg2);
3149
3150     default:
3151       gcc_unreachable ();
3152     }
3153 }
3154
3155 /* Generate and regurn a load pair isntruction of mode MODE to load register
3156    REG1 from MEM1 and register REG2 from MEM2.  */
3157
3158 static rtx
3159 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
3160                        rtx mem2)
3161 {
3162   switch (mode)
3163     {
3164     case DImode:
3165       return gen_load_pairdi (reg1, mem1, reg2, mem2);
3166
3167     case DFmode:
3168       return gen_load_pairdf (reg1, mem1, reg2, mem2);
3169
3170     default:
3171       gcc_unreachable ();
3172     }
3173 }
3174
3175 /* Return TRUE if return address signing should be enabled for the current
3176    function, otherwise return FALSE.  */
3177
3178 bool
3179 aarch64_return_address_signing_enabled (void)
3180 {
3181   /* This function should only be called after frame laid out.   */
3182   gcc_assert (cfun->machine->frame.laid_out);
3183
3184   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
3185      if it's LR is pushed onto stack.  */
3186   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
3187           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
3188               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
3189 }
3190
3191 /* Emit code to save the callee-saved registers from register number START
3192    to LIMIT to the stack at the location starting at offset START_OFFSET,
3193    skipping any write-back candidates if SKIP_WB is true.  */
3194
3195 static void
3196 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
3197                            unsigned start, unsigned limit, bool skip_wb)
3198 {
3199   rtx_insn *insn;
3200   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3201                                                  ? gen_frame_mem : gen_rtx_MEM);
3202   unsigned regno;
3203   unsigned regno2;
3204
3205   for (regno = aarch64_next_callee_save (start, limit);
3206        regno <= limit;
3207        regno = aarch64_next_callee_save (regno + 1, limit))
3208     {
3209       rtx reg, mem;
3210       HOST_WIDE_INT offset;
3211
3212       if (skip_wb
3213           && (regno == cfun->machine->frame.wb_candidate1
3214               || regno == cfun->machine->frame.wb_candidate2))
3215         continue;
3216
3217       if (cfun->machine->reg_is_wrapped_separately[regno])
3218        continue;
3219
3220       reg = gen_rtx_REG (mode, regno);
3221       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3222       mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3223                                               offset));
3224
3225       regno2 = aarch64_next_callee_save (regno + 1, limit);
3226
3227       if (regno2 <= limit
3228           && !cfun->machine->reg_is_wrapped_separately[regno2]
3229           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3230               == cfun->machine->frame.reg_offset[regno2]))
3231
3232         {
3233           rtx reg2 = gen_rtx_REG (mode, regno2);
3234           rtx mem2;
3235
3236           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3237           mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3238                                                    offset));
3239           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
3240                                                     reg2));
3241
3242           /* The first part of a frame-related parallel insn is
3243              always assumed to be relevant to the frame
3244              calculations; subsequent parts, are only
3245              frame-related if explicitly marked.  */
3246           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3247           regno = regno2;
3248         }
3249       else
3250         insn = emit_move_insn (mem, reg);
3251
3252       RTX_FRAME_RELATED_P (insn) = 1;
3253     }
3254 }
3255
3256 /* Emit code to restore the callee registers of mode MODE from register
3257    number START up to and including LIMIT.  Restore from the stack offset
3258    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
3259    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
3260
3261 static void
3262 aarch64_restore_callee_saves (machine_mode mode,
3263                               HOST_WIDE_INT start_offset, unsigned start,
3264                               unsigned limit, bool skip_wb, rtx *cfi_ops)
3265 {
3266   rtx base_rtx = stack_pointer_rtx;
3267   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3268                                                  ? gen_frame_mem : gen_rtx_MEM);
3269   unsigned regno;
3270   unsigned regno2;
3271   HOST_WIDE_INT offset;
3272
3273   for (regno = aarch64_next_callee_save (start, limit);
3274        regno <= limit;
3275        regno = aarch64_next_callee_save (regno + 1, limit))
3276     {
3277       if (cfun->machine->reg_is_wrapped_separately[regno])
3278        continue;
3279
3280       rtx reg, mem;
3281
3282       if (skip_wb
3283           && (regno == cfun->machine->frame.wb_candidate1
3284               || regno == cfun->machine->frame.wb_candidate2))
3285         continue;
3286
3287       reg = gen_rtx_REG (mode, regno);
3288       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3289       mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3290
3291       regno2 = aarch64_next_callee_save (regno + 1, limit);
3292
3293       if (regno2 <= limit
3294           && !cfun->machine->reg_is_wrapped_separately[regno2]
3295           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3296               == cfun->machine->frame.reg_offset[regno2]))
3297         {
3298           rtx reg2 = gen_rtx_REG (mode, regno2);
3299           rtx mem2;
3300
3301           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3302           mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3303           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3304
3305           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3306           regno = regno2;
3307         }
3308       else
3309         emit_move_insn (reg, mem);
3310       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
3311     }
3312 }
3313
3314 static inline bool
3315 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3316                                HOST_WIDE_INT offset)
3317 {
3318   return offset >= -256 && offset < 256;
3319 }
3320
3321 static inline bool
3322 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3323 {
3324   return (offset >= 0
3325           && offset < 4096 * GET_MODE_SIZE (mode)
3326           && offset % GET_MODE_SIZE (mode) == 0);
3327 }
3328
3329 bool
3330 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3331 {
3332   return (offset >= -64 * GET_MODE_SIZE (mode)
3333           && offset < 64 * GET_MODE_SIZE (mode)
3334           && offset % GET_MODE_SIZE (mode) == 0);
3335 }
3336
3337 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
3338
3339 static sbitmap
3340 aarch64_get_separate_components (void)
3341 {
3342   aarch64_layout_frame ();
3343
3344   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3345   bitmap_clear (components);
3346
3347   /* The registers we need saved to the frame.  */
3348   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3349     if (aarch64_register_saved_on_entry (regno))
3350       {
3351         HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3352         if (!frame_pointer_needed)
3353           offset += cfun->machine->frame.frame_size
3354                     - cfun->machine->frame.hard_fp_offset;
3355         /* Check that we can access the stack slot of the register with one
3356            direct load with no adjustments needed.  */
3357         if (offset_12bit_unsigned_scaled_p (DImode, offset))
3358           bitmap_set_bit (components, regno);
3359       }
3360
3361   /* Don't mess with the hard frame pointer.  */
3362   if (frame_pointer_needed)
3363     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
3364
3365   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3366   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3367   /* If aarch64_layout_frame has chosen registers to store/restore with
3368      writeback don't interfere with them to avoid having to output explicit
3369      stack adjustment instructions.  */
3370   if (reg2 != INVALID_REGNUM)
3371     bitmap_clear_bit (components, reg2);
3372   if (reg1 != INVALID_REGNUM)
3373     bitmap_clear_bit (components, reg1);
3374
3375   bitmap_clear_bit (components, LR_REGNUM);
3376   bitmap_clear_bit (components, SP_REGNUM);
3377
3378   return components;
3379 }
3380
3381 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
3382
3383 static sbitmap
3384 aarch64_components_for_bb (basic_block bb)
3385 {
3386   bitmap in = DF_LIVE_IN (bb);
3387   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
3388   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
3389
3390   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3391   bitmap_clear (components);
3392
3393   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
3394   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3395     if ((!call_used_regs[regno])
3396        && (bitmap_bit_p (in, regno)
3397            || bitmap_bit_p (gen, regno)
3398            || bitmap_bit_p (kill, regno)))
3399           bitmap_set_bit (components, regno);
3400
3401   return components;
3402 }
3403
3404 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
3405    Nothing to do for aarch64.  */
3406
3407 static void
3408 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
3409 {
3410 }
3411
3412 /* Return the next set bit in BMP from START onwards.  Return the total number
3413    of bits in BMP if no set bit is found at or after START.  */
3414
3415 static unsigned int
3416 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
3417 {
3418   unsigned int nbits = SBITMAP_SIZE (bmp);
3419   if (start == nbits)
3420     return start;
3421
3422   gcc_assert (start < nbits);
3423   for (unsigned int i = start; i < nbits; i++)
3424     if (bitmap_bit_p (bmp, i))
3425       return i;
3426
3427   return nbits;
3428 }
3429
3430 /* Do the work for aarch64_emit_prologue_components and
3431    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
3432    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
3433    for these components or the epilogue sequence.  That is, it determines
3434    whether we should emit stores or loads and what kind of CFA notes to attach
3435    to the insns.  Otherwise the logic for the two sequences is very
3436    similar.  */
3437
3438 static void
3439 aarch64_process_components (sbitmap components, bool prologue_p)
3440 {
3441   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
3442                              ? HARD_FRAME_POINTER_REGNUM
3443                              : STACK_POINTER_REGNUM);
3444
3445   unsigned last_regno = SBITMAP_SIZE (components);
3446   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
3447   rtx_insn *insn = NULL;
3448
3449   while (regno != last_regno)
3450     {
3451       /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
3452          so DFmode for the vector registers is enough.  */
3453       machine_mode mode = GP_REGNUM_P (regno) ? DImode : DFmode;
3454       rtx reg = gen_rtx_REG (mode, regno);
3455       HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3456       if (!frame_pointer_needed)
3457         offset += cfun->machine->frame.frame_size
3458                   - cfun->machine->frame.hard_fp_offset;
3459       rtx addr = plus_constant (Pmode, ptr_reg, offset);
3460       rtx mem = gen_frame_mem (mode, addr);
3461
3462       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
3463       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
3464       /* No more registers to handle after REGNO.
3465          Emit a single save/restore and exit.  */
3466       if (regno2 == last_regno)
3467         {
3468           insn = emit_insn (set);
3469           RTX_FRAME_RELATED_P (insn) = 1;
3470           if (prologue_p)
3471             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3472           else
3473             add_reg_note (insn, REG_CFA_RESTORE, reg);
3474           break;
3475         }
3476
3477       HOST_WIDE_INT offset2 = cfun->machine->frame.reg_offset[regno2];
3478       /* The next register is not of the same class or its offset is not
3479          mergeable with the current one into a pair.  */
3480       if (!satisfies_constraint_Ump (mem)
3481           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
3482           || (offset2 - cfun->machine->frame.reg_offset[regno])
3483                 != GET_MODE_SIZE (mode))
3484         {
3485           insn = emit_insn (set);
3486           RTX_FRAME_RELATED_P (insn) = 1;
3487           if (prologue_p)
3488             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3489           else
3490             add_reg_note (insn, REG_CFA_RESTORE, reg);
3491
3492           regno = regno2;
3493           continue;
3494         }
3495
3496       /* REGNO2 can be saved/restored in a pair with REGNO.  */
3497       rtx reg2 = gen_rtx_REG (mode, regno2);
3498       if (!frame_pointer_needed)
3499         offset2 += cfun->machine->frame.frame_size
3500                   - cfun->machine->frame.hard_fp_offset;
3501       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
3502       rtx mem2 = gen_frame_mem (mode, addr2);
3503       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
3504                              : gen_rtx_SET (reg2, mem2);
3505
3506       if (prologue_p)
3507         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
3508       else
3509         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3510
3511       RTX_FRAME_RELATED_P (insn) = 1;
3512       if (prologue_p)
3513         {
3514           add_reg_note (insn, REG_CFA_OFFSET, set);
3515           add_reg_note (insn, REG_CFA_OFFSET, set2);
3516         }
3517       else
3518         {
3519           add_reg_note (insn, REG_CFA_RESTORE, reg);
3520           add_reg_note (insn, REG_CFA_RESTORE, reg2);
3521         }
3522
3523       regno = aarch64_get_next_set_bit (components, regno2 + 1);
3524     }
3525 }
3526
3527 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
3528
3529 static void
3530 aarch64_emit_prologue_components (sbitmap components)
3531 {
3532   aarch64_process_components (components, true);
3533 }
3534
3535 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
3536
3537 static void
3538 aarch64_emit_epilogue_components (sbitmap components)
3539 {
3540   aarch64_process_components (components, false);
3541 }
3542
3543 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
3544
3545 static void
3546 aarch64_set_handled_components (sbitmap components)
3547 {
3548   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3549     if (bitmap_bit_p (components, regno))
3550       cfun->machine->reg_is_wrapped_separately[regno] = true;
3551 }
3552
3553 /* AArch64 stack frames generated by this compiler look like:
3554
3555         +-------------------------------+
3556         |                               |
3557         |  incoming stack arguments     |
3558         |                               |
3559         +-------------------------------+
3560         |                               | <-- incoming stack pointer (aligned)
3561         |  callee-allocated save area   |
3562         |  for register varargs         |
3563         |                               |
3564         +-------------------------------+
3565         |  local variables              | <-- frame_pointer_rtx
3566         |                               |
3567         +-------------------------------+
3568         |  padding0                     | \
3569         +-------------------------------+  |
3570         |  callee-saved registers       |  | frame.saved_regs_size
3571         +-------------------------------+  |
3572         |  LR'                          |  |
3573         +-------------------------------+  |
3574         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
3575         +-------------------------------+
3576         |  dynamic allocation           |
3577         +-------------------------------+
3578         |  padding                      |
3579         +-------------------------------+
3580         |  outgoing stack arguments     | <-- arg_pointer
3581         |                               |
3582         +-------------------------------+
3583         |                               | <-- stack_pointer_rtx (aligned)
3584
3585    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3586    but leave frame_pointer_rtx and hard_frame_pointer_rtx
3587    unchanged.  */
3588
3589 /* Generate the prologue instructions for entry into a function.
3590    Establish the stack frame by decreasing the stack pointer with a
3591    properly calculated size and, if necessary, create a frame record
3592    filled with the values of LR and previous frame pointer.  The
3593    current FP is also set up if it is in use.  */
3594
3595 void
3596 aarch64_expand_prologue (void)
3597 {
3598   aarch64_layout_frame ();
3599
3600   HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
3601   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3602   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3603   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3604   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3605   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3606   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3607   rtx_insn *insn;
3608
3609   /* Sign return address for functions.  */
3610   if (aarch64_return_address_signing_enabled ())
3611     {
3612       insn = emit_insn (gen_pacisp ());
3613       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3614       RTX_FRAME_RELATED_P (insn) = 1;
3615     }
3616
3617   if (flag_stack_usage_info)
3618     current_function_static_stack_size = frame_size;
3619
3620   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
3621     {
3622       if (crtl->is_leaf && !cfun->calls_alloca)
3623         {
3624           if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
3625             aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
3626                                             frame_size - STACK_CHECK_PROTECT);
3627         }
3628       else if (frame_size > 0)
3629         aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
3630     }
3631
3632   aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
3633
3634   if (callee_adjust != 0)
3635     aarch64_push_regs (reg1, reg2, callee_adjust);
3636
3637   if (frame_pointer_needed)
3638     {
3639       if (callee_adjust == 0)
3640         aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
3641                                    R30_REGNUM, false);
3642       insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
3643                                        stack_pointer_rtx,
3644                                        GEN_INT (callee_offset)));
3645       RTX_FRAME_RELATED_P (insn) = 1;
3646       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
3647     }
3648
3649   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3650                              callee_adjust != 0 || frame_pointer_needed);
3651   aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3652                              callee_adjust != 0 || frame_pointer_needed);
3653   aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
3654 }
3655
3656 /* Return TRUE if we can use a simple_return insn.
3657
3658    This function checks whether the callee saved stack is empty, which
3659    means no restore actions are need. The pro_and_epilogue will use
3660    this to check whether shrink-wrapping opt is feasible.  */
3661
3662 bool
3663 aarch64_use_return_insn_p (void)
3664 {
3665   if (!reload_completed)
3666     return false;
3667
3668   if (crtl->profile)
3669     return false;
3670
3671   aarch64_layout_frame ();
3672
3673   return cfun->machine->frame.frame_size == 0;
3674 }
3675
3676 /* Generate the epilogue instructions for returning from a function.
3677    This is almost exactly the reverse of the prolog sequence, except
3678    that we need to insert barriers to avoid scheduling loads that read
3679    from a deallocated stack, and we optimize the unwind records by
3680    emitting them all together if possible.  */
3681 void
3682 aarch64_expand_epilogue (bool for_sibcall)
3683 {
3684   aarch64_layout_frame ();
3685
3686   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3687   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3688   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3689   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3690   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3691   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3692   rtx cfi_ops = NULL;
3693   rtx_insn *insn;
3694
3695   /* We need to add memory barrier to prevent read from deallocated stack.  */
3696   bool need_barrier_p = (get_frame_size ()
3697                          + cfun->machine->frame.saved_varargs_size) != 0;
3698
3699   /* Emit a barrier to prevent loads from a deallocated stack.  */
3700   if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca
3701       || crtl->calls_eh_return)
3702     {
3703       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3704       need_barrier_p = false;
3705     }
3706
3707   /* Restore the stack pointer from the frame pointer if it may not
3708      be the same as the stack pointer.  */
3709   if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
3710     {
3711       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3712                                        hard_frame_pointer_rtx,
3713                                        GEN_INT (-callee_offset)));
3714       /* If writeback is used when restoring callee-saves, the CFA
3715          is restored on the instruction doing the writeback.  */
3716       RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
3717     }
3718   else
3719     aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM));
3720
3721   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3722                                 callee_adjust != 0, &cfi_ops);
3723   aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3724                                 callee_adjust != 0, &cfi_ops);
3725
3726   if (need_barrier_p)
3727     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3728
3729   if (callee_adjust != 0)
3730     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
3731
3732   if (callee_adjust != 0 || initial_adjust > 65536)
3733     {
3734       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
3735       insn = get_last_insn ();
3736       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
3737       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3738       RTX_FRAME_RELATED_P (insn) = 1;
3739       cfi_ops = NULL;
3740     }
3741
3742   aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM));
3743
3744   if (cfi_ops)
3745     {
3746       /* Emit delayed restores and reset the CFA to be SP.  */
3747       insn = get_last_insn ();
3748       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
3749       REG_NOTES (insn) = cfi_ops;
3750       RTX_FRAME_RELATED_P (insn) = 1;
3751     }
3752
3753   /* We prefer to emit the combined return/authenticate instruction RETAA,
3754      however there are three cases in which we must instead emit an explicit
3755      authentication instruction.
3756
3757         1) Sibcalls don't return in a normal way, so if we're about to call one
3758            we must authenticate.
3759
3760         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
3761            generating code for !TARGET_ARMV8_3 we can't use it and must
3762            explicitly authenticate.
3763
3764         3) On an eh_return path we make extra stack adjustments to update the
3765            canonical frame address to be the exception handler's CFA.  We want
3766            to authenticate using the CFA of the function which calls eh_return.
3767     */
3768   if (aarch64_return_address_signing_enabled ()
3769       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
3770     {
3771       insn = emit_insn (gen_autisp ());
3772       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3773       RTX_FRAME_RELATED_P (insn) = 1;
3774     }
3775
3776   /* Stack adjustment for exception handler.  */
3777   if (crtl->calls_eh_return)
3778     {
3779       /* We need to unwind the stack by the offset computed by
3780          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
3781          to be SP; letting the CFA move during this adjustment
3782          is just as correct as retaining the CFA from the body
3783          of the function.  Therefore, do nothing special.  */
3784       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3785     }
3786
3787   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3788   if (!for_sibcall)
3789     emit_jump_insn (ret_rtx);
3790 }
3791
3792 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
3793    normally or return to a previous frame after unwinding.
3794
3795    An EH return uses a single shared return sequence.  The epilogue is
3796    exactly like a normal epilogue except that it has an extra input
3797    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
3798    that must be applied after the frame has been destroyed.  An extra label
3799    is inserted before the epilogue which initializes this register to zero,
3800    and this is the entry point for a normal return.
3801
3802    An actual EH return updates the return address, initializes the stack
3803    adjustment and jumps directly into the epilogue (bypassing the zeroing
3804    of the adjustment).  Since the return address is typically saved on the
3805    stack when a function makes a call, the saved LR must be updated outside
3806    the epilogue.
3807
3808    This poses problems as the store is generated well before the epilogue,
3809    so the offset of LR is not known yet.  Also optimizations will remove the
3810    store as it appears dead, even after the epilogue is generated (as the
3811    base or offset for loading LR is different in many cases).
3812
3813    To avoid these problems this implementation forces the frame pointer
3814    in eh_return functions so that the location of LR is fixed and known early.
3815    It also marks the store volatile, so no optimization is permitted to
3816    remove the store.  */
3817 rtx
3818 aarch64_eh_return_handler_rtx (void)
3819 {
3820   rtx tmp = gen_frame_mem (Pmode,
3821     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3822
3823   /* Mark the store volatile, so no optimization is permitted to remove it.  */
3824   MEM_VOLATILE_P (tmp) = true;
3825   return tmp;
3826 }
3827
3828 /* Output code to add DELTA to the first argument, and then jump
3829    to FUNCTION.  Used for C++ multiple inheritance.  */
3830 static void
3831 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3832                          HOST_WIDE_INT delta,
3833                          HOST_WIDE_INT vcall_offset,
3834                          tree function)
3835 {
3836   /* The this pointer is always in x0.  Note that this differs from
3837      Arm where the this pointer maybe bumped to r1 if r0 is required
3838      to return a pointer to an aggregate.  On AArch64 a result value
3839      pointer will be in x8.  */
3840   int this_regno = R0_REGNUM;
3841   rtx this_rtx, temp0, temp1, addr, funexp;
3842   rtx_insn *insn;
3843
3844   reload_completed = 1;
3845   emit_note (NOTE_INSN_PROLOGUE_END);
3846
3847   if (vcall_offset == 0)
3848     aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3849   else
3850     {
3851       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3852
3853       this_rtx = gen_rtx_REG (Pmode, this_regno);
3854       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3855       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3856
3857       addr = this_rtx;
3858       if (delta != 0)
3859         {
3860           if (delta >= -256 && delta < 256)
3861             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3862                                        plus_constant (Pmode, this_rtx, delta));
3863           else
3864             aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3865         }
3866
3867       if (Pmode == ptr_mode)
3868         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3869       else
3870         aarch64_emit_move (temp0,
3871                            gen_rtx_ZERO_EXTEND (Pmode,
3872                                                 gen_rtx_MEM (ptr_mode, addr)));
3873
3874       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3875           addr = plus_constant (Pmode, temp0, vcall_offset);
3876       else
3877         {
3878           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
3879                                           Pmode);
3880           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3881         }
3882
3883       if (Pmode == ptr_mode)
3884         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3885       else
3886         aarch64_emit_move (temp1,
3887                            gen_rtx_SIGN_EXTEND (Pmode,
3888                                                 gen_rtx_MEM (ptr_mode, addr)));
3889
3890       emit_insn (gen_add2_insn (this_rtx, temp1));
3891     }
3892
3893   /* Generate a tail call to the target function.  */
3894   if (!TREE_USED (function))
3895     {
3896       assemble_external (function);
3897       TREE_USED (function) = 1;
3898     }
3899   funexp = XEXP (DECL_RTL (function), 0);
3900   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3901   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3902   SIBLING_CALL_P (insn) = 1;
3903
3904   insn = get_insns ();
3905   shorten_branches (insn);
3906   final_start_function (insn, file, 1);
3907   final (insn, file, 1);
3908   final_end_function ();
3909
3910   /* Stop pretending to be a post-reload pass.  */
3911   reload_completed = 0;
3912 }
3913
3914 static bool
3915 aarch64_tls_referenced_p (rtx x)
3916 {
3917   if (!TARGET_HAVE_TLS)
3918     return false;
3919   subrtx_iterator::array_type array;
3920   FOR_EACH_SUBRTX (iter, array, x, ALL)
3921     {
3922       const_rtx x = *iter;
3923       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3924         return true;
3925       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3926          TLS offsets, not real symbol references.  */
3927       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3928         iter.skip_subrtxes ();
3929     }
3930   return false;
3931 }
3932
3933
3934 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3935    a left shift of 0 or 12 bits.  */
3936 bool
3937 aarch64_uimm12_shift (HOST_WIDE_INT val)
3938 {
3939   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3940           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3941           );
3942 }
3943
3944
3945 /* Return true if val is an immediate that can be loaded into a
3946    register by a MOVZ instruction.  */
3947 static bool
3948 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3949 {
3950   if (GET_MODE_SIZE (mode) > 4)
3951     {
3952       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3953           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3954         return 1;
3955     }
3956   else
3957     {
3958       /* Ignore sign extension.  */
3959       val &= (HOST_WIDE_INT) 0xffffffff;
3960     }
3961   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3962           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3963 }
3964
3965 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
3966
3967 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
3968   {
3969     0x0000000100000001ull,
3970     0x0001000100010001ull,
3971     0x0101010101010101ull,
3972     0x1111111111111111ull,
3973     0x5555555555555555ull,
3974   };
3975
3976
3977 /* Return true if val is a valid bitmask immediate.  */
3978
3979 bool
3980 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
3981 {
3982   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
3983   int bits;
3984
3985   /* Check for a single sequence of one bits and return quickly if so.
3986      The special cases of all ones and all zeroes returns false.  */
3987   val = (unsigned HOST_WIDE_INT) val_in;
3988   tmp = val + (val & -val);
3989
3990   if (tmp == (tmp & -tmp))
3991     return (val + 1) > 1;
3992
3993   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
3994   if (mode == SImode)
3995     val = (val << 32) | (val & 0xffffffff);
3996
3997   /* Invert if the immediate doesn't start with a zero bit - this means we
3998      only need to search for sequences of one bits.  */
3999   if (val & 1)
4000     val = ~val;
4001
4002   /* Find the first set bit and set tmp to val with the first sequence of one
4003      bits removed.  Return success if there is a single sequence of ones.  */
4004   first_one = val & -val;
4005   tmp = val & (val + first_one);
4006
4007   if (tmp == 0)
4008     return true;
4009
4010   /* Find the next set bit and compute the difference in bit position.  */
4011   next_one = tmp & -tmp;
4012   bits = clz_hwi (first_one) - clz_hwi (next_one);
4013   mask = val ^ tmp;
4014
4015   /* Check the bit position difference is a power of 2, and that the first
4016      sequence of one bits fits within 'bits' bits.  */
4017   if ((mask >> bits) != 0 || bits != (bits & -bits))
4018     return false;
4019
4020   /* Check the sequence of one bits is repeated 64/bits times.  */
4021   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
4022 }
4023
4024 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
4025    Assumed precondition: VAL_IN Is not zero.  */
4026
4027 unsigned HOST_WIDE_INT
4028 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
4029 {
4030   int lowest_bit_set = ctz_hwi (val_in);
4031   int highest_bit_set = floor_log2 (val_in);
4032   gcc_assert (val_in != 0);
4033
4034   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
4035           (HOST_WIDE_INT_1U << lowest_bit_set));
4036 }
4037
4038 /* Create constant where bits outside of lowest bit set to highest bit set
4039    are set to 1.  */
4040
4041 unsigned HOST_WIDE_INT
4042 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
4043 {
4044   return val_in | ~aarch64_and_split_imm1 (val_in);
4045 }
4046
4047 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
4048
4049 bool
4050 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
4051 {
4052   if (aarch64_bitmask_imm (val_in, mode))
4053     return false;
4054
4055   if (aarch64_move_imm (val_in, mode))
4056     return false;
4057
4058   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
4059
4060   return aarch64_bitmask_imm (imm2, mode);
4061 }
4062
4063 /* Return true if val is an immediate that can be loaded into a
4064    register in a single instruction.  */
4065 bool
4066 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
4067 {
4068   if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
4069     return 1;
4070   return aarch64_bitmask_imm (val, mode);
4071 }
4072
4073 static bool
4074 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
4075 {
4076   rtx base, offset;
4077
4078   if (GET_CODE (x) == HIGH)
4079     return true;
4080
4081   split_const (x, &base, &offset);
4082   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
4083     {
4084       if (aarch64_classify_symbol (base, offset)
4085           != SYMBOL_FORCE_TO_MEM)
4086         return true;
4087       else
4088         /* Avoid generating a 64-bit relocation in ILP32; leave
4089            to aarch64_expand_mov_immediate to handle it properly.  */
4090         return mode != ptr_mode;
4091     }
4092
4093   return aarch64_tls_referenced_p (x);
4094 }
4095
4096 /* Implement TARGET_CASE_VALUES_THRESHOLD.
4097    The expansion for a table switch is quite expensive due to the number
4098    of instructions, the table lookup and hard to predict indirect jump.
4099    When optimizing for speed, and -O3 enabled, use the per-core tuning if
4100    set, otherwise use tables for > 16 cases as a tradeoff between size and
4101    performance.  When optimizing for size, use the default setting.  */
4102
4103 static unsigned int
4104 aarch64_case_values_threshold (void)
4105 {
4106   /* Use the specified limit for the number of cases before using jump
4107      tables at higher optimization levels.  */
4108   if (optimize > 2
4109       && selected_cpu->tune->max_case_values != 0)
4110     return selected_cpu->tune->max_case_values;
4111   else
4112     return optimize_size ? default_case_values_threshold () : 17;
4113 }
4114
4115 /* Return true if register REGNO is a valid index register.
4116    STRICT_P is true if REG_OK_STRICT is in effect.  */
4117
4118 bool
4119 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
4120 {
4121   if (!HARD_REGISTER_NUM_P (regno))
4122     {
4123       if (!strict_p)
4124         return true;
4125
4126       if (!reg_renumber)
4127         return false;
4128
4129       regno = reg_renumber[regno];
4130     }
4131   return GP_REGNUM_P (regno);
4132 }
4133
4134 /* Return true if register REGNO is a valid base register for mode MODE.
4135    STRICT_P is true if REG_OK_STRICT is in effect.  */
4136
4137 bool
4138 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
4139 {
4140   if (!HARD_REGISTER_NUM_P (regno))
4141     {
4142       if (!strict_p)
4143         return true;
4144
4145       if (!reg_renumber)
4146         return false;
4147
4148       regno = reg_renumber[regno];
4149     }
4150
4151   /* The fake registers will be eliminated to either the stack or
4152      hard frame pointer, both of which are usually valid base registers.
4153      Reload deals with the cases where the eliminated form isn't valid.  */
4154   return (GP_REGNUM_P (regno)
4155           || regno == SP_REGNUM
4156           || regno == FRAME_POINTER_REGNUM
4157           || regno == ARG_POINTER_REGNUM);
4158 }
4159
4160 /* Return true if X is a valid base register for mode MODE.
4161    STRICT_P is true if REG_OK_STRICT is in effect.  */
4162
4163 static bool
4164 aarch64_base_register_rtx_p (rtx x, bool strict_p)
4165 {
4166   if (!strict_p && GET_CODE (x) == SUBREG)
4167     x = SUBREG_REG (x);
4168
4169   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
4170 }
4171
4172 /* Return true if address offset is a valid index.  If it is, fill in INFO
4173    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
4174
4175 static bool
4176 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
4177                         machine_mode mode, bool strict_p)
4178 {
4179   enum aarch64_address_type type;
4180   rtx index;
4181   int shift;
4182
4183   /* (reg:P) */
4184   if ((REG_P (x) || GET_CODE (x) == SUBREG)
4185       && GET_MODE (x) == Pmode)
4186     {
4187       type = ADDRESS_REG_REG;
4188       index = x;
4189       shift = 0;
4190     }
4191   /* (sign_extend:DI (reg:SI)) */
4192   else if ((GET_CODE (x) == SIGN_EXTEND
4193             || GET_CODE (x) == ZERO_EXTEND)
4194            && GET_MODE (x) == DImode
4195            && GET_MODE (XEXP (x, 0)) == SImode)
4196     {
4197       type = (GET_CODE (x) == SIGN_EXTEND)
4198         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4199       index = XEXP (x, 0);
4200       shift = 0;
4201     }
4202   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
4203   else if (GET_CODE (x) == MULT
4204            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4205                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4206            && GET_MODE (XEXP (x, 0)) == DImode
4207            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4208            && CONST_INT_P (XEXP (x, 1)))
4209     {
4210       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4211         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4212       index = XEXP (XEXP (x, 0), 0);
4213       shift = exact_log2 (INTVAL (XEXP (x, 1)));
4214     }
4215   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
4216   else if (GET_CODE (x) == ASHIFT
4217            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4218                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4219            && GET_MODE (XEXP (x, 0)) == DImode
4220            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4221            && CONST_INT_P (XEXP (x, 1)))
4222     {
4223       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4224         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4225       index = XEXP (XEXP (x, 0), 0);
4226       shift = INTVAL (XEXP (x, 1));
4227     }
4228   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
4229   else if ((GET_CODE (x) == SIGN_EXTRACT
4230             || GET_CODE (x) == ZERO_EXTRACT)
4231            && GET_MODE (x) == DImode
4232            && GET_CODE (XEXP (x, 0)) == MULT
4233            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4234            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4235     {
4236       type = (GET_CODE (x) == SIGN_EXTRACT)
4237         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4238       index = XEXP (XEXP (x, 0), 0);
4239       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4240       if (INTVAL (XEXP (x, 1)) != 32 + shift
4241           || INTVAL (XEXP (x, 2)) != 0)
4242         shift = -1;
4243     }
4244   /* (and:DI (mult:DI (reg:DI) (const_int scale))
4245      (const_int 0xffffffff<<shift)) */
4246   else if (GET_CODE (x) == AND
4247            && GET_MODE (x) == DImode
4248            && GET_CODE (XEXP (x, 0)) == MULT
4249            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4250            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4251            && CONST_INT_P (XEXP (x, 1)))
4252     {
4253       type = ADDRESS_REG_UXTW;
4254       index = XEXP (XEXP (x, 0), 0);
4255       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4256       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4257         shift = -1;
4258     }
4259   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
4260   else if ((GET_CODE (x) == SIGN_EXTRACT
4261             || GET_CODE (x) == ZERO_EXTRACT)
4262            && GET_MODE (x) == DImode
4263            && GET_CODE (XEXP (x, 0)) == ASHIFT
4264            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4265            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4266     {
4267       type = (GET_CODE (x) == SIGN_EXTRACT)
4268         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4269       index = XEXP (XEXP (x, 0), 0);
4270       shift = INTVAL (XEXP (XEXP (x, 0), 1));
4271       if (INTVAL (XEXP (x, 1)) != 32 + shift
4272           || INTVAL (XEXP (x, 2)) != 0)
4273         shift = -1;
4274     }
4275   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
4276      (const_int 0xffffffff<<shift)) */
4277   else if (GET_CODE (x) == AND
4278            && GET_MODE (x) == DImode
4279            && GET_CODE (XEXP (x, 0)) == ASHIFT
4280            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4281            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4282            && CONST_INT_P (XEXP (x, 1)))
4283     {
4284       type = ADDRESS_REG_UXTW;
4285       index = XEXP (XEXP (x, 0), 0);
4286       shift = INTVAL (XEXP (XEXP (x, 0), 1));
4287       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4288         shift = -1;
4289     }
4290   /* (mult:P (reg:P) (const_int scale)) */
4291   else if (GET_CODE (x) == MULT
4292            && GET_MODE (x) == Pmode
4293            && GET_MODE (XEXP (x, 0)) == Pmode
4294            && CONST_INT_P (XEXP (x, 1)))
4295     {
4296       type = ADDRESS_REG_REG;
4297       index = XEXP (x, 0);
4298       shift = exact_log2 (INTVAL (XEXP (x, 1)));
4299     }
4300   /* (ashift:P (reg:P) (const_int shift)) */
4301   else if (GET_CODE (x) == ASHIFT
4302            && GET_MODE (x) == Pmode
4303            && GET_MODE (XEXP (x, 0)) == Pmode
4304            && CONST_INT_P (XEXP (x, 1)))
4305     {
4306       type = ADDRESS_REG_REG;
4307       index = XEXP (x, 0);
4308       shift = INTVAL (XEXP (x, 1));
4309     }
4310   else
4311     return false;
4312
4313   if (GET_CODE (index) == SUBREG)
4314     index = SUBREG_REG (index);
4315
4316   if ((shift == 0 ||
4317        (shift > 0 && shift <= 3
4318         && (1 << shift) == GET_MODE_SIZE (mode)))
4319       && REG_P (index)
4320       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
4321     {
4322       info->type = type;
4323       info->offset = index;
4324       info->shift = shift;
4325       return true;
4326     }
4327
4328   return false;
4329 }
4330
4331 /* Return true if MODE is one of the modes for which we
4332    support LDP/STP operations.  */
4333
4334 static bool
4335 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
4336 {
4337   return mode == SImode || mode == DImode
4338          || mode == SFmode || mode == DFmode
4339          || (aarch64_vector_mode_supported_p (mode)
4340              && GET_MODE_SIZE (mode) == 8);
4341 }
4342
4343 /* Return true if REGNO is a virtual pointer register, or an eliminable
4344    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
4345    include stack_pointer or hard_frame_pointer.  */
4346 static bool
4347 virt_or_elim_regno_p (unsigned regno)
4348 {
4349   return ((regno >= FIRST_VIRTUAL_REGISTER
4350            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
4351           || regno == FRAME_POINTER_REGNUM
4352           || regno == ARG_POINTER_REGNUM);
4353 }
4354
4355 /* Return true if X is a valid address for machine mode MODE.  If it is,
4356    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
4357    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
4358
4359 static bool
4360 aarch64_classify_address (struct aarch64_address_info *info,
4361                           rtx x, machine_mode mode,
4362                           RTX_CODE outer_code, bool strict_p)
4363 {
4364   enum rtx_code code = GET_CODE (x);
4365   rtx op0, op1;
4366
4367   /* On BE, we use load/store pair for all large int mode load/stores.
4368      TI/TFmode may also use a load/store pair.  */
4369   bool load_store_pair_p = (outer_code == PARALLEL
4370                             || mode == TImode
4371                             || mode == TFmode
4372                             || (BYTES_BIG_ENDIAN
4373                                 && aarch64_vect_struct_mode_p (mode)));
4374
4375   bool allow_reg_index_p =
4376     !load_store_pair_p
4377     && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
4378     && !aarch64_vect_struct_mode_p (mode);
4379
4380   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
4381      REG addressing.  */
4382   if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
4383       && (code != POST_INC && code != REG))
4384     return false;
4385
4386   switch (code)
4387     {
4388     case REG:
4389     case SUBREG:
4390       info->type = ADDRESS_REG_IMM;
4391       info->base = x;
4392       info->offset = const0_rtx;
4393       return aarch64_base_register_rtx_p (x, strict_p);
4394
4395     case PLUS:
4396       op0 = XEXP (x, 0);
4397       op1 = XEXP (x, 1);
4398
4399       if (! strict_p
4400           && REG_P (op0)
4401           && virt_or_elim_regno_p (REGNO (op0))
4402           && CONST_INT_P (op1))
4403         {
4404           info->type = ADDRESS_REG_IMM;
4405           info->base = op0;
4406           info->offset = op1;
4407
4408           return true;
4409         }
4410
4411       if (GET_MODE_SIZE (mode) != 0
4412           && CONST_INT_P (op1)
4413           && aarch64_base_register_rtx_p (op0, strict_p))
4414         {
4415           HOST_WIDE_INT offset = INTVAL (op1);
4416
4417           info->type = ADDRESS_REG_IMM;
4418           info->base = op0;
4419           info->offset = op1;
4420
4421           /* TImode and TFmode values are allowed in both pairs of X
4422              registers and individual Q registers.  The available
4423              address modes are:
4424              X,X: 7-bit signed scaled offset
4425              Q:   9-bit signed offset
4426              We conservatively require an offset representable in either mode.
4427              When performing the check for pairs of X registers i.e.  LDP/STP
4428              pass down DImode since that is the natural size of the LDP/STP
4429              instruction memory accesses.  */
4430           if (mode == TImode || mode == TFmode)
4431             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
4432                     && (offset_9bit_signed_unscaled_p (mode, offset)
4433                         || offset_12bit_unsigned_scaled_p (mode, offset)));
4434
4435           /* A 7bit offset check because OImode will emit a ldp/stp
4436              instruction (only big endian will get here).
4437              For ldp/stp instructions, the offset is scaled for the size of a
4438              single element of the pair.  */
4439           if (mode == OImode)
4440             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
4441
4442           /* Three 9/12 bit offsets checks because CImode will emit three
4443              ldr/str instructions (only big endian will get here).  */
4444           if (mode == CImode)
4445             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4446                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
4447                         || offset_12bit_unsigned_scaled_p (V16QImode,
4448                                                            offset + 32)));
4449
4450           /* Two 7bit offsets checks because XImode will emit two ldp/stp
4451              instructions (only big endian will get here).  */
4452           if (mode == XImode)
4453             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4454                     && aarch64_offset_7bit_signed_scaled_p (TImode,
4455                                                             offset + 32));
4456
4457           if (load_store_pair_p)
4458             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4459                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4460           else
4461             return (offset_9bit_signed_unscaled_p (mode, offset)
4462                     || offset_12bit_unsigned_scaled_p (mode, offset));
4463         }
4464
4465       if (allow_reg_index_p)
4466         {
4467           /* Look for base + (scaled/extended) index register.  */
4468           if (aarch64_base_register_rtx_p (op0, strict_p)
4469               && aarch64_classify_index (info, op1, mode, strict_p))
4470             {
4471               info->base = op0;
4472               return true;
4473             }
4474           if (aarch64_base_register_rtx_p (op1, strict_p)
4475               && aarch64_classify_index (info, op0, mode, strict_p))
4476             {
4477               info->base = op1;
4478               return true;
4479             }
4480         }
4481
4482       return false;
4483
4484     case POST_INC:
4485     case POST_DEC:
4486     case PRE_INC:
4487     case PRE_DEC:
4488       info->type = ADDRESS_REG_WB;
4489       info->base = XEXP (x, 0);
4490       info->offset = NULL_RTX;
4491       return aarch64_base_register_rtx_p (info->base, strict_p);
4492
4493     case POST_MODIFY:
4494     case PRE_MODIFY:
4495       info->type = ADDRESS_REG_WB;
4496       info->base = XEXP (x, 0);
4497       if (GET_CODE (XEXP (x, 1)) == PLUS
4498           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
4499           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
4500           && aarch64_base_register_rtx_p (info->base, strict_p))
4501         {
4502           HOST_WIDE_INT offset;
4503           info->offset = XEXP (XEXP (x, 1), 1);
4504           offset = INTVAL (info->offset);
4505
4506           /* TImode and TFmode values are allowed in both pairs of X
4507              registers and individual Q registers.  The available
4508              address modes are:
4509              X,X: 7-bit signed scaled offset
4510              Q:   9-bit signed offset
4511              We conservatively require an offset representable in either mode.
4512            */
4513           if (mode == TImode || mode == TFmode)
4514             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4515                     && offset_9bit_signed_unscaled_p (mode, offset));
4516
4517           if (load_store_pair_p)
4518             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4519                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4520           else
4521             return offset_9bit_signed_unscaled_p (mode, offset);
4522         }
4523       return false;
4524
4525     case CONST:
4526     case SYMBOL_REF:
4527     case LABEL_REF:
4528       /* load literal: pc-relative constant pool entry.  Only supported
4529          for SI mode or larger.  */
4530       info->type = ADDRESS_SYMBOLIC;
4531
4532       if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4533         {
4534           rtx sym, addend;
4535
4536           split_const (x, &sym, &addend);
4537           return ((GET_CODE (sym) == LABEL_REF
4538                    || (GET_CODE (sym) == SYMBOL_REF
4539                        && CONSTANT_POOL_ADDRESS_P (sym)
4540                        && aarch64_pcrelative_literal_loads)));
4541         }
4542       return false;
4543
4544     case LO_SUM:
4545       info->type = ADDRESS_LO_SUM;
4546       info->base = XEXP (x, 0);
4547       info->offset = XEXP (x, 1);
4548       if (allow_reg_index_p
4549           && aarch64_base_register_rtx_p (info->base, strict_p))
4550         {
4551           rtx sym, offs;
4552           split_const (info->offset, &sym, &offs);
4553           if (GET_CODE (sym) == SYMBOL_REF
4554               && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4555             {
4556               /* The symbol and offset must be aligned to the access size.  */
4557               unsigned int align;
4558               unsigned int ref_size;
4559
4560               if (CONSTANT_POOL_ADDRESS_P (sym))
4561                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4562               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4563                 {
4564                   tree exp = SYMBOL_REF_DECL (sym);
4565                   align = TYPE_ALIGN (TREE_TYPE (exp));
4566                   align = CONSTANT_ALIGNMENT (exp, align);
4567                 }
4568               else if (SYMBOL_REF_DECL (sym))
4569                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4570               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4571                        && SYMBOL_REF_BLOCK (sym) != NULL)
4572                 align = SYMBOL_REF_BLOCK (sym)->alignment;
4573               else
4574                 align = BITS_PER_UNIT;
4575
4576               ref_size = GET_MODE_SIZE (mode);
4577               if (ref_size == 0)
4578                 ref_size = GET_MODE_SIZE (DImode);
4579
4580               return ((INTVAL (offs) & (ref_size - 1)) == 0
4581                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4582             }
4583         }
4584       return false;
4585
4586     default:
4587       return false;
4588     }
4589 }
4590
4591 /* Return true if the address X is valid for a PRFM instruction.
4592    STRICT_P is true if we should do strict checking with
4593    aarch64_classify_address.  */
4594
4595 bool
4596 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
4597 {
4598   struct aarch64_address_info addr;
4599
4600   /* PRFM accepts the same addresses as DImode...  */
4601   bool res = aarch64_classify_address (&addr, x, DImode, MEM, strict_p);
4602   if (!res)
4603     return false;
4604
4605   /* ... except writeback forms.  */
4606   return addr.type != ADDRESS_REG_WB;
4607 }
4608
4609 bool
4610 aarch64_symbolic_address_p (rtx x)
4611 {
4612   rtx offset;
4613
4614   split_const (x, &x, &offset);
4615   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4616 }
4617
4618 /* Classify the base of symbolic expression X.  */
4619
4620 enum aarch64_symbol_type
4621 aarch64_classify_symbolic_expression (rtx x)
4622 {
4623   rtx offset;
4624
4625   split_const (x, &x, &offset);
4626   return aarch64_classify_symbol (x, offset);
4627 }
4628
4629
4630 /* Return TRUE if X is a legitimate address for accessing memory in
4631    mode MODE.  */
4632 static bool
4633 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4634 {
4635   struct aarch64_address_info addr;
4636
4637   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4638 }
4639
4640 /* Return TRUE if X is a legitimate address for accessing memory in
4641    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
4642    pair operation.  */
4643 bool
4644 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4645                               RTX_CODE outer_code, bool strict_p)
4646 {
4647   struct aarch64_address_info addr;
4648
4649   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4650 }
4651
4652 /* Split an out-of-range address displacement into a base and offset.
4653    Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
4654    to increase opportunities for sharing the base address of different sizes.
4655    For unaligned accesses and TI/TF mode use the signed 9-bit range.  */
4656 static bool
4657 aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
4658 {
4659   HOST_WIDE_INT offset = INTVAL (*disp);
4660   HOST_WIDE_INT base = offset & ~(GET_MODE_SIZE (mode) < 4 ? 0xfff : 0x3ffc);
4661
4662   if (mode == TImode || mode == TFmode
4663       || (offset & (GET_MODE_SIZE (mode) - 1)) != 0)
4664     base = (offset + 0x100) & ~0x1ff;
4665
4666   *off = GEN_INT (base);
4667   *disp = GEN_INT (offset - base);
4668   return true;
4669 }
4670
4671 /* Return TRUE if rtx X is immediate constant 0.0 */
4672 bool
4673 aarch64_float_const_zero_rtx_p (rtx x)
4674 {
4675   if (GET_MODE (x) == VOIDmode)
4676     return false;
4677
4678   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4679     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4680   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4681 }
4682
4683 /* Return the fixed registers used for condition codes.  */
4684
4685 static bool
4686 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4687 {
4688   *p1 = CC_REGNUM;
4689   *p2 = INVALID_REGNUM;
4690   return true;
4691 }
4692
4693 /* This function is used by the call expanders of the machine description.
4694    RESULT is the register in which the result is returned.  It's NULL for
4695    "call" and "sibcall".
4696    MEM is the location of the function call.
4697    SIBCALL indicates whether this function call is normal call or sibling call.
4698    It will generate different pattern accordingly.  */
4699
4700 void
4701 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
4702 {
4703   rtx call, callee, tmp;
4704   rtvec vec;
4705   machine_mode mode;
4706
4707   gcc_assert (MEM_P (mem));
4708   callee = XEXP (mem, 0);
4709   mode = GET_MODE (callee);
4710   gcc_assert (mode == Pmode);
4711
4712   /* Decide if we should generate indirect calls by loading the
4713      address of the callee into a register before performing
4714      the branch-and-link.  */
4715   if (SYMBOL_REF_P (callee)
4716       ? (aarch64_is_long_call_p (callee)
4717          || aarch64_is_noplt_call_p (callee))
4718       : !REG_P (callee))
4719     XEXP (mem, 0) = force_reg (mode, callee);
4720
4721   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
4722
4723   if (result != NULL_RTX)
4724     call = gen_rtx_SET (result, call);
4725
4726   if (sibcall)
4727     tmp = ret_rtx;
4728   else
4729     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
4730
4731   vec = gen_rtvec (2, call, tmp);
4732   call = gen_rtx_PARALLEL (VOIDmode, vec);
4733
4734   aarch64_emit_call_insn (call);
4735 }
4736
4737 /* Emit call insn with PAT and do aarch64-specific handling.  */
4738
4739 void
4740 aarch64_emit_call_insn (rtx pat)
4741 {
4742   rtx insn = emit_call_insn (pat);
4743
4744   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4745   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4746   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4747 }
4748
4749 machine_mode
4750 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4751 {
4752   /* All floating point compares return CCFP if it is an equality
4753      comparison, and CCFPE otherwise.  */
4754   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4755     {
4756       switch (code)
4757         {
4758         case EQ:
4759         case NE:
4760         case UNORDERED:
4761         case ORDERED:
4762         case UNLT:
4763         case UNLE:
4764         case UNGT:
4765         case UNGE:
4766         case UNEQ:
4767         case LTGT:
4768           return CCFPmode;
4769
4770         case LT:
4771         case LE:
4772         case GT:
4773         case GE:
4774           return CCFPEmode;
4775
4776         default:
4777           gcc_unreachable ();
4778         }
4779     }
4780
4781   /* Equality comparisons of short modes against zero can be performed
4782      using the TST instruction with the appropriate bitmask.  */
4783   if (y == const0_rtx && REG_P (x)
4784       && (code == EQ || code == NE)
4785       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4786     return CC_NZmode;
4787
4788   /* Similarly, comparisons of zero_extends from shorter modes can
4789      be performed using an ANDS with an immediate mask.  */
4790   if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
4791       && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4792       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
4793       && (code == EQ || code == NE))
4794     return CC_NZmode;
4795
4796   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4797       && y == const0_rtx
4798       && (code == EQ || code == NE || code == LT || code == GE)
4799       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4800           || GET_CODE (x) == NEG
4801           || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4802               && CONST_INT_P (XEXP (x, 2)))))
4803     return CC_NZmode;
4804
4805   /* A compare with a shifted operand.  Because of canonicalization,
4806      the comparison will have to be swapped when we emit the assembly
4807      code.  */
4808   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4809       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
4810       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4811           || GET_CODE (x) == LSHIFTRT
4812           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4813     return CC_SWPmode;
4814
4815   /* Similarly for a negated operand, but we can only do this for
4816      equalities.  */
4817   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4818       && (REG_P (y) || GET_CODE (y) == SUBREG)
4819       && (code == EQ || code == NE)
4820       && GET_CODE (x) == NEG)
4821     return CC_Zmode;
4822
4823   /* A test for unsigned overflow.  */
4824   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
4825       && code == NE
4826       && GET_CODE (x) == PLUS
4827       && GET_CODE (y) == ZERO_EXTEND)
4828     return CC_Cmode;
4829
4830   /* For everything else, return CCmode.  */
4831   return CCmode;
4832 }
4833
4834 static int
4835 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
4836
4837 int
4838 aarch64_get_condition_code (rtx x)
4839 {
4840   machine_mode mode = GET_MODE (XEXP (x, 0));
4841   enum rtx_code comp_code = GET_CODE (x);
4842
4843   if (GET_MODE_CLASS (mode) != MODE_CC)
4844     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
4845   return aarch64_get_condition_code_1 (mode, comp_code);
4846 }
4847
4848 static int
4849 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
4850 {
4851   switch (mode)
4852     {
4853     case CCFPmode:
4854     case CCFPEmode:
4855       switch (comp_code)
4856         {
4857         case GE: return AARCH64_GE;
4858         case GT: return AARCH64_GT;
4859         case LE: return AARCH64_LS;
4860         case LT: return AARCH64_MI;
4861         case NE: return AARCH64_NE;
4862         case EQ: return AARCH64_EQ;
4863         case ORDERED: return AARCH64_VC;
4864         case UNORDERED: return AARCH64_VS;
4865         case UNLT: return AARCH64_LT;
4866         case UNLE: return AARCH64_LE;
4867         case UNGT: return AARCH64_HI;
4868         case UNGE: return AARCH64_PL;
4869         default: return -1;
4870         }
4871       break;
4872
4873     case CCmode:
4874       switch (comp_code)
4875         {
4876         case NE: return AARCH64_NE;
4877         case EQ: return AARCH64_EQ;
4878         case GE: return AARCH64_GE;
4879         case GT: return AARCH64_GT;
4880         case LE: return AARCH64_LE;
4881         case LT: return AARCH64_LT;
4882         case GEU: return AARCH64_CS;
4883         case GTU: return AARCH64_HI;
4884         case LEU: return AARCH64_LS;
4885         case LTU: return AARCH64_CC;
4886         default: return -1;
4887         }
4888       break;
4889
4890     case CC_SWPmode:
4891       switch (comp_code)
4892         {
4893         case NE: return AARCH64_NE;
4894         case EQ: return AARCH64_EQ;
4895         case GE: return AARCH64_LE;
4896         case GT: return AARCH64_LT;
4897         case LE: return AARCH64_GE;
4898         case LT: return AARCH64_GT;
4899         case GEU: return AARCH64_LS;
4900         case GTU: return AARCH64_CC;
4901         case LEU: return AARCH64_CS;
4902         case LTU: return AARCH64_HI;
4903         default: return -1;
4904         }
4905       break;
4906
4907     case CC_NZmode:
4908       switch (comp_code)
4909         {
4910         case NE: return AARCH64_NE;
4911         case EQ: return AARCH64_EQ;
4912         case GE: return AARCH64_PL;
4913         case LT: return AARCH64_MI;
4914         default: return -1;
4915         }
4916       break;
4917
4918     case CC_Zmode:
4919       switch (comp_code)
4920         {
4921         case NE: return AARCH64_NE;
4922         case EQ: return AARCH64_EQ;
4923         default: return -1;
4924         }
4925       break;
4926
4927     case CC_Cmode:
4928       switch (comp_code)
4929         {
4930         case NE: return AARCH64_CS;
4931         case EQ: return AARCH64_CC;
4932         default: return -1;
4933         }
4934       break;
4935
4936     default:
4937       return -1;
4938     }
4939
4940   return -1;
4941 }
4942
4943 bool
4944 aarch64_const_vec_all_same_in_range_p (rtx x,
4945                                   HOST_WIDE_INT minval,
4946                                   HOST_WIDE_INT maxval)
4947 {
4948   HOST_WIDE_INT firstval;
4949   int count, i;
4950
4951   if (GET_CODE (x) != CONST_VECTOR
4952       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
4953     return false;
4954
4955   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
4956   if (firstval < minval || firstval > maxval)
4957     return false;
4958
4959   count = CONST_VECTOR_NUNITS (x);
4960   for (i = 1; i < count; i++)
4961     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
4962       return false;
4963
4964   return true;
4965 }
4966
4967 bool
4968 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
4969 {
4970   return aarch64_const_vec_all_same_in_range_p (x, val, val);
4971 }
4972
4973
4974 /* N Z C V.  */
4975 #define AARCH64_CC_V 1
4976 #define AARCH64_CC_C (1 << 1)
4977 #define AARCH64_CC_Z (1 << 2)
4978 #define AARCH64_CC_N (1 << 3)
4979
4980 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
4981 static const int aarch64_nzcv_codes[] =
4982 {
4983   0,            /* EQ, Z == 1.  */
4984   AARCH64_CC_Z, /* NE, Z == 0.  */
4985   0,            /* CS, C == 1.  */
4986   AARCH64_CC_C, /* CC, C == 0.  */
4987   0,            /* MI, N == 1.  */
4988   AARCH64_CC_N, /* PL, N == 0.  */
4989   0,            /* VS, V == 1.  */
4990   AARCH64_CC_V, /* VC, V == 0.  */
4991   0,            /* HI, C ==1 && Z == 0.  */
4992   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
4993   AARCH64_CC_V, /* GE, N == V.  */
4994   0,            /* LT, N != V.  */
4995   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
4996   0,            /* LE, !(Z == 0 && N == V).  */
4997   0,            /* AL, Any.  */
4998   0             /* NV, Any.  */
4999 };
5000
5001 static void
5002 aarch64_print_operand (FILE *f, rtx x, int code)
5003 {
5004   switch (code)
5005     {
5006     /* An integer or symbol address without a preceding # sign.  */
5007     case 'c':
5008       switch (GET_CODE (x))
5009         {
5010         case CONST_INT:
5011           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
5012           break;
5013
5014         case SYMBOL_REF:
5015           output_addr_const (f, x);
5016           break;
5017
5018         case CONST:
5019           if (GET_CODE (XEXP (x, 0)) == PLUS
5020               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
5021             {
5022               output_addr_const (f, x);
5023               break;
5024             }
5025           /* Fall through.  */
5026
5027         default:
5028           output_operand_lossage ("Unsupported operand for code '%c'", code);
5029         }
5030       break;
5031
5032     case 'e':
5033       /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w.  */
5034       {
5035         int n;
5036
5037         if (!CONST_INT_P (x)
5038             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
5039           {
5040             output_operand_lossage ("invalid operand for '%%%c'", code);
5041             return;
5042           }
5043
5044         switch (n)
5045           {
5046           case 3:
5047             fputc ('b', f);
5048             break;
5049           case 4:
5050             fputc ('h', f);
5051             break;
5052           case 5:
5053             fputc ('w', f);
5054             break;
5055           default:
5056             output_operand_lossage ("invalid operand for '%%%c'", code);
5057             return;
5058           }
5059       }
5060       break;
5061
5062     case 'p':
5063       {
5064         int n;
5065
5066         /* Print N such that 2^N == X.  */
5067         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
5068           {
5069             output_operand_lossage ("invalid operand for '%%%c'", code);
5070             return;
5071           }
5072
5073         asm_fprintf (f, "%d", n);
5074       }
5075       break;
5076
5077     case 'P':
5078       /* Print the number of non-zero bits in X (a const_int).  */
5079       if (!CONST_INT_P (x))
5080         {
5081           output_operand_lossage ("invalid operand for '%%%c'", code);
5082           return;
5083         }
5084
5085       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
5086       break;
5087
5088     case 'H':
5089       /* Print the higher numbered register of a pair (TImode) of regs.  */
5090       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
5091         {
5092           output_operand_lossage ("invalid operand for '%%%c'", code);
5093           return;
5094         }
5095
5096       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
5097       break;
5098
5099     case 'M':
5100     case 'm':
5101       {
5102         int cond_code;
5103         /* Print a condition (eq, ne, etc) or its inverse.  */
5104
5105         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
5106         if (x == const_true_rtx)
5107           {
5108             if (code == 'M')
5109               fputs ("nv", f);
5110             return;
5111           }
5112
5113         if (!COMPARISON_P (x))
5114           {
5115             output_operand_lossage ("invalid operand for '%%%c'", code);
5116             return;
5117           }
5118
5119         cond_code = aarch64_get_condition_code (x);
5120         gcc_assert (cond_code >= 0);
5121         if (code == 'M')
5122           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
5123         fputs (aarch64_condition_codes[cond_code], f);
5124       }
5125       break;
5126
5127     case 'b':
5128     case 'h':
5129     case 's':
5130     case 'd':
5131     case 'q':
5132       /* Print a scalar FP/SIMD register name.  */
5133       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5134         {
5135           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5136           return;
5137         }
5138       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
5139       break;
5140
5141     case 'S':
5142     case 'T':
5143     case 'U':
5144     case 'V':
5145       /* Print the first FP/SIMD register name in a list.  */
5146       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5147         {
5148           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5149           return;
5150         }
5151       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
5152       break;
5153
5154     case 'R':
5155       /* Print a scalar FP/SIMD register name + 1.  */
5156       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5157         {
5158           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5159           return;
5160         }
5161       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
5162       break;
5163
5164     case 'X':
5165       /* Print bottom 16 bits of integer constant in hex.  */
5166       if (!CONST_INT_P (x))
5167         {
5168           output_operand_lossage ("invalid operand for '%%%c'", code);
5169           return;
5170         }
5171       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
5172       break;
5173
5174     case 'w':
5175     case 'x':
5176       /* Print a general register name or the zero register (32-bit or
5177          64-bit).  */
5178       if (x == const0_rtx
5179           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
5180         {
5181           asm_fprintf (f, "%czr", code);
5182           break;
5183         }
5184
5185       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
5186         {
5187           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
5188           break;
5189         }
5190
5191       if (REG_P (x) && REGNO (x) == SP_REGNUM)
5192         {
5193           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
5194           break;
5195         }
5196
5197       /* Fall through */
5198
5199     case 0:
5200       /* Print a normal operand, if it's a general register, then we
5201          assume DImode.  */
5202       if (x == NULL)
5203         {
5204           output_operand_lossage ("missing operand");
5205           return;
5206         }
5207
5208       switch (GET_CODE (x))
5209         {
5210         case REG:
5211           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
5212           break;
5213
5214         case MEM:
5215           output_address (GET_MODE (x), XEXP (x, 0));
5216           break;
5217
5218         case CONST:
5219         case LABEL_REF:
5220         case SYMBOL_REF:
5221           output_addr_const (asm_out_file, x);
5222           break;
5223
5224         case CONST_INT:
5225           asm_fprintf (f, "%wd", INTVAL (x));
5226           break;
5227
5228         case CONST_VECTOR:
5229           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
5230             {
5231               gcc_assert (
5232                   aarch64_const_vec_all_same_in_range_p (x,
5233                                                          HOST_WIDE_INT_MIN,
5234                                                          HOST_WIDE_INT_MAX));
5235               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
5236             }
5237           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
5238             {
5239               fputc ('0', f);
5240             }
5241           else
5242             gcc_unreachable ();
5243           break;
5244
5245         case CONST_DOUBLE:
5246           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
5247              be getting CONST_DOUBLEs holding integers.  */
5248           gcc_assert (GET_MODE (x) != VOIDmode);
5249           if (aarch64_float_const_zero_rtx_p (x))
5250             {
5251               fputc ('0', f);
5252               break;
5253             }
5254           else if (aarch64_float_const_representable_p (x))
5255             {
5256 #define buf_size 20
5257               char float_buf[buf_size] = {'\0'};
5258               real_to_decimal_for_mode (float_buf,
5259                                         CONST_DOUBLE_REAL_VALUE (x),
5260                                         buf_size, buf_size,
5261                                         1, GET_MODE (x));
5262               asm_fprintf (asm_out_file, "%s", float_buf);
5263               break;
5264 #undef buf_size
5265             }
5266           output_operand_lossage ("invalid constant");
5267           return;
5268         default:
5269           output_operand_lossage ("invalid operand");
5270           return;
5271         }
5272       break;
5273
5274     case 'A':
5275       if (GET_CODE (x) == HIGH)
5276         x = XEXP (x, 0);
5277
5278       switch (aarch64_classify_symbolic_expression (x))
5279         {
5280         case SYMBOL_SMALL_GOT_4G:
5281           asm_fprintf (asm_out_file, ":got:");
5282           break;
5283
5284         case SYMBOL_SMALL_TLSGD:
5285           asm_fprintf (asm_out_file, ":tlsgd:");
5286           break;
5287
5288         case SYMBOL_SMALL_TLSDESC:
5289           asm_fprintf (asm_out_file, ":tlsdesc:");
5290           break;
5291
5292         case SYMBOL_SMALL_TLSIE:
5293           asm_fprintf (asm_out_file, ":gottprel:");
5294           break;
5295
5296         case SYMBOL_TLSLE24:
5297           asm_fprintf (asm_out_file, ":tprel:");
5298           break;
5299
5300         case SYMBOL_TINY_GOT:
5301           gcc_unreachable ();
5302           break;
5303
5304         default:
5305           break;
5306         }
5307       output_addr_const (asm_out_file, x);
5308       break;
5309
5310     case 'L':
5311       switch (aarch64_classify_symbolic_expression (x))
5312         {
5313         case SYMBOL_SMALL_GOT_4G:
5314           asm_fprintf (asm_out_file, ":lo12:");
5315           break;
5316
5317         case SYMBOL_SMALL_TLSGD:
5318           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
5319           break;
5320
5321         case SYMBOL_SMALL_TLSDESC:
5322           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
5323           break;
5324
5325         case SYMBOL_SMALL_TLSIE:
5326           asm_fprintf (asm_out_file, ":gottprel_lo12:");
5327           break;
5328
5329         case SYMBOL_TLSLE12:
5330           asm_fprintf (asm_out_file, ":tprel_lo12:");
5331           break;
5332
5333         case SYMBOL_TLSLE24:
5334           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
5335           break;
5336
5337         case SYMBOL_TINY_GOT:
5338           asm_fprintf (asm_out_file, ":got:");
5339           break;
5340
5341         case SYMBOL_TINY_TLSIE:
5342           asm_fprintf (asm_out_file, ":gottprel:");
5343           break;
5344
5345         default:
5346           break;
5347         }
5348       output_addr_const (asm_out_file, x);
5349       break;
5350
5351     case 'G':
5352
5353       switch (aarch64_classify_symbolic_expression (x))
5354         {
5355         case SYMBOL_TLSLE24:
5356           asm_fprintf (asm_out_file, ":tprel_hi12:");
5357           break;
5358         default:
5359           break;
5360         }
5361       output_addr_const (asm_out_file, x);
5362       break;
5363
5364     case 'k':
5365       {
5366         HOST_WIDE_INT cond_code;
5367         /* Print nzcv.  */
5368
5369         if (!CONST_INT_P (x))
5370           {
5371             output_operand_lossage ("invalid operand for '%%%c'", code);
5372             return;
5373           }
5374
5375         cond_code = INTVAL (x);
5376         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
5377         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
5378       }
5379       break;
5380
5381     default:
5382       output_operand_lossage ("invalid operand prefix '%%%c'", code);
5383       return;
5384     }
5385 }
5386
5387 static void
5388 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
5389 {
5390   struct aarch64_address_info addr;
5391
5392   if (aarch64_classify_address (&addr, x, mode, MEM, true))
5393     switch (addr.type)
5394       {
5395       case ADDRESS_REG_IMM:
5396         if (addr.offset == const0_rtx)
5397           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
5398         else
5399           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
5400                        INTVAL (addr.offset));
5401         return;
5402
5403       case ADDRESS_REG_REG:
5404         if (addr.shift == 0)
5405           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
5406                        reg_names [REGNO (addr.offset)]);
5407         else
5408           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
5409                        reg_names [REGNO (addr.offset)], addr.shift);
5410         return;
5411
5412       case ADDRESS_REG_UXTW:
5413         if (addr.shift == 0)
5414           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
5415                        REGNO (addr.offset) - R0_REGNUM);
5416         else
5417           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
5418                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
5419         return;
5420
5421       case ADDRESS_REG_SXTW:
5422         if (addr.shift == 0)
5423           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
5424                        REGNO (addr.offset) - R0_REGNUM);
5425         else
5426           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
5427                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
5428         return;
5429
5430       case ADDRESS_REG_WB:
5431         switch (GET_CODE (x))
5432           {
5433           case PRE_INC:
5434             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
5435                          GET_MODE_SIZE (mode));
5436             return;
5437           case POST_INC:
5438             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
5439                          GET_MODE_SIZE (mode));
5440             return;
5441           case PRE_DEC:
5442             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
5443                          GET_MODE_SIZE (mode));
5444             return;
5445           case POST_DEC:
5446             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
5447                          GET_MODE_SIZE (mode));
5448             return;
5449           case PRE_MODIFY:
5450             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
5451                          INTVAL (addr.offset));
5452             return;
5453           case POST_MODIFY:
5454             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
5455                          INTVAL (addr.offset));
5456             return;
5457           default:
5458             break;
5459           }
5460         break;
5461
5462       case ADDRESS_LO_SUM:
5463         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
5464         output_addr_const (f, addr.offset);
5465         asm_fprintf (f, "]");
5466         return;
5467
5468       case ADDRESS_SYMBOLIC:
5469         break;
5470       }
5471
5472   output_addr_const (f, x);
5473 }
5474
5475 bool
5476 aarch64_label_mentioned_p (rtx x)
5477 {
5478   const char *fmt;
5479   int i;
5480
5481   if (GET_CODE (x) == LABEL_REF)
5482     return true;
5483
5484   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
5485      referencing instruction, but they are constant offsets, not
5486      symbols.  */
5487   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5488     return false;
5489
5490   fmt = GET_RTX_FORMAT (GET_CODE (x));
5491   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
5492     {
5493       if (fmt[i] == 'E')
5494         {
5495           int j;
5496
5497           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
5498             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
5499               return 1;
5500         }
5501       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
5502         return 1;
5503     }
5504
5505   return 0;
5506 }
5507
5508 /* Implement REGNO_REG_CLASS.  */
5509
5510 enum reg_class
5511 aarch64_regno_regclass (unsigned regno)
5512 {
5513   if (GP_REGNUM_P (regno))
5514     return GENERAL_REGS;
5515
5516   if (regno == SP_REGNUM)
5517     return STACK_REG;
5518
5519   if (regno == FRAME_POINTER_REGNUM
5520       || regno == ARG_POINTER_REGNUM)
5521     return POINTER_REGS;
5522
5523   if (FP_REGNUM_P (regno))
5524     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
5525
5526   return NO_REGS;
5527 }
5528
5529 static rtx
5530 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
5531 {
5532   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5533      where mask is selected by alignment and size of the offset.
5534      We try to pick as large a range for the offset as possible to
5535      maximize the chance of a CSE.  However, for aligned addresses
5536      we limit the range to 4k so that structures with different sized
5537      elements are likely to use the same base.  We need to be careful
5538      not to split a CONST for some forms of address expression, otherwise
5539      it will generate sub-optimal code.  */
5540
5541   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5542     {
5543       rtx base = XEXP (x, 0);
5544       rtx offset_rtx = XEXP (x, 1);
5545       HOST_WIDE_INT offset = INTVAL (offset_rtx);
5546
5547       if (GET_CODE (base) == PLUS)
5548         {
5549           rtx op0 = XEXP (base, 0);
5550           rtx op1 = XEXP (base, 1);
5551
5552           /* Force any scaling into a temp for CSE.  */
5553           op0 = force_reg (Pmode, op0);
5554           op1 = force_reg (Pmode, op1);
5555
5556           /* Let the pointer register be in op0.  */
5557           if (REG_POINTER (op1))
5558             std::swap (op0, op1);
5559
5560           /* If the pointer is virtual or frame related, then we know that
5561              virtual register instantiation or register elimination is going
5562              to apply a second constant.  We want the two constants folded
5563              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
5564           if (virt_or_elim_regno_p (REGNO (op0)))
5565             {
5566               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
5567                                    NULL_RTX, true, OPTAB_DIRECT);
5568               return gen_rtx_PLUS (Pmode, base, op1);
5569             }
5570
5571           /* Otherwise, in order to encourage CSE (and thence loop strength
5572              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
5573           base = expand_binop (Pmode, add_optab, op0, op1,
5574                                NULL_RTX, true, OPTAB_DIRECT);
5575           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
5576         }
5577
5578       /* Does it look like we'll need a 16-byte load/store-pair operation?  */
5579       HOST_WIDE_INT base_offset;
5580       if (GET_MODE_SIZE (mode) > 16)
5581         base_offset = (offset + 0x400) & ~0x7f0;
5582       /* For offsets aren't a multiple of the access size, the limit is
5583          -256...255.  */
5584       else if (offset & (GET_MODE_SIZE (mode) - 1))
5585         {
5586           base_offset = (offset + 0x100) & ~0x1ff;
5587
5588           /* BLKmode typically uses LDP of X-registers.  */
5589           if (mode == BLKmode)
5590             base_offset = (offset + 512) & ~0x3ff;
5591         }
5592       /* Small negative offsets are supported.  */
5593       else if (IN_RANGE (offset, -256, 0))
5594         base_offset = 0;
5595       else if (mode == TImode || mode == TFmode)
5596         base_offset = (offset + 0x100) & ~0x1ff;
5597       /* Use 12-bit offset by access size.  */
5598       else
5599         base_offset = offset & (~0xfff * GET_MODE_SIZE (mode));
5600
5601       if (base_offset != 0)
5602         {
5603           base = plus_constant (Pmode, base, base_offset);
5604           base = force_operand (base, NULL_RTX);
5605           return plus_constant (Pmode, base, offset - base_offset);
5606         }
5607     }
5608
5609   return x;
5610 }
5611
5612 /* Return the reload icode required for a constant pool in mode.  */
5613 static enum insn_code
5614 aarch64_constant_pool_reload_icode (machine_mode mode)
5615 {
5616   switch (mode)
5617     {
5618     case SFmode:
5619       return CODE_FOR_aarch64_reload_movcpsfdi;
5620
5621     case DFmode:
5622       return CODE_FOR_aarch64_reload_movcpdfdi;
5623
5624     case TFmode:
5625       return CODE_FOR_aarch64_reload_movcptfdi;
5626
5627     case V8QImode:
5628       return CODE_FOR_aarch64_reload_movcpv8qidi;
5629
5630     case V16QImode:
5631       return CODE_FOR_aarch64_reload_movcpv16qidi;
5632
5633     case V4HImode:
5634       return CODE_FOR_aarch64_reload_movcpv4hidi;
5635
5636     case V8HImode:
5637       return CODE_FOR_aarch64_reload_movcpv8hidi;
5638
5639     case V2SImode:
5640       return CODE_FOR_aarch64_reload_movcpv2sidi;
5641
5642     case V4SImode:
5643       return CODE_FOR_aarch64_reload_movcpv4sidi;
5644
5645     case V2DImode:
5646       return CODE_FOR_aarch64_reload_movcpv2didi;
5647
5648     case V2DFmode:
5649       return CODE_FOR_aarch64_reload_movcpv2dfdi;
5650
5651     default:
5652       gcc_unreachable ();
5653     }
5654
5655   gcc_unreachable ();
5656 }
5657 static reg_class_t
5658 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5659                           reg_class_t rclass,
5660                           machine_mode mode,
5661                           secondary_reload_info *sri)
5662 {
5663
5664   /* If we have to disable direct literal pool loads and stores because the
5665      function is too big, then we need a scratch register.  */
5666   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5667       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5668           || targetm.vector_mode_supported_p (GET_MODE (x)))
5669       && !aarch64_pcrelative_literal_loads)
5670     {
5671       sri->icode = aarch64_constant_pool_reload_icode (mode);
5672       return NO_REGS;
5673     }
5674
5675   /* Without the TARGET_SIMD instructions we cannot move a Q register
5676      to a Q register directly.  We need a scratch.  */
5677   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5678       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5679       && reg_class_subset_p (rclass, FP_REGS))
5680     {
5681       if (mode == TFmode)
5682         sri->icode = CODE_FOR_aarch64_reload_movtf;
5683       else if (mode == TImode)
5684         sri->icode = CODE_FOR_aarch64_reload_movti;
5685       return NO_REGS;
5686     }
5687
5688   /* A TFmode or TImode memory access should be handled via an FP_REGS
5689      because AArch64 has richer addressing modes for LDR/STR instructions
5690      than LDP/STP instructions.  */
5691   if (TARGET_FLOAT && rclass == GENERAL_REGS
5692       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5693     return FP_REGS;
5694
5695   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5696       return GENERAL_REGS;
5697
5698   return NO_REGS;
5699 }
5700
5701 static bool
5702 aarch64_can_eliminate (const int from, const int to)
5703 {
5704   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5705      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
5706
5707   if (frame_pointer_needed)
5708     {
5709       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5710         return true;
5711       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5712         return false;
5713       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5714           && !cfun->calls_alloca)
5715         return true;
5716       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5717         return true;
5718
5719       return false;
5720     }
5721   else
5722     {
5723       /* If we decided that we didn't need a leaf frame pointer but then used
5724          LR in the function, then we'll want a frame pointer after all, so
5725          prevent this elimination to ensure a frame pointer is used.  */
5726       if (to == STACK_POINTER_REGNUM
5727           && flag_omit_leaf_frame_pointer
5728           && df_regs_ever_live_p (LR_REGNUM))
5729         return false;
5730     }
5731
5732   return true;
5733 }
5734
5735 HOST_WIDE_INT
5736 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5737 {
5738   aarch64_layout_frame ();
5739
5740   if (to == HARD_FRAME_POINTER_REGNUM)
5741     {
5742       if (from == ARG_POINTER_REGNUM)
5743         return cfun->machine->frame.hard_fp_offset;
5744
5745       if (from == FRAME_POINTER_REGNUM)
5746         return cfun->machine->frame.hard_fp_offset
5747                - cfun->machine->frame.locals_offset;
5748     }
5749
5750   if (to == STACK_POINTER_REGNUM)
5751     {
5752       if (from == FRAME_POINTER_REGNUM)
5753           return cfun->machine->frame.frame_size
5754                  - cfun->machine->frame.locals_offset;
5755     }
5756
5757   return cfun->machine->frame.frame_size;
5758 }
5759
5760 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
5761    previous frame.  */
5762
5763 rtx
5764 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5765 {
5766   if (count != 0)
5767     return const0_rtx;
5768   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5769 }
5770
5771
5772 static void
5773 aarch64_asm_trampoline_template (FILE *f)
5774 {
5775   if (TARGET_ILP32)
5776     {
5777       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5778       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5779     }
5780   else
5781     {
5782       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5783       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5784     }
5785   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5786   assemble_aligned_integer (4, const0_rtx);
5787   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5788   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5789 }
5790
5791 static void
5792 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5793 {
5794   rtx fnaddr, mem, a_tramp;
5795   const int tramp_code_sz = 16;
5796
5797   /* Don't need to copy the trailing D-words, we fill those in below.  */
5798   emit_block_move (m_tramp, assemble_trampoline_template (),
5799                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5800   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5801   fnaddr = XEXP (DECL_RTL (fndecl), 0);
5802   if (GET_MODE (fnaddr) != ptr_mode)
5803     fnaddr = convert_memory_address (ptr_mode, fnaddr);
5804   emit_move_insn (mem, fnaddr);
5805
5806   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5807   emit_move_insn (mem, chain_value);
5808
5809   /* XXX We should really define a "clear_cache" pattern and use
5810      gen_clear_cache().  */
5811   a_tramp = XEXP (m_tramp, 0);
5812   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5813                      LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
5814                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
5815                      ptr_mode);
5816 }
5817
5818 static unsigned char
5819 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
5820 {
5821   switch (regclass)
5822     {
5823     case CALLER_SAVE_REGS:
5824     case POINTER_REGS:
5825     case GENERAL_REGS:
5826     case ALL_REGS:
5827     case FP_REGS:
5828     case FP_LO_REGS:
5829       return
5830         aarch64_vector_mode_p (mode)
5831           ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
5832           : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5833     case STACK_REG:
5834       return 1;
5835
5836     case NO_REGS:
5837       return 0;
5838
5839     default:
5840       break;
5841     }
5842   gcc_unreachable ();
5843 }
5844
5845 static reg_class_t
5846 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
5847 {
5848   if (regclass == POINTER_REGS)
5849     return GENERAL_REGS;
5850
5851   if (regclass == STACK_REG)
5852     {
5853       if (REG_P(x)
5854           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
5855           return regclass;
5856
5857       return NO_REGS;
5858     }
5859
5860   /* If it's an integer immediate that MOVI can't handle, then
5861      FP_REGS is not an option, so we return NO_REGS instead.  */
5862   if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
5863       && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
5864     return NO_REGS;
5865
5866   /* Register eliminiation can result in a request for
5867      SP+constant->FP_REGS.  We cannot support such operations which
5868      use SP as source and an FP_REG as destination, so reject out
5869      right now.  */
5870   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
5871     {
5872       rtx lhs = XEXP (x, 0);
5873
5874       /* Look through a possible SUBREG introduced by ILP32.  */
5875       if (GET_CODE (lhs) == SUBREG)
5876         lhs = SUBREG_REG (lhs);
5877
5878       gcc_assert (REG_P (lhs));
5879       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
5880                                       POINTER_REGS));
5881       return NO_REGS;
5882     }
5883
5884   return regclass;
5885 }
5886
5887 void
5888 aarch64_asm_output_labelref (FILE* f, const char *name)
5889 {
5890   asm_fprintf (f, "%U%s", name);
5891 }
5892
5893 static void
5894 aarch64_elf_asm_constructor (rtx symbol, int priority)
5895 {
5896   if (priority == DEFAULT_INIT_PRIORITY)
5897     default_ctor_section_asm_out_constructor (symbol, priority);
5898   else
5899     {
5900       section *s;
5901       /* While priority is known to be in range [0, 65535], so 18 bytes
5902          would be enough, the compiler might not know that.  To avoid
5903          -Wformat-truncation false positive, use a larger size.  */
5904       char buf[23];
5905       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
5906       s = get_section (buf, SECTION_WRITE, NULL);
5907       switch_to_section (s);
5908       assemble_align (POINTER_SIZE);
5909       assemble_aligned_integer (POINTER_BYTES, symbol);
5910     }
5911 }
5912
5913 static void
5914 aarch64_elf_asm_destructor (rtx symbol, int priority)
5915 {
5916   if (priority == DEFAULT_INIT_PRIORITY)
5917     default_dtor_section_asm_out_destructor (symbol, priority);
5918   else
5919     {
5920       section *s;
5921       /* While priority is known to be in range [0, 65535], so 18 bytes
5922          would be enough, the compiler might not know that.  To avoid
5923          -Wformat-truncation false positive, use a larger size.  */
5924       char buf[23];
5925       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
5926       s = get_section (buf, SECTION_WRITE, NULL);
5927       switch_to_section (s);
5928       assemble_align (POINTER_SIZE);
5929       assemble_aligned_integer (POINTER_BYTES, symbol);
5930     }
5931 }
5932
5933 const char*
5934 aarch64_output_casesi (rtx *operands)
5935 {
5936   char buf[100];
5937   char label[100];
5938   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5939   int index;
5940   static const char *const patterns[4][2] =
5941   {
5942     {
5943       "ldrb\t%w3, [%0,%w1,uxtw]",
5944       "add\t%3, %4, %w3, sxtb #2"
5945     },
5946     {
5947       "ldrh\t%w3, [%0,%w1,uxtw #1]",
5948       "add\t%3, %4, %w3, sxth #2"
5949     },
5950     {
5951       "ldr\t%w3, [%0,%w1,uxtw #2]",
5952       "add\t%3, %4, %w3, sxtw #2"
5953     },
5954     /* We assume that DImode is only generated when not optimizing and
5955        that we don't really need 64-bit address offsets.  That would
5956        imply an object file with 8GB of code in a single function!  */
5957     {
5958       "ldr\t%w3, [%0,%w1,uxtw #2]",
5959       "add\t%3, %4, %w3, sxtw #2"
5960     }
5961   };
5962
5963   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5964
5965   index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5966
5967   gcc_assert (index >= 0 && index <= 3);
5968
5969   /* Need to implement table size reduction, by chaning the code below.  */
5970   output_asm_insn (patterns[index][0], operands);
5971   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5972   snprintf (buf, sizeof (buf),
5973             "adr\t%%4, %s", targetm.strip_name_encoding (label));
5974   output_asm_insn (buf, operands);
5975   output_asm_insn (patterns[index][1], operands);
5976   output_asm_insn ("br\t%3", operands);
5977   assemble_label (asm_out_file, label);
5978   return "";
5979 }
5980
5981
5982 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5983    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5984    operator.  */
5985
5986 int
5987 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5988 {
5989   if (shift >= 0 && shift <= 3)
5990     {
5991       int size;
5992       for (size = 8; size <= 32; size *= 2)
5993         {
5994           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5995           if (mask == bits << shift)
5996             return size;
5997         }
5998     }
5999   return 0;
6000 }
6001
6002 /* Constant pools are per function only when PC relative
6003    literal loads are true or we are in the large memory
6004    model.  */
6005
6006 static inline bool
6007 aarch64_can_use_per_function_literal_pools_p (void)
6008 {
6009   return (aarch64_pcrelative_literal_loads
6010           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
6011 }
6012
6013 static bool
6014 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
6015 {
6016   /* Fixme:: In an ideal world this would work similar
6017      to the logic in aarch64_select_rtx_section but this
6018      breaks bootstrap in gcc go.  For now we workaround
6019      this by returning false here.  */
6020   return false;
6021 }
6022
6023 /* Select appropriate section for constants depending
6024    on where we place literal pools.  */
6025
6026 static section *
6027 aarch64_select_rtx_section (machine_mode mode,
6028                             rtx x,
6029                             unsigned HOST_WIDE_INT align)
6030 {
6031   if (aarch64_can_use_per_function_literal_pools_p ())
6032     return function_section (current_function_decl);
6033
6034   return default_elf_select_rtx_section (mode, x, align);
6035 }
6036
6037 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
6038 void
6039 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
6040                                   HOST_WIDE_INT offset)
6041 {
6042   /* When using per-function literal pools, we must ensure that any code
6043      section is aligned to the minimal instruction length, lest we get
6044      errors from the assembler re "unaligned instructions".  */
6045   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
6046     ASM_OUTPUT_ALIGN (f, 2);
6047 }
6048
6049 /* Costs.  */
6050
6051 /* Helper function for rtx cost calculation.  Strip a shift expression
6052    from X.  Returns the inner operand if successful, or the original
6053    expression on failure.  */
6054 static rtx
6055 aarch64_strip_shift (rtx x)
6056 {
6057   rtx op = x;
6058
6059   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
6060      we can convert both to ROR during final output.  */
6061   if ((GET_CODE (op) == ASHIFT
6062        || GET_CODE (op) == ASHIFTRT
6063        || GET_CODE (op) == LSHIFTRT
6064        || GET_CODE (op) == ROTATERT
6065        || GET_CODE (op) == ROTATE)
6066       && CONST_INT_P (XEXP (op, 1)))
6067     return XEXP (op, 0);
6068
6069   if (GET_CODE (op) == MULT
6070       && CONST_INT_P (XEXP (op, 1))
6071       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
6072     return XEXP (op, 0);
6073
6074   return x;
6075 }
6076
6077 /* Helper function for rtx cost calculation.  Strip an extend
6078    expression from X.  Returns the inner operand if successful, or the
6079    original expression on failure.  We deal with a number of possible
6080    canonicalization variations here.  */
6081 static rtx
6082 aarch64_strip_extend (rtx x)
6083 {
6084   rtx op = x;
6085
6086   /* Zero and sign extraction of a widened value.  */
6087   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
6088       && XEXP (op, 2) == const0_rtx
6089       && GET_CODE (XEXP (op, 0)) == MULT
6090       && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
6091                                          XEXP (op, 1)))
6092     return XEXP (XEXP (op, 0), 0);
6093
6094   /* It can also be represented (for zero-extend) as an AND with an
6095      immediate.  */
6096   if (GET_CODE (op) == AND
6097       && GET_CODE (XEXP (op, 0)) == MULT
6098       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
6099       && CONST_INT_P (XEXP (op, 1))
6100       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
6101                            INTVAL (XEXP (op, 1))) != 0)
6102     return XEXP (XEXP (op, 0), 0);
6103
6104   /* Now handle extended register, as this may also have an optional
6105      left shift by 1..4.  */
6106   if (GET_CODE (op) == ASHIFT
6107       && CONST_INT_P (XEXP (op, 1))
6108       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
6109     op = XEXP (op, 0);
6110
6111   if (GET_CODE (op) == ZERO_EXTEND
6112       || GET_CODE (op) == SIGN_EXTEND)
6113     op = XEXP (op, 0);
6114
6115   if (op != x)
6116     return op;
6117
6118   return x;
6119 }
6120
6121 /* Return true iff CODE is a shift supported in combination
6122    with arithmetic instructions.  */
6123
6124 static bool
6125 aarch64_shift_p (enum rtx_code code)
6126 {
6127   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
6128 }
6129
6130 /* Helper function for rtx cost calculation.  Calculate the cost of
6131    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
6132    Return the calculated cost of the expression, recursing manually in to
6133    operands where needed.  */
6134
6135 static int
6136 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
6137 {
6138   rtx op0, op1;
6139   const struct cpu_cost_table *extra_cost
6140     = aarch64_tune_params.insn_extra_cost;
6141   int cost = 0;
6142   bool compound_p = (outer == PLUS || outer == MINUS);
6143   machine_mode mode = GET_MODE (x);
6144
6145   gcc_checking_assert (code == MULT);
6146
6147   op0 = XEXP (x, 0);
6148   op1 = XEXP (x, 1);
6149
6150   if (VECTOR_MODE_P (mode))
6151     mode = GET_MODE_INNER (mode);
6152
6153   /* Integer multiply/fma.  */
6154   if (GET_MODE_CLASS (mode) == MODE_INT)
6155     {
6156       /* The multiply will be canonicalized as a shift, cost it as such.  */
6157       if (aarch64_shift_p (GET_CODE (x))
6158           || (CONST_INT_P (op1)
6159               && exact_log2 (INTVAL (op1)) > 0))
6160         {
6161           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
6162                            || GET_CODE (op0) == SIGN_EXTEND;
6163           if (speed)
6164             {
6165               if (compound_p)
6166                 {
6167                   if (REG_P (op1))
6168                     /* ARITH + shift-by-register.  */
6169                     cost += extra_cost->alu.arith_shift_reg;
6170                   else if (is_extend)
6171                     /* ARITH + extended register.  We don't have a cost field
6172                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
6173                     cost += extra_cost->alu.extend_arith;
6174                   else
6175                     /* ARITH + shift-by-immediate.  */
6176                     cost += extra_cost->alu.arith_shift;
6177                 }
6178               else
6179                 /* LSL (immediate).  */
6180                 cost += extra_cost->alu.shift;
6181
6182             }
6183           /* Strip extends as we will have costed them in the case above.  */
6184           if (is_extend)
6185             op0 = aarch64_strip_extend (op0);
6186
6187           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
6188
6189           return cost;
6190         }
6191
6192       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
6193          compound and let the below cases handle it.  After all, MNEG is a
6194          special-case alias of MSUB.  */
6195       if (GET_CODE (op0) == NEG)
6196         {
6197           op0 = XEXP (op0, 0);
6198           compound_p = true;
6199         }
6200
6201       /* Integer multiplies or FMAs have zero/sign extending variants.  */
6202       if ((GET_CODE (op0) == ZERO_EXTEND
6203            && GET_CODE (op1) == ZERO_EXTEND)
6204           || (GET_CODE (op0) == SIGN_EXTEND
6205               && GET_CODE (op1) == SIGN_EXTEND))
6206         {
6207           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
6208           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
6209
6210           if (speed)
6211             {
6212               if (compound_p)
6213                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
6214                 cost += extra_cost->mult[0].extend_add;
6215               else
6216                 /* MUL/SMULL/UMULL.  */
6217                 cost += extra_cost->mult[0].extend;
6218             }
6219
6220           return cost;
6221         }
6222
6223       /* This is either an integer multiply or a MADD.  In both cases
6224          we want to recurse and cost the operands.  */
6225       cost += rtx_cost (op0, mode, MULT, 0, speed);
6226       cost += rtx_cost (op1, mode, MULT, 1, speed);
6227
6228       if (speed)
6229         {
6230           if (compound_p)
6231             /* MADD/MSUB.  */
6232             cost += extra_cost->mult[mode == DImode].add;
6233           else
6234             /* MUL.  */
6235             cost += extra_cost->mult[mode == DImode].simple;
6236         }
6237
6238       return cost;
6239     }
6240   else
6241     {
6242       if (speed)
6243         {
6244           /* Floating-point FMA/FMUL can also support negations of the
6245              operands, unless the rounding mode is upward or downward in
6246              which case FNMUL is different than FMUL with operand negation.  */
6247           bool neg0 = GET_CODE (op0) == NEG;
6248           bool neg1 = GET_CODE (op1) == NEG;
6249           if (compound_p || !flag_rounding_math || (neg0 && neg1))
6250             {
6251               if (neg0)
6252                 op0 = XEXP (op0, 0);
6253               if (neg1)
6254                 op1 = XEXP (op1, 0);
6255             }
6256
6257           if (compound_p)
6258             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
6259             cost += extra_cost->fp[mode == DFmode].fma;
6260           else
6261             /* FMUL/FNMUL.  */
6262             cost += extra_cost->fp[mode == DFmode].mult;
6263         }
6264
6265       cost += rtx_cost (op0, mode, MULT, 0, speed);
6266       cost += rtx_cost (op1, mode, MULT, 1, speed);
6267       return cost;
6268     }
6269 }
6270
6271 static int
6272 aarch64_address_cost (rtx x,
6273                       machine_mode mode,
6274                       addr_space_t as ATTRIBUTE_UNUSED,
6275                       bool speed)
6276 {
6277   enum rtx_code c = GET_CODE (x);
6278   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
6279   struct aarch64_address_info info;
6280   int cost = 0;
6281   info.shift = 0;
6282
6283   if (!aarch64_classify_address (&info, x, mode, c, false))
6284     {
6285       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
6286         {
6287           /* This is a CONST or SYMBOL ref which will be split
6288              in a different way depending on the code model in use.
6289              Cost it through the generic infrastructure.  */
6290           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
6291           /* Divide through by the cost of one instruction to
6292              bring it to the same units as the address costs.  */
6293           cost_symbol_ref /= COSTS_N_INSNS (1);
6294           /* The cost is then the cost of preparing the address,
6295              followed by an immediate (possibly 0) offset.  */
6296           return cost_symbol_ref + addr_cost->imm_offset;
6297         }
6298       else
6299         {
6300           /* This is most likely a jump table from a case
6301              statement.  */
6302           return addr_cost->register_offset;
6303         }
6304     }
6305
6306   switch (info.type)
6307     {
6308       case ADDRESS_LO_SUM:
6309       case ADDRESS_SYMBOLIC:
6310       case ADDRESS_REG_IMM:
6311         cost += addr_cost->imm_offset;
6312         break;
6313
6314       case ADDRESS_REG_WB:
6315         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
6316           cost += addr_cost->pre_modify;
6317         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
6318           cost += addr_cost->post_modify;
6319         else
6320           gcc_unreachable ();
6321
6322         break;
6323
6324       case ADDRESS_REG_REG:
6325         cost += addr_cost->register_offset;
6326         break;
6327
6328       case ADDRESS_REG_SXTW:
6329         cost += addr_cost->register_sextend;
6330         break;
6331
6332       case ADDRESS_REG_UXTW:
6333         cost += addr_cost->register_zextend;
6334         break;
6335
6336       default:
6337         gcc_unreachable ();
6338     }
6339
6340
6341   if (info.shift > 0)
6342     {
6343       /* For the sake of calculating the cost of the shifted register
6344          component, we can treat same sized modes in the same way.  */
6345       switch (GET_MODE_BITSIZE (mode))
6346         {
6347           case 16:
6348             cost += addr_cost->addr_scale_costs.hi;
6349             break;
6350
6351           case 32:
6352             cost += addr_cost->addr_scale_costs.si;
6353             break;
6354
6355           case 64:
6356             cost += addr_cost->addr_scale_costs.di;
6357             break;
6358
6359           /* We can't tell, or this is a 128-bit vector.  */
6360           default:
6361             cost += addr_cost->addr_scale_costs.ti;
6362             break;
6363         }
6364     }
6365
6366   return cost;
6367 }
6368
6369 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
6370    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
6371    to be taken.  */
6372
6373 int
6374 aarch64_branch_cost (bool speed_p, bool predictable_p)
6375 {
6376   /* When optimizing for speed, use the cost of unpredictable branches.  */
6377   const struct cpu_branch_cost *branch_costs =
6378     aarch64_tune_params.branch_costs;
6379
6380   if (!speed_p || predictable_p)
6381     return branch_costs->predictable;
6382   else
6383     return branch_costs->unpredictable;
6384 }
6385
6386 /* Return true if the RTX X in mode MODE is a zero or sign extract
6387    usable in an ADD or SUB (extended register) instruction.  */
6388 static bool
6389 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
6390 {
6391   /* Catch add with a sign extract.
6392      This is add_<optab><mode>_multp2.  */
6393   if (GET_CODE (x) == SIGN_EXTRACT
6394       || GET_CODE (x) == ZERO_EXTRACT)
6395     {
6396       rtx op0 = XEXP (x, 0);
6397       rtx op1 = XEXP (x, 1);
6398       rtx op2 = XEXP (x, 2);
6399
6400       if (GET_CODE (op0) == MULT
6401           && CONST_INT_P (op1)
6402           && op2 == const0_rtx
6403           && CONST_INT_P (XEXP (op0, 1))
6404           && aarch64_is_extend_from_extract (mode,
6405                                              XEXP (op0, 1),
6406                                              op1))
6407         {
6408           return true;
6409         }
6410     }
6411   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
6412      No shift.  */
6413   else if (GET_CODE (x) == SIGN_EXTEND
6414            || GET_CODE (x) == ZERO_EXTEND)
6415     return REG_P (XEXP (x, 0));
6416
6417   return false;
6418 }
6419
6420 static bool
6421 aarch64_frint_unspec_p (unsigned int u)
6422 {
6423   switch (u)
6424     {
6425       case UNSPEC_FRINTZ:
6426       case UNSPEC_FRINTP:
6427       case UNSPEC_FRINTM:
6428       case UNSPEC_FRINTA:
6429       case UNSPEC_FRINTN:
6430       case UNSPEC_FRINTX:
6431       case UNSPEC_FRINTI:
6432         return true;
6433
6434       default:
6435         return false;
6436     }
6437 }
6438
6439 /* Return true iff X is an rtx that will match an extr instruction
6440    i.e. as described in the *extr<mode>5_insn family of patterns.
6441    OP0 and OP1 will be set to the operands of the shifts involved
6442    on success and will be NULL_RTX otherwise.  */
6443
6444 static bool
6445 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
6446 {
6447   rtx op0, op1;
6448   machine_mode mode = GET_MODE (x);
6449
6450   *res_op0 = NULL_RTX;
6451   *res_op1 = NULL_RTX;
6452
6453   if (GET_CODE (x) != IOR)
6454     return false;
6455
6456   op0 = XEXP (x, 0);
6457   op1 = XEXP (x, 1);
6458
6459   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
6460       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
6461     {
6462      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
6463       if (GET_CODE (op1) == ASHIFT)
6464         std::swap (op0, op1);
6465
6466       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
6467         return false;
6468
6469       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
6470       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
6471
6472       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
6473           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
6474         {
6475           *res_op0 = XEXP (op0, 0);
6476           *res_op1 = XEXP (op1, 0);
6477           return true;
6478         }
6479     }
6480
6481   return false;
6482 }
6483
6484 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6485    storing it in *COST.  Result is true if the total cost of the operation
6486    has now been calculated.  */
6487 static bool
6488 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
6489 {
6490   rtx inner;
6491   rtx comparator;
6492   enum rtx_code cmpcode;
6493
6494   if (COMPARISON_P (op0))
6495     {
6496       inner = XEXP (op0, 0);
6497       comparator = XEXP (op0, 1);
6498       cmpcode = GET_CODE (op0);
6499     }
6500   else
6501     {
6502       inner = op0;
6503       comparator = const0_rtx;
6504       cmpcode = NE;
6505     }
6506
6507   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
6508     {
6509       /* Conditional branch.  */
6510       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6511         return true;
6512       else
6513         {
6514           if (cmpcode == NE || cmpcode == EQ)
6515             {
6516               if (comparator == const0_rtx)
6517                 {
6518                   /* TBZ/TBNZ/CBZ/CBNZ.  */
6519                   if (GET_CODE (inner) == ZERO_EXTRACT)
6520                     /* TBZ/TBNZ.  */
6521                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
6522                                        ZERO_EXTRACT, 0, speed);
6523                   else
6524                     /* CBZ/CBNZ.  */
6525                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
6526
6527                 return true;
6528               }
6529             }
6530           else if (cmpcode == LT || cmpcode == GE)
6531             {
6532               /* TBZ/TBNZ.  */
6533               if (comparator == const0_rtx)
6534                 return true;
6535             }
6536         }
6537     }
6538   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6539     {
6540       /* CCMP.  */
6541       if (GET_CODE (op1) == COMPARE)
6542         {
6543           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
6544           if (XEXP (op1, 1) == const0_rtx)
6545             *cost += 1;
6546           if (speed)
6547             {
6548               machine_mode mode = GET_MODE (XEXP (op1, 0));
6549               const struct cpu_cost_table *extra_cost
6550                 = aarch64_tune_params.insn_extra_cost;
6551
6552               if (GET_MODE_CLASS (mode) == MODE_INT)
6553                 *cost += extra_cost->alu.arith;
6554               else
6555                 *cost += extra_cost->fp[mode == DFmode].compare;
6556             }
6557           return true;
6558         }
6559
6560       /* It's a conditional operation based on the status flags,
6561          so it must be some flavor of CSEL.  */
6562
6563       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
6564       if (GET_CODE (op1) == NEG
6565           || GET_CODE (op1) == NOT
6566           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6567         op1 = XEXP (op1, 0);
6568       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6569         {
6570           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
6571           op1 = XEXP (op1, 0);
6572           op2 = XEXP (op2, 0);
6573         }
6574
6575       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6576       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6577       return true;
6578     }
6579
6580   /* We don't know what this is, cost all operands.  */
6581   return false;
6582 }
6583
6584 /* Check whether X is a bitfield operation of the form shift + extend that
6585    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
6586    operand to which the bitfield operation is applied.  Otherwise return
6587    NULL_RTX.  */
6588
6589 static rtx
6590 aarch64_extend_bitfield_pattern_p (rtx x)
6591 {
6592   rtx_code outer_code = GET_CODE (x);
6593   machine_mode outer_mode = GET_MODE (x);
6594
6595   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6596       && outer_mode != SImode && outer_mode != DImode)
6597     return NULL_RTX;
6598
6599   rtx inner = XEXP (x, 0);
6600   rtx_code inner_code = GET_CODE (inner);
6601   machine_mode inner_mode = GET_MODE (inner);
6602   rtx op = NULL_RTX;
6603
6604   switch (inner_code)
6605     {
6606       case ASHIFT:
6607         if (CONST_INT_P (XEXP (inner, 1))
6608             && (inner_mode == QImode || inner_mode == HImode))
6609           op = XEXP (inner, 0);
6610         break;
6611       case LSHIFTRT:
6612         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6613             && (inner_mode == QImode || inner_mode == HImode))
6614           op = XEXP (inner, 0);
6615         break;
6616       case ASHIFTRT:
6617         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6618             && (inner_mode == QImode || inner_mode == HImode))
6619           op = XEXP (inner, 0);
6620         break;
6621       default:
6622         break;
6623     }
6624
6625   return op;
6626 }
6627
6628 /* Return true if the mask and a shift amount from an RTX of the form
6629    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6630    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
6631
6632 bool
6633 aarch64_mask_and_shift_for_ubfiz_p (machine_mode mode, rtx mask, rtx shft_amnt)
6634 {
6635   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
6636          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
6637          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
6638          && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
6639 }
6640
6641 /* Calculate the cost of calculating X, storing it in *COST.  Result
6642    is true if the total cost of the operation has now been calculated.  */
6643 static bool
6644 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6645                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6646 {
6647   rtx op0, op1, op2;
6648   const struct cpu_cost_table *extra_cost
6649     = aarch64_tune_params.insn_extra_cost;
6650   int code = GET_CODE (x);
6651
6652   /* By default, assume that everything has equivalent cost to the
6653      cheapest instruction.  Any additional costs are applied as a delta
6654      above this default.  */
6655   *cost = COSTS_N_INSNS (1);
6656
6657   switch (code)
6658     {
6659     case SET:
6660       /* The cost depends entirely on the operands to SET.  */
6661       *cost = 0;
6662       op0 = SET_DEST (x);
6663       op1 = SET_SRC (x);
6664
6665       switch (GET_CODE (op0))
6666         {
6667         case MEM:
6668           if (speed)
6669             {
6670               rtx address = XEXP (op0, 0);
6671               if (VECTOR_MODE_P (mode))
6672                 *cost += extra_cost->ldst.storev;
6673               else if (GET_MODE_CLASS (mode) == MODE_INT)
6674                 *cost += extra_cost->ldst.store;
6675               else if (mode == SFmode)
6676                 *cost += extra_cost->ldst.storef;
6677               else if (mode == DFmode)
6678                 *cost += extra_cost->ldst.stored;
6679
6680               *cost +=
6681                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6682                                                      0, speed));
6683             }
6684
6685           *cost += rtx_cost (op1, mode, SET, 1, speed);
6686           return true;
6687
6688         case SUBREG:
6689           if (! REG_P (SUBREG_REG (op0)))
6690             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6691
6692           /* Fall through.  */
6693         case REG:
6694           /* The cost is one per vector-register copied.  */
6695           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6696             {
6697               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6698                               / GET_MODE_SIZE (V4SImode);
6699               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6700             }
6701           /* const0_rtx is in general free, but we will use an
6702              instruction to set a register to 0.  */
6703           else if (REG_P (op1) || op1 == const0_rtx)
6704             {
6705               /* The cost is 1 per register copied.  */
6706               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6707                               / UNITS_PER_WORD;
6708               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6709             }
6710           else
6711             /* Cost is just the cost of the RHS of the set.  */
6712             *cost += rtx_cost (op1, mode, SET, 1, speed);
6713           return true;
6714
6715         case ZERO_EXTRACT:
6716         case SIGN_EXTRACT:
6717           /* Bit-field insertion.  Strip any redundant widening of
6718              the RHS to meet the width of the target.  */
6719           if (GET_CODE (op1) == SUBREG)
6720             op1 = SUBREG_REG (op1);
6721           if ((GET_CODE (op1) == ZERO_EXTEND
6722                || GET_CODE (op1) == SIGN_EXTEND)
6723               && CONST_INT_P (XEXP (op0, 1))
6724               && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
6725                   >= INTVAL (XEXP (op0, 1))))
6726             op1 = XEXP (op1, 0);
6727
6728           if (CONST_INT_P (op1))
6729             {
6730               /* MOV immediate is assumed to always be cheap.  */
6731               *cost = COSTS_N_INSNS (1);
6732             }
6733           else
6734             {
6735               /* BFM.  */
6736               if (speed)
6737                 *cost += extra_cost->alu.bfi;
6738               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6739             }
6740
6741           return true;
6742
6743         default:
6744           /* We can't make sense of this, assume default cost.  */
6745           *cost = COSTS_N_INSNS (1);
6746           return false;
6747         }
6748       return false;
6749
6750     case CONST_INT:
6751       /* If an instruction can incorporate a constant within the
6752          instruction, the instruction's expression avoids calling
6753          rtx_cost() on the constant.  If rtx_cost() is called on a
6754          constant, then it is usually because the constant must be
6755          moved into a register by one or more instructions.
6756
6757          The exception is constant 0, which can be expressed
6758          as XZR/WZR and is therefore free.  The exception to this is
6759          if we have (set (reg) (const0_rtx)) in which case we must cost
6760          the move.  However, we can catch that when we cost the SET, so
6761          we don't need to consider that here.  */
6762       if (x == const0_rtx)
6763         *cost = 0;
6764       else
6765         {
6766           /* To an approximation, building any other constant is
6767              proportionally expensive to the number of instructions
6768              required to build that constant.  This is true whether we
6769              are compiling for SPEED or otherwise.  */
6770           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
6771                                  (NULL_RTX, x, false, mode));
6772         }
6773       return true;
6774
6775     case CONST_DOUBLE:
6776       if (speed)
6777         {
6778           /* mov[df,sf]_aarch64.  */
6779           if (aarch64_float_const_representable_p (x))
6780             /* FMOV (scalar immediate).  */
6781             *cost += extra_cost->fp[mode == DFmode].fpconst;
6782           else if (!aarch64_float_const_zero_rtx_p (x))
6783             {
6784               /* This will be a load from memory.  */
6785               if (mode == DFmode)
6786                 *cost += extra_cost->ldst.loadd;
6787               else
6788                 *cost += extra_cost->ldst.loadf;
6789             }
6790           else
6791             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
6792                or MOV v0.s[0], wzr - neither of which are modeled by the
6793                cost tables.  Just use the default cost.  */
6794             {
6795             }
6796         }
6797
6798       return true;
6799
6800     case MEM:
6801       if (speed)
6802         {
6803           /* For loads we want the base cost of a load, plus an
6804              approximation for the additional cost of the addressing
6805              mode.  */
6806           rtx address = XEXP (x, 0);
6807           if (VECTOR_MODE_P (mode))
6808             *cost += extra_cost->ldst.loadv;
6809           else if (GET_MODE_CLASS (mode) == MODE_INT)
6810             *cost += extra_cost->ldst.load;
6811           else if (mode == SFmode)
6812             *cost += extra_cost->ldst.loadf;
6813           else if (mode == DFmode)
6814             *cost += extra_cost->ldst.loadd;
6815
6816           *cost +=
6817                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6818                                                      0, speed));
6819         }
6820
6821       return true;
6822
6823     case NEG:
6824       op0 = XEXP (x, 0);
6825
6826       if (VECTOR_MODE_P (mode))
6827         {
6828           if (speed)
6829             {
6830               /* FNEG.  */
6831               *cost += extra_cost->vect.alu;
6832             }
6833           return false;
6834         }
6835
6836       if (GET_MODE_CLASS (mode) == MODE_INT)
6837         {
6838           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6839               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6840             {
6841               /* CSETM.  */
6842               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
6843               return true;
6844             }
6845
6846           /* Cost this as SUB wzr, X.  */
6847           op0 = CONST0_RTX (mode);
6848           op1 = XEXP (x, 0);
6849           goto cost_minus;
6850         }
6851
6852       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6853         {
6854           /* Support (neg(fma...)) as a single instruction only if
6855              sign of zeros is unimportant.  This matches the decision
6856              making in aarch64.md.  */
6857           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
6858             {
6859               /* FNMADD.  */
6860               *cost = rtx_cost (op0, mode, NEG, 0, speed);
6861               return true;
6862             }
6863           if (GET_CODE (op0) == MULT)
6864             {
6865               /* FNMUL.  */
6866               *cost = rtx_cost (op0, mode, NEG, 0, speed);
6867               return true;
6868             }
6869           if (speed)
6870             /* FNEG.  */
6871             *cost += extra_cost->fp[mode == DFmode].neg;
6872           return false;
6873         }
6874
6875       return false;
6876
6877     case CLRSB:
6878     case CLZ:
6879       if (speed)
6880         {
6881           if (VECTOR_MODE_P (mode))
6882             *cost += extra_cost->vect.alu;
6883           else
6884             *cost += extra_cost->alu.clz;
6885         }
6886
6887       return false;
6888
6889     case COMPARE:
6890       op0 = XEXP (x, 0);
6891       op1 = XEXP (x, 1);
6892
6893       if (op1 == const0_rtx
6894           && GET_CODE (op0) == AND)
6895         {
6896           x = op0;
6897           mode = GET_MODE (op0);
6898           goto cost_logic;
6899         }
6900
6901       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
6902         {
6903           /* TODO: A write to the CC flags possibly costs extra, this
6904              needs encoding in the cost tables.  */
6905
6906           mode = GET_MODE (op0);
6907           /* ANDS.  */
6908           if (GET_CODE (op0) == AND)
6909             {
6910               x = op0;
6911               goto cost_logic;
6912             }
6913
6914           if (GET_CODE (op0) == PLUS)
6915             {
6916               /* ADDS (and CMN alias).  */
6917               x = op0;
6918               goto cost_plus;
6919             }
6920
6921           if (GET_CODE (op0) == MINUS)
6922             {
6923               /* SUBS.  */
6924               x = op0;
6925               goto cost_minus;
6926             }
6927
6928           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
6929               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
6930               && CONST_INT_P (XEXP (op0, 2)))
6931             {
6932               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
6933                  Handle it here directly rather than going to cost_logic
6934                  since we know the immediate generated for the TST is valid
6935                  so we can avoid creating an intermediate rtx for it only
6936                  for costing purposes.  */
6937               if (speed)
6938                 *cost += extra_cost->alu.logical;
6939
6940               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
6941                                  ZERO_EXTRACT, 0, speed);
6942               return true;
6943             }
6944
6945           if (GET_CODE (op1) == NEG)
6946             {
6947               /* CMN.  */
6948               if (speed)
6949                 *cost += extra_cost->alu.arith;
6950
6951               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
6952               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
6953               return true;
6954             }
6955
6956           /* CMP.
6957
6958              Compare can freely swap the order of operands, and
6959              canonicalization puts the more complex operation first.
6960              But the integer MINUS logic expects the shift/extend
6961              operation in op1.  */
6962           if (! (REG_P (op0)
6963                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
6964           {
6965             op0 = XEXP (x, 1);
6966             op1 = XEXP (x, 0);
6967           }
6968           goto cost_minus;
6969         }
6970
6971       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
6972         {
6973           /* FCMP.  */
6974           if (speed)
6975             *cost += extra_cost->fp[mode == DFmode].compare;
6976
6977           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
6978             {
6979               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
6980               /* FCMP supports constant 0.0 for no extra cost. */
6981               return true;
6982             }
6983           return false;
6984         }
6985
6986       if (VECTOR_MODE_P (mode))
6987         {
6988           /* Vector compare.  */
6989           if (speed)
6990             *cost += extra_cost->vect.alu;
6991
6992           if (aarch64_float_const_zero_rtx_p (op1))
6993             {
6994               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
6995                  cost.  */
6996               return true;
6997             }
6998           return false;
6999         }
7000       return false;
7001
7002     case MINUS:
7003       {
7004         op0 = XEXP (x, 0);
7005         op1 = XEXP (x, 1);
7006
7007 cost_minus:
7008         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
7009
7010         /* Detect valid immediates.  */
7011         if ((GET_MODE_CLASS (mode) == MODE_INT
7012              || (GET_MODE_CLASS (mode) == MODE_CC
7013                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
7014             && CONST_INT_P (op1)
7015             && aarch64_uimm12_shift (INTVAL (op1)))
7016           {
7017             if (speed)
7018               /* SUB(S) (immediate).  */
7019               *cost += extra_cost->alu.arith;
7020             return true;
7021           }
7022
7023         /* Look for SUB (extended register).  */
7024         if (aarch64_rtx_arith_op_extract_p (op1, mode))
7025           {
7026             if (speed)
7027               *cost += extra_cost->alu.extend_arith;
7028
7029             op1 = aarch64_strip_extend (op1);
7030             *cost += rtx_cost (op1, VOIDmode,
7031                                (enum rtx_code) GET_CODE (op1), 0, speed);
7032             return true;
7033           }
7034
7035         rtx new_op1 = aarch64_strip_extend (op1);
7036
7037         /* Cost this as an FMA-alike operation.  */
7038         if ((GET_CODE (new_op1) == MULT
7039              || aarch64_shift_p (GET_CODE (new_op1)))
7040             && code != COMPARE)
7041           {
7042             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
7043                                             (enum rtx_code) code,
7044                                             speed);
7045             return true;
7046           }
7047
7048         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
7049
7050         if (speed)
7051           {
7052             if (VECTOR_MODE_P (mode))
7053               {
7054                 /* Vector SUB.  */
7055                 *cost += extra_cost->vect.alu;
7056               }
7057             else if (GET_MODE_CLASS (mode) == MODE_INT)
7058               {
7059                 /* SUB(S).  */
7060                 *cost += extra_cost->alu.arith;
7061               }
7062             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7063               {
7064                 /* FSUB.  */
7065                 *cost += extra_cost->fp[mode == DFmode].addsub;
7066               }
7067           }
7068         return true;
7069       }
7070
7071     case PLUS:
7072       {
7073         rtx new_op0;
7074
7075         op0 = XEXP (x, 0);
7076         op1 = XEXP (x, 1);
7077
7078 cost_plus:
7079         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7080             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7081           {
7082             /* CSINC.  */
7083             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
7084             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7085             return true;
7086           }
7087
7088         if (GET_MODE_CLASS (mode) == MODE_INT
7089             && CONST_INT_P (op1)
7090             && aarch64_uimm12_shift (INTVAL (op1)))
7091           {
7092             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
7093
7094             if (speed)
7095               /* ADD (immediate).  */
7096               *cost += extra_cost->alu.arith;
7097             return true;
7098           }
7099
7100         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7101
7102         /* Look for ADD (extended register).  */
7103         if (aarch64_rtx_arith_op_extract_p (op0, mode))
7104           {
7105             if (speed)
7106               *cost += extra_cost->alu.extend_arith;
7107
7108             op0 = aarch64_strip_extend (op0);
7109             *cost += rtx_cost (op0, VOIDmode,
7110                                (enum rtx_code) GET_CODE (op0), 0, speed);
7111             return true;
7112           }
7113
7114         /* Strip any extend, leave shifts behind as we will
7115            cost them through mult_cost.  */
7116         new_op0 = aarch64_strip_extend (op0);
7117
7118         if (GET_CODE (new_op0) == MULT
7119             || aarch64_shift_p (GET_CODE (new_op0)))
7120           {
7121             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
7122                                             speed);
7123             return true;
7124           }
7125
7126         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
7127
7128         if (speed)
7129           {
7130             if (VECTOR_MODE_P (mode))
7131               {
7132                 /* Vector ADD.  */
7133                 *cost += extra_cost->vect.alu;
7134               }
7135             else if (GET_MODE_CLASS (mode) == MODE_INT)
7136               {
7137                 /* ADD.  */
7138                 *cost += extra_cost->alu.arith;
7139               }
7140             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7141               {
7142                 /* FADD.  */
7143                 *cost += extra_cost->fp[mode == DFmode].addsub;
7144               }
7145           }
7146         return true;
7147       }
7148
7149     case BSWAP:
7150       *cost = COSTS_N_INSNS (1);
7151
7152       if (speed)
7153         {
7154           if (VECTOR_MODE_P (mode))
7155             *cost += extra_cost->vect.alu;
7156           else
7157             *cost += extra_cost->alu.rev;
7158         }
7159       return false;
7160
7161     case IOR:
7162       if (aarch_rev16_p (x))
7163         {
7164           *cost = COSTS_N_INSNS (1);
7165
7166           if (speed)
7167             {
7168               if (VECTOR_MODE_P (mode))
7169                 *cost += extra_cost->vect.alu;
7170               else
7171                 *cost += extra_cost->alu.rev;
7172             }
7173           return true;
7174         }
7175
7176       if (aarch64_extr_rtx_p (x, &op0, &op1))
7177         {
7178           *cost += rtx_cost (op0, mode, IOR, 0, speed);
7179           *cost += rtx_cost (op1, mode, IOR, 1, speed);
7180           if (speed)
7181             *cost += extra_cost->alu.shift;
7182
7183           return true;
7184         }
7185     /* Fall through.  */
7186     case XOR:
7187     case AND:
7188     cost_logic:
7189       op0 = XEXP (x, 0);
7190       op1 = XEXP (x, 1);
7191
7192       if (VECTOR_MODE_P (mode))
7193         {
7194           if (speed)
7195             *cost += extra_cost->vect.alu;
7196           return true;
7197         }
7198
7199       if (code == AND
7200           && GET_CODE (op0) == MULT
7201           && CONST_INT_P (XEXP (op0, 1))
7202           && CONST_INT_P (op1)
7203           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
7204                                INTVAL (op1)) != 0)
7205         {
7206           /* This is a UBFM/SBFM.  */
7207           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
7208           if (speed)
7209             *cost += extra_cost->alu.bfx;
7210           return true;
7211         }
7212
7213       if (GET_MODE_CLASS (mode) == MODE_INT)
7214         {
7215           if (CONST_INT_P (op1))
7216             {
7217               /* We have a mask + shift version of a UBFIZ
7218                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
7219               if (GET_CODE (op0) == ASHIFT
7220                   && aarch64_mask_and_shift_for_ubfiz_p (mode, op1,
7221                                                           XEXP (op0, 1)))
7222                 {
7223                   *cost += rtx_cost (XEXP (op0, 0), mode,
7224                                      (enum rtx_code) code, 0, speed);
7225                   if (speed)
7226                     *cost += extra_cost->alu.bfx;
7227
7228                   return true;
7229                 }
7230               else if (aarch64_bitmask_imm (INTVAL (op1), mode))
7231                 {
7232                 /* We possibly get the immediate for free, this is not
7233                    modelled.  */
7234                   *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7235                   if (speed)
7236                     *cost += extra_cost->alu.logical;
7237
7238                   return true;
7239                 }
7240             }
7241           else
7242             {
7243               rtx new_op0 = op0;
7244
7245               /* Handle ORN, EON, or BIC.  */
7246               if (GET_CODE (op0) == NOT)
7247                 op0 = XEXP (op0, 0);
7248
7249               new_op0 = aarch64_strip_shift (op0);
7250
7251               /* If we had a shift on op0 then this is a logical-shift-
7252                  by-register/immediate operation.  Otherwise, this is just
7253                  a logical operation.  */
7254               if (speed)
7255                 {
7256                   if (new_op0 != op0)
7257                     {
7258                       /* Shift by immediate.  */
7259                       if (CONST_INT_P (XEXP (op0, 1)))
7260                         *cost += extra_cost->alu.log_shift;
7261                       else
7262                         *cost += extra_cost->alu.log_shift_reg;
7263                     }
7264                   else
7265                     *cost += extra_cost->alu.logical;
7266                 }
7267
7268               /* In both cases we want to cost both operands.  */
7269               *cost += rtx_cost (new_op0, mode, (enum rtx_code) code, 0, speed);
7270               *cost += rtx_cost (op1, mode, (enum rtx_code) code, 1, speed);
7271
7272               return true;
7273             }
7274         }
7275       return false;
7276
7277     case NOT:
7278       x = XEXP (x, 0);
7279       op0 = aarch64_strip_shift (x);
7280
7281       if (VECTOR_MODE_P (mode))
7282         {
7283           /* Vector NOT.  */
7284           *cost += extra_cost->vect.alu;
7285           return false;
7286         }
7287
7288       /* MVN-shifted-reg.  */
7289       if (op0 != x)
7290         {
7291           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7292
7293           if (speed)
7294             *cost += extra_cost->alu.log_shift;
7295
7296           return true;
7297         }
7298       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
7299          Handle the second form here taking care that 'a' in the above can
7300          be a shift.  */
7301       else if (GET_CODE (op0) == XOR)
7302         {
7303           rtx newop0 = XEXP (op0, 0);
7304           rtx newop1 = XEXP (op0, 1);
7305           rtx op0_stripped = aarch64_strip_shift (newop0);
7306
7307           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
7308           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
7309
7310           if (speed)
7311             {
7312               if (op0_stripped != newop0)
7313                 *cost += extra_cost->alu.log_shift;
7314               else
7315                 *cost += extra_cost->alu.logical;
7316             }
7317
7318           return true;
7319         }
7320       /* MVN.  */
7321       if (speed)
7322         *cost += extra_cost->alu.logical;
7323
7324       return false;
7325
7326     case ZERO_EXTEND:
7327
7328       op0 = XEXP (x, 0);
7329       /* If a value is written in SI mode, then zero extended to DI
7330          mode, the operation will in general be free as a write to
7331          a 'w' register implicitly zeroes the upper bits of an 'x'
7332          register.  However, if this is
7333
7334            (set (reg) (zero_extend (reg)))
7335
7336          we must cost the explicit register move.  */
7337       if (mode == DImode
7338           && GET_MODE (op0) == SImode
7339           && outer == SET)
7340         {
7341           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
7342
7343         /* If OP_COST is non-zero, then the cost of the zero extend
7344            is effectively the cost of the inner operation.  Otherwise
7345            we have a MOV instruction and we take the cost from the MOV
7346            itself.  This is true independently of whether we are
7347            optimizing for space or time.  */
7348           if (op_cost)
7349             *cost = op_cost;
7350
7351           return true;
7352         }
7353       else if (MEM_P (op0))
7354         {
7355           /* All loads can zero extend to any size for free.  */
7356           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
7357           return true;
7358         }
7359
7360       op0 = aarch64_extend_bitfield_pattern_p (x);
7361       if (op0)
7362         {
7363           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
7364           if (speed)
7365             *cost += extra_cost->alu.bfx;
7366           return true;
7367         }
7368
7369       if (speed)
7370         {
7371           if (VECTOR_MODE_P (mode))
7372             {
7373               /* UMOV.  */
7374               *cost += extra_cost->vect.alu;
7375             }
7376           else
7377             {
7378               /* We generate an AND instead of UXTB/UXTH.  */
7379               *cost += extra_cost->alu.logical;
7380             }
7381         }
7382       return false;
7383
7384     case SIGN_EXTEND:
7385       if (MEM_P (XEXP (x, 0)))
7386         {
7387           /* LDRSH.  */
7388           if (speed)
7389             {
7390               rtx address = XEXP (XEXP (x, 0), 0);
7391               *cost += extra_cost->ldst.load_sign_extend;
7392
7393               *cost +=
7394                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7395                                                      0, speed));
7396             }
7397           return true;
7398         }
7399
7400       op0 = aarch64_extend_bitfield_pattern_p (x);
7401       if (op0)
7402         {
7403           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
7404           if (speed)
7405             *cost += extra_cost->alu.bfx;
7406           return true;
7407         }
7408
7409       if (speed)
7410         {
7411           if (VECTOR_MODE_P (mode))
7412             *cost += extra_cost->vect.alu;
7413           else
7414             *cost += extra_cost->alu.extend;
7415         }
7416       return false;
7417
7418     case ASHIFT:
7419       op0 = XEXP (x, 0);
7420       op1 = XEXP (x, 1);
7421
7422       if (CONST_INT_P (op1))
7423         {
7424           if (speed)
7425             {
7426               if (VECTOR_MODE_P (mode))
7427                 {
7428                   /* Vector shift (immediate).  */
7429                   *cost += extra_cost->vect.alu;
7430                 }
7431               else
7432                 {
7433                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
7434                      aliases.  */
7435                   *cost += extra_cost->alu.shift;
7436                 }
7437             }
7438
7439           /* We can incorporate zero/sign extend for free.  */
7440           if (GET_CODE (op0) == ZERO_EXTEND
7441               || GET_CODE (op0) == SIGN_EXTEND)
7442             op0 = XEXP (op0, 0);
7443
7444           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
7445           return true;
7446         }
7447       else
7448         {
7449           if (speed)
7450             {
7451               if (VECTOR_MODE_P (mode))
7452                 {
7453                   /* Vector shift (register).  */
7454                   *cost += extra_cost->vect.alu;
7455                 }
7456               else
7457                 {
7458                   /* LSLV.  */
7459                   *cost += extra_cost->alu.shift_reg;
7460                 }
7461             }
7462           return false;  /* All arguments need to be in registers.  */
7463         }
7464
7465     case ROTATE:
7466     case ROTATERT:
7467     case LSHIFTRT:
7468     case ASHIFTRT:
7469       op0 = XEXP (x, 0);
7470       op1 = XEXP (x, 1);
7471
7472       if (CONST_INT_P (op1))
7473         {
7474           /* ASR (immediate) and friends.  */
7475           if (speed)
7476             {
7477               if (VECTOR_MODE_P (mode))
7478                 *cost += extra_cost->vect.alu;
7479               else
7480                 *cost += extra_cost->alu.shift;
7481             }
7482
7483           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7484           return true;
7485         }
7486       else
7487         {
7488
7489           /* ASR (register) and friends.  */
7490           if (speed)
7491             {
7492               if (VECTOR_MODE_P (mode))
7493                 *cost += extra_cost->vect.alu;
7494               else
7495                 *cost += extra_cost->alu.shift_reg;
7496             }
7497           return false;  /* All arguments need to be in registers.  */
7498         }
7499
7500     case SYMBOL_REF:
7501
7502       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
7503           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
7504         {
7505           /* LDR.  */
7506           if (speed)
7507             *cost += extra_cost->ldst.load;
7508         }
7509       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
7510                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
7511         {
7512           /* ADRP, followed by ADD.  */
7513           *cost += COSTS_N_INSNS (1);
7514           if (speed)
7515             *cost += 2 * extra_cost->alu.arith;
7516         }
7517       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
7518                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
7519         {
7520           /* ADR.  */
7521           if (speed)
7522             *cost += extra_cost->alu.arith;
7523         }
7524
7525       if (flag_pic)
7526         {
7527           /* One extra load instruction, after accessing the GOT.  */
7528           *cost += COSTS_N_INSNS (1);
7529           if (speed)
7530             *cost += extra_cost->ldst.load;
7531         }
7532       return true;
7533
7534     case HIGH:
7535     case LO_SUM:
7536       /* ADRP/ADD (immediate).  */
7537       if (speed)
7538         *cost += extra_cost->alu.arith;
7539       return true;
7540
7541     case ZERO_EXTRACT:
7542     case SIGN_EXTRACT:
7543       /* UBFX/SBFX.  */
7544       if (speed)
7545         {
7546           if (VECTOR_MODE_P (mode))
7547             *cost += extra_cost->vect.alu;
7548           else
7549             *cost += extra_cost->alu.bfx;
7550         }
7551
7552       /* We can trust that the immediates used will be correct (there
7553          are no by-register forms), so we need only cost op0.  */
7554       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7555       return true;
7556
7557     case MULT:
7558       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7559       /* aarch64_rtx_mult_cost always handles recursion to its
7560          operands.  */
7561       return true;
7562
7563     case MOD:
7564     /* We can expand signed mod by power of 2 using a NEGS, two parallel
7565        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
7566        an unconditional negate.  This case should only ever be reached through
7567        the set_smod_pow2_cheap check in expmed.c.  */
7568       if (CONST_INT_P (XEXP (x, 1))
7569           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7570           && (mode == SImode || mode == DImode))
7571         {
7572           /* We expand to 4 instructions.  Reset the baseline.  */
7573           *cost = COSTS_N_INSNS (4);
7574
7575           if (speed)
7576             *cost += 2 * extra_cost->alu.logical
7577                      + 2 * extra_cost->alu.arith;
7578
7579           return true;
7580         }
7581
7582     /* Fall-through.  */
7583     case UMOD:
7584       if (speed)
7585         {
7586           /* Slighly prefer UMOD over SMOD.  */
7587           if (VECTOR_MODE_P (mode))
7588             *cost += extra_cost->vect.alu;
7589           else if (GET_MODE_CLASS (mode) == MODE_INT)
7590             *cost += (extra_cost->mult[mode == DImode].add
7591                       + extra_cost->mult[mode == DImode].idiv
7592                       + (code == MOD ? 1 : 0));
7593         }
7594       return false;  /* All arguments need to be in registers.  */
7595
7596     case DIV:
7597     case UDIV:
7598     case SQRT:
7599       if (speed)
7600         {
7601           if (VECTOR_MODE_P (mode))
7602             *cost += extra_cost->vect.alu;
7603           else if (GET_MODE_CLASS (mode) == MODE_INT)
7604             /* There is no integer SQRT, so only DIV and UDIV can get
7605                here.  */
7606             *cost += (extra_cost->mult[mode == DImode].idiv
7607                      /* Slighly prefer UDIV over SDIV.  */
7608                      + (code == DIV ? 1 : 0));
7609           else
7610             *cost += extra_cost->fp[mode == DFmode].div;
7611         }
7612       return false;  /* All arguments need to be in registers.  */
7613
7614     case IF_THEN_ELSE:
7615       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7616                                          XEXP (x, 2), cost, speed);
7617
7618     case EQ:
7619     case NE:
7620     case GT:
7621     case GTU:
7622     case LT:
7623     case LTU:
7624     case GE:
7625     case GEU:
7626     case LE:
7627     case LEU:
7628
7629       return false; /* All arguments must be in registers.  */
7630
7631     case FMA:
7632       op0 = XEXP (x, 0);
7633       op1 = XEXP (x, 1);
7634       op2 = XEXP (x, 2);
7635
7636       if (speed)
7637         {
7638           if (VECTOR_MODE_P (mode))
7639             *cost += extra_cost->vect.alu;
7640           else
7641             *cost += extra_cost->fp[mode == DFmode].fma;
7642         }
7643
7644       /* FMSUB, FNMADD, and FNMSUB are free.  */
7645       if (GET_CODE (op0) == NEG)
7646         op0 = XEXP (op0, 0);
7647
7648       if (GET_CODE (op2) == NEG)
7649         op2 = XEXP (op2, 0);
7650
7651       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7652          and the by-element operand as operand 0.  */
7653       if (GET_CODE (op1) == NEG)
7654         op1 = XEXP (op1, 0);
7655
7656       /* Catch vector-by-element operations.  The by-element operand can
7657          either be (vec_duplicate (vec_select (x))) or just
7658          (vec_select (x)), depending on whether we are multiplying by
7659          a vector or a scalar.
7660
7661          Canonicalization is not very good in these cases, FMA4 will put the
7662          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
7663       if (GET_CODE (op0) == VEC_DUPLICATE)
7664         op0 = XEXP (op0, 0);
7665       else if (GET_CODE (op1) == VEC_DUPLICATE)
7666         op1 = XEXP (op1, 0);
7667
7668       if (GET_CODE (op0) == VEC_SELECT)
7669         op0 = XEXP (op0, 0);
7670       else if (GET_CODE (op1) == VEC_SELECT)
7671         op1 = XEXP (op1, 0);
7672
7673       /* If the remaining parameters are not registers,
7674          get the cost to put them into registers.  */
7675       *cost += rtx_cost (op0, mode, FMA, 0, speed);
7676       *cost += rtx_cost (op1, mode, FMA, 1, speed);
7677       *cost += rtx_cost (op2, mode, FMA, 2, speed);
7678       return true;
7679
7680     case FLOAT:
7681     case UNSIGNED_FLOAT:
7682       if (speed)
7683         *cost += extra_cost->fp[mode == DFmode].fromint;
7684       return false;
7685
7686     case FLOAT_EXTEND:
7687       if (speed)
7688         {
7689           if (VECTOR_MODE_P (mode))
7690             {
7691               /*Vector truncate.  */
7692               *cost += extra_cost->vect.alu;
7693             }
7694           else
7695             *cost += extra_cost->fp[mode == DFmode].widen;
7696         }
7697       return false;
7698
7699     case FLOAT_TRUNCATE:
7700       if (speed)
7701         {
7702           if (VECTOR_MODE_P (mode))
7703             {
7704               /*Vector conversion.  */
7705               *cost += extra_cost->vect.alu;
7706             }
7707           else
7708             *cost += extra_cost->fp[mode == DFmode].narrow;
7709         }
7710       return false;
7711
7712     case FIX:
7713     case UNSIGNED_FIX:
7714       x = XEXP (x, 0);
7715       /* Strip the rounding part.  They will all be implemented
7716          by the fcvt* family of instructions anyway.  */
7717       if (GET_CODE (x) == UNSPEC)
7718         {
7719           unsigned int uns_code = XINT (x, 1);
7720
7721           if (uns_code == UNSPEC_FRINTA
7722               || uns_code == UNSPEC_FRINTM
7723               || uns_code == UNSPEC_FRINTN
7724               || uns_code == UNSPEC_FRINTP
7725               || uns_code == UNSPEC_FRINTZ)
7726             x = XVECEXP (x, 0, 0);
7727         }
7728
7729       if (speed)
7730         {
7731           if (VECTOR_MODE_P (mode))
7732             *cost += extra_cost->vect.alu;
7733           else
7734             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
7735         }
7736
7737       /* We can combine fmul by a power of 2 followed by a fcvt into a single
7738          fixed-point fcvt.  */
7739       if (GET_CODE (x) == MULT
7740           && ((VECTOR_MODE_P (mode)
7741                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
7742               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
7743         {
7744           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
7745                              0, speed);
7746           return true;
7747         }
7748
7749       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
7750       return true;
7751
7752     case ABS:
7753       if (VECTOR_MODE_P (mode))
7754         {
7755           /* ABS (vector).  */
7756           if (speed)
7757             *cost += extra_cost->vect.alu;
7758         }
7759       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7760         {
7761           op0 = XEXP (x, 0);
7762
7763           /* FABD, which is analogous to FADD.  */
7764           if (GET_CODE (op0) == MINUS)
7765             {
7766               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
7767               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
7768               if (speed)
7769                 *cost += extra_cost->fp[mode == DFmode].addsub;
7770
7771               return true;
7772             }
7773           /* Simple FABS is analogous to FNEG.  */
7774           if (speed)
7775             *cost += extra_cost->fp[mode == DFmode].neg;
7776         }
7777       else
7778         {
7779           /* Integer ABS will either be split to
7780              two arithmetic instructions, or will be an ABS
7781              (scalar), which we don't model.  */
7782           *cost = COSTS_N_INSNS (2);
7783           if (speed)
7784             *cost += 2 * extra_cost->alu.arith;
7785         }
7786       return false;
7787
7788     case SMAX:
7789     case SMIN:
7790       if (speed)
7791         {
7792           if (VECTOR_MODE_P (mode))
7793             *cost += extra_cost->vect.alu;
7794           else
7795             {
7796               /* FMAXNM/FMINNM/FMAX/FMIN.
7797                  TODO: This may not be accurate for all implementations, but
7798                  we do not model this in the cost tables.  */
7799               *cost += extra_cost->fp[mode == DFmode].addsub;
7800             }
7801         }
7802       return false;
7803
7804     case UNSPEC:
7805       /* The floating point round to integer frint* instructions.  */
7806       if (aarch64_frint_unspec_p (XINT (x, 1)))
7807         {
7808           if (speed)
7809             *cost += extra_cost->fp[mode == DFmode].roundint;
7810
7811           return false;
7812         }
7813
7814       if (XINT (x, 1) == UNSPEC_RBIT)
7815         {
7816           if (speed)
7817             *cost += extra_cost->alu.rev;
7818
7819           return false;
7820         }
7821       break;
7822
7823     case TRUNCATE:
7824
7825       /* Decompose <su>muldi3_highpart.  */
7826       if (/* (truncate:DI  */
7827           mode == DImode
7828           /*   (lshiftrt:TI  */
7829           && GET_MODE (XEXP (x, 0)) == TImode
7830           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
7831           /*      (mult:TI  */
7832           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
7833           /*        (ANY_EXTEND:TI (reg:DI))
7834                     (ANY_EXTEND:TI (reg:DI)))  */
7835           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
7836                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
7837               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
7838                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
7839           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
7840           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
7841           /*     (const_int 64)  */
7842           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7843           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
7844         {
7845           /* UMULH/SMULH.  */
7846           if (speed)
7847             *cost += extra_cost->mult[mode == DImode].extend;
7848           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
7849                              mode, MULT, 0, speed);
7850           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
7851                              mode, MULT, 1, speed);
7852           return true;
7853         }
7854
7855       /* Fall through.  */
7856     default:
7857       break;
7858     }
7859
7860   if (dump_file
7861       && flag_aarch64_verbose_cost)
7862     fprintf (dump_file,
7863       "\nFailed to cost RTX.  Assuming default cost.\n");
7864
7865   return true;
7866 }
7867
7868 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
7869    calculated for X.  This cost is stored in *COST.  Returns true
7870    if the total cost of X was calculated.  */
7871 static bool
7872 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
7873                    int param, int *cost, bool speed)
7874 {
7875   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
7876
7877   if (dump_file
7878       && flag_aarch64_verbose_cost)
7879     {
7880       print_rtl_single (dump_file, x);
7881       fprintf (dump_file, "\n%s cost: %d (%s)\n",
7882                speed ? "Hot" : "Cold",
7883                *cost, result ? "final" : "partial");
7884     }
7885
7886   return result;
7887 }
7888
7889 static int
7890 aarch64_register_move_cost (machine_mode mode,
7891                             reg_class_t from_i, reg_class_t to_i)
7892 {
7893   enum reg_class from = (enum reg_class) from_i;
7894   enum reg_class to = (enum reg_class) to_i;
7895   const struct cpu_regmove_cost *regmove_cost
7896     = aarch64_tune_params.regmove_cost;
7897
7898   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
7899   if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
7900     to = GENERAL_REGS;
7901
7902   if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
7903     from = GENERAL_REGS;
7904
7905   /* Moving between GPR and stack cost is the same as GP2GP.  */
7906   if ((from == GENERAL_REGS && to == STACK_REG)
7907       || (to == GENERAL_REGS && from == STACK_REG))
7908     return regmove_cost->GP2GP;
7909
7910   /* To/From the stack register, we move via the gprs.  */
7911   if (to == STACK_REG || from == STACK_REG)
7912     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
7913             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
7914
7915   if (GET_MODE_SIZE (mode) == 16)
7916     {
7917       /* 128-bit operations on general registers require 2 instructions.  */
7918       if (from == GENERAL_REGS && to == GENERAL_REGS)
7919         return regmove_cost->GP2GP * 2;
7920       else if (from == GENERAL_REGS)
7921         return regmove_cost->GP2FP * 2;
7922       else if (to == GENERAL_REGS)
7923         return regmove_cost->FP2GP * 2;
7924
7925       /* When AdvSIMD instructions are disabled it is not possible to move
7926          a 128-bit value directly between Q registers.  This is handled in
7927          secondary reload.  A general register is used as a scratch to move
7928          the upper DI value and the lower DI value is moved directly,
7929          hence the cost is the sum of three moves. */
7930       if (! TARGET_SIMD)
7931         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
7932
7933       return regmove_cost->FP2FP;
7934     }
7935
7936   if (from == GENERAL_REGS && to == GENERAL_REGS)
7937     return regmove_cost->GP2GP;
7938   else if (from == GENERAL_REGS)
7939     return regmove_cost->GP2FP;
7940   else if (to == GENERAL_REGS)
7941     return regmove_cost->FP2GP;
7942
7943   return regmove_cost->FP2FP;
7944 }
7945
7946 static int
7947 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
7948                           reg_class_t rclass ATTRIBUTE_UNUSED,
7949                           bool in ATTRIBUTE_UNUSED)
7950 {
7951   return aarch64_tune_params.memmov_cost;
7952 }
7953
7954 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
7955    to optimize 1.0/sqrt.  */
7956
7957 static bool
7958 use_rsqrt_p (machine_mode mode)
7959 {
7960   return (!flag_trapping_math
7961           && flag_unsafe_math_optimizations
7962           && ((aarch64_tune_params.approx_modes->recip_sqrt
7963                & AARCH64_APPROX_MODE (mode))
7964               || flag_mrecip_low_precision_sqrt));
7965 }
7966
7967 /* Function to decide when to use the approximate reciprocal square root
7968    builtin.  */
7969
7970 static tree
7971 aarch64_builtin_reciprocal (tree fndecl)
7972 {
7973   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
7974
7975   if (!use_rsqrt_p (mode))
7976     return NULL_TREE;
7977   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
7978 }
7979
7980 typedef rtx (*rsqrte_type) (rtx, rtx);
7981
7982 /* Select reciprocal square root initial estimate insn depending on machine
7983    mode.  */
7984
7985 static rsqrte_type
7986 get_rsqrte_type (machine_mode mode)
7987 {
7988   switch (mode)
7989   {
7990     case DFmode:   return gen_aarch64_rsqrtedf;
7991     case SFmode:   return gen_aarch64_rsqrtesf;
7992     case V2DFmode: return gen_aarch64_rsqrtev2df;
7993     case V2SFmode: return gen_aarch64_rsqrtev2sf;
7994     case V4SFmode: return gen_aarch64_rsqrtev4sf;
7995     default: gcc_unreachable ();
7996   }
7997 }
7998
7999 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
8000
8001 /* Select reciprocal square root series step insn depending on machine mode.  */
8002
8003 static rsqrts_type
8004 get_rsqrts_type (machine_mode mode)
8005 {
8006   switch (mode)
8007   {
8008     case DFmode:   return gen_aarch64_rsqrtsdf;
8009     case SFmode:   return gen_aarch64_rsqrtssf;
8010     case V2DFmode: return gen_aarch64_rsqrtsv2df;
8011     case V2SFmode: return gen_aarch64_rsqrtsv2sf;
8012     case V4SFmode: return gen_aarch64_rsqrtsv4sf;
8013     default: gcc_unreachable ();
8014   }
8015 }
8016
8017 /* Emit instruction sequence to compute either the approximate square root
8018    or its approximate reciprocal, depending on the flag RECP, and return
8019    whether the sequence was emitted or not.  */
8020
8021 bool
8022 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
8023 {
8024   machine_mode mode = GET_MODE (dst);
8025
8026   if (GET_MODE_INNER (mode) == HFmode)
8027     {
8028       gcc_assert (!recp);
8029       return false;
8030     }
8031
8032   machine_mode mmsk
8033     = mode_for_vector (int_mode_for_mode (GET_MODE_INNER (mode)),
8034                        GET_MODE_NUNITS (mode));
8035   if (!recp)
8036     {
8037       if (!(flag_mlow_precision_sqrt
8038             || (aarch64_tune_params.approx_modes->sqrt
8039                 & AARCH64_APPROX_MODE (mode))))
8040         return false;
8041
8042       if (flag_finite_math_only
8043           || flag_trapping_math
8044           || !flag_unsafe_math_optimizations
8045           || optimize_function_for_size_p (cfun))
8046         return false;
8047     }
8048   else
8049     /* Caller assumes we cannot fail.  */
8050     gcc_assert (use_rsqrt_p (mode));
8051
8052
8053   rtx xmsk = gen_reg_rtx (mmsk);
8054   if (!recp)
8055     /* When calculating the approximate square root, compare the
8056        argument with 0.0 and create a mask.  */
8057     emit_insn (gen_rtx_SET (xmsk,
8058                             gen_rtx_NEG (mmsk,
8059                                          gen_rtx_EQ (mmsk, src,
8060                                                      CONST0_RTX (mode)))));
8061
8062   /* Estimate the approximate reciprocal square root.  */
8063   rtx xdst = gen_reg_rtx (mode);
8064   emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
8065
8066   /* Iterate over the series twice for SF and thrice for DF.  */
8067   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8068
8069   /* Optionally iterate over the series once less for faster performance
8070      while sacrificing the accuracy.  */
8071   if ((recp && flag_mrecip_low_precision_sqrt)
8072       || (!recp && flag_mlow_precision_sqrt))
8073     iterations--;
8074
8075   /* Iterate over the series to calculate the approximate reciprocal square
8076      root.  */
8077   rtx x1 = gen_reg_rtx (mode);
8078   while (iterations--)
8079     {
8080       rtx x2 = gen_reg_rtx (mode);
8081       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
8082
8083       emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
8084
8085       if (iterations > 0)
8086         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
8087     }
8088
8089   if (!recp)
8090     {
8091       /* Qualify the approximate reciprocal square root when the argument is
8092          0.0 by squashing the intermediary result to 0.0.  */
8093       rtx xtmp = gen_reg_rtx (mmsk);
8094       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
8095                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
8096       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
8097
8098       /* Calculate the approximate square root.  */
8099       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
8100     }
8101
8102   /* Finalize the approximation.  */
8103   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
8104
8105   return true;
8106 }
8107
8108 typedef rtx (*recpe_type) (rtx, rtx);
8109
8110 /* Select reciprocal initial estimate insn depending on machine mode.  */
8111
8112 static recpe_type
8113 get_recpe_type (machine_mode mode)
8114 {
8115   switch (mode)
8116   {
8117     case SFmode:   return (gen_aarch64_frecpesf);
8118     case V2SFmode: return (gen_aarch64_frecpev2sf);
8119     case V4SFmode: return (gen_aarch64_frecpev4sf);
8120     case DFmode:   return (gen_aarch64_frecpedf);
8121     case V2DFmode: return (gen_aarch64_frecpev2df);
8122     default:       gcc_unreachable ();
8123   }
8124 }
8125
8126 typedef rtx (*recps_type) (rtx, rtx, rtx);
8127
8128 /* Select reciprocal series step insn depending on machine mode.  */
8129
8130 static recps_type
8131 get_recps_type (machine_mode mode)
8132 {
8133   switch (mode)
8134   {
8135     case SFmode:   return (gen_aarch64_frecpssf);
8136     case V2SFmode: return (gen_aarch64_frecpsv2sf);
8137     case V4SFmode: return (gen_aarch64_frecpsv4sf);
8138     case DFmode:   return (gen_aarch64_frecpsdf);
8139     case V2DFmode: return (gen_aarch64_frecpsv2df);
8140     default:       gcc_unreachable ();
8141   }
8142 }
8143
8144 /* Emit the instruction sequence to compute the approximation for the division
8145    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
8146
8147 bool
8148 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
8149 {
8150   machine_mode mode = GET_MODE (quo);
8151
8152   if (GET_MODE_INNER (mode) == HFmode)
8153     return false;
8154
8155   bool use_approx_division_p = (flag_mlow_precision_div
8156                                 || (aarch64_tune_params.approx_modes->division
8157                                     & AARCH64_APPROX_MODE (mode)));
8158
8159   if (!flag_finite_math_only
8160       || flag_trapping_math
8161       || !flag_unsafe_math_optimizations
8162       || optimize_function_for_size_p (cfun)
8163       || !use_approx_division_p)
8164     return false;
8165
8166   /* Estimate the approximate reciprocal.  */
8167   rtx xrcp = gen_reg_rtx (mode);
8168   emit_insn ((*get_recpe_type (mode)) (xrcp, den));
8169
8170   /* Iterate over the series twice for SF and thrice for DF.  */
8171   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8172
8173   /* Optionally iterate over the series once less for faster performance,
8174      while sacrificing the accuracy.  */
8175   if (flag_mlow_precision_div)
8176     iterations--;
8177
8178   /* Iterate over the series to calculate the approximate reciprocal.  */
8179   rtx xtmp = gen_reg_rtx (mode);
8180   while (iterations--)
8181     {
8182       emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
8183
8184       if (iterations > 0)
8185         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
8186     }
8187
8188   if (num != CONST1_RTX (mode))
8189     {
8190       /* As the approximate reciprocal of DEN is already calculated, only
8191          calculate the approximate division when NUM is not 1.0.  */
8192       rtx xnum = force_reg (mode, num);
8193       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
8194     }
8195
8196   /* Finalize the approximation.  */
8197   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
8198   return true;
8199 }
8200
8201 /* Return the number of instructions that can be issued per cycle.  */
8202 static int
8203 aarch64_sched_issue_rate (void)
8204 {
8205   return aarch64_tune_params.issue_rate;
8206 }
8207
8208 static int
8209 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
8210 {
8211   int issue_rate = aarch64_sched_issue_rate ();
8212
8213   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
8214 }
8215
8216
8217 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
8218    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
8219    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
8220
8221 static int
8222 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
8223                                                     int ready_index)
8224 {
8225   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
8226 }
8227
8228
8229 /* Vectorizer cost model target hooks.  */
8230
8231 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
8232 static int
8233 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
8234                                     tree vectype,
8235                                     int misalign ATTRIBUTE_UNUSED)
8236 {
8237   unsigned elements;
8238   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
8239   bool fp = false;
8240
8241   if (vectype != NULL)
8242     fp = FLOAT_TYPE_P (vectype);
8243
8244   switch (type_of_cost)
8245     {
8246       case scalar_stmt:
8247         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
8248
8249       case scalar_load:
8250         return costs->scalar_load_cost;
8251
8252       case scalar_store:
8253         return costs->scalar_store_cost;
8254
8255       case vector_stmt:
8256         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8257
8258       case vector_load:
8259         return costs->vec_align_load_cost;
8260
8261       case vector_store:
8262         return costs->vec_store_cost;
8263
8264       case vec_to_scalar:
8265         return costs->vec_to_scalar_cost;
8266
8267       case scalar_to_vec:
8268         return costs->scalar_to_vec_cost;
8269
8270       case unaligned_load:
8271         return costs->vec_unalign_load_cost;
8272
8273       case unaligned_store:
8274         return costs->vec_unalign_store_cost;
8275
8276       case cond_branch_taken:
8277         return costs->cond_taken_branch_cost;
8278
8279       case cond_branch_not_taken:
8280         return costs->cond_not_taken_branch_cost;
8281
8282       case vec_perm:
8283         return costs->vec_permute_cost;
8284
8285       case vec_promote_demote:
8286         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8287
8288       case vec_construct:
8289         elements = TYPE_VECTOR_SUBPARTS (vectype);
8290         return elements / 2 + 1;
8291
8292       default:
8293         gcc_unreachable ();
8294     }
8295 }
8296
8297 /* Implement targetm.vectorize.add_stmt_cost.  */
8298 static unsigned
8299 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
8300                        struct _stmt_vec_info *stmt_info, int misalign,
8301                        enum vect_cost_model_location where)
8302 {
8303   unsigned *cost = (unsigned *) data;
8304   unsigned retval = 0;
8305
8306   if (flag_vect_cost_model)
8307     {
8308       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
8309       int stmt_cost =
8310             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
8311
8312       /* Statements in an inner loop relative to the loop being
8313          vectorized are weighted more heavily.  The value here is
8314          arbitrary and could potentially be improved with analysis.  */
8315       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
8316         count *= 50; /*  FIXME  */
8317
8318       retval = (unsigned) (count * stmt_cost);
8319       cost[where] += retval;
8320     }
8321
8322   return retval;
8323 }
8324
8325 static void initialize_aarch64_code_model (struct gcc_options *);
8326
8327 /* Parse the TO_PARSE string and put the architecture struct that it
8328    selects into RES and the architectural features into ISA_FLAGS.
8329    Return an aarch64_parse_opt_result describing the parse result.
8330    If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
8331
8332 static enum aarch64_parse_opt_result
8333 aarch64_parse_arch (const char *to_parse, const struct processor **res,
8334                     unsigned long *isa_flags)
8335 {
8336   char *ext;
8337   const struct processor *arch;
8338   char *str = (char *) alloca (strlen (to_parse) + 1);
8339   size_t len;
8340
8341   strcpy (str, to_parse);
8342
8343   ext = strchr (str, '+');
8344
8345   if (ext != NULL)
8346     len = ext - str;
8347   else
8348     len = strlen (str);
8349
8350   if (len == 0)
8351     return AARCH64_PARSE_MISSING_ARG;
8352
8353
8354   /* Loop through the list of supported ARCHes to find a match.  */
8355   for (arch = all_architectures; arch->name != NULL; arch++)
8356     {
8357       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
8358         {
8359           unsigned long isa_temp = arch->flags;
8360
8361           if (ext != NULL)
8362             {
8363               /* TO_PARSE string contains at least one extension.  */
8364               enum aarch64_parse_opt_result ext_res
8365                 = aarch64_parse_extension (ext, &isa_temp);
8366
8367               if (ext_res != AARCH64_PARSE_OK)
8368                 return ext_res;
8369             }
8370           /* Extension parsing was successful.  Confirm the result
8371              arch and ISA flags.  */
8372           *res = arch;
8373           *isa_flags = isa_temp;
8374           return AARCH64_PARSE_OK;
8375         }
8376     }
8377
8378   /* ARCH name not found in list.  */
8379   return AARCH64_PARSE_INVALID_ARG;
8380 }
8381
8382 /* Parse the TO_PARSE string and put the result tuning in RES and the
8383    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
8384    describing the parse result.  If there is an error parsing, RES and
8385    ISA_FLAGS are left unchanged.  */
8386
8387 static enum aarch64_parse_opt_result
8388 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
8389                    unsigned long *isa_flags)
8390 {
8391   char *ext;
8392   const struct processor *cpu;
8393   char *str = (char *) alloca (strlen (to_parse) + 1);
8394   size_t len;
8395
8396   strcpy (str, to_parse);
8397
8398   ext = strchr (str, '+');
8399
8400   if (ext != NULL)
8401     len = ext - str;
8402   else
8403     len = strlen (str);
8404
8405   if (len == 0)
8406     return AARCH64_PARSE_MISSING_ARG;
8407
8408
8409   /* Loop through the list of supported CPUs to find a match.  */
8410   for (cpu = all_cores; cpu->name != NULL; cpu++)
8411     {
8412       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
8413         {
8414           unsigned long isa_temp = cpu->flags;
8415
8416
8417           if (ext != NULL)
8418             {
8419               /* TO_PARSE string contains at least one extension.  */
8420               enum aarch64_parse_opt_result ext_res
8421                 = aarch64_parse_extension (ext, &isa_temp);
8422
8423               if (ext_res != AARCH64_PARSE_OK)
8424                 return ext_res;
8425             }
8426           /* Extension parsing was successfull.  Confirm the result
8427              cpu and ISA flags.  */
8428           *res = cpu;
8429           *isa_flags = isa_temp;
8430           return AARCH64_PARSE_OK;
8431         }
8432     }
8433
8434   /* CPU name not found in list.  */
8435   return AARCH64_PARSE_INVALID_ARG;
8436 }
8437
8438 /* Parse the TO_PARSE string and put the cpu it selects into RES.
8439    Return an aarch64_parse_opt_result describing the parse result.
8440    If the parsing fails the RES does not change.  */
8441
8442 static enum aarch64_parse_opt_result
8443 aarch64_parse_tune (const char *to_parse, const struct processor **res)
8444 {
8445   const struct processor *cpu;
8446   char *str = (char *) alloca (strlen (to_parse) + 1);
8447
8448   strcpy (str, to_parse);
8449
8450   /* Loop through the list of supported CPUs to find a match.  */
8451   for (cpu = all_cores; cpu->name != NULL; cpu++)
8452     {
8453       if (strcmp (cpu->name, str) == 0)
8454         {
8455           *res = cpu;
8456           return AARCH64_PARSE_OK;
8457         }
8458     }
8459
8460   /* CPU name not found in list.  */
8461   return AARCH64_PARSE_INVALID_ARG;
8462 }
8463
8464 /* Parse TOKEN, which has length LENGTH to see if it is an option
8465    described in FLAG.  If it is, return the index bit for that fusion type.
8466    If not, error (printing OPTION_NAME) and return zero.  */
8467
8468 static unsigned int
8469 aarch64_parse_one_option_token (const char *token,
8470                                 size_t length,
8471                                 const struct aarch64_flag_desc *flag,
8472                                 const char *option_name)
8473 {
8474   for (; flag->name != NULL; flag++)
8475     {
8476       if (length == strlen (flag->name)
8477           && !strncmp (flag->name, token, length))
8478         return flag->flag;
8479     }
8480
8481   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
8482   return 0;
8483 }
8484
8485 /* Parse OPTION which is a comma-separated list of flags to enable.
8486    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
8487    default state we inherit from the CPU tuning structures.  OPTION_NAME
8488    gives the top-level option we are parsing in the -moverride string,
8489    for use in error messages.  */
8490
8491 static unsigned int
8492 aarch64_parse_boolean_options (const char *option,
8493                                const struct aarch64_flag_desc *flags,
8494                                unsigned int initial_state,
8495                                const char *option_name)
8496 {
8497   const char separator = '.';
8498   const char* specs = option;
8499   const char* ntoken = option;
8500   unsigned int found_flags = initial_state;
8501
8502   while ((ntoken = strchr (specs, separator)))
8503     {
8504       size_t token_length = ntoken - specs;
8505       unsigned token_ops = aarch64_parse_one_option_token (specs,
8506                                                            token_length,
8507                                                            flags,
8508                                                            option_name);
8509       /* If we find "none" (or, for simplicity's sake, an error) anywhere
8510          in the token stream, reset the supported operations.  So:
8511
8512            adrp+add.cmp+branch.none.adrp+add
8513
8514            would have the result of turning on only adrp+add fusion.  */
8515       if (!token_ops)
8516         found_flags = 0;
8517
8518       found_flags |= token_ops;
8519       specs = ++ntoken;
8520     }
8521
8522   /* We ended with a comma, print something.  */
8523   if (!(*specs))
8524     {
8525       error ("%s string ill-formed\n", option_name);
8526       return 0;
8527     }
8528
8529   /* We still have one more token to parse.  */
8530   size_t token_length = strlen (specs);
8531   unsigned token_ops = aarch64_parse_one_option_token (specs,
8532                                                        token_length,
8533                                                        flags,
8534                                                        option_name);
8535    if (!token_ops)
8536      found_flags = 0;
8537
8538   found_flags |= token_ops;
8539   return found_flags;
8540 }
8541
8542 /* Support for overriding instruction fusion.  */
8543
8544 static void
8545 aarch64_parse_fuse_string (const char *fuse_string,
8546                             struct tune_params *tune)
8547 {
8548   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
8549                                                      aarch64_fusible_pairs,
8550                                                      tune->fusible_ops,
8551                                                      "fuse=");
8552 }
8553
8554 /* Support for overriding other tuning flags.  */
8555
8556 static void
8557 aarch64_parse_tune_string (const char *tune_string,
8558                             struct tune_params *tune)
8559 {
8560   tune->extra_tuning_flags
8561     = aarch64_parse_boolean_options (tune_string,
8562                                      aarch64_tuning_flags,
8563                                      tune->extra_tuning_flags,
8564                                      "tune=");
8565 }
8566
8567 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8568    we understand.  If it is, extract the option string and handoff to
8569    the appropriate function.  */
8570
8571 void
8572 aarch64_parse_one_override_token (const char* token,
8573                                   size_t length,
8574                                   struct tune_params *tune)
8575 {
8576   const struct aarch64_tuning_override_function *fn
8577     = aarch64_tuning_override_functions;
8578
8579   const char *option_part = strchr (token, '=');
8580   if (!option_part)
8581     {
8582       error ("tuning string missing in option (%s)", token);
8583       return;
8584     }
8585
8586   /* Get the length of the option name.  */
8587   length = option_part - token;
8588   /* Skip the '=' to get to the option string.  */
8589   option_part++;
8590
8591   for (; fn->name != NULL; fn++)
8592     {
8593       if (!strncmp (fn->name, token, length))
8594         {
8595           fn->parse_override (option_part, tune);
8596           return;
8597         }
8598     }
8599
8600   error ("unknown tuning option (%s)",token);
8601   return;
8602 }
8603
8604 /* A checking mechanism for the implementation of the tls size.  */
8605
8606 static void
8607 initialize_aarch64_tls_size (struct gcc_options *opts)
8608 {
8609   if (aarch64_tls_size == 0)
8610     aarch64_tls_size = 24;
8611
8612   switch (opts->x_aarch64_cmodel_var)
8613     {
8614     case AARCH64_CMODEL_TINY:
8615       /* Both the default and maximum TLS size allowed under tiny is 1M which
8616          needs two instructions to address, so we clamp the size to 24.  */
8617       if (aarch64_tls_size > 24)
8618         aarch64_tls_size = 24;
8619       break;
8620     case AARCH64_CMODEL_SMALL:
8621       /* The maximum TLS size allowed under small is 4G.  */
8622       if (aarch64_tls_size > 32)
8623         aarch64_tls_size = 32;
8624       break;
8625     case AARCH64_CMODEL_LARGE:
8626       /* The maximum TLS size allowed under large is 16E.
8627          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
8628       if (aarch64_tls_size > 48)
8629         aarch64_tls_size = 48;
8630       break;
8631     default:
8632       gcc_unreachable ();
8633     }
8634
8635   return;
8636 }
8637
8638 /* Parse STRING looking for options in the format:
8639      string     :: option:string
8640      option     :: name=substring
8641      name       :: {a-z}
8642      substring  :: defined by option.  */
8643
8644 static void
8645 aarch64_parse_override_string (const char* input_string,
8646                                struct tune_params* tune)
8647 {
8648   const char separator = ':';
8649   size_t string_length = strlen (input_string) + 1;
8650   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8651   char *string = string_root;
8652   strncpy (string, input_string, string_length);
8653   string[string_length - 1] = '\0';
8654
8655   char* ntoken = string;
8656
8657   while ((ntoken = strchr (string, separator)))
8658     {
8659       size_t token_length = ntoken - string;
8660       /* Make this substring look like a string.  */
8661       *ntoken = '\0';
8662       aarch64_parse_one_override_token (string, token_length, tune);
8663       string = ++ntoken;
8664     }
8665
8666   /* One last option to parse.  */
8667   aarch64_parse_one_override_token (string, strlen (string), tune);
8668   free (string_root);
8669 }
8670
8671
8672 static void
8673 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8674 {
8675   /* The logic here is that if we are disabling all frame pointer generation
8676      then we do not need to disable leaf frame pointer generation as a
8677      separate operation.  But if we are *only* disabling leaf frame pointer
8678      generation then we set flag_omit_frame_pointer to true, but in
8679      aarch64_frame_pointer_required we return false only for leaf functions.
8680
8681      PR 70044: We have to be careful about being called multiple times for the
8682      same function.  Once we have decided to set flag_omit_frame_pointer just
8683      so that we can omit leaf frame pointers, we must then not interpret a
8684      second call as meaning that all frame pointer generation should be
8685      omitted.  We do this by setting flag_omit_frame_pointer to a special,
8686      non-zero value.  */
8687   if (opts->x_flag_omit_frame_pointer == 2)
8688     opts->x_flag_omit_frame_pointer = 0;
8689
8690   if (opts->x_flag_omit_frame_pointer)
8691     opts->x_flag_omit_leaf_frame_pointer = false;
8692   else if (opts->x_flag_omit_leaf_frame_pointer)
8693     opts->x_flag_omit_frame_pointer = 2;
8694
8695   /* If not optimizing for size, set the default
8696      alignment to what the target wants.  */
8697   if (!opts->x_optimize_size)
8698     {
8699       if (opts->x_align_loops <= 0)
8700         opts->x_align_loops = aarch64_tune_params.loop_align;
8701       if (opts->x_align_jumps <= 0)
8702         opts->x_align_jumps = aarch64_tune_params.jump_align;
8703       if (opts->x_align_functions <= 0)
8704         opts->x_align_functions = aarch64_tune_params.function_align;
8705     }
8706
8707   /* We default to no pc-relative literal loads.  */
8708
8709   aarch64_pcrelative_literal_loads = false;
8710
8711   /* If -mpc-relative-literal-loads is set on the command line, this
8712      implies that the user asked for PC relative literal loads.  */
8713   if (opts->x_pcrelative_literal_loads == 1)
8714     aarch64_pcrelative_literal_loads = true;
8715
8716   /* This is PR70113. When building the Linux kernel with
8717      CONFIG_ARM64_ERRATUM_843419, support for relocations
8718      R_AARCH64_ADR_PREL_PG_HI21 and R_AARCH64_ADR_PREL_PG_HI21_NC is
8719      removed from the kernel to avoid loading objects with possibly
8720      offending sequences.  Without -mpc-relative-literal-loads we would
8721      generate such relocations, preventing the kernel build from
8722      succeeding.  */
8723   if (opts->x_pcrelative_literal_loads == 2
8724       && TARGET_FIX_ERR_A53_843419)
8725     aarch64_pcrelative_literal_loads = true;
8726
8727   /* In the tiny memory model it makes no sense to disallow PC relative
8728      literal pool loads.  */
8729   if (aarch64_cmodel == AARCH64_CMODEL_TINY
8730       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
8731     aarch64_pcrelative_literal_loads = true;
8732
8733   /* When enabling the lower precision Newton series for the square root, also
8734      enable it for the reciprocal square root, since the latter is an
8735      intermediary step for the former.  */
8736   if (flag_mlow_precision_sqrt)
8737     flag_mrecip_low_precision_sqrt = true;
8738 }
8739
8740 /* 'Unpack' up the internal tuning structs and update the options
8741     in OPTS.  The caller must have set up selected_tune and selected_arch
8742     as all the other target-specific codegen decisions are
8743     derived from them.  */
8744
8745 void
8746 aarch64_override_options_internal (struct gcc_options *opts)
8747 {
8748   aarch64_tune_flags = selected_tune->flags;
8749   aarch64_tune = selected_tune->sched_core;
8750   /* Make a copy of the tuning parameters attached to the core, which
8751      we may later overwrite.  */
8752   aarch64_tune_params = *(selected_tune->tune);
8753   aarch64_architecture_version = selected_arch->architecture_version;
8754
8755   if (opts->x_aarch64_override_tune_string)
8756     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
8757                                   &aarch64_tune_params);
8758
8759   /* This target defaults to strict volatile bitfields.  */
8760   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
8761     opts->x_flag_strict_volatile_bitfields = 1;
8762
8763   initialize_aarch64_code_model (opts);
8764   initialize_aarch64_tls_size (opts);
8765
8766   int queue_depth = 0;
8767   switch (aarch64_tune_params.autoprefetcher_model)
8768     {
8769       case tune_params::AUTOPREFETCHER_OFF:
8770         queue_depth = -1;
8771         break;
8772       case tune_params::AUTOPREFETCHER_WEAK:
8773         queue_depth = 0;
8774         break;
8775       case tune_params::AUTOPREFETCHER_STRONG:
8776         queue_depth = max_insn_queue_index + 1;
8777         break;
8778       default:
8779         gcc_unreachable ();
8780     }
8781
8782   /* We don't mind passing in global_options_set here as we don't use
8783      the *options_set structs anyway.  */
8784   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
8785                          queue_depth,
8786                          opts->x_param_values,
8787                          global_options_set.x_param_values);
8788
8789   /* Set up parameters to be used in prefetching algorithm.  Do not
8790      override the defaults unless we are tuning for a core we have
8791      researched values for.  */
8792   if (aarch64_tune_params.prefetch->num_slots > 0)
8793     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
8794                            aarch64_tune_params.prefetch->num_slots,
8795                            opts->x_param_values,
8796                            global_options_set.x_param_values);
8797   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
8798     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
8799                            aarch64_tune_params.prefetch->l1_cache_size,
8800                            opts->x_param_values,
8801                            global_options_set.x_param_values);
8802   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
8803     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
8804                            aarch64_tune_params.prefetch->l1_cache_line_size,
8805                            opts->x_param_values,
8806                            global_options_set.x_param_values);
8807   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
8808     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
8809                            aarch64_tune_params.prefetch->l2_cache_size,
8810                            opts->x_param_values,
8811                            global_options_set.x_param_values);
8812
8813   /* Enable sw prefetching at specified optimization level for
8814      CPUS that have prefetch.  Lower optimization level threshold by 1
8815      when profiling is enabled.  */
8816   if (opts->x_flag_prefetch_loop_arrays < 0
8817       && !opts->x_optimize_size
8818       && aarch64_tune_params.prefetch->default_opt_level >= 0
8819       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
8820     opts->x_flag_prefetch_loop_arrays = 1;
8821
8822   aarch64_override_options_after_change_1 (opts);
8823 }
8824
8825 /* Print a hint with a suggestion for a core or architecture name that
8826    most closely resembles what the user passed in STR.  ARCH is true if
8827    the user is asking for an architecture name.  ARCH is false if the user
8828    is asking for a core name.  */
8829
8830 static void
8831 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
8832 {
8833   auto_vec<const char *> candidates;
8834   const struct processor *entry = arch ? all_architectures : all_cores;
8835   for (; entry->name != NULL; entry++)
8836     candidates.safe_push (entry->name);
8837   char *s;
8838   const char *hint = candidates_list_and_hint (str, s, candidates);
8839   if (hint)
8840     inform (input_location, "valid arguments are: %s;"
8841                              " did you mean %qs?", s, hint);
8842   XDELETEVEC (s);
8843 }
8844
8845 /* Print a hint with a suggestion for a core name that most closely resembles
8846    what the user passed in STR.  */
8847
8848 inline static void
8849 aarch64_print_hint_for_core (const char *str)
8850 {
8851   aarch64_print_hint_for_core_or_arch (str, false);
8852 }
8853
8854 /* Print a hint with a suggestion for an architecture name that most closely
8855    resembles what the user passed in STR.  */
8856
8857 inline static void
8858 aarch64_print_hint_for_arch (const char *str)
8859 {
8860   aarch64_print_hint_for_core_or_arch (str, true);
8861 }
8862
8863 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
8864    specified in STR and throw errors if appropriate.  Put the results if
8865    they are valid in RES and ISA_FLAGS.  Return whether the option is
8866    valid.  */
8867
8868 static bool
8869 aarch64_validate_mcpu (const char *str, const struct processor **res,
8870                        unsigned long *isa_flags)
8871 {
8872   enum aarch64_parse_opt_result parse_res
8873     = aarch64_parse_cpu (str, res, isa_flags);
8874
8875   if (parse_res == AARCH64_PARSE_OK)
8876     return true;
8877
8878   switch (parse_res)
8879     {
8880       case AARCH64_PARSE_MISSING_ARG:
8881         error ("missing cpu name in %<-mcpu=%s%>", str);
8882         break;
8883       case AARCH64_PARSE_INVALID_ARG:
8884         error ("unknown value %qs for -mcpu", str);
8885         aarch64_print_hint_for_core (str);
8886         break;
8887       case AARCH64_PARSE_INVALID_FEATURE:
8888         error ("invalid feature modifier in %<-mcpu=%s%>", str);
8889         break;
8890       default:
8891         gcc_unreachable ();
8892     }
8893
8894   return false;
8895 }
8896
8897 /* Validate a command-line -march option.  Parse the arch and extensions
8898    (if any) specified in STR and throw errors if appropriate.  Put the
8899    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
8900    option is valid.  */
8901
8902 static bool
8903 aarch64_validate_march (const char *str, const struct processor **res,
8904                          unsigned long *isa_flags)
8905 {
8906   enum aarch64_parse_opt_result parse_res
8907     = aarch64_parse_arch (str, res, isa_flags);
8908
8909   if (parse_res == AARCH64_PARSE_OK)
8910     return true;
8911
8912   switch (parse_res)
8913     {
8914       case AARCH64_PARSE_MISSING_ARG:
8915         error ("missing arch name in %<-march=%s%>", str);
8916         break;
8917       case AARCH64_PARSE_INVALID_ARG:
8918         error ("unknown value %qs for -march", str);
8919         aarch64_print_hint_for_arch (str);
8920         break;
8921       case AARCH64_PARSE_INVALID_FEATURE:
8922         error ("invalid feature modifier in %<-march=%s%>", str);
8923         break;
8924       default:
8925         gcc_unreachable ();
8926     }
8927
8928   return false;
8929 }
8930
8931 /* Validate a command-line -mtune option.  Parse the cpu
8932    specified in STR and throw errors if appropriate.  Put the
8933    result, if it is valid, in RES.  Return whether the option is
8934    valid.  */
8935
8936 static bool
8937 aarch64_validate_mtune (const char *str, const struct processor **res)
8938 {
8939   enum aarch64_parse_opt_result parse_res
8940     = aarch64_parse_tune (str, res);
8941
8942   if (parse_res == AARCH64_PARSE_OK)
8943     return true;
8944
8945   switch (parse_res)
8946     {
8947       case AARCH64_PARSE_MISSING_ARG:
8948         error ("missing cpu name in %<-mtune=%s%>", str);
8949         break;
8950       case AARCH64_PARSE_INVALID_ARG:
8951         error ("unknown value %qs for -mtune", str);
8952         aarch64_print_hint_for_core (str);
8953         break;
8954       default:
8955         gcc_unreachable ();
8956     }
8957   return false;
8958 }
8959
8960 /* Return the CPU corresponding to the enum CPU.
8961    If it doesn't specify a cpu, return the default.  */
8962
8963 static const struct processor *
8964 aarch64_get_tune_cpu (enum aarch64_processor cpu)
8965 {
8966   if (cpu != aarch64_none)
8967     return &all_cores[cpu];
8968
8969   /* The & 0x3f is to extract the bottom 6 bits that encode the
8970      default cpu as selected by the --with-cpu GCC configure option
8971      in config.gcc.
8972      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
8973      flags mechanism should be reworked to make it more sane.  */
8974   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8975 }
8976
8977 /* Return the architecture corresponding to the enum ARCH.
8978    If it doesn't specify a valid architecture, return the default.  */
8979
8980 static const struct processor *
8981 aarch64_get_arch (enum aarch64_arch arch)
8982 {
8983   if (arch != aarch64_no_arch)
8984     return &all_architectures[arch];
8985
8986   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8987
8988   return &all_architectures[cpu->arch];
8989 }
8990
8991 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
8992    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
8993    tuning structs.  In particular it must set selected_tune and
8994    aarch64_isa_flags that define the available ISA features and tuning
8995    decisions.  It must also set selected_arch as this will be used to
8996    output the .arch asm tags for each function.  */
8997
8998 static void
8999 aarch64_override_options (void)
9000 {
9001   unsigned long cpu_isa = 0;
9002   unsigned long arch_isa = 0;
9003   aarch64_isa_flags = 0;
9004
9005   bool valid_cpu = true;
9006   bool valid_tune = true;
9007   bool valid_arch = true;
9008
9009   selected_cpu = NULL;
9010   selected_arch = NULL;
9011   selected_tune = NULL;
9012
9013   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
9014      If either of -march or -mtune is given, they override their
9015      respective component of -mcpu.  */
9016   if (aarch64_cpu_string)
9017     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
9018                                         &cpu_isa);
9019
9020   if (aarch64_arch_string)
9021     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
9022                                           &arch_isa);
9023
9024   if (aarch64_tune_string)
9025     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
9026
9027   /* If the user did not specify a processor, choose the default
9028      one for them.  This will be the CPU set during configuration using
9029      --with-cpu, otherwise it is "generic".  */
9030   if (!selected_cpu)
9031     {
9032       if (selected_arch)
9033         {
9034           selected_cpu = &all_cores[selected_arch->ident];
9035           aarch64_isa_flags = arch_isa;
9036           explicit_arch = selected_arch->arch;
9037         }
9038       else
9039         {
9040           /* Get default configure-time CPU.  */
9041           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
9042           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
9043         }
9044
9045       if (selected_tune)
9046         explicit_tune_core = selected_tune->ident;
9047     }
9048   /* If both -mcpu and -march are specified check that they are architecturally
9049      compatible, warn if they're not and prefer the -march ISA flags.  */
9050   else if (selected_arch)
9051     {
9052       if (selected_arch->arch != selected_cpu->arch)
9053         {
9054           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
9055                        all_architectures[selected_cpu->arch].name,
9056                        selected_arch->name);
9057         }
9058       aarch64_isa_flags = arch_isa;
9059       explicit_arch = selected_arch->arch;
9060       explicit_tune_core = selected_tune ? selected_tune->ident
9061                                           : selected_cpu->ident;
9062     }
9063   else
9064     {
9065       /* -mcpu but no -march.  */
9066       aarch64_isa_flags = cpu_isa;
9067       explicit_tune_core = selected_tune ? selected_tune->ident
9068                                           : selected_cpu->ident;
9069       gcc_assert (selected_cpu);
9070       selected_arch = &all_architectures[selected_cpu->arch];
9071       explicit_arch = selected_arch->arch;
9072     }
9073
9074   /* Set the arch as well as we will need it when outputing
9075      the .arch directive in assembly.  */
9076   if (!selected_arch)
9077     {
9078       gcc_assert (selected_cpu);
9079       selected_arch = &all_architectures[selected_cpu->arch];
9080     }
9081
9082   if (!selected_tune)
9083     selected_tune = selected_cpu;
9084
9085 #ifndef HAVE_AS_MABI_OPTION
9086   /* The compiler may have been configured with 2.23.* binutils, which does
9087      not have support for ILP32.  */
9088   if (TARGET_ILP32)
9089     error ("Assembler does not support -mabi=ilp32");
9090 #endif
9091
9092   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
9093     sorry ("Return address signing is only supported for -mabi=lp64");
9094
9095   /* Make sure we properly set up the explicit options.  */
9096   if ((aarch64_cpu_string && valid_cpu)
9097        || (aarch64_tune_string && valid_tune))
9098     gcc_assert (explicit_tune_core != aarch64_none);
9099
9100   if ((aarch64_cpu_string && valid_cpu)
9101        || (aarch64_arch_string && valid_arch))
9102     gcc_assert (explicit_arch != aarch64_no_arch);
9103
9104   aarch64_override_options_internal (&global_options);
9105
9106   /* Save these options as the default ones in case we push and pop them later
9107      while processing functions with potential target attributes.  */
9108   target_option_default_node = target_option_current_node
9109       = build_target_option_node (&global_options);
9110 }
9111
9112 /* Implement targetm.override_options_after_change.  */
9113
9114 static void
9115 aarch64_override_options_after_change (void)
9116 {
9117   aarch64_override_options_after_change_1 (&global_options);
9118 }
9119
9120 static struct machine_function *
9121 aarch64_init_machine_status (void)
9122 {
9123   struct machine_function *machine;
9124   machine = ggc_cleared_alloc<machine_function> ();
9125   return machine;
9126 }
9127
9128 void
9129 aarch64_init_expanders (void)
9130 {
9131   init_machine_status = aarch64_init_machine_status;
9132 }
9133
9134 /* A checking mechanism for the implementation of the various code models.  */
9135 static void
9136 initialize_aarch64_code_model (struct gcc_options *opts)
9137 {
9138    if (opts->x_flag_pic)
9139      {
9140        switch (opts->x_aarch64_cmodel_var)
9141          {
9142          case AARCH64_CMODEL_TINY:
9143            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
9144            break;
9145          case AARCH64_CMODEL_SMALL:
9146 #ifdef HAVE_AS_SMALL_PIC_RELOCS
9147            aarch64_cmodel = (flag_pic == 2
9148                              ? AARCH64_CMODEL_SMALL_PIC
9149                              : AARCH64_CMODEL_SMALL_SPIC);
9150 #else
9151            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
9152 #endif
9153            break;
9154          case AARCH64_CMODEL_LARGE:
9155            sorry ("code model %qs with -f%s", "large",
9156                   opts->x_flag_pic > 1 ? "PIC" : "pic");
9157            break;
9158          default:
9159            gcc_unreachable ();
9160          }
9161      }
9162    else
9163      aarch64_cmodel = opts->x_aarch64_cmodel_var;
9164 }
9165
9166 /* Implement TARGET_OPTION_SAVE.  */
9167
9168 static void
9169 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
9170 {
9171   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
9172 }
9173
9174 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
9175    using the information saved in PTR.  */
9176
9177 static void
9178 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
9179 {
9180   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
9181   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9182   opts->x_explicit_arch = ptr->x_explicit_arch;
9183   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
9184   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
9185
9186   aarch64_override_options_internal (opts);
9187 }
9188
9189 /* Implement TARGET_OPTION_PRINT.  */
9190
9191 static void
9192 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
9193 {
9194   const struct processor *cpu
9195     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9196   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
9197   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
9198   std::string extension
9199     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
9200
9201   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
9202   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
9203            arch->name, extension.c_str ());
9204 }
9205
9206 static GTY(()) tree aarch64_previous_fndecl;
9207
9208 void
9209 aarch64_reset_previous_fndecl (void)
9210 {
9211   aarch64_previous_fndecl = NULL;
9212 }
9213
9214 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
9215    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
9216    make sure optab availability predicates are recomputed when necessary.  */
9217
9218 void
9219 aarch64_save_restore_target_globals (tree new_tree)
9220 {
9221   if (TREE_TARGET_GLOBALS (new_tree))
9222     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
9223   else if (new_tree == target_option_default_node)
9224     restore_target_globals (&default_target_globals);
9225   else
9226     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
9227 }
9228
9229 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
9230    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
9231    of the function, if such exists.  This function may be called multiple
9232    times on a single function so use aarch64_previous_fndecl to avoid
9233    setting up identical state.  */
9234
9235 static void
9236 aarch64_set_current_function (tree fndecl)
9237 {
9238   if (!fndecl || fndecl == aarch64_previous_fndecl)
9239     return;
9240
9241   tree old_tree = (aarch64_previous_fndecl
9242                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
9243                    : NULL_TREE);
9244
9245   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9246
9247   /* If current function has no attributes but the previous one did,
9248      use the default node.  */
9249   if (!new_tree && old_tree)
9250     new_tree = target_option_default_node;
9251
9252   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
9253      the default have been handled by aarch64_save_restore_target_globals from
9254      aarch64_pragma_target_parse.  */
9255   if (old_tree == new_tree)
9256     return;
9257
9258   aarch64_previous_fndecl = fndecl;
9259
9260   /* First set the target options.  */
9261   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
9262
9263   aarch64_save_restore_target_globals (new_tree);
9264 }
9265
9266 /* Enum describing the various ways we can handle attributes.
9267    In many cases we can reuse the generic option handling machinery.  */
9268
9269 enum aarch64_attr_opt_type
9270 {
9271   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
9272   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
9273   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
9274   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
9275 };
9276
9277 /* All the information needed to handle a target attribute.
9278    NAME is the name of the attribute.
9279    ATTR_TYPE specifies the type of behavior of the attribute as described
9280    in the definition of enum aarch64_attr_opt_type.
9281    ALLOW_NEG is true if the attribute supports a "no-" form.
9282    HANDLER is the function that takes the attribute string and whether
9283    it is a pragma or attribute and handles the option.  It is needed only
9284    when the ATTR_TYPE is aarch64_attr_custom.
9285    OPT_NUM is the enum specifying the option that the attribute modifies.
9286    This is needed for attributes that mirror the behavior of a command-line
9287    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
9288    aarch64_attr_enum.  */
9289
9290 struct aarch64_attribute_info
9291 {
9292   const char *name;
9293   enum aarch64_attr_opt_type attr_type;
9294   bool allow_neg;
9295   bool (*handler) (const char *, const char *);
9296   enum opt_code opt_num;
9297 };
9298
9299 /* Handle the ARCH_STR argument to the arch= target attribute.
9300    PRAGMA_OR_ATTR is used in potential error messages.  */
9301
9302 static bool
9303 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
9304 {
9305   const struct processor *tmp_arch = NULL;
9306   enum aarch64_parse_opt_result parse_res
9307     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
9308
9309   if (parse_res == AARCH64_PARSE_OK)
9310     {
9311       gcc_assert (tmp_arch);
9312       selected_arch = tmp_arch;
9313       explicit_arch = selected_arch->arch;
9314       return true;
9315     }
9316
9317   switch (parse_res)
9318     {
9319       case AARCH64_PARSE_MISSING_ARG:
9320         error ("missing architecture name in 'arch' target %s", pragma_or_attr);
9321         break;
9322       case AARCH64_PARSE_INVALID_ARG:
9323         error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
9324         aarch64_print_hint_for_arch (str);
9325         break;
9326       case AARCH64_PARSE_INVALID_FEATURE:
9327         error ("invalid feature modifier %qs for 'arch' target %s",
9328                str, pragma_or_attr);
9329         break;
9330       default:
9331         gcc_unreachable ();
9332     }
9333
9334   return false;
9335 }
9336
9337 /* Handle the argument CPU_STR to the cpu= target attribute.
9338    PRAGMA_OR_ATTR is used in potential error messages.  */
9339
9340 static bool
9341 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
9342 {
9343   const struct processor *tmp_cpu = NULL;
9344   enum aarch64_parse_opt_result parse_res
9345     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
9346
9347   if (parse_res == AARCH64_PARSE_OK)
9348     {
9349       gcc_assert (tmp_cpu);
9350       selected_tune = tmp_cpu;
9351       explicit_tune_core = selected_tune->ident;
9352
9353       selected_arch = &all_architectures[tmp_cpu->arch];
9354       explicit_arch = selected_arch->arch;
9355       return true;
9356     }
9357
9358   switch (parse_res)
9359     {
9360       case AARCH64_PARSE_MISSING_ARG:
9361         error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
9362         break;
9363       case AARCH64_PARSE_INVALID_ARG:
9364         error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
9365         aarch64_print_hint_for_core (str);
9366         break;
9367       case AARCH64_PARSE_INVALID_FEATURE:
9368         error ("invalid feature modifier %qs for 'cpu' target %s",
9369                str, pragma_or_attr);
9370         break;
9371       default:
9372         gcc_unreachable ();
9373     }
9374
9375   return false;
9376 }
9377
9378 /* Handle the argument STR to the tune= target attribute.
9379    PRAGMA_OR_ATTR is used in potential error messages.  */
9380
9381 static bool
9382 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
9383 {
9384   const struct processor *tmp_tune = NULL;
9385   enum aarch64_parse_opt_result parse_res
9386     = aarch64_parse_tune (str, &tmp_tune);
9387
9388   if (parse_res == AARCH64_PARSE_OK)
9389     {
9390       gcc_assert (tmp_tune);
9391       selected_tune = tmp_tune;
9392       explicit_tune_core = selected_tune->ident;
9393       return true;
9394     }
9395
9396   switch (parse_res)
9397     {
9398       case AARCH64_PARSE_INVALID_ARG:
9399         error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
9400         aarch64_print_hint_for_core (str);
9401         break;
9402       default:
9403         gcc_unreachable ();
9404     }
9405
9406   return false;
9407 }
9408
9409 /* Parse an architecture extensions target attribute string specified in STR.
9410    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
9411    if successful.  Update aarch64_isa_flags to reflect the ISA features
9412    modified.
9413    PRAGMA_OR_ATTR is used in potential error messages.  */
9414
9415 static bool
9416 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
9417 {
9418   enum aarch64_parse_opt_result parse_res;
9419   unsigned long isa_flags = aarch64_isa_flags;
9420
9421   /* We allow "+nothing" in the beginning to clear out all architectural
9422      features if the user wants to handpick specific features.  */
9423   if (strncmp ("+nothing", str, 8) == 0)
9424     {
9425       isa_flags = 0;
9426       str += 8;
9427     }
9428
9429   parse_res = aarch64_parse_extension (str, &isa_flags);
9430
9431   if (parse_res == AARCH64_PARSE_OK)
9432     {
9433       aarch64_isa_flags = isa_flags;
9434       return true;
9435     }
9436
9437   switch (parse_res)
9438     {
9439       case AARCH64_PARSE_MISSING_ARG:
9440         error ("missing feature modifier in target %s %qs",
9441                pragma_or_attr, str);
9442         break;
9443
9444       case AARCH64_PARSE_INVALID_FEATURE:
9445         error ("invalid feature modifier in target %s %qs",
9446                pragma_or_attr, str);
9447         break;
9448
9449       default:
9450         gcc_unreachable ();
9451     }
9452
9453  return false;
9454 }
9455
9456 /* The target attributes that we support.  On top of these we also support just
9457    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
9458    handled explicitly in aarch64_process_one_target_attr.  */
9459
9460 static const struct aarch64_attribute_info aarch64_attributes[] =
9461 {
9462   { "general-regs-only", aarch64_attr_mask, false, NULL,
9463      OPT_mgeneral_regs_only },
9464   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
9465      OPT_mfix_cortex_a53_835769 },
9466   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
9467      OPT_mfix_cortex_a53_843419 },
9468   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
9469   { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
9470   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
9471      OPT_momit_leaf_frame_pointer },
9472   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
9473   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
9474      OPT_march_ },
9475   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
9476   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
9477      OPT_mtune_ },
9478   { "sign-return-address", aarch64_attr_enum, false, NULL,
9479      OPT_msign_return_address_ },
9480   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
9481 };
9482
9483 /* Parse ARG_STR which contains the definition of one target attribute.
9484    Show appropriate errors if any or return true if the attribute is valid.
9485    PRAGMA_OR_ATTR holds the string to use in error messages about whether
9486    we're processing a target attribute or pragma.  */
9487
9488 static bool
9489 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
9490 {
9491   bool invert = false;
9492
9493   size_t len = strlen (arg_str);
9494
9495   if (len == 0)
9496     {
9497       error ("malformed target %s", pragma_or_attr);
9498       return false;
9499     }
9500
9501   char *str_to_check = (char *) alloca (len + 1);
9502   strcpy (str_to_check, arg_str);
9503
9504   /* Skip leading whitespace.  */
9505   while (*str_to_check == ' ' || *str_to_check == '\t')
9506     str_to_check++;
9507
9508   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
9509      It is easier to detect and handle it explicitly here rather than going
9510      through the machinery for the rest of the target attributes in this
9511      function.  */
9512   if (*str_to_check == '+')
9513     return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
9514
9515   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
9516     {
9517       invert = true;
9518       str_to_check += 3;
9519     }
9520   char *arg = strchr (str_to_check, '=');
9521
9522   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
9523      and point ARG to "foo".  */
9524   if (arg)
9525     {
9526       *arg = '\0';
9527       arg++;
9528     }
9529   const struct aarch64_attribute_info *p_attr;
9530   bool found = false;
9531   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
9532     {
9533       /* If the names don't match up, or the user has given an argument
9534          to an attribute that doesn't accept one, or didn't give an argument
9535          to an attribute that expects one, fail to match.  */
9536       if (strcmp (str_to_check, p_attr->name) != 0)
9537         continue;
9538
9539       found = true;
9540       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
9541                               || p_attr->attr_type == aarch64_attr_enum;
9542
9543       if (attr_need_arg_p ^ (arg != NULL))
9544         {
9545           error ("target %s %qs does not accept an argument",
9546                   pragma_or_attr, str_to_check);
9547           return false;
9548         }
9549
9550       /* If the name matches but the attribute does not allow "no-" versions
9551          then we can't match.  */
9552       if (invert && !p_attr->allow_neg)
9553         {
9554           error ("target %s %qs does not allow a negated form",
9555                   pragma_or_attr, str_to_check);
9556           return false;
9557         }
9558
9559       switch (p_attr->attr_type)
9560         {
9561         /* Has a custom handler registered.
9562            For example, cpu=, arch=, tune=.  */
9563           case aarch64_attr_custom:
9564             gcc_assert (p_attr->handler);
9565             if (!p_attr->handler (arg, pragma_or_attr))
9566               return false;
9567             break;
9568
9569           /* Either set or unset a boolean option.  */
9570           case aarch64_attr_bool:
9571             {
9572               struct cl_decoded_option decoded;
9573
9574               generate_option (p_attr->opt_num, NULL, !invert,
9575                                CL_TARGET, &decoded);
9576               aarch64_handle_option (&global_options, &global_options_set,
9577                                       &decoded, input_location);
9578               break;
9579             }
9580           /* Set or unset a bit in the target_flags.  aarch64_handle_option
9581              should know what mask to apply given the option number.  */
9582           case aarch64_attr_mask:
9583             {
9584               struct cl_decoded_option decoded;
9585               /* We only need to specify the option number.
9586                  aarch64_handle_option will know which mask to apply.  */
9587               decoded.opt_index = p_attr->opt_num;
9588               decoded.value = !invert;
9589               aarch64_handle_option (&global_options, &global_options_set,
9590                                       &decoded, input_location);
9591               break;
9592             }
9593           /* Use the option setting machinery to set an option to an enum.  */
9594           case aarch64_attr_enum:
9595             {
9596               gcc_assert (arg);
9597               bool valid;
9598               int value;
9599               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
9600                                               &value, CL_TARGET);
9601               if (valid)
9602                 {
9603                   set_option (&global_options, NULL, p_attr->opt_num, value,
9604                               NULL, DK_UNSPECIFIED, input_location,
9605                               global_dc);
9606                 }
9607               else
9608                 {
9609                   error ("target %s %s=%s is not valid",
9610                          pragma_or_attr, str_to_check, arg);
9611                 }
9612               break;
9613             }
9614           default:
9615             gcc_unreachable ();
9616         }
9617     }
9618
9619   /* If we reached here we either have found an attribute and validated
9620      it or didn't match any.  If we matched an attribute but its arguments
9621      were malformed we will have returned false already.  */
9622   return found;
9623 }
9624
9625 /* Count how many times the character C appears in
9626    NULL-terminated string STR.  */
9627
9628 static unsigned int
9629 num_occurences_in_str (char c, char *str)
9630 {
9631   unsigned int res = 0;
9632   while (*str != '\0')
9633     {
9634       if (*str == c)
9635         res++;
9636
9637       str++;
9638     }
9639
9640   return res;
9641 }
9642
9643 /* Parse the tree in ARGS that contains the target attribute information
9644    and update the global target options space.  PRAGMA_OR_ATTR is a string
9645    to be used in error messages, specifying whether this is processing
9646    a target attribute or a target pragma.  */
9647
9648 bool
9649 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
9650 {
9651   if (TREE_CODE (args) == TREE_LIST)
9652     {
9653       do
9654         {
9655           tree head = TREE_VALUE (args);
9656           if (head)
9657             {
9658               if (!aarch64_process_target_attr (head, pragma_or_attr))
9659                 return false;
9660             }
9661           args = TREE_CHAIN (args);
9662         } while (args);
9663
9664       return true;
9665     }
9666
9667   if (TREE_CODE (args) != STRING_CST)
9668     {
9669       error ("attribute %<target%> argument not a string");
9670       return false;
9671     }
9672
9673   size_t len = strlen (TREE_STRING_POINTER (args));
9674   char *str_to_check = (char *) alloca (len + 1);
9675   strcpy (str_to_check, TREE_STRING_POINTER (args));
9676
9677   if (len == 0)
9678     {
9679       error ("malformed target %s value", pragma_or_attr);
9680       return false;
9681     }
9682
9683   /* Used to catch empty spaces between commas i.e.
9684      attribute ((target ("attr1,,attr2"))).  */
9685   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9686
9687   /* Handle multiple target attributes separated by ','.  */
9688   char *token = strtok (str_to_check, ",");
9689
9690   unsigned int num_attrs = 0;
9691   while (token)
9692     {
9693       num_attrs++;
9694       if (!aarch64_process_one_target_attr (token, pragma_or_attr))
9695         {
9696           error ("target %s %qs is invalid", pragma_or_attr, token);
9697           return false;
9698         }
9699
9700       token = strtok (NULL, ",");
9701     }
9702
9703   if (num_attrs != num_commas + 1)
9704     {
9705       error ("malformed target %s list %qs",
9706               pragma_or_attr, TREE_STRING_POINTER (args));
9707       return false;
9708     }
9709
9710   return true;
9711 }
9712
9713 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
9714    process attribute ((target ("..."))).  */
9715
9716 static bool
9717 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9718 {
9719   struct cl_target_option cur_target;
9720   bool ret;
9721   tree old_optimize;
9722   tree new_target, new_optimize;
9723   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9724
9725   /* If what we're processing is the current pragma string then the
9726      target option node is already stored in target_option_current_node
9727      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
9728      having to re-parse the string.  This is especially useful to keep
9729      arm_neon.h compile times down since that header contains a lot
9730      of intrinsics enclosed in pragmas.  */
9731   if (!existing_target && args == current_target_pragma)
9732     {
9733       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
9734       return true;
9735     }
9736   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9737
9738   old_optimize = build_optimization_node (&global_options);
9739   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9740
9741   /* If the function changed the optimization levels as well as setting
9742      target options, start with the optimizations specified.  */
9743   if (func_optimize && func_optimize != old_optimize)
9744     cl_optimization_restore (&global_options,
9745                              TREE_OPTIMIZATION (func_optimize));
9746
9747   /* Save the current target options to restore at the end.  */
9748   cl_target_option_save (&cur_target, &global_options);
9749
9750   /* If fndecl already has some target attributes applied to it, unpack
9751      them so that we add this attribute on top of them, rather than
9752      overwriting them.  */
9753   if (existing_target)
9754     {
9755       struct cl_target_option *existing_options
9756         = TREE_TARGET_OPTION (existing_target);
9757
9758       if (existing_options)
9759         cl_target_option_restore (&global_options, existing_options);
9760     }
9761   else
9762     cl_target_option_restore (&global_options,
9763                         TREE_TARGET_OPTION (target_option_current_node));
9764
9765
9766   ret = aarch64_process_target_attr (args, "attribute");
9767
9768   /* Set up any additional state.  */
9769   if (ret)
9770     {
9771       aarch64_override_options_internal (&global_options);
9772       /* Initialize SIMD builtins if we haven't already.
9773          Set current_target_pragma to NULL for the duration so that
9774          the builtin initialization code doesn't try to tag the functions
9775          being built with the attributes specified by any current pragma, thus
9776          going into an infinite recursion.  */
9777       if (TARGET_SIMD)
9778         {
9779           tree saved_current_target_pragma = current_target_pragma;
9780           current_target_pragma = NULL;
9781           aarch64_init_simd_builtins ();
9782           current_target_pragma = saved_current_target_pragma;
9783         }
9784       new_target = build_target_option_node (&global_options);
9785     }
9786   else
9787     new_target = NULL;
9788
9789   new_optimize = build_optimization_node (&global_options);
9790
9791   if (fndecl && ret)
9792     {
9793       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
9794
9795       if (old_optimize != new_optimize)
9796         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
9797     }
9798
9799   cl_target_option_restore (&global_options, &cur_target);
9800
9801   if (old_optimize != new_optimize)
9802     cl_optimization_restore (&global_options,
9803                              TREE_OPTIMIZATION (old_optimize));
9804   return ret;
9805 }
9806
9807 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
9808    tri-bool options (yes, no, don't care) and the default value is
9809    DEF, determine whether to reject inlining.  */
9810
9811 static bool
9812 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
9813                                      int dont_care, int def)
9814 {
9815   /* If the callee doesn't care, always allow inlining.  */
9816   if (callee == dont_care)
9817     return true;
9818
9819   /* If the caller doesn't care, always allow inlining.  */
9820   if (caller == dont_care)
9821     return true;
9822
9823   /* Otherwise, allow inlining if either the callee and caller values
9824      agree, or if the callee is using the default value.  */
9825   return (callee == caller || callee == def);
9826 }
9827
9828 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
9829    to inline CALLEE into CALLER based on target-specific info.
9830    Make sure that the caller and callee have compatible architectural
9831    features.  Then go through the other possible target attributes
9832    and see if they can block inlining.  Try not to reject always_inline
9833    callees unless they are incompatible architecturally.  */
9834
9835 static bool
9836 aarch64_can_inline_p (tree caller, tree callee)
9837 {
9838   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
9839   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
9840
9841   /* If callee has no option attributes, then it is ok to inline.  */
9842   if (!callee_tree)
9843     return true;
9844
9845   struct cl_target_option *caller_opts
9846         = TREE_TARGET_OPTION (caller_tree ? caller_tree
9847                                            : target_option_default_node);
9848
9849   struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
9850
9851
9852   /* Callee's ISA flags should be a subset of the caller's.  */
9853   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
9854        != callee_opts->x_aarch64_isa_flags)
9855     return false;
9856
9857   /* Allow non-strict aligned functions inlining into strict
9858      aligned ones.  */
9859   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
9860        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
9861       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
9862            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
9863     return false;
9864
9865   bool always_inline = lookup_attribute ("always_inline",
9866                                           DECL_ATTRIBUTES (callee));
9867
9868   /* If the architectural features match up and the callee is always_inline
9869      then the other attributes don't matter.  */
9870   if (always_inline)
9871     return true;
9872
9873   if (caller_opts->x_aarch64_cmodel_var
9874       != callee_opts->x_aarch64_cmodel_var)
9875     return false;
9876
9877   if (caller_opts->x_aarch64_tls_dialect
9878       != callee_opts->x_aarch64_tls_dialect)
9879     return false;
9880
9881   /* Honour explicit requests to workaround errata.  */
9882   if (!aarch64_tribools_ok_for_inlining_p (
9883           caller_opts->x_aarch64_fix_a53_err835769,
9884           callee_opts->x_aarch64_fix_a53_err835769,
9885           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
9886     return false;
9887
9888   if (!aarch64_tribools_ok_for_inlining_p (
9889           caller_opts->x_aarch64_fix_a53_err843419,
9890           callee_opts->x_aarch64_fix_a53_err843419,
9891           2, TARGET_FIX_ERR_A53_843419))
9892     return false;
9893
9894   /* If the user explicitly specified -momit-leaf-frame-pointer for the
9895      caller and calle and they don't match up, reject inlining.  */
9896   if (!aarch64_tribools_ok_for_inlining_p (
9897           caller_opts->x_flag_omit_leaf_frame_pointer,
9898           callee_opts->x_flag_omit_leaf_frame_pointer,
9899           2, 1))
9900     return false;
9901
9902   /* If the callee has specific tuning overrides, respect them.  */
9903   if (callee_opts->x_aarch64_override_tune_string != NULL
9904       && caller_opts->x_aarch64_override_tune_string == NULL)
9905     return false;
9906
9907   /* If the user specified tuning override strings for the
9908      caller and callee and they don't match up, reject inlining.
9909      We just do a string compare here, we don't analyze the meaning
9910      of the string, as it would be too costly for little gain.  */
9911   if (callee_opts->x_aarch64_override_tune_string
9912       && caller_opts->x_aarch64_override_tune_string
9913       && (strcmp (callee_opts->x_aarch64_override_tune_string,
9914                   caller_opts->x_aarch64_override_tune_string) != 0))
9915     return false;
9916
9917   return true;
9918 }
9919
9920 /* Return true if SYMBOL_REF X binds locally.  */
9921
9922 static bool
9923 aarch64_symbol_binds_local_p (const_rtx x)
9924 {
9925   return (SYMBOL_REF_DECL (x)
9926           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
9927           : SYMBOL_REF_LOCAL_P (x));
9928 }
9929
9930 /* Return true if SYMBOL_REF X is thread local */
9931 static bool
9932 aarch64_tls_symbol_p (rtx x)
9933 {
9934   if (! TARGET_HAVE_TLS)
9935     return false;
9936
9937   if (GET_CODE (x) != SYMBOL_REF)
9938     return false;
9939
9940   return SYMBOL_REF_TLS_MODEL (x) != 0;
9941 }
9942
9943 /* Classify a TLS symbol into one of the TLS kinds.  */
9944 enum aarch64_symbol_type
9945 aarch64_classify_tls_symbol (rtx x)
9946 {
9947   enum tls_model tls_kind = tls_symbolic_operand_type (x);
9948
9949   switch (tls_kind)
9950     {
9951     case TLS_MODEL_GLOBAL_DYNAMIC:
9952     case TLS_MODEL_LOCAL_DYNAMIC:
9953       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
9954
9955     case TLS_MODEL_INITIAL_EXEC:
9956       switch (aarch64_cmodel)
9957         {
9958         case AARCH64_CMODEL_TINY:
9959         case AARCH64_CMODEL_TINY_PIC:
9960           return SYMBOL_TINY_TLSIE;
9961         default:
9962           return SYMBOL_SMALL_TLSIE;
9963         }
9964
9965     case TLS_MODEL_LOCAL_EXEC:
9966       if (aarch64_tls_size == 12)
9967         return SYMBOL_TLSLE12;
9968       else if (aarch64_tls_size == 24)
9969         return SYMBOL_TLSLE24;
9970       else if (aarch64_tls_size == 32)
9971         return SYMBOL_TLSLE32;
9972       else if (aarch64_tls_size == 48)
9973         return SYMBOL_TLSLE48;
9974       else
9975         gcc_unreachable ();
9976
9977     case TLS_MODEL_EMULATED:
9978     case TLS_MODEL_NONE:
9979       return SYMBOL_FORCE_TO_MEM;
9980
9981     default:
9982       gcc_unreachable ();
9983     }
9984 }
9985
9986 /* Return the method that should be used to access SYMBOL_REF or
9987    LABEL_REF X.  */
9988
9989 enum aarch64_symbol_type
9990 aarch64_classify_symbol (rtx x, rtx offset)
9991 {
9992   if (GET_CODE (x) == LABEL_REF)
9993     {
9994       switch (aarch64_cmodel)
9995         {
9996         case AARCH64_CMODEL_LARGE:
9997           return SYMBOL_FORCE_TO_MEM;
9998
9999         case AARCH64_CMODEL_TINY_PIC:
10000         case AARCH64_CMODEL_TINY:
10001           return SYMBOL_TINY_ABSOLUTE;
10002
10003         case AARCH64_CMODEL_SMALL_SPIC:
10004         case AARCH64_CMODEL_SMALL_PIC:
10005         case AARCH64_CMODEL_SMALL:
10006           return SYMBOL_SMALL_ABSOLUTE;
10007
10008         default:
10009           gcc_unreachable ();
10010         }
10011     }
10012
10013   if (GET_CODE (x) == SYMBOL_REF)
10014     {
10015       if (aarch64_tls_symbol_p (x))
10016         return aarch64_classify_tls_symbol (x);
10017
10018       switch (aarch64_cmodel)
10019         {
10020         case AARCH64_CMODEL_TINY:
10021           /* When we retrieve symbol + offset address, we have to make sure
10022              the offset does not cause overflow of the final address.  But
10023              we have no way of knowing the address of symbol at compile time
10024              so we can't accurately say if the distance between the PC and
10025              symbol + offset is outside the addressible range of +/-1M in the
10026              TINY code model.  So we rely on images not being greater than
10027              1M and cap the offset at 1M and anything beyond 1M will have to
10028              be loaded using an alternative mechanism.  Furthermore if the
10029              symbol is a weak reference to something that isn't known to
10030              resolve to a symbol in this module, then force to memory.  */
10031           if ((SYMBOL_REF_WEAK (x)
10032                && !aarch64_symbol_binds_local_p (x))
10033               || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
10034             return SYMBOL_FORCE_TO_MEM;
10035           return SYMBOL_TINY_ABSOLUTE;
10036
10037         case AARCH64_CMODEL_SMALL:
10038           /* Same reasoning as the tiny code model, but the offset cap here is
10039              4G.  */
10040           if ((SYMBOL_REF_WEAK (x)
10041                && !aarch64_symbol_binds_local_p (x))
10042               || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
10043                             HOST_WIDE_INT_C (4294967264)))
10044             return SYMBOL_FORCE_TO_MEM;
10045           return SYMBOL_SMALL_ABSOLUTE;
10046
10047         case AARCH64_CMODEL_TINY_PIC:
10048           if (!aarch64_symbol_binds_local_p (x))
10049             return SYMBOL_TINY_GOT;
10050           return SYMBOL_TINY_ABSOLUTE;
10051
10052         case AARCH64_CMODEL_SMALL_SPIC:
10053         case AARCH64_CMODEL_SMALL_PIC:
10054           if (!aarch64_symbol_binds_local_p (x))
10055             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
10056                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
10057           return SYMBOL_SMALL_ABSOLUTE;
10058
10059         case AARCH64_CMODEL_LARGE:
10060           /* This is alright even in PIC code as the constant
10061              pool reference is always PC relative and within
10062              the same translation unit.  */
10063           if (CONSTANT_POOL_ADDRESS_P (x))
10064             return SYMBOL_SMALL_ABSOLUTE;
10065           else
10066             return SYMBOL_FORCE_TO_MEM;
10067
10068         default:
10069           gcc_unreachable ();
10070         }
10071     }
10072
10073   /* By default push everything into the constant pool.  */
10074   return SYMBOL_FORCE_TO_MEM;
10075 }
10076
10077 bool
10078 aarch64_constant_address_p (rtx x)
10079 {
10080   return (CONSTANT_P (x) && memory_address_p (DImode, x));
10081 }
10082
10083 bool
10084 aarch64_legitimate_pic_operand_p (rtx x)
10085 {
10086   if (GET_CODE (x) == SYMBOL_REF
10087       || (GET_CODE (x) == CONST
10088           && GET_CODE (XEXP (x, 0)) == PLUS
10089           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
10090      return false;
10091
10092   return true;
10093 }
10094
10095 /* Return true if X holds either a quarter-precision or
10096      floating-point +0.0 constant.  */
10097 static bool
10098 aarch64_valid_floating_const (machine_mode mode, rtx x)
10099 {
10100   if (!CONST_DOUBLE_P (x))
10101     return false;
10102
10103   if (aarch64_float_const_zero_rtx_p (x))
10104     return true;
10105
10106   /* We only handle moving 0.0 to a TFmode register.  */
10107   if (!(mode == SFmode || mode == DFmode))
10108     return false;
10109
10110   return aarch64_float_const_representable_p (x);
10111 }
10112
10113 static bool
10114 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
10115 {
10116   /* Do not allow vector struct mode constants.  We could support
10117      0 and -1 easily, but they need support in aarch64-simd.md.  */
10118   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
10119     return false;
10120
10121   /* This could probably go away because
10122      we now decompose CONST_INTs according to expand_mov_immediate.  */
10123   if ((GET_CODE (x) == CONST_VECTOR
10124        && aarch64_simd_valid_immediate (x, mode, false, NULL))
10125       || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
10126         return !targetm.cannot_force_const_mem (mode, x);
10127
10128   if (GET_CODE (x) == HIGH
10129       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
10130     return true;
10131
10132   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
10133      so spilling them is better than rematerialization.  */
10134   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
10135     return true;
10136
10137   return aarch64_constant_address_p (x);
10138 }
10139
10140 rtx
10141 aarch64_load_tp (rtx target)
10142 {
10143   if (!target
10144       || GET_MODE (target) != Pmode
10145       || !register_operand (target, Pmode))
10146     target = gen_reg_rtx (Pmode);
10147
10148   /* Can return in any reg.  */
10149   emit_insn (gen_aarch64_load_tp_hard (target));
10150   return target;
10151 }
10152
10153 /* On AAPCS systems, this is the "struct __va_list".  */
10154 static GTY(()) tree va_list_type;
10155
10156 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
10157    Return the type to use as __builtin_va_list.
10158
10159    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
10160
10161    struct __va_list
10162    {
10163      void *__stack;
10164      void *__gr_top;
10165      void *__vr_top;
10166      int   __gr_offs;
10167      int   __vr_offs;
10168    };  */
10169
10170 static tree
10171 aarch64_build_builtin_va_list (void)
10172 {
10173   tree va_list_name;
10174   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10175
10176   /* Create the type.  */
10177   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
10178   /* Give it the required name.  */
10179   va_list_name = build_decl (BUILTINS_LOCATION,
10180                              TYPE_DECL,
10181                              get_identifier ("__va_list"),
10182                              va_list_type);
10183   DECL_ARTIFICIAL (va_list_name) = 1;
10184   TYPE_NAME (va_list_type) = va_list_name;
10185   TYPE_STUB_DECL (va_list_type) = va_list_name;
10186
10187   /* Create the fields.  */
10188   f_stack = build_decl (BUILTINS_LOCATION,
10189                         FIELD_DECL, get_identifier ("__stack"),
10190                         ptr_type_node);
10191   f_grtop = build_decl (BUILTINS_LOCATION,
10192                         FIELD_DECL, get_identifier ("__gr_top"),
10193                         ptr_type_node);
10194   f_vrtop = build_decl (BUILTINS_LOCATION,
10195                         FIELD_DECL, get_identifier ("__vr_top"),
10196                         ptr_type_node);
10197   f_groff = build_decl (BUILTINS_LOCATION,
10198                         FIELD_DECL, get_identifier ("__gr_offs"),
10199                         integer_type_node);
10200   f_vroff = build_decl (BUILTINS_LOCATION,
10201                         FIELD_DECL, get_identifier ("__vr_offs"),
10202                         integer_type_node);
10203
10204   /* Tell tree-stdarg pass about our internal offset fields.
10205      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
10206      purpose to identify whether the code is updating va_list internal
10207      offset fields through irregular way.  */
10208   va_list_gpr_counter_field = f_groff;
10209   va_list_fpr_counter_field = f_vroff;
10210
10211   DECL_ARTIFICIAL (f_stack) = 1;
10212   DECL_ARTIFICIAL (f_grtop) = 1;
10213   DECL_ARTIFICIAL (f_vrtop) = 1;
10214   DECL_ARTIFICIAL (f_groff) = 1;
10215   DECL_ARTIFICIAL (f_vroff) = 1;
10216
10217   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
10218   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
10219   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
10220   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
10221   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
10222
10223   TYPE_FIELDS (va_list_type) = f_stack;
10224   DECL_CHAIN (f_stack) = f_grtop;
10225   DECL_CHAIN (f_grtop) = f_vrtop;
10226   DECL_CHAIN (f_vrtop) = f_groff;
10227   DECL_CHAIN (f_groff) = f_vroff;
10228
10229   /* Compute its layout.  */
10230   layout_type (va_list_type);
10231
10232   return va_list_type;
10233 }
10234
10235 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
10236 static void
10237 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
10238 {
10239   const CUMULATIVE_ARGS *cum;
10240   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10241   tree stack, grtop, vrtop, groff, vroff;
10242   tree t;
10243   int gr_save_area_size = cfun->va_list_gpr_size;
10244   int vr_save_area_size = cfun->va_list_fpr_size;
10245   int vr_offset;
10246
10247   cum = &crtl->args.info;
10248   if (cfun->va_list_gpr_size)
10249     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
10250                              cfun->va_list_gpr_size);
10251   if (cfun->va_list_fpr_size)
10252     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
10253                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
10254
10255   if (!TARGET_FLOAT)
10256     {
10257       gcc_assert (cum->aapcs_nvrn == 0);
10258       vr_save_area_size = 0;
10259     }
10260
10261   f_stack = TYPE_FIELDS (va_list_type_node);
10262   f_grtop = DECL_CHAIN (f_stack);
10263   f_vrtop = DECL_CHAIN (f_grtop);
10264   f_groff = DECL_CHAIN (f_vrtop);
10265   f_vroff = DECL_CHAIN (f_groff);
10266
10267   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
10268                   NULL_TREE);
10269   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
10270                   NULL_TREE);
10271   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
10272                   NULL_TREE);
10273   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
10274                   NULL_TREE);
10275   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
10276                   NULL_TREE);
10277
10278   /* Emit code to initialize STACK, which points to the next varargs stack
10279      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
10280      by named arguments.  STACK is 8-byte aligned.  */
10281   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
10282   if (cum->aapcs_stack_size > 0)
10283     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
10284   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
10285   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10286
10287   /* Emit code to initialize GRTOP, the top of the GR save area.
10288      virtual_incoming_args_rtx should have been 16 byte aligned.  */
10289   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
10290   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
10291   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10292
10293   /* Emit code to initialize VRTOP, the top of the VR save area.
10294      This address is gr_save_area_bytes below GRTOP, rounded
10295      down to the next 16-byte boundary.  */
10296   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
10297   vr_offset = ROUND_UP (gr_save_area_size,
10298                         STACK_BOUNDARY / BITS_PER_UNIT);
10299
10300   if (vr_offset)
10301     t = fold_build_pointer_plus_hwi (t, -vr_offset);
10302   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
10303   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10304
10305   /* Emit code to initialize GROFF, the offset from GRTOP of the
10306      next GPR argument.  */
10307   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
10308               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
10309   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10310
10311   /* Likewise emit code to initialize VROFF, the offset from FTOP
10312      of the next VR argument.  */
10313   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
10314               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
10315   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10316 }
10317
10318 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
10319
10320 static tree
10321 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
10322                               gimple_seq *post_p ATTRIBUTE_UNUSED)
10323 {
10324   tree addr;
10325   bool indirect_p;
10326   bool is_ha;           /* is HFA or HVA.  */
10327   bool dw_align;        /* double-word align.  */
10328   machine_mode ag_mode = VOIDmode;
10329   int nregs;
10330   machine_mode mode;
10331
10332   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10333   tree stack, f_top, f_off, off, arg, roundup, on_stack;
10334   HOST_WIDE_INT size, rsize, adjust, align;
10335   tree t, u, cond1, cond2;
10336
10337   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
10338   if (indirect_p)
10339     type = build_pointer_type (type);
10340
10341   mode = TYPE_MODE (type);
10342
10343   f_stack = TYPE_FIELDS (va_list_type_node);
10344   f_grtop = DECL_CHAIN (f_stack);
10345   f_vrtop = DECL_CHAIN (f_grtop);
10346   f_groff = DECL_CHAIN (f_vrtop);
10347   f_vroff = DECL_CHAIN (f_groff);
10348
10349   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
10350                   f_stack, NULL_TREE);
10351   size = int_size_in_bytes (type);
10352   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
10353
10354   dw_align = false;
10355   adjust = 0;
10356   if (aarch64_vfp_is_call_or_return_candidate (mode,
10357                                                type,
10358                                                &ag_mode,
10359                                                &nregs,
10360                                                &is_ha))
10361     {
10362       /* TYPE passed in fp/simd registers.  */
10363       if (!TARGET_FLOAT)
10364         aarch64_err_no_fpadvsimd (mode, "varargs");
10365
10366       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
10367                       unshare_expr (valist), f_vrtop, NULL_TREE);
10368       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
10369                       unshare_expr (valist), f_vroff, NULL_TREE);
10370
10371       rsize = nregs * UNITS_PER_VREG;
10372
10373       if (is_ha)
10374         {
10375           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
10376             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
10377         }
10378       else if (BLOCK_REG_PADDING (mode, type, 1) == downward
10379                && size < UNITS_PER_VREG)
10380         {
10381           adjust = UNITS_PER_VREG - size;
10382         }
10383     }
10384   else
10385     {
10386       /* TYPE passed in general registers.  */
10387       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
10388                       unshare_expr (valist), f_grtop, NULL_TREE);
10389       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
10390                       unshare_expr (valist), f_groff, NULL_TREE);
10391       rsize = ROUND_UP (size, UNITS_PER_WORD);
10392       nregs = rsize / UNITS_PER_WORD;
10393
10394       if (align > 8)
10395         dw_align = true;
10396
10397       if (BLOCK_REG_PADDING (mode, type, 1) == downward
10398           && size < UNITS_PER_WORD)
10399         {
10400           adjust = UNITS_PER_WORD  - size;
10401         }
10402     }
10403
10404   /* Get a local temporary for the field value.  */
10405   off = get_initialized_tmp_var (f_off, pre_p, NULL);
10406
10407   /* Emit code to branch if off >= 0.  */
10408   t = build2 (GE_EXPR, boolean_type_node, off,
10409               build_int_cst (TREE_TYPE (off), 0));
10410   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
10411
10412   if (dw_align)
10413     {
10414       /* Emit: offs = (offs + 15) & -16.  */
10415       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10416                   build_int_cst (TREE_TYPE (off), 15));
10417       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
10418                   build_int_cst (TREE_TYPE (off), -16));
10419       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
10420     }
10421   else
10422     roundup = NULL;
10423
10424   /* Update ap.__[g|v]r_offs  */
10425   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10426               build_int_cst (TREE_TYPE (off), rsize));
10427   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
10428
10429   /* String up.  */
10430   if (roundup)
10431     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10432
10433   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
10434   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
10435               build_int_cst (TREE_TYPE (f_off), 0));
10436   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
10437
10438   /* String up: make sure the assignment happens before the use.  */
10439   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
10440   COND_EXPR_ELSE (cond1) = t;
10441
10442   /* Prepare the trees handling the argument that is passed on the stack;
10443      the top level node will store in ON_STACK.  */
10444   arg = get_initialized_tmp_var (stack, pre_p, NULL);
10445   if (align > 8)
10446     {
10447       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
10448       t = fold_convert (intDI_type_node, arg);
10449       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10450                   build_int_cst (TREE_TYPE (t), 15));
10451       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10452                   build_int_cst (TREE_TYPE (t), -16));
10453       t = fold_convert (TREE_TYPE (arg), t);
10454       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
10455     }
10456   else
10457     roundup = NULL;
10458   /* Advance ap.__stack  */
10459   t = fold_convert (intDI_type_node, arg);
10460   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10461               build_int_cst (TREE_TYPE (t), size + 7));
10462   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10463               build_int_cst (TREE_TYPE (t), -8));
10464   t = fold_convert (TREE_TYPE (arg), t);
10465   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
10466   /* String up roundup and advance.  */
10467   if (roundup)
10468     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10469   /* String up with arg */
10470   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
10471   /* Big-endianness related address adjustment.  */
10472   if (BLOCK_REG_PADDING (mode, type, 1) == downward
10473       && size < UNITS_PER_WORD)
10474   {
10475     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
10476                 size_int (UNITS_PER_WORD - size));
10477     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
10478   }
10479
10480   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
10481   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
10482
10483   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
10484   t = off;
10485   if (adjust)
10486     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
10487                 build_int_cst (TREE_TYPE (off), adjust));
10488
10489   t = fold_convert (sizetype, t);
10490   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
10491
10492   if (is_ha)
10493     {
10494       /* type ha; // treat as "struct {ftype field[n];}"
10495          ... [computing offs]
10496          for (i = 0; i <nregs; ++i, offs += 16)
10497            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
10498          return ha;  */
10499       int i;
10500       tree tmp_ha, field_t, field_ptr_t;
10501
10502       /* Declare a local variable.  */
10503       tmp_ha = create_tmp_var_raw (type, "ha");
10504       gimple_add_tmp_var (tmp_ha);
10505
10506       /* Establish the base type.  */
10507       switch (ag_mode)
10508         {
10509         case SFmode:
10510           field_t = float_type_node;
10511           field_ptr_t = float_ptr_type_node;
10512           break;
10513         case DFmode:
10514           field_t = double_type_node;
10515           field_ptr_t = double_ptr_type_node;
10516           break;
10517         case TFmode:
10518           field_t = long_double_type_node;
10519           field_ptr_t = long_double_ptr_type_node;
10520           break;
10521         case HFmode:
10522           field_t = aarch64_fp16_type_node;
10523           field_ptr_t = aarch64_fp16_ptr_type_node;
10524           break;
10525         case V2SImode:
10526         case V4SImode:
10527             {
10528               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
10529               field_t = build_vector_type_for_mode (innertype, ag_mode);
10530               field_ptr_t = build_pointer_type (field_t);
10531             }
10532           break;
10533         default:
10534           gcc_assert (0);
10535         }
10536
10537       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
10538       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
10539       addr = t;
10540       t = fold_convert (field_ptr_t, addr);
10541       t = build2 (MODIFY_EXPR, field_t,
10542                   build1 (INDIRECT_REF, field_t, tmp_ha),
10543                   build1 (INDIRECT_REF, field_t, t));
10544
10545       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
10546       for (i = 1; i < nregs; ++i)
10547         {
10548           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
10549           u = fold_convert (field_ptr_t, addr);
10550           u = build2 (MODIFY_EXPR, field_t,
10551                       build2 (MEM_REF, field_t, tmp_ha,
10552                               build_int_cst (field_ptr_t,
10553                                              (i *
10554                                               int_size_in_bytes (field_t)))),
10555                       build1 (INDIRECT_REF, field_t, u));
10556           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
10557         }
10558
10559       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
10560       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
10561     }
10562
10563   COND_EXPR_ELSE (cond2) = t;
10564   addr = fold_convert (build_pointer_type (type), cond1);
10565   addr = build_va_arg_indirect_ref (addr);
10566
10567   if (indirect_p)
10568     addr = build_va_arg_indirect_ref (addr);
10569
10570   return addr;
10571 }
10572
10573 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
10574
10575 static void
10576 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10577                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
10578                                 int no_rtl)
10579 {
10580   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10581   CUMULATIVE_ARGS local_cum;
10582   int gr_saved = cfun->va_list_gpr_size;
10583   int vr_saved = cfun->va_list_fpr_size;
10584
10585   /* The caller has advanced CUM up to, but not beyond, the last named
10586      argument.  Advance a local copy of CUM past the last "real" named
10587      argument, to find out how many registers are left over.  */
10588   local_cum = *cum;
10589   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
10590
10591   /* Found out how many registers we need to save.
10592      Honor tree-stdvar analysis results.  */
10593   if (cfun->va_list_gpr_size)
10594     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
10595                     cfun->va_list_gpr_size / UNITS_PER_WORD);
10596   if (cfun->va_list_fpr_size)
10597     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
10598                     cfun->va_list_fpr_size / UNITS_PER_VREG);
10599
10600   if (!TARGET_FLOAT)
10601     {
10602       gcc_assert (local_cum.aapcs_nvrn == 0);
10603       vr_saved = 0;
10604     }
10605
10606   if (!no_rtl)
10607     {
10608       if (gr_saved > 0)
10609         {
10610           rtx ptr, mem;
10611
10612           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
10613           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
10614                                - gr_saved * UNITS_PER_WORD);
10615           mem = gen_frame_mem (BLKmode, ptr);
10616           set_mem_alias_set (mem, get_varargs_alias_set ());
10617
10618           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
10619                                mem, gr_saved);
10620         }
10621       if (vr_saved > 0)
10622         {
10623           /* We can't use move_block_from_reg, because it will use
10624              the wrong mode, storing D regs only.  */
10625           machine_mode mode = TImode;
10626           int off, i, vr_start;
10627
10628           /* Set OFF to the offset from virtual_incoming_args_rtx of
10629              the first vector register.  The VR save area lies below
10630              the GR one, and is aligned to 16 bytes.  */
10631           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10632                            STACK_BOUNDARY / BITS_PER_UNIT);
10633           off -= vr_saved * UNITS_PER_VREG;
10634
10635           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
10636           for (i = 0; i < vr_saved; ++i)
10637             {
10638               rtx ptr, mem;
10639
10640               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10641               mem = gen_frame_mem (mode, ptr);
10642               set_mem_alias_set (mem, get_varargs_alias_set ());
10643               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
10644               off += UNITS_PER_VREG;
10645             }
10646         }
10647     }
10648
10649   /* We don't save the size into *PRETEND_SIZE because we want to avoid
10650      any complication of having crtl->args.pretend_args_size changed.  */
10651   cfun->machine->frame.saved_varargs_size
10652     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10653                  STACK_BOUNDARY / BITS_PER_UNIT)
10654        + vr_saved * UNITS_PER_VREG);
10655 }
10656
10657 static void
10658 aarch64_conditional_register_usage (void)
10659 {
10660   int i;
10661   if (!TARGET_FLOAT)
10662     {
10663       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10664         {
10665           fixed_regs[i] = 1;
10666           call_used_regs[i] = 1;
10667         }
10668     }
10669 }
10670
10671 /* Walk down the type tree of TYPE counting consecutive base elements.
10672    If *MODEP is VOIDmode, then set it to the first valid floating point
10673    type.  If a non-floating point type is found, or if a floating point
10674    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10675    otherwise return the count in the sub-tree.  */
10676 static int
10677 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10678 {
10679   machine_mode mode;
10680   HOST_WIDE_INT size;
10681
10682   switch (TREE_CODE (type))
10683     {
10684     case REAL_TYPE:
10685       mode = TYPE_MODE (type);
10686       if (mode != DFmode && mode != SFmode
10687           && mode != TFmode && mode != HFmode)
10688         return -1;
10689
10690       if (*modep == VOIDmode)
10691         *modep = mode;
10692
10693       if (*modep == mode)
10694         return 1;
10695
10696       break;
10697
10698     case COMPLEX_TYPE:
10699       mode = TYPE_MODE (TREE_TYPE (type));
10700       if (mode != DFmode && mode != SFmode
10701           && mode != TFmode && mode != HFmode)
10702         return -1;
10703
10704       if (*modep == VOIDmode)
10705         *modep = mode;
10706
10707       if (*modep == mode)
10708         return 2;
10709
10710       break;
10711
10712     case VECTOR_TYPE:
10713       /* Use V2SImode and V4SImode as representatives of all 64-bit
10714          and 128-bit vector types.  */
10715       size = int_size_in_bytes (type);
10716       switch (size)
10717         {
10718         case 8:
10719           mode = V2SImode;
10720           break;
10721         case 16:
10722           mode = V4SImode;
10723           break;
10724         default:
10725           return -1;
10726         }
10727
10728       if (*modep == VOIDmode)
10729         *modep = mode;
10730
10731       /* Vector modes are considered to be opaque: two vectors are
10732          equivalent for the purposes of being homogeneous aggregates
10733          if they are the same size.  */
10734       if (*modep == mode)
10735         return 1;
10736
10737       break;
10738
10739     case ARRAY_TYPE:
10740       {
10741         int count;
10742         tree index = TYPE_DOMAIN (type);
10743
10744         /* Can't handle incomplete types nor sizes that are not
10745            fixed.  */
10746         if (!COMPLETE_TYPE_P (type)
10747             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10748           return -1;
10749
10750         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
10751         if (count == -1
10752             || !index
10753             || !TYPE_MAX_VALUE (index)
10754             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
10755             || !TYPE_MIN_VALUE (index)
10756             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
10757             || count < 0)
10758           return -1;
10759
10760         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
10761                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
10762
10763         /* There must be no padding.  */
10764         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10765           return -1;
10766
10767         return count;
10768       }
10769
10770     case RECORD_TYPE:
10771       {
10772         int count = 0;
10773         int sub_count;
10774         tree field;
10775
10776         /* Can't handle incomplete types nor sizes that are not
10777            fixed.  */
10778         if (!COMPLETE_TYPE_P (type)
10779             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10780           return -1;
10781
10782         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10783           {
10784             if (TREE_CODE (field) != FIELD_DECL)
10785               continue;
10786
10787             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10788             if (sub_count < 0)
10789               return -1;
10790             count += sub_count;
10791           }
10792
10793         /* There must be no padding.  */
10794         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10795           return -1;
10796
10797         return count;
10798       }
10799
10800     case UNION_TYPE:
10801     case QUAL_UNION_TYPE:
10802       {
10803         /* These aren't very interesting except in a degenerate case.  */
10804         int count = 0;
10805         int sub_count;
10806         tree field;
10807
10808         /* Can't handle incomplete types nor sizes that are not
10809            fixed.  */
10810         if (!COMPLETE_TYPE_P (type)
10811             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10812           return -1;
10813
10814         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10815           {
10816             if (TREE_CODE (field) != FIELD_DECL)
10817               continue;
10818
10819             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10820             if (sub_count < 0)
10821               return -1;
10822             count = count > sub_count ? count : sub_count;
10823           }
10824
10825         /* There must be no padding.  */
10826         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10827           return -1;
10828
10829         return count;
10830       }
10831
10832     default:
10833       break;
10834     }
10835
10836   return -1;
10837 }
10838
10839 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
10840    type as described in AAPCS64 \S 4.1.2.
10841
10842    See the comment above aarch64_composite_type_p for the notes on MODE.  */
10843
10844 static bool
10845 aarch64_short_vector_p (const_tree type,
10846                         machine_mode mode)
10847 {
10848   HOST_WIDE_INT size = -1;
10849
10850   if (type && TREE_CODE (type) == VECTOR_TYPE)
10851     size = int_size_in_bytes (type);
10852   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
10853             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
10854     size = GET_MODE_SIZE (mode);
10855
10856   return (size == 8 || size == 16);
10857 }
10858
10859 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
10860    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
10861    array types.  The C99 floating-point complex types are also considered
10862    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
10863    types, which are GCC extensions and out of the scope of AAPCS64, are
10864    treated as composite types here as well.
10865
10866    Note that MODE itself is not sufficient in determining whether a type
10867    is such a composite type or not.  This is because
10868    stor-layout.c:compute_record_mode may have already changed the MODE
10869    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
10870    structure with only one field may have its MODE set to the mode of the
10871    field.  Also an integer mode whose size matches the size of the
10872    RECORD_TYPE type may be used to substitute the original mode
10873    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
10874    solely relied on.  */
10875
10876 static bool
10877 aarch64_composite_type_p (const_tree type,
10878                           machine_mode mode)
10879 {
10880   if (aarch64_short_vector_p (type, mode))
10881     return false;
10882
10883   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
10884     return true;
10885
10886   if (mode == BLKmode
10887       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
10888       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
10889     return true;
10890
10891   return false;
10892 }
10893
10894 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
10895    shall be passed or returned in simd/fp register(s) (providing these
10896    parameter passing registers are available).
10897
10898    Upon successful return, *COUNT returns the number of needed registers,
10899    *BASE_MODE returns the mode of the individual register and when IS_HAF
10900    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
10901    floating-point aggregate or a homogeneous short-vector aggregate.  */
10902
10903 static bool
10904 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
10905                                          const_tree type,
10906                                          machine_mode *base_mode,
10907                                          int *count,
10908                                          bool *is_ha)
10909 {
10910   machine_mode new_mode = VOIDmode;
10911   bool composite_p = aarch64_composite_type_p (type, mode);
10912
10913   if (is_ha != NULL) *is_ha = false;
10914
10915   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
10916       || aarch64_short_vector_p (type, mode))
10917     {
10918       *count = 1;
10919       new_mode = mode;
10920     }
10921   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
10922     {
10923       if (is_ha != NULL) *is_ha = true;
10924       *count = 2;
10925       new_mode = GET_MODE_INNER (mode);
10926     }
10927   else if (type && composite_p)
10928     {
10929       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
10930
10931       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
10932         {
10933           if (is_ha != NULL) *is_ha = true;
10934           *count = ag_count;
10935         }
10936       else
10937         return false;
10938     }
10939   else
10940     return false;
10941
10942   *base_mode = new_mode;
10943   return true;
10944 }
10945
10946 /* Implement TARGET_STRUCT_VALUE_RTX.  */
10947
10948 static rtx
10949 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
10950                           int incoming ATTRIBUTE_UNUSED)
10951 {
10952   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
10953 }
10954
10955 /* Implements target hook vector_mode_supported_p.  */
10956 static bool
10957 aarch64_vector_mode_supported_p (machine_mode mode)
10958 {
10959   if (TARGET_SIMD
10960       && (mode == V4SImode  || mode == V8HImode
10961           || mode == V16QImode || mode == V2DImode
10962           || mode == V2SImode  || mode == V4HImode
10963           || mode == V8QImode || mode == V2SFmode
10964           || mode == V4SFmode || mode == V2DFmode
10965           || mode == V4HFmode || mode == V8HFmode
10966           || mode == V1DFmode))
10967     return true;
10968
10969   return false;
10970 }
10971
10972 /* Return appropriate SIMD container
10973    for MODE within a vector of WIDTH bits.  */
10974 static machine_mode
10975 aarch64_simd_container_mode (machine_mode mode, unsigned width)
10976 {
10977   gcc_assert (width == 64 || width == 128);
10978   if (TARGET_SIMD)
10979     {
10980       if (width == 128)
10981         switch (mode)
10982           {
10983           case DFmode:
10984             return V2DFmode;
10985           case SFmode:
10986             return V4SFmode;
10987           case HFmode:
10988             return V8HFmode;
10989           case SImode:
10990             return V4SImode;
10991           case HImode:
10992             return V8HImode;
10993           case QImode:
10994             return V16QImode;
10995           case DImode:
10996             return V2DImode;
10997           default:
10998             break;
10999           }
11000       else
11001         switch (mode)
11002           {
11003           case SFmode:
11004             return V2SFmode;
11005           case HFmode:
11006             return V4HFmode;
11007           case SImode:
11008             return V2SImode;
11009           case HImode:
11010             return V4HImode;
11011           case QImode:
11012             return V8QImode;
11013           default:
11014             break;
11015           }
11016     }
11017   return word_mode;
11018 }
11019
11020 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
11021 static machine_mode
11022 aarch64_preferred_simd_mode (machine_mode mode)
11023 {
11024   return aarch64_simd_container_mode (mode, 128);
11025 }
11026
11027 /* Return the bitmask of possible vector sizes for the vectorizer
11028    to iterate over.  */
11029 static unsigned int
11030 aarch64_autovectorize_vector_sizes (void)
11031 {
11032   return (16 | 8);
11033 }
11034
11035 /* Implement TARGET_MANGLE_TYPE.  */
11036
11037 static const char *
11038 aarch64_mangle_type (const_tree type)
11039 {
11040   /* The AArch64 ABI documents say that "__va_list" has to be
11041      managled as if it is in the "std" namespace.  */
11042   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
11043     return "St9__va_list";
11044
11045   /* Half-precision float.  */
11046   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
11047     return "Dh";
11048
11049   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
11050      builtin types.  */
11051   if (TYPE_NAME (type) != NULL)
11052     return aarch64_mangle_builtin_type (type);
11053
11054   /* Use the default mangling.  */
11055   return NULL;
11056 }
11057
11058 /* Find the first rtx_insn before insn that will generate an assembly
11059    instruction.  */
11060
11061 static rtx_insn *
11062 aarch64_prev_real_insn (rtx_insn *insn)
11063 {
11064   if (!insn)
11065     return NULL;
11066
11067   do
11068     {
11069       insn = prev_real_insn (insn);
11070     }
11071   while (insn && recog_memoized (insn) < 0);
11072
11073   return insn;
11074 }
11075
11076 static bool
11077 is_madd_op (enum attr_type t1)
11078 {
11079   unsigned int i;
11080   /* A number of these may be AArch32 only.  */
11081   enum attr_type mlatypes[] = {
11082     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
11083     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
11084     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
11085   };
11086
11087   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
11088     {
11089       if (t1 == mlatypes[i])
11090         return true;
11091     }
11092
11093   return false;
11094 }
11095
11096 /* Check if there is a register dependency between a load and the insn
11097    for which we hold recog_data.  */
11098
11099 static bool
11100 dep_between_memop_and_curr (rtx memop)
11101 {
11102   rtx load_reg;
11103   int opno;
11104
11105   gcc_assert (GET_CODE (memop) == SET);
11106
11107   if (!REG_P (SET_DEST (memop)))
11108     return false;
11109
11110   load_reg = SET_DEST (memop);
11111   for (opno = 1; opno < recog_data.n_operands; opno++)
11112     {
11113       rtx operand = recog_data.operand[opno];
11114       if (REG_P (operand)
11115           && reg_overlap_mentioned_p (load_reg, operand))
11116         return true;
11117
11118     }
11119   return false;
11120 }
11121
11122
11123 /* When working around the Cortex-A53 erratum 835769,
11124    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
11125    instruction and has a preceding memory instruction such that a NOP
11126    should be inserted between them.  */
11127
11128 bool
11129 aarch64_madd_needs_nop (rtx_insn* insn)
11130 {
11131   enum attr_type attr_type;
11132   rtx_insn *prev;
11133   rtx body;
11134
11135   if (!TARGET_FIX_ERR_A53_835769)
11136     return false;
11137
11138   if (!INSN_P (insn) || recog_memoized (insn) < 0)
11139     return false;
11140
11141   attr_type = get_attr_type (insn);
11142   if (!is_madd_op (attr_type))
11143     return false;
11144
11145   prev = aarch64_prev_real_insn (insn);
11146   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
11147      Restore recog state to INSN to avoid state corruption.  */
11148   extract_constrain_insn_cached (insn);
11149
11150   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
11151     return false;
11152
11153   body = single_set (prev);
11154
11155   /* If the previous insn is a memory op and there is no dependency between
11156      it and the DImode madd, emit a NOP between them.  If body is NULL then we
11157      have a complex memory operation, probably a load/store pair.
11158      Be conservative for now and emit a NOP.  */
11159   if (GET_MODE (recog_data.operand[0]) == DImode
11160       && (!body || !dep_between_memop_and_curr (body)))
11161     return true;
11162
11163   return false;
11164
11165 }
11166
11167
11168 /* Implement FINAL_PRESCAN_INSN.  */
11169
11170 void
11171 aarch64_final_prescan_insn (rtx_insn *insn)
11172 {
11173   if (aarch64_madd_needs_nop (insn))
11174     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
11175 }
11176
11177
11178 /* Return the equivalent letter for size.  */
11179 static char
11180 sizetochar (int size)
11181 {
11182   switch (size)
11183     {
11184     case 64: return 'd';
11185     case 32: return 's';
11186     case 16: return 'h';
11187     case 8 : return 'b';
11188     default: gcc_unreachable ();
11189     }
11190 }
11191
11192 /* Return true iff x is a uniform vector of floating-point
11193    constants, and the constant can be represented in
11194    quarter-precision form.  Note, as aarch64_float_const_representable
11195    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
11196 static bool
11197 aarch64_vect_float_const_representable_p (rtx x)
11198 {
11199   rtx elt;
11200   return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
11201           && const_vec_duplicate_p (x, &elt)
11202           && aarch64_float_const_representable_p (elt));
11203 }
11204
11205 /* Return true for valid and false for invalid.  */
11206 bool
11207 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
11208                               struct simd_immediate_info *info)
11209 {
11210 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
11211   matches = 1;                                          \
11212   for (i = 0; i < idx; i += (STRIDE))                   \
11213     if (!(TEST))                                        \
11214       matches = 0;                                      \
11215   if (matches)                                          \
11216     {                                                   \
11217       immtype = (CLASS);                                \
11218       elsize = (ELSIZE);                                \
11219       eshift = (SHIFT);                                 \
11220       emvn = (NEG);                                     \
11221       break;                                            \
11222     }
11223
11224   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
11225   unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
11226   unsigned char bytes[16];
11227   int immtype = -1, matches;
11228   unsigned int invmask = inverse ? 0xff : 0;
11229   int eshift, emvn;
11230
11231   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11232     {
11233       if (! (aarch64_simd_imm_zero_p (op, mode)
11234              || aarch64_vect_float_const_representable_p (op)))
11235         return false;
11236
11237       if (info)
11238         {
11239           info->value = CONST_VECTOR_ELT (op, 0);
11240           info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
11241           info->mvn = false;
11242           info->shift = 0;
11243         }
11244
11245       return true;
11246     }
11247
11248   /* Splat vector constant out into a byte vector.  */
11249   for (i = 0; i < n_elts; i++)
11250     {
11251       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
11252          it must be laid out in the vector register in reverse order.  */
11253       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
11254       unsigned HOST_WIDE_INT elpart;
11255
11256       gcc_assert (CONST_INT_P (el));
11257       elpart = INTVAL (el);
11258
11259       for (unsigned int byte = 0; byte < innersize; byte++)
11260         {
11261           bytes[idx++] = (elpart & 0xff) ^ invmask;
11262           elpart >>= BITS_PER_UNIT;
11263         }
11264
11265     }
11266
11267   /* Sanity check.  */
11268   gcc_assert (idx == GET_MODE_SIZE (mode));
11269
11270   do
11271     {
11272       CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
11273              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
11274
11275       CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11276              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11277
11278       CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
11279              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11280
11281       CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
11282              && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
11283
11284       CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
11285
11286       CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
11287
11288       CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
11289              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
11290
11291       CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11292              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11293
11294       CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
11295              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11296
11297       CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
11298              && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
11299
11300       CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
11301
11302       CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
11303
11304       CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11305              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11306
11307       CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11308              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11309
11310       CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
11311              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11312
11313       CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
11314              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11315
11316       CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
11317
11318       CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
11319              && bytes[i] == bytes[(i + 8) % idx], 0, 0);
11320     }
11321   while (0);
11322
11323   if (immtype == -1)
11324     return false;
11325
11326   if (info)
11327     {
11328       info->element_width = elsize;
11329       info->mvn = emvn != 0;
11330       info->shift = eshift;
11331
11332       unsigned HOST_WIDE_INT imm = 0;
11333
11334       if (immtype >= 12 && immtype <= 15)
11335         info->msl = true;
11336
11337       /* Un-invert bytes of recognized vector, if necessary.  */
11338       if (invmask != 0)
11339         for (i = 0; i < idx; i++)
11340           bytes[i] ^= invmask;
11341
11342       if (immtype == 17)
11343         {
11344           /* FIXME: Broken on 32-bit H_W_I hosts.  */
11345           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
11346
11347           for (i = 0; i < 8; i++)
11348             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
11349               << (i * BITS_PER_UNIT);
11350
11351
11352           info->value = GEN_INT (imm);
11353         }
11354       else
11355         {
11356           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
11357             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
11358
11359           /* Construct 'abcdefgh' because the assembler cannot handle
11360              generic constants.  */
11361           if (info->mvn)
11362             imm = ~imm;
11363           imm = (imm >> info->shift) & 0xff;
11364           info->value = GEN_INT (imm);
11365         }
11366     }
11367
11368   return true;
11369 #undef CHECK
11370 }
11371
11372 /* Check of immediate shift constants are within range.  */
11373 bool
11374 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
11375 {
11376   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
11377   if (left)
11378     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
11379   else
11380     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
11381 }
11382
11383 /* Return true if X is a uniform vector where all elements
11384    are either the floating-point constant 0.0 or the
11385    integer constant 0.  */
11386 bool
11387 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
11388 {
11389   return x == CONST0_RTX (mode);
11390 }
11391
11392
11393 /* Return the bitmask CONST_INT to select the bits required by a zero extract
11394    operation of width WIDTH at bit position POS.  */
11395
11396 rtx
11397 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
11398 {
11399   gcc_assert (CONST_INT_P (width));
11400   gcc_assert (CONST_INT_P (pos));
11401
11402   unsigned HOST_WIDE_INT mask
11403     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
11404   return GEN_INT (mask << UINTVAL (pos));
11405 }
11406
11407 bool
11408 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
11409 {
11410   HOST_WIDE_INT imm = INTVAL (x);
11411   int i;
11412
11413   for (i = 0; i < 8; i++)
11414     {
11415       unsigned int byte = imm & 0xff;
11416       if (byte != 0xff && byte != 0)
11417        return false;
11418       imm >>= 8;
11419     }
11420
11421   return true;
11422 }
11423
11424 bool
11425 aarch64_mov_operand_p (rtx x, machine_mode mode)
11426 {
11427   if (GET_CODE (x) == HIGH
11428       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
11429     return true;
11430
11431   if (CONST_INT_P (x))
11432     return true;
11433
11434   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
11435     return true;
11436
11437   return aarch64_classify_symbolic_expression (x)
11438     == SYMBOL_TINY_ABSOLUTE;
11439 }
11440
11441 /* Return a const_int vector of VAL.  */
11442 rtx
11443 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
11444 {
11445   int nunits = GET_MODE_NUNITS (mode);
11446   rtvec v = rtvec_alloc (nunits);
11447   int i;
11448
11449   rtx cache = GEN_INT (val);
11450
11451   for (i=0; i < nunits; i++)
11452     RTVEC_ELT (v, i) = cache;
11453
11454   return gen_rtx_CONST_VECTOR (mode, v);
11455 }
11456
11457 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
11458
11459 bool
11460 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
11461 {
11462   machine_mode vmode;
11463
11464   gcc_assert (!VECTOR_MODE_P (mode));
11465   vmode = aarch64_preferred_simd_mode (mode);
11466   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
11467   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
11468 }
11469
11470 /* Construct and return a PARALLEL RTX vector with elements numbering the
11471    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
11472    the vector - from the perspective of the architecture.  This does not
11473    line up with GCC's perspective on lane numbers, so we end up with
11474    different masks depending on our target endian-ness.  The diagram
11475    below may help.  We must draw the distinction when building masks
11476    which select one half of the vector.  An instruction selecting
11477    architectural low-lanes for a big-endian target, must be described using
11478    a mask selecting GCC high-lanes.
11479
11480                  Big-Endian             Little-Endian
11481
11482 GCC             0   1   2   3           3   2   1   0
11483               | x | x | x | x |       | x | x | x | x |
11484 Architecture    3   2   1   0           3   2   1   0
11485
11486 Low Mask:         { 2, 3 }                { 0, 1 }
11487 High Mask:        { 0, 1 }                { 2, 3 }
11488 */
11489
11490 rtx
11491 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
11492 {
11493   int nunits = GET_MODE_NUNITS (mode);
11494   rtvec v = rtvec_alloc (nunits / 2);
11495   int high_base = nunits / 2;
11496   int low_base = 0;
11497   int base;
11498   rtx t1;
11499   int i;
11500
11501   if (BYTES_BIG_ENDIAN)
11502     base = high ? low_base : high_base;
11503   else
11504     base = high ? high_base : low_base;
11505
11506   for (i = 0; i < nunits / 2; i++)
11507     RTVEC_ELT (v, i) = GEN_INT (base + i);
11508
11509   t1 = gen_rtx_PARALLEL (mode, v);
11510   return t1;
11511 }
11512
11513 /* Check OP for validity as a PARALLEL RTX vector with elements
11514    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
11515    from the perspective of the architecture.  See the diagram above
11516    aarch64_simd_vect_par_cnst_half for more details.  */
11517
11518 bool
11519 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
11520                                        bool high)
11521 {
11522   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
11523   HOST_WIDE_INT count_op = XVECLEN (op, 0);
11524   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
11525   int i = 0;
11526
11527   if (!VECTOR_MODE_P (mode))
11528     return false;
11529
11530   if (count_op != count_ideal)
11531     return false;
11532
11533   for (i = 0; i < count_ideal; i++)
11534     {
11535       rtx elt_op = XVECEXP (op, 0, i);
11536       rtx elt_ideal = XVECEXP (ideal, 0, i);
11537
11538       if (!CONST_INT_P (elt_op)
11539           || INTVAL (elt_ideal) != INTVAL (elt_op))
11540         return false;
11541     }
11542   return true;
11543 }
11544
11545 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
11546    HIGH (exclusive).  */
11547 void
11548 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
11549                           const_tree exp)
11550 {
11551   HOST_WIDE_INT lane;
11552   gcc_assert (CONST_INT_P (operand));
11553   lane = INTVAL (operand);
11554
11555   if (lane < low || lane >= high)
11556   {
11557     if (exp)
11558       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
11559     else
11560       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
11561   }
11562 }
11563
11564 /* Return TRUE if OP is a valid vector addressing mode.  */
11565 bool
11566 aarch64_simd_mem_operand_p (rtx op)
11567 {
11568   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
11569                         || REG_P (XEXP (op, 0)));
11570 }
11571
11572 /* Emit a register copy from operand to operand, taking care not to
11573    early-clobber source registers in the process.
11574
11575    COUNT is the number of components into which the copy needs to be
11576    decomposed.  */
11577 void
11578 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
11579                                 unsigned int count)
11580 {
11581   unsigned int i;
11582   int rdest = REGNO (operands[0]);
11583   int rsrc = REGNO (operands[1]);
11584
11585   if (!reg_overlap_mentioned_p (operands[0], operands[1])
11586       || rdest < rsrc)
11587     for (i = 0; i < count; i++)
11588       emit_move_insn (gen_rtx_REG (mode, rdest + i),
11589                       gen_rtx_REG (mode, rsrc + i));
11590   else
11591     for (i = 0; i < count; i++)
11592       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
11593                       gen_rtx_REG (mode, rsrc + count - i - 1));
11594 }
11595
11596 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
11597    one of VSTRUCT modes: OI, CI, or XI.  */
11598 int
11599 aarch64_simd_attr_length_rglist (enum machine_mode mode)
11600 {
11601   return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
11602 }
11603
11604 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
11605    alignment of a vector to 128 bits.  */
11606 static HOST_WIDE_INT
11607 aarch64_simd_vector_alignment (const_tree type)
11608 {
11609   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
11610   return MIN (align, 128);
11611 }
11612
11613 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
11614 static bool
11615 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
11616 {
11617   if (is_packed)
11618     return false;
11619
11620   /* We guarantee alignment for vectors up to 128-bits.  */
11621   if (tree_int_cst_compare (TYPE_SIZE (type),
11622                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
11623     return false;
11624
11625   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
11626   return true;
11627 }
11628
11629 /* Return true if the vector misalignment factor is supported by the
11630    target.  */
11631 static bool
11632 aarch64_builtin_support_vector_misalignment (machine_mode mode,
11633                                              const_tree type, int misalignment,
11634                                              bool is_packed)
11635 {
11636   if (TARGET_SIMD && STRICT_ALIGNMENT)
11637     {
11638       /* Return if movmisalign pattern is not supported for this mode.  */
11639       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
11640         return false;
11641
11642       if (misalignment == -1)
11643         {
11644           /* Misalignment factor is unknown at compile time but we know
11645              it's word aligned.  */
11646           if (aarch64_simd_vector_alignment_reachable (type, is_packed))
11647             {
11648               int element_size = TREE_INT_CST_LOW (TYPE_SIZE (type));
11649
11650               if (element_size != 64)
11651                 return true;
11652             }
11653           return false;
11654         }
11655     }
11656   return default_builtin_support_vector_misalignment (mode, type, misalignment,
11657                                                       is_packed);
11658 }
11659
11660 /* If VALS is a vector constant that can be loaded into a register
11661    using DUP, generate instructions to do so and return an RTX to
11662    assign to the register.  Otherwise return NULL_RTX.  */
11663 static rtx
11664 aarch64_simd_dup_constant (rtx vals)
11665 {
11666   machine_mode mode = GET_MODE (vals);
11667   machine_mode inner_mode = GET_MODE_INNER (mode);
11668   rtx x;
11669
11670   if (!const_vec_duplicate_p (vals, &x))
11671     return NULL_RTX;
11672
11673   /* We can load this constant by using DUP and a constant in a
11674      single ARM register.  This will be cheaper than a vector
11675      load.  */
11676   x = copy_to_mode_reg (inner_mode, x);
11677   return gen_rtx_VEC_DUPLICATE (mode, x);
11678 }
11679
11680
11681 /* Generate code to load VALS, which is a PARALLEL containing only
11682    constants (for vec_init) or CONST_VECTOR, efficiently into a
11683    register.  Returns an RTX to copy into the register, or NULL_RTX
11684    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
11685 static rtx
11686 aarch64_simd_make_constant (rtx vals)
11687 {
11688   machine_mode mode = GET_MODE (vals);
11689   rtx const_dup;
11690   rtx const_vec = NULL_RTX;
11691   int n_elts = GET_MODE_NUNITS (mode);
11692   int n_const = 0;
11693   int i;
11694
11695   if (GET_CODE (vals) == CONST_VECTOR)
11696     const_vec = vals;
11697   else if (GET_CODE (vals) == PARALLEL)
11698     {
11699       /* A CONST_VECTOR must contain only CONST_INTs and
11700          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11701          Only store valid constants in a CONST_VECTOR.  */
11702       for (i = 0; i < n_elts; ++i)
11703         {
11704           rtx x = XVECEXP (vals, 0, i);
11705           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11706             n_const++;
11707         }
11708       if (n_const == n_elts)
11709         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11710     }
11711   else
11712     gcc_unreachable ();
11713
11714   if (const_vec != NULL_RTX
11715       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11716     /* Load using MOVI/MVNI.  */
11717     return const_vec;
11718   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11719     /* Loaded using DUP.  */
11720     return const_dup;
11721   else if (const_vec != NULL_RTX)
11722     /* Load from constant pool. We can not take advantage of single-cycle
11723        LD1 because we need a PC-relative addressing mode.  */
11724     return const_vec;
11725   else
11726     /* A PARALLEL containing something not valid inside CONST_VECTOR.
11727        We can not construct an initializer.  */
11728     return NULL_RTX;
11729 }
11730
11731 /* Expand a vector initialisation sequence, such that TARGET is
11732    initialised to contain VALS.  */
11733
11734 void
11735 aarch64_expand_vector_init (rtx target, rtx vals)
11736 {
11737   machine_mode mode = GET_MODE (target);
11738   machine_mode inner_mode = GET_MODE_INNER (mode);
11739   /* The number of vector elements.  */
11740   int n_elts = GET_MODE_NUNITS (mode);
11741   /* The number of vector elements which are not constant.  */
11742   int n_var = 0;
11743   rtx any_const = NULL_RTX;
11744   /* The first element of vals.  */
11745   rtx v0 = XVECEXP (vals, 0, 0);
11746   bool all_same = true;
11747
11748   /* Count the number of variable elements to initialise.  */
11749   for (int i = 0; i < n_elts; ++i)
11750     {
11751       rtx x = XVECEXP (vals, 0, i);
11752       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
11753         ++n_var;
11754       else
11755         any_const = x;
11756
11757       all_same &= rtx_equal_p (x, v0);
11758     }
11759
11760   /* No variable elements, hand off to aarch64_simd_make_constant which knows
11761      how best to handle this.  */
11762   if (n_var == 0)
11763     {
11764       rtx constant = aarch64_simd_make_constant (vals);
11765       if (constant != NULL_RTX)
11766         {
11767           emit_move_insn (target, constant);
11768           return;
11769         }
11770     }
11771
11772   /* Splat a single non-constant element if we can.  */
11773   if (all_same)
11774     {
11775       rtx x = copy_to_mode_reg (inner_mode, v0);
11776       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
11777       return;
11778     }
11779
11780   enum insn_code icode = optab_handler (vec_set_optab, mode);
11781   gcc_assert (icode != CODE_FOR_nothing);
11782
11783   /* If there are only variable elements, try to optimize
11784      the insertion using dup for the most common element
11785      followed by insertions.  */
11786
11787   /* The algorithm will fill matches[*][0] with the earliest matching element,
11788      and matches[X][1] with the count of duplicate elements (if X is the
11789      earliest element which has duplicates).  */
11790
11791   if (n_var == n_elts && n_elts <= 16)
11792     {
11793       int matches[16][2] = {0};
11794       for (int i = 0; i < n_elts; i++)
11795         {
11796           for (int j = 0; j <= i; j++)
11797             {
11798               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
11799                 {
11800                   matches[i][0] = j;
11801                   matches[j][1]++;
11802                   break;
11803                 }
11804             }
11805         }
11806       int maxelement = 0;
11807       int maxv = 0;
11808       for (int i = 0; i < n_elts; i++)
11809         if (matches[i][1] > maxv)
11810           {
11811             maxelement = i;
11812             maxv = matches[i][1];
11813           }
11814
11815       /* Create a duplicate of the most common element.  */
11816       rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
11817       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
11818
11819       /* Insert the rest.  */
11820       for (int i = 0; i < n_elts; i++)
11821         {
11822           rtx x = XVECEXP (vals, 0, i);
11823           if (matches[i][0] == maxelement)
11824             continue;
11825           x = copy_to_mode_reg (inner_mode, x);
11826           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
11827         }
11828       return;
11829     }
11830
11831   /* Initialise a vector which is part-variable.  We want to first try
11832      to build those lanes which are constant in the most efficient way we
11833      can.  */
11834   if (n_var != n_elts)
11835     {
11836       rtx copy = copy_rtx (vals);
11837
11838       /* Load constant part of vector.  We really don't care what goes into the
11839          parts we will overwrite, but we're more likely to be able to load the
11840          constant efficiently if it has fewer, larger, repeating parts
11841          (see aarch64_simd_valid_immediate).  */
11842       for (int i = 0; i < n_elts; i++)
11843         {
11844           rtx x = XVECEXP (vals, 0, i);
11845           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11846             continue;
11847           rtx subst = any_const;
11848           for (int bit = n_elts / 2; bit > 0; bit /= 2)
11849             {
11850               /* Look in the copied vector, as more elements are const.  */
11851               rtx test = XVECEXP (copy, 0, i ^ bit);
11852               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
11853                 {
11854                   subst = test;
11855                   break;
11856                 }
11857             }
11858           XVECEXP (copy, 0, i) = subst;
11859         }
11860       aarch64_expand_vector_init (target, copy);
11861     }
11862
11863   /* Insert the variable lanes directly.  */
11864   for (int i = 0; i < n_elts; i++)
11865     {
11866       rtx x = XVECEXP (vals, 0, i);
11867       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11868         continue;
11869       x = copy_to_mode_reg (inner_mode, x);
11870       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
11871     }
11872 }
11873
11874 static unsigned HOST_WIDE_INT
11875 aarch64_shift_truncation_mask (machine_mode mode)
11876 {
11877   return
11878     (!SHIFT_COUNT_TRUNCATED
11879      || aarch64_vector_mode_supported_p (mode)
11880      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
11881 }
11882
11883 /* Select a format to encode pointers in exception handling data.  */
11884 int
11885 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
11886 {
11887    int type;
11888    switch (aarch64_cmodel)
11889      {
11890      case AARCH64_CMODEL_TINY:
11891      case AARCH64_CMODEL_TINY_PIC:
11892      case AARCH64_CMODEL_SMALL:
11893      case AARCH64_CMODEL_SMALL_PIC:
11894      case AARCH64_CMODEL_SMALL_SPIC:
11895        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
11896           for everything.  */
11897        type = DW_EH_PE_sdata4;
11898        break;
11899      default:
11900        /* No assumptions here.  8-byte relocs required.  */
11901        type = DW_EH_PE_sdata8;
11902        break;
11903      }
11904    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
11905 }
11906
11907 /* The last .arch and .tune assembly strings that we printed.  */
11908 static std::string aarch64_last_printed_arch_string;
11909 static std::string aarch64_last_printed_tune_string;
11910
11911 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
11912    by the function fndecl.  */
11913
11914 void
11915 aarch64_declare_function_name (FILE *stream, const char* name,
11916                                 tree fndecl)
11917 {
11918   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11919
11920   struct cl_target_option *targ_options;
11921   if (target_parts)
11922     targ_options = TREE_TARGET_OPTION (target_parts);
11923   else
11924     targ_options = TREE_TARGET_OPTION (target_option_current_node);
11925   gcc_assert (targ_options);
11926
11927   const struct processor *this_arch
11928     = aarch64_get_arch (targ_options->x_explicit_arch);
11929
11930   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
11931   std::string extension
11932     = aarch64_get_extension_string_for_isa_flags (isa_flags,
11933                                                   this_arch->flags);
11934   /* Only update the assembler .arch string if it is distinct from the last
11935      such string we printed.  */
11936   std::string to_print = this_arch->name + extension;
11937   if (to_print != aarch64_last_printed_arch_string)
11938     {
11939       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
11940       aarch64_last_printed_arch_string = to_print;
11941     }
11942
11943   /* Print the cpu name we're tuning for in the comments, might be
11944      useful to readers of the generated asm.  Do it only when it changes
11945      from function to function and verbose assembly is requested.  */
11946   const struct processor *this_tune
11947     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
11948
11949   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
11950     {
11951       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
11952                    this_tune->name);
11953       aarch64_last_printed_tune_string = this_tune->name;
11954     }
11955
11956   /* Don't forget the type directive for ELF.  */
11957   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
11958   ASM_OUTPUT_LABEL (stream, name);
11959 }
11960
11961 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
11962
11963 static void
11964 aarch64_start_file (void)
11965 {
11966   struct cl_target_option *default_options
11967     = TREE_TARGET_OPTION (target_option_default_node);
11968
11969   const struct processor *default_arch
11970     = aarch64_get_arch (default_options->x_explicit_arch);
11971   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
11972   std::string extension
11973     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
11974                                                   default_arch->flags);
11975
11976    aarch64_last_printed_arch_string = default_arch->name + extension;
11977    aarch64_last_printed_tune_string = "";
11978    asm_fprintf (asm_out_file, "\t.arch %s\n",
11979                 aarch64_last_printed_arch_string.c_str ());
11980
11981    default_file_start ();
11982 }
11983
11984 /* Emit load exclusive.  */
11985
11986 static void
11987 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
11988                              rtx mem, rtx model_rtx)
11989 {
11990   rtx (*gen) (rtx, rtx, rtx);
11991
11992   switch (mode)
11993     {
11994     case QImode: gen = gen_aarch64_load_exclusiveqi; break;
11995     case HImode: gen = gen_aarch64_load_exclusivehi; break;
11996     case SImode: gen = gen_aarch64_load_exclusivesi; break;
11997     case DImode: gen = gen_aarch64_load_exclusivedi; break;
11998     default:
11999       gcc_unreachable ();
12000     }
12001
12002   emit_insn (gen (rval, mem, model_rtx));
12003 }
12004
12005 /* Emit store exclusive.  */
12006
12007 static void
12008 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
12009                               rtx rval, rtx mem, rtx model_rtx)
12010 {
12011   rtx (*gen) (rtx, rtx, rtx, rtx);
12012
12013   switch (mode)
12014     {
12015     case QImode: gen = gen_aarch64_store_exclusiveqi; break;
12016     case HImode: gen = gen_aarch64_store_exclusivehi; break;
12017     case SImode: gen = gen_aarch64_store_exclusivesi; break;
12018     case DImode: gen = gen_aarch64_store_exclusivedi; break;
12019     default:
12020       gcc_unreachable ();
12021     }
12022
12023   emit_insn (gen (bval, rval, mem, model_rtx));
12024 }
12025
12026 /* Mark the previous jump instruction as unlikely.  */
12027
12028 static void
12029 aarch64_emit_unlikely_jump (rtx insn)
12030 {
12031   int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
12032
12033   rtx_insn *jump = emit_jump_insn (insn);
12034   add_int_reg_note (jump, REG_BR_PROB, very_unlikely);
12035 }
12036
12037 /* Expand a compare and swap pattern.  */
12038
12039 void
12040 aarch64_expand_compare_and_swap (rtx operands[])
12041 {
12042   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
12043   machine_mode mode, cmp_mode;
12044   typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
12045   int idx;
12046   gen_cas_fn gen;
12047   const gen_cas_fn split_cas[] =
12048   {
12049     gen_aarch64_compare_and_swapqi,
12050     gen_aarch64_compare_and_swaphi,
12051     gen_aarch64_compare_and_swapsi,
12052     gen_aarch64_compare_and_swapdi
12053   };
12054   const gen_cas_fn atomic_cas[] =
12055   {
12056     gen_aarch64_compare_and_swapqi_lse,
12057     gen_aarch64_compare_and_swaphi_lse,
12058     gen_aarch64_compare_and_swapsi_lse,
12059     gen_aarch64_compare_and_swapdi_lse
12060   };
12061
12062   bval = operands[0];
12063   rval = operands[1];
12064   mem = operands[2];
12065   oldval = operands[3];
12066   newval = operands[4];
12067   is_weak = operands[5];
12068   mod_s = operands[6];
12069   mod_f = operands[7];
12070   mode = GET_MODE (mem);
12071   cmp_mode = mode;
12072
12073   /* Normally the succ memory model must be stronger than fail, but in the
12074      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
12075      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
12076
12077   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
12078       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
12079     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
12080
12081   switch (mode)
12082     {
12083     case QImode:
12084     case HImode:
12085       /* For short modes, we're going to perform the comparison in SImode,
12086          so do the zero-extension now.  */
12087       cmp_mode = SImode;
12088       rval = gen_reg_rtx (SImode);
12089       oldval = convert_modes (SImode, mode, oldval, true);
12090       /* Fall through.  */
12091
12092     case SImode:
12093     case DImode:
12094       /* Force the value into a register if needed.  */
12095       if (!aarch64_plus_operand (oldval, mode))
12096         oldval = force_reg (cmp_mode, oldval);
12097       break;
12098
12099     default:
12100       gcc_unreachable ();
12101     }
12102
12103   switch (mode)
12104     {
12105     case QImode: idx = 0; break;
12106     case HImode: idx = 1; break;
12107     case SImode: idx = 2; break;
12108     case DImode: idx = 3; break;
12109     default:
12110       gcc_unreachable ();
12111     }
12112   if (TARGET_LSE)
12113     gen = atomic_cas[idx];
12114   else
12115     gen = split_cas[idx];
12116
12117   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
12118
12119   if (mode == QImode || mode == HImode)
12120     emit_move_insn (operands[1], gen_lowpart (mode, rval));
12121
12122   x = gen_rtx_REG (CCmode, CC_REGNUM);
12123   x = gen_rtx_EQ (SImode, x, const0_rtx);
12124   emit_insn (gen_rtx_SET (bval, x));
12125 }
12126
12127 /* Test whether the target supports using a atomic load-operate instruction.
12128    CODE is the operation and AFTER is TRUE if the data in memory after the
12129    operation should be returned and FALSE if the data before the operation
12130    should be returned.  Returns FALSE if the operation isn't supported by the
12131    architecture.  */
12132
12133 bool
12134 aarch64_atomic_ldop_supported_p (enum rtx_code code)
12135 {
12136   if (!TARGET_LSE)
12137     return false;
12138
12139   switch (code)
12140     {
12141     case SET:
12142     case AND:
12143     case IOR:
12144     case XOR:
12145     case MINUS:
12146     case PLUS:
12147       return true;
12148     default:
12149       return false;
12150     }
12151 }
12152
12153 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
12154    sequence implementing an atomic operation.  */
12155
12156 static void
12157 aarch64_emit_post_barrier (enum memmodel model)
12158 {
12159   const enum memmodel base_model = memmodel_base (model);
12160
12161   if (is_mm_sync (model)
12162       && (base_model == MEMMODEL_ACQUIRE
12163           || base_model == MEMMODEL_ACQ_REL
12164           || base_model == MEMMODEL_SEQ_CST))
12165     {
12166       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
12167     }
12168 }
12169
12170 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
12171    for the data in memory.  EXPECTED is the value expected to be in memory.
12172    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
12173    is the memory ordering to use.  */
12174
12175 void
12176 aarch64_gen_atomic_cas (rtx rval, rtx mem,
12177                         rtx expected, rtx desired,
12178                         rtx model)
12179 {
12180   rtx (*gen) (rtx, rtx, rtx, rtx);
12181   machine_mode mode;
12182
12183   mode = GET_MODE (mem);
12184
12185   switch (mode)
12186     {
12187     case QImode: gen = gen_aarch64_atomic_casqi; break;
12188     case HImode: gen = gen_aarch64_atomic_cashi; break;
12189     case SImode: gen = gen_aarch64_atomic_cassi; break;
12190     case DImode: gen = gen_aarch64_atomic_casdi; break;
12191     default:
12192       gcc_unreachable ();
12193     }
12194
12195   /* Move the expected value into the CAS destination register.  */
12196   emit_insn (gen_rtx_SET (rval, expected));
12197
12198   /* Emit the CAS.  */
12199   emit_insn (gen (rval, mem, desired, model));
12200
12201   /* Compare the expected value with the value loaded by the CAS, to establish
12202      whether the swap was made.  */
12203   aarch64_gen_compare_reg (EQ, rval, expected);
12204 }
12205
12206 /* Split a compare and swap pattern.  */
12207
12208 void
12209 aarch64_split_compare_and_swap (rtx operands[])
12210 {
12211   rtx rval, mem, oldval, newval, scratch;
12212   machine_mode mode;
12213   bool is_weak;
12214   rtx_code_label *label1, *label2;
12215   rtx x, cond;
12216   enum memmodel model;
12217   rtx model_rtx;
12218
12219   rval = operands[0];
12220   mem = operands[1];
12221   oldval = operands[2];
12222   newval = operands[3];
12223   is_weak = (operands[4] != const0_rtx);
12224   model_rtx = operands[5];
12225   scratch = operands[7];
12226   mode = GET_MODE (mem);
12227   model = memmodel_from_int (INTVAL (model_rtx));
12228
12229   /* When OLDVAL is zero and we want the strong version we can emit a tighter
12230     loop:
12231     .label1:
12232         LD[A]XR rval, [mem]
12233         CBNZ    rval, .label2
12234         ST[L]XR scratch, newval, [mem]
12235         CBNZ    scratch, .label1
12236     .label2:
12237         CMP     rval, 0.  */
12238   bool strong_zero_p = !is_weak && oldval == const0_rtx;
12239
12240   label1 = NULL;
12241   if (!is_weak)
12242     {
12243       label1 = gen_label_rtx ();
12244       emit_label (label1);
12245     }
12246   label2 = gen_label_rtx ();
12247
12248   /* The initial load can be relaxed for a __sync operation since a final
12249      barrier will be emitted to stop code hoisting.  */
12250   if (is_mm_sync (model))
12251     aarch64_emit_load_exclusive (mode, rval, mem,
12252                                  GEN_INT (MEMMODEL_RELAXED));
12253   else
12254     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
12255
12256   if (strong_zero_p)
12257     {
12258       x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
12259       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12260                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12261       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12262     }
12263   else
12264     {
12265       cond = aarch64_gen_compare_reg (NE, rval, oldval);
12266       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12267       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12268                                  gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12269       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12270     }
12271
12272   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
12273
12274   if (!is_weak)
12275     {
12276       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
12277       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12278                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
12279       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12280     }
12281   else
12282     {
12283       cond = gen_rtx_REG (CCmode, CC_REGNUM);
12284       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
12285       emit_insn (gen_rtx_SET (cond, x));
12286     }
12287
12288   emit_label (label2);
12289   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
12290      to set the condition flags.  If this is not used it will be removed by
12291      later passes.  */
12292   if (strong_zero_p)
12293     {
12294       cond = gen_rtx_REG (CCmode, CC_REGNUM);
12295       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
12296       emit_insn (gen_rtx_SET (cond, x));
12297     }
12298   /* Emit any final barrier needed for a __sync operation.  */
12299   if (is_mm_sync (model))
12300     aarch64_emit_post_barrier (model);
12301 }
12302
12303 /* Emit a BIC instruction.  */
12304
12305 static void
12306 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
12307 {
12308   rtx shift_rtx = GEN_INT (shift);
12309   rtx (*gen) (rtx, rtx, rtx, rtx);
12310
12311   switch (mode)
12312     {
12313     case SImode: gen = gen_and_one_cmpl_lshrsi3; break;
12314     case DImode: gen = gen_and_one_cmpl_lshrdi3; break;
12315     default:
12316       gcc_unreachable ();
12317     }
12318
12319   emit_insn (gen (dst, s2, shift_rtx, s1));
12320 }
12321
12322 /* Emit an atomic swap.  */
12323
12324 static void
12325 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
12326                           rtx mem, rtx model)
12327 {
12328   rtx (*gen) (rtx, rtx, rtx, rtx);
12329
12330   switch (mode)
12331     {
12332     case QImode: gen = gen_aarch64_atomic_swpqi; break;
12333     case HImode: gen = gen_aarch64_atomic_swphi; break;
12334     case SImode: gen = gen_aarch64_atomic_swpsi; break;
12335     case DImode: gen = gen_aarch64_atomic_swpdi; break;
12336     default:
12337       gcc_unreachable ();
12338     }
12339
12340   emit_insn (gen (dst, mem, value, model));
12341 }
12342
12343 /* Operations supported by aarch64_emit_atomic_load_op.  */
12344
12345 enum aarch64_atomic_load_op_code
12346 {
12347   AARCH64_LDOP_PLUS,    /* A + B  */
12348   AARCH64_LDOP_XOR,     /* A ^ B  */
12349   AARCH64_LDOP_OR,      /* A | B  */
12350   AARCH64_LDOP_BIC      /* A & ~B  */
12351 };
12352
12353 /* Emit an atomic load-operate.  */
12354
12355 static void
12356 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
12357                              machine_mode mode, rtx dst, rtx src,
12358                              rtx mem, rtx model)
12359 {
12360   typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
12361   const aarch64_atomic_load_op_fn plus[] =
12362   {
12363     gen_aarch64_atomic_loadaddqi,
12364     gen_aarch64_atomic_loadaddhi,
12365     gen_aarch64_atomic_loadaddsi,
12366     gen_aarch64_atomic_loadadddi
12367   };
12368   const aarch64_atomic_load_op_fn eor[] =
12369   {
12370     gen_aarch64_atomic_loadeorqi,
12371     gen_aarch64_atomic_loadeorhi,
12372     gen_aarch64_atomic_loadeorsi,
12373     gen_aarch64_atomic_loadeordi
12374   };
12375   const aarch64_atomic_load_op_fn ior[] =
12376   {
12377     gen_aarch64_atomic_loadsetqi,
12378     gen_aarch64_atomic_loadsethi,
12379     gen_aarch64_atomic_loadsetsi,
12380     gen_aarch64_atomic_loadsetdi
12381   };
12382   const aarch64_atomic_load_op_fn bic[] =
12383   {
12384     gen_aarch64_atomic_loadclrqi,
12385     gen_aarch64_atomic_loadclrhi,
12386     gen_aarch64_atomic_loadclrsi,
12387     gen_aarch64_atomic_loadclrdi
12388   };
12389   aarch64_atomic_load_op_fn gen;
12390   int idx = 0;
12391
12392   switch (mode)
12393     {
12394     case QImode: idx = 0; break;
12395     case HImode: idx = 1; break;
12396     case SImode: idx = 2; break;
12397     case DImode: idx = 3; break;
12398     default:
12399       gcc_unreachable ();
12400     }
12401
12402   switch (code)
12403     {
12404     case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
12405     case AARCH64_LDOP_XOR: gen = eor[idx]; break;
12406     case AARCH64_LDOP_OR: gen = ior[idx]; break;
12407     case AARCH64_LDOP_BIC: gen = bic[idx]; break;
12408     default:
12409       gcc_unreachable ();
12410     }
12411
12412   emit_insn (gen (dst, mem, src, model));
12413 }
12414
12415 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
12416    location to store the data read from memory.  OUT_RESULT is the location to
12417    store the result of the operation.  MEM is the memory location to read and
12418    modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
12419    operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
12420    be NULL.  */
12421
12422 void
12423 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
12424                          rtx mem, rtx value, rtx model_rtx)
12425 {
12426   machine_mode mode = GET_MODE (mem);
12427   machine_mode wmode = (mode == DImode ? DImode : SImode);
12428   const bool short_mode = (mode < SImode);
12429   aarch64_atomic_load_op_code ldop_code;
12430   rtx src;
12431   rtx x;
12432
12433   if (out_data)
12434     out_data = gen_lowpart (mode, out_data);
12435
12436   if (out_result)
12437     out_result = gen_lowpart (mode, out_result);
12438
12439   /* Make sure the value is in a register, putting it into a destination
12440      register if it needs to be manipulated.  */
12441   if (!register_operand (value, mode)
12442       || code == AND || code == MINUS)
12443     {
12444       src = out_result ? out_result : out_data;
12445       emit_move_insn (src, gen_lowpart (mode, value));
12446     }
12447   else
12448     src = value;
12449   gcc_assert (register_operand (src, mode));
12450
12451   /* Preprocess the data for the operation as necessary.  If the operation is
12452      a SET then emit a swap instruction and finish.  */
12453   switch (code)
12454     {
12455     case SET:
12456       aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
12457       return;
12458
12459     case MINUS:
12460       /* Negate the value and treat it as a PLUS.  */
12461       {
12462         rtx neg_src;
12463
12464         /* Resize the value if necessary.  */
12465         if (short_mode)
12466           src = gen_lowpart (wmode, src);
12467
12468         neg_src = gen_rtx_NEG (wmode, src);
12469         emit_insn (gen_rtx_SET (src, neg_src));
12470
12471         if (short_mode)
12472           src = gen_lowpart (mode, src);
12473       }
12474       /* Fall-through.  */
12475     case PLUS:
12476       ldop_code = AARCH64_LDOP_PLUS;
12477       break;
12478
12479     case IOR:
12480       ldop_code = AARCH64_LDOP_OR;
12481       break;
12482
12483     case XOR:
12484       ldop_code = AARCH64_LDOP_XOR;
12485       break;
12486
12487     case AND:
12488       {
12489         rtx not_src;
12490
12491         /* Resize the value if necessary.  */
12492         if (short_mode)
12493           src = gen_lowpart (wmode, src);
12494
12495         not_src = gen_rtx_NOT (wmode, src);
12496         emit_insn (gen_rtx_SET (src, not_src));
12497
12498         if (short_mode)
12499           src = gen_lowpart (mode, src);
12500       }
12501       ldop_code = AARCH64_LDOP_BIC;
12502       break;
12503
12504     default:
12505       /* The operation can't be done with atomic instructions.  */
12506       gcc_unreachable ();
12507     }
12508
12509   aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
12510
12511   /* If necessary, calculate the data in memory after the update by redoing the
12512      operation from values in registers.  */
12513   if (!out_result)
12514     return;
12515
12516   if (short_mode)
12517     {
12518       src = gen_lowpart (wmode, src);
12519       out_data = gen_lowpart (wmode, out_data);
12520       out_result = gen_lowpart (wmode, out_result);
12521     }
12522
12523   x = NULL_RTX;
12524
12525   switch (code)
12526     {
12527     case MINUS:
12528     case PLUS:
12529       x = gen_rtx_PLUS (wmode, out_data, src);
12530       break;
12531     case IOR:
12532       x = gen_rtx_IOR (wmode, out_data, src);
12533       break;
12534     case XOR:
12535       x = gen_rtx_XOR (wmode, out_data, src);
12536       break;
12537     case AND:
12538       aarch64_emit_bic (wmode, out_result, out_data, src, 0);
12539       return;
12540     default:
12541       gcc_unreachable ();
12542     }
12543
12544   emit_set_insn (out_result, x);
12545
12546   return;
12547 }
12548
12549 /* Split an atomic operation.  */
12550
12551 void
12552 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
12553                          rtx value, rtx model_rtx, rtx cond)
12554 {
12555   machine_mode mode = GET_MODE (mem);
12556   machine_mode wmode = (mode == DImode ? DImode : SImode);
12557   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
12558   const bool is_sync = is_mm_sync (model);
12559   rtx_code_label *label;
12560   rtx x;
12561
12562   /* Split the atomic operation into a sequence.  */
12563   label = gen_label_rtx ();
12564   emit_label (label);
12565
12566   if (new_out)
12567     new_out = gen_lowpart (wmode, new_out);
12568   if (old_out)
12569     old_out = gen_lowpart (wmode, old_out);
12570   else
12571     old_out = new_out;
12572   value = simplify_gen_subreg (wmode, value, mode, 0);
12573
12574   /* The initial load can be relaxed for a __sync operation since a final
12575      barrier will be emitted to stop code hoisting.  */
12576  if (is_sync)
12577     aarch64_emit_load_exclusive (mode, old_out, mem,
12578                                  GEN_INT (MEMMODEL_RELAXED));
12579   else
12580     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
12581
12582   switch (code)
12583     {
12584     case SET:
12585       new_out = value;
12586       break;
12587
12588     case NOT:
12589       x = gen_rtx_AND (wmode, old_out, value);
12590       emit_insn (gen_rtx_SET (new_out, x));
12591       x = gen_rtx_NOT (wmode, new_out);
12592       emit_insn (gen_rtx_SET (new_out, x));
12593       break;
12594
12595     case MINUS:
12596       if (CONST_INT_P (value))
12597         {
12598           value = GEN_INT (-INTVAL (value));
12599           code = PLUS;
12600         }
12601       /* Fall through.  */
12602
12603     default:
12604       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
12605       emit_insn (gen_rtx_SET (new_out, x));
12606       break;
12607     }
12608
12609   aarch64_emit_store_exclusive (mode, cond, mem,
12610                                 gen_lowpart (mode, new_out), model_rtx);
12611
12612   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12613   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12614                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
12615   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12616
12617   /* Emit any final barrier needed for a __sync operation.  */
12618   if (is_sync)
12619     aarch64_emit_post_barrier (model);
12620 }
12621
12622 static void
12623 aarch64_init_libfuncs (void)
12624 {
12625    /* Half-precision float operations.  The compiler handles all operations
12626      with NULL libfuncs by converting to SFmode.  */
12627
12628   /* Conversions.  */
12629   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
12630   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
12631
12632   /* Arithmetic.  */
12633   set_optab_libfunc (add_optab, HFmode, NULL);
12634   set_optab_libfunc (sdiv_optab, HFmode, NULL);
12635   set_optab_libfunc (smul_optab, HFmode, NULL);
12636   set_optab_libfunc (neg_optab, HFmode, NULL);
12637   set_optab_libfunc (sub_optab, HFmode, NULL);
12638
12639   /* Comparisons.  */
12640   set_optab_libfunc (eq_optab, HFmode, NULL);
12641   set_optab_libfunc (ne_optab, HFmode, NULL);
12642   set_optab_libfunc (lt_optab, HFmode, NULL);
12643   set_optab_libfunc (le_optab, HFmode, NULL);
12644   set_optab_libfunc (ge_optab, HFmode, NULL);
12645   set_optab_libfunc (gt_optab, HFmode, NULL);
12646   set_optab_libfunc (unord_optab, HFmode, NULL);
12647 }
12648
12649 /* Target hook for c_mode_for_suffix.  */
12650 static machine_mode
12651 aarch64_c_mode_for_suffix (char suffix)
12652 {
12653   if (suffix == 'q')
12654     return TFmode;
12655
12656   return VOIDmode;
12657 }
12658
12659 /* We can only represent floating point constants which will fit in
12660    "quarter-precision" values.  These values are characterised by
12661    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
12662    by:
12663
12664    (-1)^s * (n/16) * 2^r
12665
12666    Where:
12667      's' is the sign bit.
12668      'n' is an integer in the range 16 <= n <= 31.
12669      'r' is an integer in the range -3 <= r <= 4.  */
12670
12671 /* Return true iff X can be represented by a quarter-precision
12672    floating point immediate operand X.  Note, we cannot represent 0.0.  */
12673 bool
12674 aarch64_float_const_representable_p (rtx x)
12675 {
12676   /* This represents our current view of how many bits
12677      make up the mantissa.  */
12678   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
12679   int exponent;
12680   unsigned HOST_WIDE_INT mantissa, mask;
12681   REAL_VALUE_TYPE r, m;
12682   bool fail;
12683
12684   if (!CONST_DOUBLE_P (x))
12685     return false;
12686
12687   /* We don't support HFmode constants yet.  */
12688   if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
12689     return false;
12690
12691   r = *CONST_DOUBLE_REAL_VALUE (x);
12692
12693   /* We cannot represent infinities, NaNs or +/-zero.  We won't
12694      know if we have +zero until we analyse the mantissa, but we
12695      can reject the other invalid values.  */
12696   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
12697       || REAL_VALUE_MINUS_ZERO (r))
12698     return false;
12699
12700   /* Extract exponent.  */
12701   r = real_value_abs (&r);
12702   exponent = REAL_EXP (&r);
12703
12704   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
12705      highest (sign) bit, with a fixed binary point at bit point_pos.
12706      m1 holds the low part of the mantissa, m2 the high part.
12707      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
12708      bits for the mantissa, this can fail (low bits will be lost).  */
12709   real_ldexp (&m, &r, point_pos - exponent);
12710   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
12711
12712   /* If the low part of the mantissa has bits set we cannot represent
12713      the value.  */
12714   if (w.ulow () != 0)
12715     return false;
12716   /* We have rejected the lower HOST_WIDE_INT, so update our
12717      understanding of how many bits lie in the mantissa and
12718      look only at the high HOST_WIDE_INT.  */
12719   mantissa = w.elt (1);
12720   point_pos -= HOST_BITS_PER_WIDE_INT;
12721
12722   /* We can only represent values with a mantissa of the form 1.xxxx.  */
12723   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
12724   if ((mantissa & mask) != 0)
12725     return false;
12726
12727   /* Having filtered unrepresentable values, we may now remove all
12728      but the highest 5 bits.  */
12729   mantissa >>= point_pos - 5;
12730
12731   /* We cannot represent the value 0.0, so reject it.  This is handled
12732      elsewhere.  */
12733   if (mantissa == 0)
12734     return false;
12735
12736   /* Then, as bit 4 is always set, we can mask it off, leaving
12737      the mantissa in the range [0, 15].  */
12738   mantissa &= ~(1 << 4);
12739   gcc_assert (mantissa <= 15);
12740
12741   /* GCC internally does not use IEEE754-like encoding (where normalized
12742      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
12743      Our mantissa values are shifted 4 places to the left relative to
12744      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
12745      by 5 places to correct for GCC's representation.  */
12746   exponent = 5 - exponent;
12747
12748   return (exponent >= 0 && exponent <= 7);
12749 }
12750
12751 char*
12752 aarch64_output_simd_mov_immediate (rtx const_vector,
12753                                    machine_mode mode,
12754                                    unsigned width)
12755 {
12756   bool is_valid;
12757   static char templ[40];
12758   const char *mnemonic;
12759   const char *shift_op;
12760   unsigned int lane_count = 0;
12761   char element_char;
12762
12763   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
12764
12765   /* This will return true to show const_vector is legal for use as either
12766      a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate.  It will
12767      also update INFO to show how the immediate should be generated.  */
12768   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
12769   gcc_assert (is_valid);
12770
12771   element_char = sizetochar (info.element_width);
12772   lane_count = width / info.element_width;
12773
12774   mode = GET_MODE_INNER (mode);
12775   if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12776     {
12777       gcc_assert (info.shift == 0 && ! info.mvn);
12778       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
12779          move immediate path.  */
12780       if (aarch64_float_const_zero_rtx_p (info.value))
12781         info.value = GEN_INT (0);
12782       else
12783         {
12784           const unsigned int buf_size = 20;
12785           char float_buf[buf_size] = {'\0'};
12786           real_to_decimal_for_mode (float_buf,
12787                                     CONST_DOUBLE_REAL_VALUE (info.value),
12788                                     buf_size, buf_size, 1, mode);
12789
12790           if (lane_count == 1)
12791             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
12792           else
12793             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
12794                       lane_count, element_char, float_buf);
12795           return templ;
12796         }
12797     }
12798
12799   mnemonic = info.mvn ? "mvni" : "movi";
12800   shift_op = info.msl ? "msl" : "lsl";
12801
12802   gcc_assert (CONST_INT_P (info.value));
12803   if (lane_count == 1)
12804     snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
12805               mnemonic, UINTVAL (info.value));
12806   else if (info.shift)
12807     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
12808               ", %s %d", mnemonic, lane_count, element_char,
12809               UINTVAL (info.value), shift_op, info.shift);
12810   else
12811     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
12812               mnemonic, lane_count, element_char, UINTVAL (info.value));
12813   return templ;
12814 }
12815
12816 char*
12817 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
12818                                           machine_mode mode)
12819 {
12820   machine_mode vmode;
12821
12822   gcc_assert (!VECTOR_MODE_P (mode));
12823   vmode = aarch64_simd_container_mode (mode, 64);
12824   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
12825   return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
12826 }
12827
12828 /* Split operands into moves from op[1] + op[2] into op[0].  */
12829
12830 void
12831 aarch64_split_combinev16qi (rtx operands[3])
12832 {
12833   unsigned int dest = REGNO (operands[0]);
12834   unsigned int src1 = REGNO (operands[1]);
12835   unsigned int src2 = REGNO (operands[2]);
12836   machine_mode halfmode = GET_MODE (operands[1]);
12837   unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
12838   rtx destlo, desthi;
12839
12840   gcc_assert (halfmode == V16QImode);
12841
12842   if (src1 == dest && src2 == dest + halfregs)
12843     {
12844       /* No-op move.  Can't split to nothing; emit something.  */
12845       emit_note (NOTE_INSN_DELETED);
12846       return;
12847     }
12848
12849   /* Preserve register attributes for variable tracking.  */
12850   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
12851   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
12852                                GET_MODE_SIZE (halfmode));
12853
12854   /* Special case of reversed high/low parts.  */
12855   if (reg_overlap_mentioned_p (operands[2], destlo)
12856       && reg_overlap_mentioned_p (operands[1], desthi))
12857     {
12858       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12859       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
12860       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12861     }
12862   else if (!reg_overlap_mentioned_p (operands[2], destlo))
12863     {
12864       /* Try to avoid unnecessary moves if part of the result
12865          is in the right place already.  */
12866       if (src1 != dest)
12867         emit_move_insn (destlo, operands[1]);
12868       if (src2 != dest + halfregs)
12869         emit_move_insn (desthi, operands[2]);
12870     }
12871   else
12872     {
12873       if (src2 != dest + halfregs)
12874         emit_move_insn (desthi, operands[2]);
12875       if (src1 != dest)
12876         emit_move_insn (destlo, operands[1]);
12877     }
12878 }
12879
12880 /* vec_perm support.  */
12881
12882 #define MAX_VECT_LEN 16
12883
12884 struct expand_vec_perm_d
12885 {
12886   rtx target, op0, op1;
12887   unsigned char perm[MAX_VECT_LEN];
12888   machine_mode vmode;
12889   unsigned char nelt;
12890   bool one_vector_p;
12891   bool testing_p;
12892 };
12893
12894 /* Generate a variable permutation.  */
12895
12896 static void
12897 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
12898 {
12899   machine_mode vmode = GET_MODE (target);
12900   bool one_vector_p = rtx_equal_p (op0, op1);
12901
12902   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
12903   gcc_checking_assert (GET_MODE (op0) == vmode);
12904   gcc_checking_assert (GET_MODE (op1) == vmode);
12905   gcc_checking_assert (GET_MODE (sel) == vmode);
12906   gcc_checking_assert (TARGET_SIMD);
12907
12908   if (one_vector_p)
12909     {
12910       if (vmode == V8QImode)
12911         {
12912           /* Expand the argument to a V16QI mode by duplicating it.  */
12913           rtx pair = gen_reg_rtx (V16QImode);
12914           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
12915           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12916         }
12917       else
12918         {
12919           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
12920         }
12921     }
12922   else
12923     {
12924       rtx pair;
12925
12926       if (vmode == V8QImode)
12927         {
12928           pair = gen_reg_rtx (V16QImode);
12929           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
12930           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12931         }
12932       else
12933         {
12934           pair = gen_reg_rtx (OImode);
12935           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
12936           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
12937         }
12938     }
12939 }
12940
12941 void
12942 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
12943 {
12944   machine_mode vmode = GET_MODE (target);
12945   unsigned int nelt = GET_MODE_NUNITS (vmode);
12946   bool one_vector_p = rtx_equal_p (op0, op1);
12947   rtx mask;
12948
12949   /* The TBL instruction does not use a modulo index, so we must take care
12950      of that ourselves.  */
12951   mask = aarch64_simd_gen_const_vector_dup (vmode,
12952       one_vector_p ? nelt - 1 : 2 * nelt - 1);
12953   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
12954
12955   /* For big-endian, we also need to reverse the index within the vector
12956      (but not which vector).  */
12957   if (BYTES_BIG_ENDIAN)
12958     {
12959       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
12960       if (!one_vector_p)
12961         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
12962       sel = expand_simple_binop (vmode, XOR, sel, mask,
12963                                  NULL, 0, OPTAB_LIB_WIDEN);
12964     }
12965   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
12966 }
12967
12968 /* Recognize patterns suitable for the TRN instructions.  */
12969 static bool
12970 aarch64_evpc_trn (struct expand_vec_perm_d *d)
12971 {
12972   unsigned int i, odd, mask, nelt = d->nelt;
12973   rtx out, in0, in1, x;
12974   rtx (*gen) (rtx, rtx, rtx);
12975   machine_mode vmode = d->vmode;
12976
12977   if (GET_MODE_UNIT_SIZE (vmode) > 8)
12978     return false;
12979
12980   /* Note that these are little-endian tests.
12981      We correct for big-endian later.  */
12982   if (d->perm[0] == 0)
12983     odd = 0;
12984   else if (d->perm[0] == 1)
12985     odd = 1;
12986   else
12987     return false;
12988   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12989
12990   for (i = 0; i < nelt; i += 2)
12991     {
12992       if (d->perm[i] != i + odd)
12993         return false;
12994       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
12995         return false;
12996     }
12997
12998   /* Success!  */
12999   if (d->testing_p)
13000     return true;
13001
13002   in0 = d->op0;
13003   in1 = d->op1;
13004   if (BYTES_BIG_ENDIAN)
13005     {
13006       x = in0, in0 = in1, in1 = x;
13007       odd = !odd;
13008     }
13009   out = d->target;
13010
13011   if (odd)
13012     {
13013       switch (vmode)
13014         {
13015         case V16QImode: gen = gen_aarch64_trn2v16qi; break;
13016         case V8QImode: gen = gen_aarch64_trn2v8qi; break;
13017         case V8HImode: gen = gen_aarch64_trn2v8hi; break;
13018         case V4HImode: gen = gen_aarch64_trn2v4hi; break;
13019         case V4SImode: gen = gen_aarch64_trn2v4si; break;
13020         case V2SImode: gen = gen_aarch64_trn2v2si; break;
13021         case V2DImode: gen = gen_aarch64_trn2v2di; break;
13022         case V4HFmode: gen = gen_aarch64_trn2v4hf; break;
13023         case V8HFmode: gen = gen_aarch64_trn2v8hf; break;
13024         case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
13025         case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
13026         case V2DFmode: gen = gen_aarch64_trn2v2df; break;
13027         default:
13028           return false;
13029         }
13030     }
13031   else
13032     {
13033       switch (vmode)
13034         {
13035         case V16QImode: gen = gen_aarch64_trn1v16qi; break;
13036         case V8QImode: gen = gen_aarch64_trn1v8qi; break;
13037         case V8HImode: gen = gen_aarch64_trn1v8hi; break;
13038         case V4HImode: gen = gen_aarch64_trn1v4hi; break;
13039         case V4SImode: gen = gen_aarch64_trn1v4si; break;
13040         case V2SImode: gen = gen_aarch64_trn1v2si; break;
13041         case V2DImode: gen = gen_aarch64_trn1v2di; break;
13042         case V4HFmode: gen = gen_aarch64_trn1v4hf; break;
13043         case V8HFmode: gen = gen_aarch64_trn1v8hf; break;
13044         case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
13045         case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
13046         case V2DFmode: gen = gen_aarch64_trn1v2df; break;
13047         default:
13048           return false;
13049         }
13050     }
13051
13052   emit_insn (gen (out, in0, in1));
13053   return true;
13054 }
13055
13056 /* Recognize patterns suitable for the UZP instructions.  */
13057 static bool
13058 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
13059 {
13060   unsigned int i, odd, mask, nelt = d->nelt;
13061   rtx out, in0, in1, x;
13062   rtx (*gen) (rtx, rtx, rtx);
13063   machine_mode vmode = d->vmode;
13064
13065   if (GET_MODE_UNIT_SIZE (vmode) > 8)
13066     return false;
13067
13068   /* Note that these are little-endian tests.
13069      We correct for big-endian later.  */
13070   if (d->perm[0] == 0)
13071     odd = 0;
13072   else if (d->perm[0] == 1)
13073     odd = 1;
13074   else
13075     return false;
13076   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13077
13078   for (i = 0; i < nelt; i++)
13079     {
13080       unsigned elt = (i * 2 + odd) & mask;
13081       if (d->perm[i] != elt)
13082         return false;
13083     }
13084
13085   /* Success!  */
13086   if (d->testing_p)
13087     return true;
13088
13089   in0 = d->op0;
13090   in1 = d->op1;
13091   if (BYTES_BIG_ENDIAN)
13092     {
13093       x = in0, in0 = in1, in1 = x;
13094       odd = !odd;
13095     }
13096   out = d->target;
13097
13098   if (odd)
13099     {
13100       switch (vmode)
13101         {
13102         case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
13103         case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
13104         case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
13105         case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
13106         case V4SImode: gen = gen_aarch64_uzp2v4si; break;
13107         case V2SImode: gen = gen_aarch64_uzp2v2si; break;
13108         case V2DImode: gen = gen_aarch64_uzp2v2di; break;
13109         case V4HFmode: gen = gen_aarch64_uzp2v4hf; break;
13110         case V8HFmode: gen = gen_aarch64_uzp2v8hf; break;
13111         case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
13112         case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
13113         case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
13114         default:
13115           return false;
13116         }
13117     }
13118   else
13119     {
13120       switch (vmode)
13121         {
13122         case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
13123         case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
13124         case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
13125         case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
13126         case V4SImode: gen = gen_aarch64_uzp1v4si; break;
13127         case V2SImode: gen = gen_aarch64_uzp1v2si; break;
13128         case V2DImode: gen = gen_aarch64_uzp1v2di; break;
13129         case V4HFmode: gen = gen_aarch64_uzp1v4hf; break;
13130         case V8HFmode: gen = gen_aarch64_uzp1v8hf; break;
13131         case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
13132         case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
13133         case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
13134         default:
13135           return false;
13136         }
13137     }
13138
13139   emit_insn (gen (out, in0, in1));
13140   return true;
13141 }
13142
13143 /* Recognize patterns suitable for the ZIP instructions.  */
13144 static bool
13145 aarch64_evpc_zip (struct expand_vec_perm_d *d)
13146 {
13147   unsigned int i, high, mask, nelt = d->nelt;
13148   rtx out, in0, in1, x;
13149   rtx (*gen) (rtx, rtx, rtx);
13150   machine_mode vmode = d->vmode;
13151
13152   if (GET_MODE_UNIT_SIZE (vmode) > 8)
13153     return false;
13154
13155   /* Note that these are little-endian tests.
13156      We correct for big-endian later.  */
13157   high = nelt / 2;
13158   if (d->perm[0] == high)
13159     /* Do Nothing.  */
13160     ;
13161   else if (d->perm[0] == 0)
13162     high = 0;
13163   else
13164     return false;
13165   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13166
13167   for (i = 0; i < nelt / 2; i++)
13168     {
13169       unsigned elt = (i + high) & mask;
13170       if (d->perm[i * 2] != elt)
13171         return false;
13172       elt = (elt + nelt) & mask;
13173       if (d->perm[i * 2 + 1] != elt)
13174         return false;
13175     }
13176
13177   /* Success!  */
13178   if (d->testing_p)
13179     return true;
13180
13181   in0 = d->op0;
13182   in1 = d->op1;
13183   if (BYTES_BIG_ENDIAN)
13184     {
13185       x = in0, in0 = in1, in1 = x;
13186       high = !high;
13187     }
13188   out = d->target;
13189
13190   if (high)
13191     {
13192       switch (vmode)
13193         {
13194         case V16QImode: gen = gen_aarch64_zip2v16qi; break;
13195         case V8QImode: gen = gen_aarch64_zip2v8qi; break;
13196         case V8HImode: gen = gen_aarch64_zip2v8hi; break;
13197         case V4HImode: gen = gen_aarch64_zip2v4hi; break;
13198         case V4SImode: gen = gen_aarch64_zip2v4si; break;
13199         case V2SImode: gen = gen_aarch64_zip2v2si; break;
13200         case V2DImode: gen = gen_aarch64_zip2v2di; break;
13201         case V4HFmode: gen = gen_aarch64_zip2v4hf; break;
13202         case V8HFmode: gen = gen_aarch64_zip2v8hf; break;
13203         case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
13204         case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
13205         case V2DFmode: gen = gen_aarch64_zip2v2df; break;
13206         default:
13207           return false;
13208         }
13209     }
13210   else
13211     {
13212       switch (vmode)
13213         {
13214         case V16QImode: gen = gen_aarch64_zip1v16qi; break;
13215         case V8QImode: gen = gen_aarch64_zip1v8qi; break;
13216         case V8HImode: gen = gen_aarch64_zip1v8hi; break;
13217         case V4HImode: gen = gen_aarch64_zip1v4hi; break;
13218         case V4SImode: gen = gen_aarch64_zip1v4si; break;
13219         case V2SImode: gen = gen_aarch64_zip1v2si; break;
13220         case V2DImode: gen = gen_aarch64_zip1v2di; break;
13221         case V4HFmode: gen = gen_aarch64_zip1v4hf; break;
13222         case V8HFmode: gen = gen_aarch64_zip1v8hf; break;
13223         case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
13224         case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
13225         case V2DFmode: gen = gen_aarch64_zip1v2df; break;
13226         default:
13227           return false;
13228         }
13229     }
13230
13231   emit_insn (gen (out, in0, in1));
13232   return true;
13233 }
13234
13235 /* Recognize patterns for the EXT insn.  */
13236
13237 static bool
13238 aarch64_evpc_ext (struct expand_vec_perm_d *d)
13239 {
13240   unsigned int i, nelt = d->nelt;
13241   rtx (*gen) (rtx, rtx, rtx, rtx);
13242   rtx offset;
13243
13244   unsigned int location = d->perm[0]; /* Always < nelt.  */
13245
13246   /* Check if the extracted indices are increasing by one.  */
13247   for (i = 1; i < nelt; i++)
13248     {
13249       unsigned int required = location + i;
13250       if (d->one_vector_p)
13251         {
13252           /* We'll pass the same vector in twice, so allow indices to wrap.  */
13253           required &= (nelt - 1);
13254         }
13255       if (d->perm[i] != required)
13256         return false;
13257     }
13258
13259   switch (d->vmode)
13260     {
13261     case V16QImode: gen = gen_aarch64_extv16qi; break;
13262     case V8QImode: gen = gen_aarch64_extv8qi; break;
13263     case V4HImode: gen = gen_aarch64_extv4hi; break;
13264     case V8HImode: gen = gen_aarch64_extv8hi; break;
13265     case V2SImode: gen = gen_aarch64_extv2si; break;
13266     case V4SImode: gen = gen_aarch64_extv4si; break;
13267     case V4HFmode: gen = gen_aarch64_extv4hf; break;
13268     case V8HFmode: gen = gen_aarch64_extv8hf; break;
13269     case V2SFmode: gen = gen_aarch64_extv2sf; break;
13270     case V4SFmode: gen = gen_aarch64_extv4sf; break;
13271     case V2DImode: gen = gen_aarch64_extv2di; break;
13272     case V2DFmode: gen = gen_aarch64_extv2df; break;
13273     default:
13274       return false;
13275     }
13276
13277   /* Success! */
13278   if (d->testing_p)
13279     return true;
13280
13281   /* The case where (location == 0) is a no-op for both big- and little-endian,
13282      and is removed by the mid-end at optimization levels -O1 and higher.  */
13283
13284   if (BYTES_BIG_ENDIAN && (location != 0))
13285     {
13286       /* After setup, we want the high elements of the first vector (stored
13287          at the LSB end of the register), and the low elements of the second
13288          vector (stored at the MSB end of the register). So swap.  */
13289       std::swap (d->op0, d->op1);
13290       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
13291       location = nelt - location;
13292     }
13293
13294   offset = GEN_INT (location);
13295   emit_insn (gen (d->target, d->op0, d->op1, offset));
13296   return true;
13297 }
13298
13299 /* Recognize patterns for the REV insns.  */
13300
13301 static bool
13302 aarch64_evpc_rev (struct expand_vec_perm_d *d)
13303 {
13304   unsigned int i, j, diff, nelt = d->nelt;
13305   rtx (*gen) (rtx, rtx);
13306
13307   if (!d->one_vector_p)
13308     return false;
13309
13310   diff = d->perm[0];
13311   switch (diff)
13312     {
13313     case 7:
13314       switch (d->vmode)
13315         {
13316         case V16QImode: gen = gen_aarch64_rev64v16qi; break;
13317         case V8QImode: gen = gen_aarch64_rev64v8qi;  break;
13318         default:
13319           return false;
13320         }
13321       break;
13322     case 3:
13323       switch (d->vmode)
13324         {
13325         case V16QImode: gen = gen_aarch64_rev32v16qi; break;
13326         case V8QImode: gen = gen_aarch64_rev32v8qi;  break;
13327         case V8HImode: gen = gen_aarch64_rev64v8hi;  break;
13328         case V4HImode: gen = gen_aarch64_rev64v4hi;  break;
13329         default:
13330           return false;
13331         }
13332       break;
13333     case 1:
13334       switch (d->vmode)
13335         {
13336         case V16QImode: gen = gen_aarch64_rev16v16qi; break;
13337         case V8QImode: gen = gen_aarch64_rev16v8qi;  break;
13338         case V8HImode: gen = gen_aarch64_rev32v8hi;  break;
13339         case V4HImode: gen = gen_aarch64_rev32v4hi;  break;
13340         case V4SImode: gen = gen_aarch64_rev64v4si;  break;
13341         case V2SImode: gen = gen_aarch64_rev64v2si;  break;
13342         case V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
13343         case V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
13344         case V8HFmode: gen = gen_aarch64_rev64v8hf;  break;
13345         case V4HFmode: gen = gen_aarch64_rev64v4hf;  break;
13346         default:
13347           return false;
13348         }
13349       break;
13350     default:
13351       return false;
13352     }
13353
13354   for (i = 0; i < nelt ; i += diff + 1)
13355     for (j = 0; j <= diff; j += 1)
13356       {
13357         /* This is guaranteed to be true as the value of diff
13358            is 7, 3, 1 and we should have enough elements in the
13359            queue to generate this.  Getting a vector mask with a
13360            value of diff other than these values implies that
13361            something is wrong by the time we get here.  */
13362         gcc_assert (i + j < nelt);
13363         if (d->perm[i + j] != i + diff - j)
13364           return false;
13365       }
13366
13367   /* Success! */
13368   if (d->testing_p)
13369     return true;
13370
13371   emit_insn (gen (d->target, d->op0));
13372   return true;
13373 }
13374
13375 static bool
13376 aarch64_evpc_dup (struct expand_vec_perm_d *d)
13377 {
13378   rtx (*gen) (rtx, rtx, rtx);
13379   rtx out = d->target;
13380   rtx in0;
13381   machine_mode vmode = d->vmode;
13382   unsigned int i, elt, nelt = d->nelt;
13383   rtx lane;
13384
13385   elt = d->perm[0];
13386   for (i = 1; i < nelt; i++)
13387     {
13388       if (elt != d->perm[i])
13389         return false;
13390     }
13391
13392   /* The generic preparation in aarch64_expand_vec_perm_const_1
13393      swaps the operand order and the permute indices if it finds
13394      d->perm[0] to be in the second operand.  Thus, we can always
13395      use d->op0 and need not do any extra arithmetic to get the
13396      correct lane number.  */
13397   in0 = d->op0;
13398   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
13399
13400   switch (vmode)
13401     {
13402     case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
13403     case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
13404     case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
13405     case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
13406     case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
13407     case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
13408     case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
13409     case V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
13410     case V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
13411     case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
13412     case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
13413     case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
13414     default:
13415       return false;
13416     }
13417
13418   emit_insn (gen (out, in0, lane));
13419   return true;
13420 }
13421
13422 static bool
13423 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
13424 {
13425   rtx rperm[MAX_VECT_LEN], sel;
13426   machine_mode vmode = d->vmode;
13427   unsigned int i, nelt = d->nelt;
13428
13429   if (d->testing_p)
13430     return true;
13431
13432   /* Generic code will try constant permutation twice.  Once with the
13433      original mode and again with the elements lowered to QImode.
13434      So wait and don't do the selector expansion ourselves.  */
13435   if (vmode != V8QImode && vmode != V16QImode)
13436     return false;
13437
13438   for (i = 0; i < nelt; ++i)
13439     {
13440       int nunits = GET_MODE_NUNITS (vmode);
13441
13442       /* If big-endian and two vectors we end up with a weird mixed-endian
13443          mode on NEON.  Reverse the index within each word but not the word
13444          itself.  */
13445       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
13446                                            : d->perm[i]);
13447     }
13448   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
13449   sel = force_reg (vmode, sel);
13450
13451   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
13452   return true;
13453 }
13454
13455 static bool
13456 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
13457 {
13458   /* The pattern matching functions above are written to look for a small
13459      number to begin the sequence (0, 1, N/2).  If we begin with an index
13460      from the second operand, we can swap the operands.  */
13461   if (d->perm[0] >= d->nelt)
13462     {
13463       unsigned i, nelt = d->nelt;
13464
13465       gcc_assert (nelt == (nelt & -nelt));
13466       for (i = 0; i < nelt; ++i)
13467         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
13468
13469       std::swap (d->op0, d->op1);
13470     }
13471
13472   if (TARGET_SIMD)
13473     {
13474       if (aarch64_evpc_rev (d))
13475         return true;
13476       else if (aarch64_evpc_ext (d))
13477         return true;
13478       else if (aarch64_evpc_dup (d))
13479         return true;
13480       else if (aarch64_evpc_zip (d))
13481         return true;
13482       else if (aarch64_evpc_uzp (d))
13483         return true;
13484       else if (aarch64_evpc_trn (d))
13485         return true;
13486       return aarch64_evpc_tbl (d);
13487     }
13488   return false;
13489 }
13490
13491 /* Expand a vec_perm_const pattern.  */
13492
13493 bool
13494 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
13495 {
13496   struct expand_vec_perm_d d;
13497   int i, nelt, which;
13498
13499   d.target = target;
13500   d.op0 = op0;
13501   d.op1 = op1;
13502
13503   d.vmode = GET_MODE (target);
13504   gcc_assert (VECTOR_MODE_P (d.vmode));
13505   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13506   d.testing_p = false;
13507
13508   for (i = which = 0; i < nelt; ++i)
13509     {
13510       rtx e = XVECEXP (sel, 0, i);
13511       int ei = INTVAL (e) & (2 * nelt - 1);
13512       which |= (ei < nelt ? 1 : 2);
13513       d.perm[i] = ei;
13514     }
13515
13516   switch (which)
13517     {
13518     default:
13519       gcc_unreachable ();
13520
13521     case 3:
13522       d.one_vector_p = false;
13523       if (!rtx_equal_p (op0, op1))
13524         break;
13525
13526       /* The elements of PERM do not suggest that only the first operand
13527          is used, but both operands are identical.  Allow easier matching
13528          of the permutation by folding the permutation into the single
13529          input vector.  */
13530       /* Fall Through.  */
13531     case 2:
13532       for (i = 0; i < nelt; ++i)
13533         d.perm[i] &= nelt - 1;
13534       d.op0 = op1;
13535       d.one_vector_p = true;
13536       break;
13537
13538     case 1:
13539       d.op1 = op0;
13540       d.one_vector_p = true;
13541       break;
13542     }
13543
13544   return aarch64_expand_vec_perm_const_1 (&d);
13545 }
13546
13547 static bool
13548 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
13549                                      const unsigned char *sel)
13550 {
13551   struct expand_vec_perm_d d;
13552   unsigned int i, nelt, which;
13553   bool ret;
13554
13555   d.vmode = vmode;
13556   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13557   d.testing_p = true;
13558   memcpy (d.perm, sel, nelt);
13559
13560   /* Calculate whether all elements are in one vector.  */
13561   for (i = which = 0; i < nelt; ++i)
13562     {
13563       unsigned char e = d.perm[i];
13564       gcc_assert (e < 2 * nelt);
13565       which |= (e < nelt ? 1 : 2);
13566     }
13567
13568   /* If all elements are from the second vector, reindex as if from the
13569      first vector.  */
13570   if (which == 2)
13571     for (i = 0; i < nelt; ++i)
13572       d.perm[i] -= nelt;
13573
13574   /* Check whether the mask can be applied to a single vector.  */
13575   d.one_vector_p = (which != 3);
13576
13577   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
13578   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
13579   if (!d.one_vector_p)
13580     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
13581
13582   start_sequence ();
13583   ret = aarch64_expand_vec_perm_const_1 (&d);
13584   end_sequence ();
13585
13586   return ret;
13587 }
13588
13589 rtx
13590 aarch64_reverse_mask (enum machine_mode mode)
13591 {
13592   /* We have to reverse each vector because we dont have
13593      a permuted load that can reverse-load according to ABI rules.  */
13594   rtx mask;
13595   rtvec v = rtvec_alloc (16);
13596   int i, j;
13597   int nunits = GET_MODE_NUNITS (mode);
13598   int usize = GET_MODE_UNIT_SIZE (mode);
13599
13600   gcc_assert (BYTES_BIG_ENDIAN);
13601   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
13602
13603   for (i = 0; i < nunits; i++)
13604     for (j = 0; j < usize; j++)
13605       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
13606   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
13607   return force_reg (V16QImode, mask);
13608 }
13609
13610 /* Implement MODES_TIEABLE_P.  In principle we should always return true.
13611    However due to issues with register allocation it is preferable to avoid
13612    tieing integer scalar and FP scalar modes.  Executing integer operations
13613    in general registers is better than treating them as scalar vector
13614    operations.  This reduces latency and avoids redundant int<->FP moves.
13615    So tie modes if they are either the same class, or vector modes with
13616    other vector modes, vector structs or any scalar mode.
13617 */
13618
13619 bool
13620 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
13621 {
13622   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
13623     return true;
13624
13625   /* We specifically want to allow elements of "structure" modes to
13626      be tieable to the structure.  This more general condition allows
13627      other rarer situations too.  */
13628   if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
13629     return true;
13630
13631   /* Also allow any scalar modes with vectors.  */
13632   if (aarch64_vector_mode_supported_p (mode1)
13633       || aarch64_vector_mode_supported_p (mode2))
13634     return true;
13635
13636   return false;
13637 }
13638
13639 /* Return a new RTX holding the result of moving POINTER forward by
13640    AMOUNT bytes.  */
13641
13642 static rtx
13643 aarch64_move_pointer (rtx pointer, int amount)
13644 {
13645   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
13646
13647   return adjust_automodify_address (pointer, GET_MODE (pointer),
13648                                     next, amount);
13649 }
13650
13651 /* Return a new RTX holding the result of moving POINTER forward by the
13652    size of the mode it points to.  */
13653
13654 static rtx
13655 aarch64_progress_pointer (rtx pointer)
13656 {
13657   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
13658
13659   return aarch64_move_pointer (pointer, amount);
13660 }
13661
13662 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
13663    MODE bytes.  */
13664
13665 static void
13666 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
13667                                               machine_mode mode)
13668 {
13669   rtx reg = gen_reg_rtx (mode);
13670
13671   /* "Cast" the pointers to the correct mode.  */
13672   *src = adjust_address (*src, mode, 0);
13673   *dst = adjust_address (*dst, mode, 0);
13674   /* Emit the memcpy.  */
13675   emit_move_insn (reg, *src);
13676   emit_move_insn (*dst, reg);
13677   /* Move the pointers forward.  */
13678   *src = aarch64_progress_pointer (*src);
13679   *dst = aarch64_progress_pointer (*dst);
13680 }
13681
13682 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
13683    we succeed, otherwise return false.  */
13684
13685 bool
13686 aarch64_expand_movmem (rtx *operands)
13687 {
13688   unsigned int n;
13689   rtx dst = operands[0];
13690   rtx src = operands[1];
13691   rtx base;
13692   bool speed_p = !optimize_function_for_size_p (cfun);
13693
13694   /* When optimizing for size, give a better estimate of the length of a
13695      memcpy call, but use the default otherwise.  */
13696   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
13697
13698   /* We can't do anything smart if the amount to copy is not constant.  */
13699   if (!CONST_INT_P (operands[2]))
13700     return false;
13701
13702   n = UINTVAL (operands[2]);
13703
13704   /* Try to keep the number of instructions low.  For cases below 16 bytes we
13705      need to make at most two moves.  For cases above 16 bytes it will be one
13706      move for each 16 byte chunk, then at most two additional moves.  */
13707   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
13708     return false;
13709
13710   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
13711   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
13712
13713   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
13714   src = adjust_automodify_address (src, VOIDmode, base, 0);
13715
13716   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
13717      1-byte chunk.  */
13718   if (n < 4)
13719     {
13720       if (n >= 2)
13721         {
13722           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13723           n -= 2;
13724         }
13725
13726       if (n == 1)
13727         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13728
13729       return true;
13730     }
13731
13732   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
13733      4-byte chunk, partially overlapping with the previously copied chunk.  */
13734   if (n < 8)
13735     {
13736       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13737       n -= 4;
13738       if (n > 0)
13739         {
13740           int move = n - 4;
13741
13742           src = aarch64_move_pointer (src, move);
13743           dst = aarch64_move_pointer (dst, move);
13744           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13745         }
13746       return true;
13747     }
13748
13749   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
13750      them, then (if applicable) an 8-byte chunk.  */
13751   while (n >= 8)
13752     {
13753       if (n / 16)
13754         {
13755           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
13756           n -= 16;
13757         }
13758       else
13759         {
13760           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13761           n -= 8;
13762         }
13763     }
13764
13765   /* Finish the final bytes of the copy.  We can always do this in one
13766      instruction.  We either copy the exact amount we need, or partially
13767      overlap with the previous chunk we copied and copy 8-bytes.  */
13768   if (n == 0)
13769     return true;
13770   else if (n == 1)
13771     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13772   else if (n == 2)
13773     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13774   else if (n == 4)
13775     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13776   else
13777     {
13778       if (n == 3)
13779         {
13780           src = aarch64_move_pointer (src, -1);
13781           dst = aarch64_move_pointer (dst, -1);
13782           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13783         }
13784       else
13785         {
13786           int move = n - 8;
13787
13788           src = aarch64_move_pointer (src, move);
13789           dst = aarch64_move_pointer (dst, move);
13790           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13791         }
13792     }
13793
13794   return true;
13795 }
13796
13797 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
13798    SImode stores.  Handle the case when the constant has identical
13799    bottom and top halves.  This is beneficial when the two stores can be
13800    merged into an STP and we avoid synthesising potentially expensive
13801    immediates twice.  Return true if such a split is possible.  */
13802
13803 bool
13804 aarch64_split_dimode_const_store (rtx dst, rtx src)
13805 {
13806   rtx lo = gen_lowpart (SImode, src);
13807   rtx hi = gen_highpart_mode (SImode, DImode, src);
13808
13809   bool size_p = optimize_function_for_size_p (cfun);
13810
13811   if (!rtx_equal_p (lo, hi))
13812     return false;
13813
13814   unsigned int orig_cost
13815     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
13816   unsigned int lo_cost
13817     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
13818
13819   /* We want to transform:
13820      MOV        x1, 49370
13821      MOVK       x1, 0x140, lsl 16
13822      MOVK       x1, 0xc0da, lsl 32
13823      MOVK       x1, 0x140, lsl 48
13824      STR        x1, [x0]
13825    into:
13826      MOV        w1, 49370
13827      MOVK       w1, 0x140, lsl 16
13828      STP        w1, w1, [x0]
13829    So we want to perform this only when we save two instructions
13830    or more.  When optimizing for size, however, accept any code size
13831    savings we can.  */
13832   if (size_p && orig_cost <= lo_cost)
13833     return false;
13834
13835   if (!size_p
13836       && (orig_cost <= lo_cost + 1))
13837     return false;
13838
13839   rtx mem_lo = adjust_address (dst, SImode, 0);
13840   if (!aarch64_mem_pair_operand (mem_lo, SImode))
13841     return false;
13842
13843   rtx tmp_reg = gen_reg_rtx (SImode);
13844   aarch64_expand_mov_immediate (tmp_reg, lo);
13845   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
13846   /* Don't emit an explicit store pair as this may not be always profitable.
13847      Let the sched-fusion logic decide whether to merge them.  */
13848   emit_move_insn (mem_lo, tmp_reg);
13849   emit_move_insn (mem_hi, tmp_reg);
13850
13851   return true;
13852 }
13853
13854 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
13855
13856 static unsigned HOST_WIDE_INT
13857 aarch64_asan_shadow_offset (void)
13858 {
13859   return (HOST_WIDE_INT_1 << 36);
13860 }
13861
13862 static bool
13863 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
13864                                         unsigned int align,
13865                                         enum by_pieces_operation op,
13866                                         bool speed_p)
13867 {
13868   /* STORE_BY_PIECES can be used when copying a constant string, but
13869      in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
13870      For now we always fail this and let the move_by_pieces code copy
13871      the string from read-only memory.  */
13872   if (op == STORE_BY_PIECES)
13873     return false;
13874
13875   return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
13876 }
13877
13878 static rtx
13879 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
13880                         int code, tree treeop0, tree treeop1)
13881 {
13882   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13883   rtx op0, op1;
13884   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13885   insn_code icode;
13886   struct expand_operand ops[4];
13887
13888   start_sequence ();
13889   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13890
13891   op_mode = GET_MODE (op0);
13892   if (op_mode == VOIDmode)
13893     op_mode = GET_MODE (op1);
13894
13895   switch (op_mode)
13896     {
13897     case QImode:
13898     case HImode:
13899     case SImode:
13900       cmp_mode = SImode;
13901       icode = CODE_FOR_cmpsi;
13902       break;
13903
13904     case DImode:
13905       cmp_mode = DImode;
13906       icode = CODE_FOR_cmpdi;
13907       break;
13908
13909     case SFmode:
13910       cmp_mode = SFmode;
13911       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
13912       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
13913       break;
13914
13915     case DFmode:
13916       cmp_mode = DFmode;
13917       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
13918       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
13919       break;
13920
13921     default:
13922       end_sequence ();
13923       return NULL_RTX;
13924     }
13925
13926   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
13927   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
13928   if (!op0 || !op1)
13929     {
13930       end_sequence ();
13931       return NULL_RTX;
13932     }
13933   *prep_seq = get_insns ();
13934   end_sequence ();
13935
13936   create_fixed_operand (&ops[0], op0);
13937   create_fixed_operand (&ops[1], op1);
13938
13939   start_sequence ();
13940   if (!maybe_expand_insn (icode, 2, ops))
13941     {
13942       end_sequence ();
13943       return NULL_RTX;
13944     }
13945   *gen_seq = get_insns ();
13946   end_sequence ();
13947
13948   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
13949                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
13950 }
13951
13952 static rtx
13953 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
13954                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
13955 {
13956   rtx op0, op1, target;
13957   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13958   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13959   insn_code icode;
13960   struct expand_operand ops[6];
13961   int aarch64_cond;
13962
13963   push_to_sequence (*prep_seq);
13964   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13965
13966   op_mode = GET_MODE (op0);
13967   if (op_mode == VOIDmode)
13968     op_mode = GET_MODE (op1);
13969
13970   switch (op_mode)
13971     {
13972     case QImode:
13973     case HImode:
13974     case SImode:
13975       cmp_mode = SImode;
13976       icode = CODE_FOR_ccmpsi;
13977       break;
13978
13979     case DImode:
13980       cmp_mode = DImode;
13981       icode = CODE_FOR_ccmpdi;
13982       break;
13983
13984     case SFmode:
13985       cmp_mode = SFmode;
13986       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
13987       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
13988       break;
13989
13990     case DFmode:
13991       cmp_mode = DFmode;
13992       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
13993       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
13994       break;
13995
13996     default:
13997       end_sequence ();
13998       return NULL_RTX;
13999     }
14000
14001   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
14002   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
14003   if (!op0 || !op1)
14004     {
14005       end_sequence ();
14006       return NULL_RTX;
14007     }
14008   *prep_seq = get_insns ();
14009   end_sequence ();
14010
14011   target = gen_rtx_REG (cc_mode, CC_REGNUM);
14012   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
14013
14014   if (bit_code != AND)
14015     {
14016       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
14017                                                 GET_MODE (XEXP (prev, 0))),
14018                              VOIDmode, XEXP (prev, 0), const0_rtx);
14019       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
14020     }
14021
14022   create_fixed_operand (&ops[0], XEXP (prev, 0));
14023   create_fixed_operand (&ops[1], target);
14024   create_fixed_operand (&ops[2], op0);
14025   create_fixed_operand (&ops[3], op1);
14026   create_fixed_operand (&ops[4], prev);
14027   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
14028
14029   push_to_sequence (*gen_seq);
14030   if (!maybe_expand_insn (icode, 6, ops))
14031     {
14032       end_sequence ();
14033       return NULL_RTX;
14034     }
14035
14036   *gen_seq = get_insns ();
14037   end_sequence ();
14038
14039   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
14040 }
14041
14042 #undef TARGET_GEN_CCMP_FIRST
14043 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
14044
14045 #undef TARGET_GEN_CCMP_NEXT
14046 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
14047
14048 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
14049    instruction fusion of some sort.  */
14050
14051 static bool
14052 aarch64_macro_fusion_p (void)
14053 {
14054   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
14055 }
14056
14057
14058 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
14059    should be kept together during scheduling.  */
14060
14061 static bool
14062 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
14063 {
14064   rtx set_dest;
14065   rtx prev_set = single_set (prev);
14066   rtx curr_set = single_set (curr);
14067   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
14068   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
14069
14070   if (!aarch64_macro_fusion_p ())
14071     return false;
14072
14073   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
14074     {
14075       /* We are trying to match:
14076          prev (mov)  == (set (reg r0) (const_int imm16))
14077          curr (movk) == (set (zero_extract (reg r0)
14078                                            (const_int 16)
14079                                            (const_int 16))
14080                              (const_int imm16_1))  */
14081
14082       set_dest = SET_DEST (curr_set);
14083
14084       if (GET_CODE (set_dest) == ZERO_EXTRACT
14085           && CONST_INT_P (SET_SRC (curr_set))
14086           && CONST_INT_P (SET_SRC (prev_set))
14087           && CONST_INT_P (XEXP (set_dest, 2))
14088           && INTVAL (XEXP (set_dest, 2)) == 16
14089           && REG_P (XEXP (set_dest, 0))
14090           && REG_P (SET_DEST (prev_set))
14091           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
14092         {
14093           return true;
14094         }
14095     }
14096
14097   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
14098     {
14099
14100       /*  We're trying to match:
14101           prev (adrp) == (set (reg r1)
14102                               (high (symbol_ref ("SYM"))))
14103           curr (add) == (set (reg r0)
14104                              (lo_sum (reg r1)
14105                                      (symbol_ref ("SYM"))))
14106           Note that r0 need not necessarily be the same as r1, especially
14107           during pre-regalloc scheduling.  */
14108
14109       if (satisfies_constraint_Ush (SET_SRC (prev_set))
14110           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14111         {
14112           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
14113               && REG_P (XEXP (SET_SRC (curr_set), 0))
14114               && REGNO (XEXP (SET_SRC (curr_set), 0))
14115                  == REGNO (SET_DEST (prev_set))
14116               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
14117                               XEXP (SET_SRC (curr_set), 1)))
14118             return true;
14119         }
14120     }
14121
14122   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
14123     {
14124
14125       /* We're trying to match:
14126          prev (movk) == (set (zero_extract (reg r0)
14127                                            (const_int 16)
14128                                            (const_int 32))
14129                              (const_int imm16_1))
14130          curr (movk) == (set (zero_extract (reg r0)
14131                                            (const_int 16)
14132                                            (const_int 48))
14133                              (const_int imm16_2))  */
14134
14135       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
14136           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
14137           && REG_P (XEXP (SET_DEST (prev_set), 0))
14138           && REG_P (XEXP (SET_DEST (curr_set), 0))
14139           && REGNO (XEXP (SET_DEST (prev_set), 0))
14140              == REGNO (XEXP (SET_DEST (curr_set), 0))
14141           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
14142           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
14143           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
14144           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
14145           && CONST_INT_P (SET_SRC (prev_set))
14146           && CONST_INT_P (SET_SRC (curr_set)))
14147         return true;
14148
14149     }
14150   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
14151     {
14152       /* We're trying to match:
14153           prev (adrp) == (set (reg r0)
14154                               (high (symbol_ref ("SYM"))))
14155           curr (ldr) == (set (reg r1)
14156                              (mem (lo_sum (reg r0)
14157                                              (symbol_ref ("SYM")))))
14158                  or
14159           curr (ldr) == (set (reg r1)
14160                              (zero_extend (mem
14161                                            (lo_sum (reg r0)
14162                                                    (symbol_ref ("SYM"))))))  */
14163       if (satisfies_constraint_Ush (SET_SRC (prev_set))
14164           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14165         {
14166           rtx curr_src = SET_SRC (curr_set);
14167
14168           if (GET_CODE (curr_src) == ZERO_EXTEND)
14169             curr_src = XEXP (curr_src, 0);
14170
14171           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
14172               && REG_P (XEXP (XEXP (curr_src, 0), 0))
14173               && REGNO (XEXP (XEXP (curr_src, 0), 0))
14174                  == REGNO (SET_DEST (prev_set))
14175               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
14176                               XEXP (SET_SRC (prev_set), 0)))
14177               return true;
14178         }
14179     }
14180
14181   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
14182        && aarch_crypto_can_dual_issue (prev, curr))
14183     return true;
14184
14185   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
14186       && any_condjump_p (curr))
14187     {
14188       enum attr_type prev_type = get_attr_type (prev);
14189
14190       /* FIXME: this misses some which is considered simple arthematic
14191          instructions for ThunderX.  Simple shifts are missed here.  */
14192       if (prev_type == TYPE_ALUS_SREG
14193           || prev_type == TYPE_ALUS_IMM
14194           || prev_type == TYPE_LOGICS_REG
14195           || prev_type == TYPE_LOGICS_IMM)
14196         return true;
14197     }
14198
14199   return false;
14200 }
14201
14202 /* Return true iff the instruction fusion described by OP is enabled.  */
14203
14204 bool
14205 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
14206 {
14207   return (aarch64_tune_params.fusible_ops & op) != 0;
14208 }
14209
14210 /* If MEM is in the form of [base+offset], extract the two parts
14211    of address and set to BASE and OFFSET, otherwise return false
14212    after clearing BASE and OFFSET.  */
14213
14214 bool
14215 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
14216 {
14217   rtx addr;
14218
14219   gcc_assert (MEM_P (mem));
14220
14221   addr = XEXP (mem, 0);
14222
14223   if (REG_P (addr))
14224     {
14225       *base = addr;
14226       *offset = const0_rtx;
14227       return true;
14228     }
14229
14230   if (GET_CODE (addr) == PLUS
14231       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
14232     {
14233       *base = XEXP (addr, 0);
14234       *offset = XEXP (addr, 1);
14235       return true;
14236     }
14237
14238   *base = NULL_RTX;
14239   *offset = NULL_RTX;
14240
14241   return false;
14242 }
14243
14244 /* Types for scheduling fusion.  */
14245 enum sched_fusion_type
14246 {
14247   SCHED_FUSION_NONE = 0,
14248   SCHED_FUSION_LD_SIGN_EXTEND,
14249   SCHED_FUSION_LD_ZERO_EXTEND,
14250   SCHED_FUSION_LD,
14251   SCHED_FUSION_ST,
14252   SCHED_FUSION_NUM
14253 };
14254
14255 /* If INSN is a load or store of address in the form of [base+offset],
14256    extract the two parts and set to BASE and OFFSET.  Return scheduling
14257    fusion type this INSN is.  */
14258
14259 static enum sched_fusion_type
14260 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
14261 {
14262   rtx x, dest, src;
14263   enum sched_fusion_type fusion = SCHED_FUSION_LD;
14264
14265   gcc_assert (INSN_P (insn));
14266   x = PATTERN (insn);
14267   if (GET_CODE (x) != SET)
14268     return SCHED_FUSION_NONE;
14269
14270   src = SET_SRC (x);
14271   dest = SET_DEST (x);
14272
14273   machine_mode dest_mode = GET_MODE (dest);
14274
14275   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
14276     return SCHED_FUSION_NONE;
14277
14278   if (GET_CODE (src) == SIGN_EXTEND)
14279     {
14280       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
14281       src = XEXP (src, 0);
14282       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14283         return SCHED_FUSION_NONE;
14284     }
14285   else if (GET_CODE (src) == ZERO_EXTEND)
14286     {
14287       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
14288       src = XEXP (src, 0);
14289       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14290         return SCHED_FUSION_NONE;
14291     }
14292
14293   if (GET_CODE (src) == MEM && REG_P (dest))
14294     extract_base_offset_in_addr (src, base, offset);
14295   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
14296     {
14297       fusion = SCHED_FUSION_ST;
14298       extract_base_offset_in_addr (dest, base, offset);
14299     }
14300   else
14301     return SCHED_FUSION_NONE;
14302
14303   if (*base == NULL_RTX || *offset == NULL_RTX)
14304     fusion = SCHED_FUSION_NONE;
14305
14306   return fusion;
14307 }
14308
14309 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
14310
14311    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
14312    and PRI are only calculated for these instructions.  For other instruction,
14313    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
14314    type instruction fusion can be added by returning different priorities.
14315
14316    It's important that irrelevant instructions get the largest FUSION_PRI.  */
14317
14318 static void
14319 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
14320                                int *fusion_pri, int *pri)
14321 {
14322   int tmp, off_val;
14323   rtx base, offset;
14324   enum sched_fusion_type fusion;
14325
14326   gcc_assert (INSN_P (insn));
14327
14328   tmp = max_pri - 1;
14329   fusion = fusion_load_store (insn, &base, &offset);
14330   if (fusion == SCHED_FUSION_NONE)
14331     {
14332       *pri = tmp;
14333       *fusion_pri = tmp;
14334       return;
14335     }
14336
14337   /* Set FUSION_PRI according to fusion type and base register.  */
14338   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
14339
14340   /* Calculate PRI.  */
14341   tmp /= 2;
14342
14343   /* INSN with smaller offset goes first.  */
14344   off_val = (int)(INTVAL (offset));
14345   if (off_val >= 0)
14346     tmp -= (off_val & 0xfffff);
14347   else
14348     tmp += ((- off_val) & 0xfffff);
14349
14350   *pri = tmp;
14351   return;
14352 }
14353
14354 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
14355    Adjust priority of sha1h instructions so they are scheduled before
14356    other SHA1 instructions.  */
14357
14358 static int
14359 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
14360 {
14361   rtx x = PATTERN (insn);
14362
14363   if (GET_CODE (x) == SET)
14364     {
14365       x = SET_SRC (x);
14366
14367       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
14368         return priority + 10;
14369     }
14370
14371   return priority;
14372 }
14373
14374 /* Given OPERANDS of consecutive load/store, check if we can merge
14375    them into ldp/stp.  LOAD is true if they are load instructions.
14376    MODE is the mode of memory operands.  */
14377
14378 bool
14379 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
14380                                 enum machine_mode mode)
14381 {
14382   HOST_WIDE_INT offval_1, offval_2, msize;
14383   enum reg_class rclass_1, rclass_2;
14384   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
14385
14386   if (load)
14387     {
14388       mem_1 = operands[1];
14389       mem_2 = operands[3];
14390       reg_1 = operands[0];
14391       reg_2 = operands[2];
14392       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
14393       if (REGNO (reg_1) == REGNO (reg_2))
14394         return false;
14395     }
14396   else
14397     {
14398       mem_1 = operands[0];
14399       mem_2 = operands[2];
14400       reg_1 = operands[1];
14401       reg_2 = operands[3];
14402     }
14403
14404   /* The mems cannot be volatile.  */
14405   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
14406     return false;
14407
14408   /* If we have SImode and slow unaligned ldp,
14409      check the alignment to be at least 8 byte. */
14410   if (mode == SImode
14411       && (aarch64_tune_params.extra_tuning_flags
14412           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14413       && !optimize_size
14414       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14415     return false;
14416
14417   /* Check if the addresses are in the form of [base+offset].  */
14418   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14419   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14420     return false;
14421   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14422   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14423     return false;
14424
14425   /* Check if the bases are same.  */
14426   if (!rtx_equal_p (base_1, base_2))
14427     return false;
14428
14429   offval_1 = INTVAL (offset_1);
14430   offval_2 = INTVAL (offset_2);
14431   msize = GET_MODE_SIZE (mode);
14432   /* Check if the offsets are consecutive.  */
14433   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
14434     return false;
14435
14436   /* Check if the addresses are clobbered by load.  */
14437   if (load)
14438     {
14439       if (reg_mentioned_p (reg_1, mem_1))
14440         return false;
14441
14442       /* In increasing order, the last load can clobber the address.  */
14443       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
14444       return false;
14445     }
14446
14447   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14448     rclass_1 = FP_REGS;
14449   else
14450     rclass_1 = GENERAL_REGS;
14451
14452   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14453     rclass_2 = FP_REGS;
14454   else
14455     rclass_2 = GENERAL_REGS;
14456
14457   /* Check if the registers are of same class.  */
14458   if (rclass_1 != rclass_2)
14459     return false;
14460
14461   return true;
14462 }
14463
14464 /* Given OPERANDS of consecutive load/store, check if we can merge
14465    them into ldp/stp by adjusting the offset.  LOAD is true if they
14466    are load instructions.  MODE is the mode of memory operands.
14467
14468    Given below consecutive stores:
14469
14470      str  w1, [xb, 0x100]
14471      str  w1, [xb, 0x104]
14472      str  w1, [xb, 0x108]
14473      str  w1, [xb, 0x10c]
14474
14475    Though the offsets are out of the range supported by stp, we can
14476    still pair them after adjusting the offset, like:
14477
14478      add  scratch, xb, 0x100
14479      stp  w1, w1, [scratch]
14480      stp  w1, w1, [scratch, 0x8]
14481
14482    The peephole patterns detecting this opportunity should guarantee
14483    the scratch register is avaliable.  */
14484
14485 bool
14486 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
14487                                        enum machine_mode mode)
14488 {
14489   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
14490   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
14491   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
14492   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
14493
14494   if (load)
14495     {
14496       reg_1 = operands[0];
14497       mem_1 = operands[1];
14498       reg_2 = operands[2];
14499       mem_2 = operands[3];
14500       reg_3 = operands[4];
14501       mem_3 = operands[5];
14502       reg_4 = operands[6];
14503       mem_4 = operands[7];
14504       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
14505                   && REG_P (reg_3) && REG_P (reg_4));
14506       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
14507         return false;
14508     }
14509   else
14510     {
14511       mem_1 = operands[0];
14512       reg_1 = operands[1];
14513       mem_2 = operands[2];
14514       reg_2 = operands[3];
14515       mem_3 = operands[4];
14516       reg_3 = operands[5];
14517       mem_4 = operands[6];
14518       reg_4 = operands[7];
14519     }
14520   /* Skip if memory operand is by itslef valid for ldp/stp.  */
14521   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
14522     return false;
14523
14524   /* The mems cannot be volatile.  */
14525   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
14526       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
14527     return false;
14528
14529   /* Check if the addresses are in the form of [base+offset].  */
14530   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14531   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14532     return false;
14533   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14534   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14535     return false;
14536   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
14537   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
14538     return false;
14539   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
14540   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
14541     return false;
14542
14543   /* Check if the bases are same.  */
14544   if (!rtx_equal_p (base_1, base_2)
14545       || !rtx_equal_p (base_2, base_3)
14546       || !rtx_equal_p (base_3, base_4))
14547     return false;
14548
14549   offval_1 = INTVAL (offset_1);
14550   offval_2 = INTVAL (offset_2);
14551   offval_3 = INTVAL (offset_3);
14552   offval_4 = INTVAL (offset_4);
14553   msize = GET_MODE_SIZE (mode);
14554   /* Check if the offsets are consecutive.  */
14555   if ((offval_1 != (offval_2 + msize)
14556        || offval_1 != (offval_3 + msize * 2)
14557        || offval_1 != (offval_4 + msize * 3))
14558       && (offval_4 != (offval_3 + msize)
14559           || offval_4 != (offval_2 + msize * 2)
14560           || offval_4 != (offval_1 + msize * 3)))
14561     return false;
14562
14563   /* Check if the addresses are clobbered by load.  */
14564   if (load)
14565     {
14566       if (reg_mentioned_p (reg_1, mem_1)
14567           || reg_mentioned_p (reg_2, mem_2)
14568           || reg_mentioned_p (reg_3, mem_3))
14569         return false;
14570
14571       /* In increasing order, the last load can clobber the address.  */
14572       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
14573         return false;
14574     }
14575
14576   /* If we have SImode and slow unaligned ldp,
14577      check the alignment to be at least 8 byte. */
14578   if (mode == SImode
14579       && (aarch64_tune_params.extra_tuning_flags
14580           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14581       && !optimize_size
14582       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14583     return false;
14584
14585   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14586     rclass_1 = FP_REGS;
14587   else
14588     rclass_1 = GENERAL_REGS;
14589
14590   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14591     rclass_2 = FP_REGS;
14592   else
14593     rclass_2 = GENERAL_REGS;
14594
14595   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
14596     rclass_3 = FP_REGS;
14597   else
14598     rclass_3 = GENERAL_REGS;
14599
14600   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
14601     rclass_4 = FP_REGS;
14602   else
14603     rclass_4 = GENERAL_REGS;
14604
14605   /* Check if the registers are of same class.  */
14606   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
14607     return false;
14608
14609   return true;
14610 }
14611
14612 /* Given OPERANDS of consecutive load/store, this function pairs them
14613    into ldp/stp after adjusting the offset.  It depends on the fact
14614    that addresses of load/store instructions are in increasing order.
14615    MODE is the mode of memory operands.  CODE is the rtl operator
14616    which should be applied to all memory operands, it's SIGN_EXTEND,
14617    ZERO_EXTEND or UNKNOWN.  */
14618
14619 bool
14620 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
14621                              enum machine_mode mode, RTX_CODE code)
14622 {
14623   rtx base, offset, t1, t2;
14624   rtx mem_1, mem_2, mem_3, mem_4;
14625   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
14626
14627   if (load)
14628     {
14629       mem_1 = operands[1];
14630       mem_2 = operands[3];
14631       mem_3 = operands[5];
14632       mem_4 = operands[7];
14633     }
14634   else
14635     {
14636       mem_1 = operands[0];
14637       mem_2 = operands[2];
14638       mem_3 = operands[4];
14639       mem_4 = operands[6];
14640       gcc_assert (code == UNKNOWN);
14641     }
14642
14643   extract_base_offset_in_addr (mem_1, &base, &offset);
14644   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
14645
14646   /* Adjust offset thus it can fit in ldp/stp instruction.  */
14647   msize = GET_MODE_SIZE (mode);
14648   stp_off_limit = msize * 0x40;
14649   off_val = INTVAL (offset);
14650   abs_off = (off_val < 0) ? -off_val : off_val;
14651   new_off = abs_off % stp_off_limit;
14652   adj_off = abs_off - new_off;
14653
14654   /* Further adjust to make sure all offsets are OK.  */
14655   if ((new_off + msize * 2) >= stp_off_limit)
14656     {
14657       adj_off += stp_off_limit;
14658       new_off -= stp_off_limit;
14659     }
14660
14661   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
14662   if (adj_off >= 0x1000)
14663     return false;
14664
14665   if (off_val < 0)
14666     {
14667       adj_off = -adj_off;
14668       new_off = -new_off;
14669     }
14670
14671   /* Create new memory references.  */
14672   mem_1 = change_address (mem_1, VOIDmode,
14673                           plus_constant (DImode, operands[8], new_off));
14674
14675   /* Check if the adjusted address is OK for ldp/stp.  */
14676   if (!aarch64_mem_pair_operand (mem_1, mode))
14677     return false;
14678
14679   msize = GET_MODE_SIZE (mode);
14680   mem_2 = change_address (mem_2, VOIDmode,
14681                           plus_constant (DImode,
14682                                          operands[8],
14683                                          new_off + msize));
14684   mem_3 = change_address (mem_3, VOIDmode,
14685                           plus_constant (DImode,
14686                                          operands[8],
14687                                          new_off + msize * 2));
14688   mem_4 = change_address (mem_4, VOIDmode,
14689                           plus_constant (DImode,
14690                                          operands[8],
14691                                          new_off + msize * 3));
14692
14693   if (code == ZERO_EXTEND)
14694     {
14695       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
14696       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
14697       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
14698       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
14699     }
14700   else if (code == SIGN_EXTEND)
14701     {
14702       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
14703       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
14704       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
14705       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
14706     }
14707
14708   if (load)
14709     {
14710       operands[1] = mem_1;
14711       operands[3] = mem_2;
14712       operands[5] = mem_3;
14713       operands[7] = mem_4;
14714     }
14715   else
14716     {
14717       operands[0] = mem_1;
14718       operands[2] = mem_2;
14719       operands[4] = mem_3;
14720       operands[6] = mem_4;
14721     }
14722
14723   /* Emit adjusting instruction.  */
14724   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
14725   /* Emit ldp/stp instructions.  */
14726   t1 = gen_rtx_SET (operands[0], operands[1]);
14727   t2 = gen_rtx_SET (operands[2], operands[3]);
14728   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
14729   t1 = gen_rtx_SET (operands[4], operands[5]);
14730   t2 = gen_rtx_SET (operands[6], operands[7]);
14731   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
14732   return true;
14733 }
14734
14735 /* Return 1 if pseudo register should be created and used to hold
14736    GOT address for PIC code.  */
14737
14738 bool
14739 aarch64_use_pseudo_pic_reg (void)
14740 {
14741   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
14742 }
14743
14744 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
14745
14746 static int
14747 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
14748 {
14749   switch (XINT (x, 1))
14750     {
14751     case UNSPEC_GOTSMALLPIC:
14752     case UNSPEC_GOTSMALLPIC28K:
14753     case UNSPEC_GOTTINYPIC:
14754       return 0;
14755     default:
14756       break;
14757     }
14758
14759   return default_unspec_may_trap_p (x, flags);
14760 }
14761
14762
14763 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
14764    return the log2 of that value.  Otherwise return -1.  */
14765
14766 int
14767 aarch64_fpconst_pow_of_2 (rtx x)
14768 {
14769   const REAL_VALUE_TYPE *r;
14770
14771   if (!CONST_DOUBLE_P (x))
14772     return -1;
14773
14774   r = CONST_DOUBLE_REAL_VALUE (x);
14775
14776   if (REAL_VALUE_NEGATIVE (*r)
14777       || REAL_VALUE_ISNAN (*r)
14778       || REAL_VALUE_ISINF (*r)
14779       || !real_isinteger (r, DFmode))
14780     return -1;
14781
14782   return exact_log2 (real_to_integer (r));
14783 }
14784
14785 /* If X is a vector of equal CONST_DOUBLE values and that value is
14786    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
14787
14788 int
14789 aarch64_vec_fpconst_pow_of_2 (rtx x)
14790 {
14791   if (GET_CODE (x) != CONST_VECTOR)
14792     return -1;
14793
14794   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
14795     return -1;
14796
14797   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
14798   if (firstval <= 0)
14799     return -1;
14800
14801   for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
14802     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
14803       return -1;
14804
14805   return firstval;
14806 }
14807
14808 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
14809    to float.
14810
14811    __fp16 always promotes through this hook.
14812    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
14813    through the generic excess precision logic rather than here.  */
14814
14815 static tree
14816 aarch64_promoted_type (const_tree t)
14817 {
14818   if (SCALAR_FLOAT_TYPE_P (t)
14819       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
14820     return float_type_node;
14821
14822   return NULL_TREE;
14823 }
14824
14825 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
14826
14827 static bool
14828 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
14829                            optimization_type opt_type)
14830 {
14831   switch (op)
14832     {
14833     case rsqrt_optab:
14834       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
14835
14836     default:
14837       return true;
14838     }
14839 }
14840
14841 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
14842    if MODE is HFmode, and punt to the generic implementation otherwise.  */
14843
14844 static bool
14845 aarch64_libgcc_floating_mode_supported_p (machine_mode mode)
14846 {
14847   return (mode == HFmode
14848           ? true
14849           : default_libgcc_floating_mode_supported_p (mode));
14850 }
14851
14852 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
14853    if MODE is HFmode, and punt to the generic implementation otherwise.  */
14854
14855 static bool
14856 aarch64_scalar_mode_supported_p (machine_mode mode)
14857 {
14858   return (mode == HFmode
14859           ? true
14860           : default_scalar_mode_supported_p (mode));
14861 }
14862
14863 /* Set the value of FLT_EVAL_METHOD.
14864    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
14865
14866     0: evaluate all operations and constants, whose semantic type has at
14867        most the range and precision of type float, to the range and
14868        precision of float; evaluate all other operations and constants to
14869        the range and precision of the semantic type;
14870
14871     N, where _FloatN is a supported interchange floating type
14872        evaluate all operations and constants, whose semantic type has at
14873        most the range and precision of _FloatN type, to the range and
14874        precision of the _FloatN type; evaluate all other operations and
14875        constants to the range and precision of the semantic type;
14876
14877    If we have the ARMv8.2-A extensions then we support _Float16 in native
14878    precision, so we should set this to 16.  Otherwise, we support the type,
14879    but want to evaluate expressions in float precision, so set this to
14880    0.  */
14881
14882 static enum flt_eval_method
14883 aarch64_excess_precision (enum excess_precision_type type)
14884 {
14885   switch (type)
14886     {
14887       case EXCESS_PRECISION_TYPE_FAST:
14888       case EXCESS_PRECISION_TYPE_STANDARD:
14889         /* We can calculate either in 16-bit range and precision or
14890            32-bit range and precision.  Make that decision based on whether
14891            we have native support for the ARMv8.2-A 16-bit floating-point
14892            instructions or not.  */
14893         return (TARGET_FP_F16INST
14894                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
14895                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
14896       case EXCESS_PRECISION_TYPE_IMPLICIT:
14897         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
14898       default:
14899         gcc_unreachable ();
14900     }
14901   return FLT_EVAL_METHOD_UNPREDICTABLE;
14902 }
14903
14904 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
14905    scheduled for speculative execution.  Reject the long-running division
14906    and square-root instructions.  */
14907
14908 static bool
14909 aarch64_sched_can_speculate_insn (rtx_insn *insn)
14910 {
14911   switch (get_attr_type (insn))
14912     {
14913       case TYPE_SDIV:
14914       case TYPE_UDIV:
14915       case TYPE_FDIVS:
14916       case TYPE_FDIVD:
14917       case TYPE_FSQRTS:
14918       case TYPE_FSQRTD:
14919       case TYPE_NEON_FP_SQRT_S:
14920       case TYPE_NEON_FP_SQRT_D:
14921       case TYPE_NEON_FP_SQRT_S_Q:
14922       case TYPE_NEON_FP_SQRT_D_Q:
14923       case TYPE_NEON_FP_DIV_S:
14924       case TYPE_NEON_FP_DIV_D:
14925       case TYPE_NEON_FP_DIV_S_Q:
14926       case TYPE_NEON_FP_DIV_D_Q:
14927         return false;
14928       default:
14929         return true;
14930     }
14931 }
14932
14933 /* Target-specific selftests.  */
14934
14935 #if CHECKING_P
14936
14937 namespace selftest {
14938
14939 /* Selftest for the RTL loader.
14940    Verify that the RTL loader copes with a dump from
14941    print_rtx_function.  This is essentially just a test that class
14942    function_reader can handle a real dump, but it also verifies
14943    that lookup_reg_by_dump_name correctly handles hard regs.
14944    The presence of hard reg names in the dump means that the test is
14945    target-specific, hence it is in this file.  */
14946
14947 static void
14948 aarch64_test_loading_full_dump ()
14949 {
14950   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
14951
14952   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
14953
14954   rtx_insn *insn_1 = get_insn_by_uid (1);
14955   ASSERT_EQ (NOTE, GET_CODE (insn_1));
14956
14957   rtx_insn *insn_15 = get_insn_by_uid (15);
14958   ASSERT_EQ (INSN, GET_CODE (insn_15));
14959   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
14960
14961   /* Verify crtl->return_rtx.  */
14962   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
14963   ASSERT_EQ (0, REGNO (crtl->return_rtx));
14964   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
14965 }
14966
14967 /* Run all target-specific selftests.  */
14968
14969 static void
14970 aarch64_run_selftests (void)
14971 {
14972   aarch64_test_loading_full_dump ();
14973 }
14974
14975 } // namespace selftest
14976
14977 #endif /* #if CHECKING_P */
14978
14979 #undef TARGET_ADDRESS_COST
14980 #define TARGET_ADDRESS_COST aarch64_address_cost
14981
14982 /* This hook will determines whether unnamed bitfields affect the alignment
14983    of the containing structure.  The hook returns true if the structure
14984    should inherit the alignment requirements of an unnamed bitfield's
14985    type.  */
14986 #undef TARGET_ALIGN_ANON_BITFIELD
14987 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
14988
14989 #undef TARGET_ASM_ALIGNED_DI_OP
14990 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
14991
14992 #undef TARGET_ASM_ALIGNED_HI_OP
14993 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
14994
14995 #undef TARGET_ASM_ALIGNED_SI_OP
14996 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
14997
14998 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
14999 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
15000   hook_bool_const_tree_hwi_hwi_const_tree_true
15001
15002 #undef TARGET_ASM_FILE_START
15003 #define TARGET_ASM_FILE_START aarch64_start_file
15004
15005 #undef TARGET_ASM_OUTPUT_MI_THUNK
15006 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
15007
15008 #undef TARGET_ASM_SELECT_RTX_SECTION
15009 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
15010
15011 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
15012 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
15013
15014 #undef TARGET_BUILD_BUILTIN_VA_LIST
15015 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
15016
15017 #undef TARGET_CALLEE_COPIES
15018 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
15019
15020 #undef TARGET_CAN_ELIMINATE
15021 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
15022
15023 #undef TARGET_CAN_INLINE_P
15024 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
15025
15026 #undef TARGET_CANNOT_FORCE_CONST_MEM
15027 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
15028
15029 #undef TARGET_CASE_VALUES_THRESHOLD
15030 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
15031
15032 #undef TARGET_CONDITIONAL_REGISTER_USAGE
15033 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
15034
15035 /* Only the least significant bit is used for initialization guard
15036    variables.  */
15037 #undef TARGET_CXX_GUARD_MASK_BIT
15038 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
15039
15040 #undef TARGET_C_MODE_FOR_SUFFIX
15041 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
15042
15043 #ifdef TARGET_BIG_ENDIAN_DEFAULT
15044 #undef  TARGET_DEFAULT_TARGET_FLAGS
15045 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
15046 #endif
15047
15048 #undef TARGET_CLASS_MAX_NREGS
15049 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
15050
15051 #undef TARGET_BUILTIN_DECL
15052 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
15053
15054 #undef TARGET_BUILTIN_RECIPROCAL
15055 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
15056
15057 #undef TARGET_C_EXCESS_PRECISION
15058 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
15059
15060 #undef  TARGET_EXPAND_BUILTIN
15061 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
15062
15063 #undef TARGET_EXPAND_BUILTIN_VA_START
15064 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
15065
15066 #undef TARGET_FOLD_BUILTIN
15067 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
15068
15069 #undef TARGET_FUNCTION_ARG
15070 #define TARGET_FUNCTION_ARG aarch64_function_arg
15071
15072 #undef TARGET_FUNCTION_ARG_ADVANCE
15073 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
15074
15075 #undef TARGET_FUNCTION_ARG_BOUNDARY
15076 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
15077
15078 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
15079 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
15080
15081 #undef TARGET_FUNCTION_VALUE
15082 #define TARGET_FUNCTION_VALUE aarch64_function_value
15083
15084 #undef TARGET_FUNCTION_VALUE_REGNO_P
15085 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
15086
15087 #undef TARGET_FRAME_POINTER_REQUIRED
15088 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
15089
15090 #undef TARGET_GIMPLE_FOLD_BUILTIN
15091 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
15092
15093 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
15094 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
15095
15096 #undef  TARGET_INIT_BUILTINS
15097 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
15098
15099 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
15100 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
15101   aarch64_ira_change_pseudo_allocno_class
15102
15103 #undef TARGET_LEGITIMATE_ADDRESS_P
15104 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
15105
15106 #undef TARGET_LEGITIMATE_CONSTANT_P
15107 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
15108
15109 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
15110 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
15111   aarch64_legitimize_address_displacement
15112
15113 #undef TARGET_LIBGCC_CMP_RETURN_MODE
15114 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
15115
15116 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
15117 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
15118 aarch64_libgcc_floating_mode_supported_p
15119
15120 #undef TARGET_MANGLE_TYPE
15121 #define TARGET_MANGLE_TYPE aarch64_mangle_type
15122
15123 #undef TARGET_MEMORY_MOVE_COST
15124 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
15125
15126 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
15127 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
15128
15129 #undef TARGET_MUST_PASS_IN_STACK
15130 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
15131
15132 /* This target hook should return true if accesses to volatile bitfields
15133    should use the narrowest mode possible.  It should return false if these
15134    accesses should use the bitfield container type.  */
15135 #undef TARGET_NARROW_VOLATILE_BITFIELD
15136 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
15137
15138 #undef  TARGET_OPTION_OVERRIDE
15139 #define TARGET_OPTION_OVERRIDE aarch64_override_options
15140
15141 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
15142 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
15143   aarch64_override_options_after_change
15144
15145 #undef TARGET_OPTION_SAVE
15146 #define TARGET_OPTION_SAVE aarch64_option_save
15147
15148 #undef TARGET_OPTION_RESTORE
15149 #define TARGET_OPTION_RESTORE aarch64_option_restore
15150
15151 #undef TARGET_OPTION_PRINT
15152 #define TARGET_OPTION_PRINT aarch64_option_print
15153
15154 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
15155 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
15156
15157 #undef TARGET_SET_CURRENT_FUNCTION
15158 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
15159
15160 #undef TARGET_PASS_BY_REFERENCE
15161 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
15162
15163 #undef TARGET_PREFERRED_RELOAD_CLASS
15164 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
15165
15166 #undef TARGET_SCHED_REASSOCIATION_WIDTH
15167 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
15168
15169 #undef TARGET_PROMOTED_TYPE
15170 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
15171
15172 #undef TARGET_SECONDARY_RELOAD
15173 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
15174
15175 #undef TARGET_SHIFT_TRUNCATION_MASK
15176 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
15177
15178 #undef TARGET_SETUP_INCOMING_VARARGS
15179 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
15180
15181 #undef TARGET_STRUCT_VALUE_RTX
15182 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
15183
15184 #undef TARGET_REGISTER_MOVE_COST
15185 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
15186
15187 #undef TARGET_RETURN_IN_MEMORY
15188 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
15189
15190 #undef TARGET_RETURN_IN_MSB
15191 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
15192
15193 #undef TARGET_RTX_COSTS
15194 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
15195
15196 #undef TARGET_SCALAR_MODE_SUPPORTED_P
15197 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
15198
15199 #undef TARGET_SCHED_ISSUE_RATE
15200 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
15201
15202 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
15203 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
15204   aarch64_sched_first_cycle_multipass_dfa_lookahead
15205
15206 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
15207 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
15208   aarch64_first_cycle_multipass_dfa_lookahead_guard
15209
15210 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
15211 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
15212   aarch64_get_separate_components
15213
15214 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
15215 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
15216   aarch64_components_for_bb
15217
15218 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
15219 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
15220   aarch64_disqualify_components
15221
15222 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
15223 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
15224   aarch64_emit_prologue_components
15225
15226 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
15227 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
15228   aarch64_emit_epilogue_components
15229
15230 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
15231 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
15232   aarch64_set_handled_components
15233
15234 #undef TARGET_TRAMPOLINE_INIT
15235 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
15236
15237 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
15238 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
15239
15240 #undef TARGET_VECTOR_MODE_SUPPORTED_P
15241 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
15242
15243 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
15244 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
15245   aarch64_builtin_support_vector_misalignment
15246
15247 #undef TARGET_ARRAY_MODE_SUPPORTED_P
15248 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
15249
15250 #undef TARGET_VECTORIZE_ADD_STMT_COST
15251 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
15252
15253 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
15254 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
15255   aarch64_builtin_vectorization_cost
15256
15257 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
15258 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
15259
15260 #undef TARGET_VECTORIZE_BUILTINS
15261 #define TARGET_VECTORIZE_BUILTINS
15262
15263 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
15264 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
15265   aarch64_builtin_vectorized_function
15266
15267 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
15268 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
15269   aarch64_autovectorize_vector_sizes
15270
15271 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
15272 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
15273   aarch64_atomic_assign_expand_fenv
15274
15275 /* Section anchor support.  */
15276
15277 #undef TARGET_MIN_ANCHOR_OFFSET
15278 #define TARGET_MIN_ANCHOR_OFFSET -256
15279
15280 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
15281    byte offset; we can do much more for larger data types, but have no way
15282    to determine the size of the access.  We assume accesses are aligned.  */
15283 #undef TARGET_MAX_ANCHOR_OFFSET
15284 #define TARGET_MAX_ANCHOR_OFFSET 4095
15285
15286 #undef TARGET_VECTOR_ALIGNMENT
15287 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
15288
15289 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
15290 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
15291   aarch64_simd_vector_alignment_reachable
15292
15293 /* vec_perm support.  */
15294
15295 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
15296 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
15297   aarch64_vectorize_vec_perm_const_ok
15298
15299 #undef TARGET_INIT_LIBFUNCS
15300 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
15301
15302 #undef TARGET_FIXED_CONDITION_CODE_REGS
15303 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
15304
15305 #undef TARGET_FLAGS_REGNUM
15306 #define TARGET_FLAGS_REGNUM CC_REGNUM
15307
15308 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
15309 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
15310
15311 #undef TARGET_ASAN_SHADOW_OFFSET
15312 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
15313
15314 #undef TARGET_LEGITIMIZE_ADDRESS
15315 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
15316
15317 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
15318 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
15319   aarch64_use_by_pieces_infrastructure_p
15320
15321 #undef TARGET_SCHED_CAN_SPECULATE_INSN
15322 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
15323
15324 #undef TARGET_CAN_USE_DOLOOP_P
15325 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
15326
15327 #undef TARGET_SCHED_ADJUST_PRIORITY
15328 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
15329
15330 #undef TARGET_SCHED_MACRO_FUSION_P
15331 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
15332
15333 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
15334 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
15335
15336 #undef TARGET_SCHED_FUSION_PRIORITY
15337 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
15338
15339 #undef TARGET_UNSPEC_MAY_TRAP_P
15340 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
15341
15342 #undef TARGET_USE_PSEUDO_PIC_REG
15343 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
15344
15345 #undef TARGET_PRINT_OPERAND
15346 #define TARGET_PRINT_OPERAND aarch64_print_operand
15347
15348 #undef TARGET_PRINT_OPERAND_ADDRESS
15349 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
15350
15351 #undef TARGET_OPTAB_SUPPORTED_P
15352 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
15353
15354 #undef TARGET_OMIT_STRUCT_RETURN_REG
15355 #define TARGET_OMIT_STRUCT_RETURN_REG true
15356
15357 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
15358 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
15359 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
15360
15361 #if CHECKING_P
15362 #undef TARGET_RUN_TARGET_SELFTESTS
15363 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
15364 #endif /* #if CHECKING_P */
15365
15366 struct gcc_target targetm = TARGET_INITIALIZER;
15367
15368 #include "gt-aarch64.h"